Mise à jour terminée. Pour connaître les apports de la version 13.8.4 par rapport à notre ancienne version vous pouvez lire les "Release Notes" suivantes :
https://about.gitlab.com/releases/2021/02/11/security-release-gitlab-13-8-4-released/
https://about.gitlab.com/releases/2021/02/05/gitlab-13-8-3-released/

segment.cpp 29.5 KB
Newer Older
Mikaël Salson's avatar
Mikaël Salson committed
1
/*
2
  This file is part of Vidjil <http://www.vidjil.org>
3
  Copyright (C) 2011, 2012, 2013, 2014, 2015, 2016 by Bonsai bioinformatics
4 5 6 7 8
  at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille
  Contributors: 
      Mathieu Giraud <mathieu.giraud@vidjil.org>
      Mikaël Salson <mikael.salson@vidjil.org>
      Marc Duez <marc.duez@vidjil.org>
Mikaël Salson's avatar
Mikaël Salson committed
9 10 11 12 13 14 15 16 17 18 19 20 21 22

  "Vidjil" is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.

  "Vidjil" is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with "Vidjil". If not, see <http://www.gnu.org/licenses/>
*/
Mikaël Salson's avatar
Mikaël Salson committed
23
#include <algorithm>    // std::sort
Mikaël Salson's avatar
Mikaël Salson committed
24 25 26 27
#include <cassert>
#include "segment.h"
#include "tools.h"
#include "affectanalyser.h"
Mikaël Salson's avatar
Mikaël Salson committed
28
#include <sstream>
29
#include <cstring>
Marc Duez's avatar
Marc Duez committed
30
#include <string>
Mikaël Salson's avatar
Mikaël Salson committed
31

32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49

AlignBox::AlignBox() {
  del_left = 0 ;
  start = 0 ;
  end = 0 ;
  del_right = 0 ;

  ref_nb = 0 ;
  ref = "";
  ref_label = "";
}

string AlignBox::getSequence(string sequence) {
  return sequence.substr(start, end-start+1);
}



Mathieu Giraud's avatar
Mathieu Giraud committed
50
Segmenter::~Segmenter() {}
Mikaël Salson's avatar
Mikaël Salson committed
51

Mathieu Giraud's avatar
Mathieu Giraud committed
52
Sequence Segmenter::getSequence() const {
Mikaël Salson's avatar
Mikaël Salson committed
53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
  Sequence s ;
  s.label_full = info ;
  s.label = label + " " + (reversed ? "-" : "+");
  s.sequence = revcomp(sequence, reversed);

  return s ;
}

string Segmenter::getJunction(int l) const {
  assert(isSegmented());

  int start = (getLeft() + getRight())/2 - l/2;
  
  if (start < 0 or start + l > (int)sequence.size())  // TODO: +l ou +l-1 ?
    return "" ;

  return getSequence().sequence.substr(start, l);
}

int Segmenter::getLeft() const {
73
  return box_V->end;
Mikaël Salson's avatar
Mikaël Salson committed
74 75 76
}
  
int Segmenter::getRight() const {
77
  return box_J->start;
Mikaël Salson's avatar
Mikaël Salson committed
78 79 80
}

int Segmenter::getLeftD() const {
81
  return box_D->start;
Mikaël Salson's avatar
Mikaël Salson committed
82 83 84
}
  
int Segmenter::getRightD() const {
85
  return box_D->end;
Mikaël Salson's avatar
Mikaël Salson committed
86 87 88 89 90 91 92 93 94 95 96 97 98 99
}

bool Segmenter::isReverse() const {
  return reversed;
}

bool Segmenter::isSegmented() const {
  return segmented;
}

bool Segmenter::isDSegmented() const {
  return dSegmented;
}

100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118

// E-values

void Segmenter::checkLeftRightEvaluesThreshold(double threshold, int strand)
{
  if (threshold == NO_LIMIT_VALUE)
    return ;

  if (evalue_left >= threshold && evalue_right >= threshold)
    because = UNSEG_TOO_FEW_ZERO ;
  else if ((strand == 1 ? evalue_left : evalue_right) >= threshold)
    because = UNSEG_TOO_FEW_V ;
  else if ((strand == 1 ? evalue_right : evalue_left) >= threshold)
    because = UNSEG_TOO_FEW_J ;
  else if (evalue >= threshold) // left and right are <= threshold, but their sum is > threshold
    because = UNSEG_TOO_FEW_ZERO ;
}


Mikaël Salson's avatar
Mikaël Salson committed
119 120 121 122 123 124 125 126
// Chevauchement

string Segmenter::removeChevauchement()
{
  assert(isSegmented());
  
  string chevauchement = "" ;

127
  if (box_V->end >= box_J->start)
Mikaël Salson's avatar
Mikaël Salson committed
128
    {
129 130 131 132
      int middle = (box_V->end + box_J->start) / 2 ;
      chevauchement = " !ov " + string_of_int (box_V->end - box_J->start + 1);
      box_V->end = middle ;
      box_J->start = middle+1 ;
Mikaël Salson's avatar
Mikaël Salson committed
133 134 135 136 137 138 139 140 141 142 143 144 145 146
    }

  return chevauchement ;
}

// Prettyprint


bool Segmenter::finishSegmentation() 
{
  assert(isSegmented());
  
  string seq = getSequence().sequence;
    
147 148 149 150 151
  seg_V = seq.substr(0, box_V->end+1) ;
  seg_N = seq.substr(box_V->end+1, box_J->start-box_V->end-1) ;  // Twice computed for FineSegmenter, but only once in KmerSegmenter !
  seg_J = seq.substr(box_J->start) ;
  box_D->start=0;
  box_D->end=0;
Mikaël Salson's avatar
Mikaël Salson committed
152 153 154 155 156 157 158 159 160 161 162 163 164

  info = "VJ \t" + string_of_int(FIRST_POS) + " " + info + " " + string_of_int(seq.size() - 1 + FIRST_POS) ;
  info += "\t" + code ;

  info = (reversed ? "- " : "+ ") + info ;

  return true ;
}

bool Segmenter::finishSegmentationD() 
{
  string seq = getSequence().sequence;

165 166 167 168
  seg_V = seq.substr(0, box_V->end+1) ; // From pos. 0 to box_V->end
  seg_J = seq.substr(box_J->start) ;
  seg_N = seq.substr(box_V->end+1, box_J->start-box_V->end-1) ;  // Twice computed for FineSegmenter, but only once in KmerSegmenter !
  seg_D  = seq.substr(box_D->start, box_D->end-box_D->start+1) ; // From Dstart to Dend
Mikaël Salson's avatar
Mikaël Salson committed
169
  
170 171 172 173
  info = "VDJ \t0 " + string_of_int(box_V->end) +
                " " + string_of_int(box_D->start) +
		" " + string_of_int(box_D->end) +
		" " + string_of_int(box_J->start) +
Mikaël Salson's avatar
Mikaël Salson committed
174 175 176 177 178 179 180 181 182
		" " + string_of_int(seq.size()-1+FIRST_POS) ;
		
  info += "\t" + code ;
  
  info = (reversed ? "- " : "+ ") + info ;

  return true ;
}

183 184
string Segmenter::getInfoLine() const
{
185
  string s = "" ;
186 187 188 189 190

  s += (segmented ? "" : "! ") + info ;
  s += " " + info_extra ;
  s += " " + segmented_germline->code ;
  s += " " + string(segmented_mesg[because]) ;
191 192 193 194

  if (evalue > NO_LIMIT_VALUE)
    s += " " + scientific_string_of_double(evalue);

195 196 197 198 199
  if (evalue_left > NO_LIMIT_VALUE)
    s += " " + scientific_string_of_double(evalue_left);
  if (evalue_right > NO_LIMIT_VALUE)
    s += "/" + scientific_string_of_double(evalue_right);

200 201 202
  return s ;
}

203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218
string KmerSegmenter::getInfoLineWithAffects() const
{
   stringstream ss;

   ss << "# "
      << right << setw(3) << score << " "
      << left << setw(30)
      << getInfoLine() ;

   if (getSegmentationStatus() != UNSEG_TOO_SHORT)
     ss << getKmerAffectAnalyser()->toString();

   return ss.str();
}


Mikaël Salson's avatar
Mikaël Salson committed
219 220
ostream &operator<<(ostream &out, const Segmenter &s)
{
221
  out << ">" << s.label << " " ;
222
  out << s.getInfoLine() << endl;
Mikaël Salson's avatar
Mikaël Salson committed
223 224 225 226 227 228 229 230 231

  if (s.segmented)
    {
      out << s.seg_V << endl ;
      out << s.seg_N << endl ;
      out << s.seg_J << endl ;
    }
  else
    {
232
      out << s.getSequence().sequence << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
233 234 235 236 237 238 239 240
    }

  return out ;
}


// KmerSegmenter (Cheap)

241 242
KmerSegmenter::KmerSegmenter() { kaa = 0 ; }

243
KmerSegmenter::KmerSegmenter(Sequence seq, Germline *germline, double threshold, int multiplier)
Mikaël Salson's avatar
Mikaël Salson committed
244
{
245 246 247 248
  box_V = new AlignBox();
  box_D = new AlignBox();
  box_J = new AlignBox();

Mikaël Salson's avatar
Mikaël Salson committed
249 250 251
  label = seq.label ;
  sequence = seq.sequence ;
  info = "" ;
252
  info_extra = "seed";
Mikaël Salson's avatar
Mikaël Salson committed
253
  segmented = false;
254
  segmented_germline = germline ;
255
  system = germline->code; // useful ?
256
  reversed = false;
257
  because = NOT_PROCESSED ; // Cause of unsegmentation
258
  score = 0 ;
259
  evalue = NO_LIMIT_VALUE;
260 261
  evalue_left = NO_LIMIT_VALUE;
  evalue_right = NO_LIMIT_VALUE;
262

263
  int s = (size_t)germline->index->getS() ;
Mikaël Salson's avatar
Mikaël Salson committed
264
  int length = sequence.length() ;
Mikaël Salson's avatar
Mikaël Salson committed
265

266
  if (length < s) 
Mikaël Salson's avatar
Mikaël Salson committed
267
    {
Mathieu Giraud's avatar
Mathieu Giraud committed
268 269
      because = UNSEG_TOO_SHORT;
      kaa = NULL;
270
      return ;
Mikaël Salson's avatar
Mikaël Salson committed
271 272
    }
 
273
  kaa = new KmerAffectAnalyser(*(germline->index), sequence);
Mikaël Salson's avatar
Mikaël Salson committed
274 275
  
  // Check strand consistency among the affectations.
Mikaël Salson's avatar
Mikaël Salson committed
276 277 278 279 280 281 282 283 284
  int strand;
  int nb_strand[2] = {0,0};     // In cell 0 we'll put the number of negative
                                // strand, while in cell 1 we'll put the
                                // positives
  for (int i = 0; i < kaa->count(); i++) { 
    KmerAffect it = kaa->getAffectation(i);
    if (! it.isAmbiguous() && ! it.isUnknown()) {
      strand = affect_strand(it.affect);
      nb_strand[(strand + 1) / 2] ++; // (strand+1) / 2 → 0 if strand == -1; 1 if strand == 1
Mikaël Salson's avatar
Mikaël Salson committed
285 286 287
    }
  }

288 289
  score = nb_strand[0] + nb_strand[1] ; // Used only for non-segmented germlines

290 291
  reversed = (nb_strand[0] > nb_strand[1]) ;

292 293
  if ((germline->seg_method == SEG_METHOD_MAX12)
      || (germline->seg_method == SEG_METHOD_MAX1U))
294 295 296 297
    { // Pseudo-germline, MAX12 and MAX1U
      pair <KmerAffect, KmerAffect> max12 ;
      CountKmerAffectAnalyser ckaa(*(germline->index), sequence);

298 299 300 301 302

      set<KmerAffect> forbidden;
      forbidden.insert(KmerAffect::getAmbiguous());
      forbidden.insert(KmerAffect::getUnknown());

303
      if (germline->seg_method == SEG_METHOD_MAX12)
304 305 306
        // MAX12: two maximum k-mers (no unknown)
        {
          max12 = ckaa.max12(forbidden);
307

308 309 310 311 312 313
          if (max12.first.isUnknown() || max12.second.isUnknown())
            {
              because = UNSEG_TOO_FEW_ZERO ;
              return ;
            }
        }
314

315 316
      else
        // MAX1U: the maximum k-mers (no unknown) + unknown
317
        {
318 319 320 321 322 323 324 325 326
          CountKmerAffectAnalyser ckaa(*(germline->index), sequence);
          KmerAffect max = ckaa.max(forbidden);

          if (max.isUnknown())
            {
              because = UNSEG_TOO_FEW_ZERO ;
              return ;
            }
          max12 = make_pair(max, KmerAffect::getUnknown());
327 328
        }

329 330 331 332 333
      pair <KmerAffect, KmerAffect> before_after =  ckaa.sortLeftRight(max12);

      before = before_after.first ;
      after = before_after.second ;

334 335 336
      // This strand computation is only a heuristic, especially for chimera +/- reads
      // Anyway, it allows to gather such reads and their reverse complement into a unique window...
      // ... except when the read is quite different outside the window
337
      strand = reversed ? -1 : 1 ;
338 339 340 341 342
    }

  else
    { // Regular germline

343
  // Test on which strand we are, select the before and after KmerAffects
Mikaël Salson's avatar
Mikaël Salson committed
344
  if (nb_strand[0] == 0 && nb_strand[1] == 0) {
345
    because = UNSEG_TOO_FEW_ZERO ;
346
    return ;
Mikaël Salson's avatar
Mikaël Salson committed
347 348
  } else if (nb_strand[0] > RATIO_STRAND * nb_strand[1]) {
    strand = -1;
349 350
    before = KmerAffect(germline->affect_3, -1); 
    after = KmerAffect(germline->affect_5, -1);
Mikaël Salson's avatar
Mikaël Salson committed
351 352
  } else if (nb_strand[1] > RATIO_STRAND * nb_strand[0]) {
    strand = 1;
353 354
    before = KmerAffect(germline->affect_5, 1); 
    after = KmerAffect(germline->affect_3, 1);    
Mikaël Salson's avatar
Mikaël Salson committed
355 356
  } else {
    // Ambiguous information: we have positive and negative strands
357 358 359 360 361
    // and there is not enough difference to put them apart.
    if (nb_strand[0] + nb_strand[1] >= DETECT_THRESHOLD_STRAND)
      because = UNSEG_STRAND_NOT_CONSISTENT ;
    else
      because = UNSEG_TOO_FEW_ZERO ;
362
    return ;
Mikaël Salson's avatar
Mikaël Salson committed
363 364
  }

365
    } // endif Pseudo-germline
366
 
367
  computeSegmentation(strand, before, after, threshold, multiplier);
Mathieu Giraud's avatar
Mathieu Giraud committed
368
}
Mikaël Salson's avatar
Mikaël Salson committed
369

Mathieu Giraud's avatar
Mathieu Giraud committed
370
KmerSegmenter::~KmerSegmenter() {
371 372
  if (kaa)
    delete kaa;
373 374
}

375 376
KmerMultiSegmenter::KmerMultiSegmenter(Sequence seq, MultiGermline *multigermline, ostream *out_unsegmented,
                                       double threshold, int nb_reads_for_evalue)
377
{
378 379
  bool found_seg = false ; // Found a segmentation
  double best_evalue_seg = NO_LIMIT_VALUE ; // Best evalue, segmented sequences
380
  int best_score_unseg = 0 ; // Best score, unsegmented sequences
381
  the_kseg = NULL;
382
  multi_germline = multigermline;
383
  threshold_nb_expected = threshold;
384 385 386

  // E-value multiplier
  int multiplier = multi_germline->germlines.size() * nb_reads_for_evalue;
387
  
388 389 390 391 392
  // Iterate over the germlines
  for (list<Germline*>::const_iterator it = multigermline->germlines.begin(); it != multigermline->germlines.end(); ++it)
    {
      Germline *germline = *it ;

393
      KmerSegmenter *kseg = new KmerSegmenter(seq, germline, threshold, multiplier);
394
      bool keep_seg = false;
395

396 397 398
      if (out_unsegmented)
        {
          // Debug, display k-mer affectation and segmentation result for this germline
399
          *out_unsegmented << kseg->getInfoLineWithAffects() << endl ;
400 401
        }

402 403
      // Always remember the first kseg
      if (the_kseg == NULL)
404
        keep_seg = true;
405
      
406
      if (kseg->isSegmented())
407 408
        {
          // Yes, it is segmented
409
          // Should we keep the kseg ?
410
          if (!found_seg || (kseg->evalue < best_evalue_seg))
411
            {
412
              keep_seg = true;
413 414 415
              best_evalue_seg = kseg->evalue ;

              found_seg = true;
416
            }
417
        }
418 419 420 421 422 423 424
      else
        {
          // It is not segmented
          // Should we keep the kseg (with the unsegmentation cause) ?
            if (kseg->score > best_score_unseg)
            {              
              best_score_unseg = kseg->score ;
425
              if (!found_seg)
426 427 428 429
                keep_seg = true;
            }
        }
      
430
      if (keep_seg) {
431 432
        if (the_kseg)
          delete the_kseg;
433 434 435 436
        the_kseg = kseg;
      } else {
        delete kseg;
      }
437 438 439
    } // end for (Germlines)
}

440
KmerMultiSegmenter::~KmerMultiSegmenter() {
441 442
  if (the_kseg)
    delete the_kseg;
Mathieu Giraud's avatar
Mathieu Giraud committed
443 444
}

445 446
void KmerSegmenter::computeSegmentation(int strand, KmerAffect before, KmerAffect after,
                                        double threshold, int multiplier) {
447
  // Try to segment, computing 'box_V->end' and 'box_J->start'
448 449
  // If not segmented, put the cause of unsegmentation in 'because'

450
  affect_infos max;
451
  max = kaa->getMaximum(before, after);
Mikaël Salson's avatar
Mikaël Salson committed
452

453 454 455
  // We did not find a good segmentation point
  if (!max.max_found) {
    // We labeled it detected if there were both enough affect_5 and enough affect_3
456 457 458 459 460 461 462 463 464
    bool detected_before = (max.nb_before_left + max.nb_before_right >= DETECT_THRESHOLD);
    bool detected_after = (max.nb_after_left + max.nb_after_right >= DETECT_THRESHOLD);

    if (detected_before && detected_after)
      because = UNSEG_AMBIGUOUS ;
    else if ((strand == 1 && detected_before) || (strand == -1 && detected_after))
      because = UNSEG_TOO_FEW_J ;
    else if ((strand == 1 && detected_after) || (strand == -1 && detected_before))
      because = UNSEG_TOO_FEW_V ;
465
    else
466
      because = UNSEG_TOO_FEW_ZERO ;
467 468 469 470 471

    return ;
  }


472
  // E-values
473 474 475
  pair <double, double> pvalues = kaa->getLeftRightProbabilityAtLeastOrAbove();
  evalue_left = pvalues.first * multiplier ;
  evalue_right = pvalues.second * multiplier ;
476
  evalue = evalue_left + evalue_right ;
477

478
  checkLeftRightEvaluesThreshold(threshold, strand);
479

480
  if (because != NOT_PROCESSED)
481 482
    return ;

483 484
   // There was a good segmentation point

485 486
   box_V->end = max.first_pos_max;
   box_J->start = max.last_pos_max + 1;
487
   if (strand == -1) {
488 489 490
     int tmp = sequence.size() - box_V->end - 1;
     box_V->end = sequence.size() - box_J->start - 1;
     box_J->start = tmp;
491 492 493 494 495 496
   }

  // Yes, it is segmented
  segmented = true;
  because = reversed ? SEG_MINUS : SEG_PLUS ;

497
  info = string_of_int(box_V->end + FIRST_POS) + " " + string_of_int(box_J->start + FIRST_POS)  ;
498 499 500 501 502 503

  // removeChevauchement is called once info was already computed: it is only to output info_extra
  info_extra += removeChevauchement();
  finishSegmentation();

  return ;
Mathieu Giraud's avatar
Mathieu Giraud committed
504
}
Mikaël Salson's avatar
Mikaël Salson committed
505

506
KmerAffectAnalyser *KmerSegmenter::getKmerAffectAnalyser() const {
Mathieu Giraud's avatar
Mathieu Giraud committed
507 508
  return kaa;
}
Mikaël Salson's avatar
Mikaël Salson committed
509

510
int Segmenter::getSegmentationStatus() const {
Mathieu Giraud's avatar
Mathieu Giraud committed
511
  return because;
Mikaël Salson's avatar
Mikaël Salson committed
512 513
}

514 515 516 517 518
void Segmenter::setSegmentationStatus(int status) {
  because = status;
  segmented = (status == SEG_PLUS || status == SEG_MINUS);
}

Mikaël Salson's avatar
Mikaël Salson committed
519 520
// FineSegmenter

521

522
string check_and_resolve_overlap(string seq, int seq_begin, int seq_end,
523 524
                                 AlignBox *box_left, AlignBox *box_right,
                                 Cost segment_cost)
Mikaël Salson's avatar
Mikaël Salson committed
525
{
526
  // Overlap size
527
  int overlap = box_left->end - box_right->start + 1;
528 529 530

  if (overlap > 0)
    {
531 532
      string seq_left = seq.substr(seq_begin, box_left->end - seq_begin + 1);
      string seq_right = seq.substr(box_right->start, seq_end - box_right->start + 1);
533

Mikaël Salson's avatar
Mikaël Salson committed
534 535 536 537
      int score_r[overlap+1];
      int score_l[overlap+1];
      
      //LEFT
538
      DynProg dp_l = DynProg(seq_left, box_left->ref,
Mikaël Salson's avatar
Mikaël Salson committed
539
			   DynProg::Local, segment_cost);
540
      score_l[0] = dp_l.compute();
541

Mikaël Salson's avatar
Mikaël Salson committed
542 543
      
      //RIGHT
544
      // reverse right sequence
545
      string ref_right=string(box_right->ref.rbegin(), box_right->ref.rend());
546 547 548
      seq_right=string(seq_right.rbegin(), seq_right.rend());


549
      DynProg dp_r = DynProg(seq_right, ref_right,
Mikaël Salson's avatar
Mikaël Salson committed
550
			   DynProg::Local, segment_cost);
551
      score_r[0] = dp_r.compute();
552 553 554 555 556 557 558 559 560



      int trim_l[overlap+1];
      int trim_r[overlap+1];

      for(int i=0; i<=overlap; i++) {
        score_l[i] = i < seq_left.size()  ? dp_l.best_score_on_i(seq_left.size()  - i, trim_l + i) : MINUS_INF ;
        score_r[i] = i < seq_right.size() ? dp_r.best_score_on_i(seq_right.size() - i, trim_r + i) : MINUS_INF ;
Mikaël Salson's avatar
Mikaël Salson committed
561
      }
562

563 564 565 566 567 568

// #define DEBUG_OVERLAP
#ifdef DEBUG_OVERLAP
      cout << dp_l ;
      cout << dp_r ;

569 570
      cout << "seq:" << seq_left << "\t\t" << seq_right << endl;
      cout << "ref:" << ref_left << "\t\t" << ref_right << endl;
571 572 573 574 575 576 577 578
      for(int i=0; i<=overlap; i++)
        cout << i << "  left: " << score_l[i] << "/" << trim_l[i] << "     right: " << score_r[i] << "/" << trim_r[i] << endl;
#endif

      int score = MINUS_INF;
      int best_i = 0 ;
      int best_j = 0 ;

Mikaël Salson's avatar
Mikaël Salson committed
579

580
      // Find (i, j), with i+j >= overlap,
581
      // maximizing score_l[j] + score_r[i]
Mikaël Salson's avatar
Mikaël Salson committed
582 583
      for (int i=0; i<=overlap; i++){
	for (int j=overlap-i; j<=overlap; j++){
584
          int score_ij = score_l[i] + score_r[j];
585

586 587 588
	  if (score_ij > score) {
            best_i = i ;
            best_j = j ;
589 590
            box_left->del_right = box_left->ref.size() - trim_l[i];
	    box_right->del_left = box_right->ref.size() - trim_r[j];
591
	    score = score_ij;
Mikaël Salson's avatar
Mikaël Salson committed
592 593 594
	  }
	}
      }
595

596 597
      box_left->end -= best_i ;
      box_right->start += best_j ;
598 599

#ifdef DEBUG_OVERLAP
600
      cout << "overlap: " << overlap << ", " << "best_overlap_split: " << score
601 602
           << "    left: " << best_i << "-" << box_left->del_right << " @" << box_left->end
           << "    right:" << best_j << "-" << box_right->del_left << " @" << box_right->start
603
           << endl;
604
#endif
605 606
    } // end if (overlap > 0)

607 608
  // From box_left->end + 1 to box_right->start - 1
  return seq.substr(box_left->end + 1, box_right->start - box_left->end - 1);
Mikaël Salson's avatar
Mikaël Salson committed
609 610
}

Mikaël Salson's avatar
Mikaël Salson committed
611 612 613 614 615
bool comp_pair (pair<int,int> i,pair<int,int> j)
{
  return ( i.first > j.first);
}

616 617 618 619 620

/**
 * Align a read against a collection of sequences, maximizing the alignment 'score'
 * @param read:         the read
 * @param rep:          a collection of reference sequences
621
 * @param reverse_ref:  if true, reverse the reference sequences (VkVk)
622
 * @param reverse_both: if true, reverse both the read and the reference sequences (J segment)
623
 * @param local:        if true, Local alignment (D segment), otherwise LocalEndWithSomeDeletions and onlyBottomTriangle (V and J segments)
624 625 626 627 628 629 630 631 632 633
 * @param segment_cost: the cost used by the dynamic programing
 * @return score:       the maximized score, together with the following values:
 * @return tag:           - the name of the sequence
 * @return del:           - the number of deletions at the end of the reference sequence (or at the start if 'reverse_both')
 * @return del2:          - the number of deletions at the start of the reference sequence (used with 'local')
 * @return begin:         - the start position of the aligned segment in the read
 * @return length:        - the length of the alignmed segment in the read
 * @return best_best_i: ?
 */

634 635
int align_against_collection(string &read, Fasta &rep, bool reverse_ref, bool reverse_both, bool local,
                             AlignBox *box, int *length, Cost segment_cost)
Mikaël Salson's avatar
Mikaël Salson committed
636 637 638
{
  
  int best_score = MINUS_INF ;
639
  box->ref_nb = MINUS_INF ;
Mikaël Salson's avatar
Mikaël Salson committed
640 641 642 643
  int best_best_i = (int) string::npos ;
  int best_best_j = (int) string::npos ;
  int best_first_i = (int) string::npos ;
  int best_first_j = (int) string::npos ;
644

Mikaël Salson's avatar
Mikaël Salson committed
645
  vector<pair<int, int> > score_r;
Mikaël Salson's avatar
Mikaël Salson committed
646 647 648

  DynProg::DynProgMode dpMode = DynProg::LocalEndWithSomeDeletions;
  if (local==true) dpMode = DynProg::Local;
649 650 651

  // With reverse_ref, the read is reversed to prevent calling revcomp on each reference sequence
  string sequence_or_rc = revcomp(read, reverse_ref);
Mikaël Salson's avatar
Mikaël Salson committed
652 653 654
  
  for (int r = 0 ; r < rep.size() ; r++)
    {
655
      DynProg dp = DynProg(sequence_or_rc, rep.sequence(r),
Mikaël Salson's avatar
Mikaël Salson committed
656 657 658
			   dpMode, // DynProg::SemiGlobalTrans, 
			   segment_cost, // DNA
			   reverse_both, reverse_both);
659 660 661

      bool onlyBottomTriangle = !local ;
      int score = dp.compute(onlyBottomTriangle, BOTTOM_TRIANGLE_SHIFT);
Mikaël Salson's avatar
Mikaël Salson committed
662
      
Mikaël Salson's avatar
Mikaël Salson committed
663 664 665 666 667 668 669 670 671 672 673
      if (local==true){ 
	dp.backtrack();
      }
      
      if (score > best_score)
	{
	  best_score = score ;
	  best_best_i = dp.best_i ;
	  best_best_j = dp.best_j ;
	  best_first_i = dp.first_i ;
	  best_first_j = dp.first_j ;
674 675
	  box->ref_nb = r ;
	  box->ref_label = rep.label(r) ;
Mikaël Salson's avatar
Mikaël Salson committed
676
	}
Mikaël Salson's avatar
Mikaël Salson committed
677 678
	
	score_r.push_back(make_pair(score, r));
Mathieu Giraud's avatar
Mathieu Giraud committed
679 680 681 682 683 684

	// #define DEBUG_SEGMENT      

#ifdef DEBUG_SEGMENT	
	cout << rep.label(r) << " " << score << " " << dp.best_i << endl ;
#endif
Mikaël Salson's avatar
Mikaël Salson committed
685 686

    }
Mikaël Salson's avatar
Mikaël Salson committed
687
    sort(score_r.begin(),score_r.end(),comp_pair);
Mikaël Salson's avatar
Mikaël Salson committed
688

689 690 691 692 693 694
  box->ref = rep.sequence(box->ref_nb);
  box->del_right = reverse_both ? best_best_j : box->ref.size() - best_best_j - 1;
  box->del_left = best_first_j;
  box->start = best_first_i;

  *length -= box->del_right ;
Mikaël Salson's avatar
Mikaël Salson committed
695
  
696
  box->score = score_r;
Mathieu Giraud's avatar
Mathieu Giraud committed
697 698 699

#ifdef DEBUG_SEGMENT	
  cout << "best: " << best_labels << " " << best_score ;
700
  cout << "del/del2/begin:" << (box->del_right) << "/" << (box->del_left) << "/" << (box->start) << endl;
Mathieu Giraud's avatar
Mathieu Giraud committed
701 702
  cout << endl;
#endif
703 704 705 706

  if (reverse_ref)
    // Why -1 here and +1 in dynprog.cpp /// best_i = m - best_i + 1 ;
    best_best_i = read.length() - best_best_i - 1 ;
Mikaël Salson's avatar
Mikaël Salson committed
707 708 709 710 711 712 713 714 715
  
  return best_best_i ;
}

string format_del(int deletions)
{
  return deletions ? *"(" + string_of_int(deletions) + " del)" : "" ;
}

716
FineSegmenter::FineSegmenter(Sequence seq, Germline *germline, Cost segment_c,  double threshold, int multiplier)
Mikaël Salson's avatar
Mikaël Salson committed
717
{
718 719 720 721
  box_V = new AlignBox();
  box_D = new AlignBox();
  box_J = new AlignBox();

722 723
  segmented = false;
  dSegmented = false;
724
  because = NOT_PROCESSED ;
725
  segmented_germline = germline ;
726
  info_extra = "" ;
Mikaël Salson's avatar
Mikaël Salson committed
727 728 729
  label = seq.label ;
  sequence = seq.sequence ;
  segment_cost=segment_c;
730
  evalue = NO_LIMIT_VALUE;
731 732
  evalue_left = NO_LIMIT_VALUE;
  evalue_right = NO_LIMIT_VALUE;
Mikaël Salson's avatar
Mikaël Salson committed
733

734 735
  CDR3start = -1;
  CDR3end = -1;
736

737 738 739
  bool reverse_V = false ;
  bool reverse_J = false ;

740
  if ((germline->seg_method == SEG_METHOD_MAX12) || (germline->seg_method == SEG_METHOD_MAX1U))
741
    {
742
      // We check whether this sequence is segmented with MAX12 or MAX1U (with default e-value parameters)
743
      KmerSegmenter *kseg = new KmerSegmenter(seq, germline, THRESHOLD_NB_EXPECTED, 1);
744
      if (kseg->isSegmented())
745
        {
746 747 748 749 750
          reversed = kseg->isReverse();

          KmerAffect left = reversed ? KmerAffect(kseg->after, true) : kseg->before ;
          KmerAffect right = reversed ? KmerAffect(kseg->before, true) : kseg->after ;

751 752
          delete kseg ;

753 754 755
          reverse_V = (left.getStrand() == -1);
          reverse_J = (right.getStrand() == -1);

756
          code = "Unexpected ";
757

758 759 760 761
          code += left.toStringSigns() + germline->index->getLabel(left).name;
          code += "/";
          code += right.toStringSigns() + germline->index->getLabel(right).name;
          info_extra += " " + left.toString() + "/" + right.toString() + " (" + code + ")";
762 763 764 765 766

          if (germline->seg_method == SEG_METHOD_MAX1U)
            return ;

          germline->override_rep5_rep3_from_labels(left, right);
767
        }
768
      else
769 770 771 772
        {
          delete kseg ;
          return ;
        }
773
    }
774

775 776 777 778 779 780 781
  // Strand determination, with KmerSegmenter (with default e-value parameters)
  // Note that we use only the 'strand' component
  // When the KmerSegmenter fails, continue with positive strand
  // TODO: flag to force a strand / to test both strands ?

  KmerSegmenter *kseg = new KmerSegmenter(seq, germline, THRESHOLD_NB_EXPECTED, 1);
  reversed = kseg->isReverse();
782
  delete kseg ;
Mikaël Salson's avatar
Mikaël Salson committed
783
  
784 785
  string sequence_or_rc = revcomp(sequence, reversed); // sequence, possibly reversed

Mathieu Giraud's avatar
Mathieu Giraud committed
786 787

  /* Segmentation */
Mikaël Salson's avatar
Mikaël Salson committed
788
  int plus_length = 0 ;
789 790 791 792 793 794
  box_V->end = align_against_collection(sequence_or_rc, germline->rep_5, reverse_V, reverse_V, false,
                                        box_V, &plus_length, segment_cost);

  box_J->start = align_against_collection(sequence_or_rc, germline->rep_3, reverse_J, !reverse_J, false,
                                          box_J, &plus_length, segment_cost);
  box_J->del_left = box_J->del_right; // should be directly in align_against_collection() ?
795

796
  plus_length += box_J->start - box_V->end ;
Mikaël Salson's avatar
Mikaël Salson committed
797

798
  /* E-values */
799 800
  evalue_left  = multiplier * sequence.size() * germline->rep_5.totalSize() * segment_cost.toPValue(box_V->score[0].first);
  evalue_right = multiplier * sequence.size() * germline->rep_3.totalSize() * segment_cost.toPValue(box_J->score[0].first);
801
  evalue = evalue_left + evalue_right ;
802 803

  /* Unsegmentation causes */
804
  if (box_V->end == (int) string::npos)
805
    {
806
      evalue_left = BAD_EVALUE ;
807
    }
808
      
809
  if (box_J->start == (int) string::npos)
810
    {
811
      evalue_right = BAD_EVALUE ;
812 813
    }

814 815
  checkLeftRightEvaluesThreshold(threshold, reversed ? -1 : 1);

816 817 818
  if (because != NOT_PROCESSED)
    {
      segmented = false;
819
      info = " @" + string_of_int (box_V->end + FIRST_POS) + "  @" + string_of_int(box_J->start + FIRST_POS) ;
Mikaël Salson's avatar
Mikaël Salson committed
820 821
      return ;
    }
822 823 824 825

  /* The sequence is segmented */
  segmented = true ;
  because = reversed ? SEG_MINUS : SEG_PLUS ;
826

Mikaël Salson's avatar
Mikaël Salson committed
827
    //overlap VJ
828
  seg_N = check_and_resolve_overlap(sequence_or_rc, 0, sequence_or_rc.length(),
829
                                    box_V, box_J, segment_cost);
830

831
  // Why could this happen ?
832 833
      if (box_J->start>=(int) sequence.length())
	  box_J->start=sequence.length()-1;
Mikaël Salson's avatar
Mikaël Salson committed
834

835
  // seg_N will be recomputed in finishSegmentation()
Mikaël Salson's avatar
Mikaël Salson committed
836

837 838
  code = box_V->ref_label +
    " "+ string_of_int(box_V->del_right) +
Mikaël Salson's avatar
Mikaël Salson committed
839 840
    "/" + seg_N + 
    // chevauchement +
841 842
    "/" + string_of_int(box_J->del_left) +
    " " + box_J->ref_label;
Mikaël Salson's avatar
Mikaël Salson committed
843

844
  info = string_of_int(box_V->end + FIRST_POS) + " " + string_of_int(box_J->start + FIRST_POS) ;
Mikaël Salson's avatar
Mikaël Salson committed
845 846 847
  finishSegmentation();
}

848
bool FineSegmenter::FineSegmentD(Germline *germline,
849
                                 AlignBox *box_Y, AlignBox *box_DD, AlignBox *box_Z,
850
                                 double evalue_threshold, int multiplier){
Mikaël Salson's avatar
Mikaël Salson committed
851 852 853
    int length = 0 ;
    
    // Create a zone where to look for D, adding at most EXTEND_D_ZONE nucleotides at each side
854
    int l = box_Y->end - EXTEND_D_ZONE;
Mikaël Salson's avatar
Mikaël Salson committed
855 856 857
    if (l<0) 
      l=0 ;

858
    int r = box_Z->start + EXTEND_D_ZONE;
Mikaël Salson's avatar
Mikaël Salson committed
859

860 861 862 863
    string seq = getSequence().sequence; // segmented sequence, possibly rev-comped

    if (r > (int) seq.length())
      r = seq.length();
Mikaël Salson's avatar
Mikaël Salson committed
864
      
865
    string str = seq.substr(l, r-l);
Mikaël Salson's avatar
Mikaël Salson committed
866 867

    // Align
868 869
    box_DD->end = align_against_collection(str, germline->rep_4, false, false, true,
                                           box_DD, &length, segment_cost);
870

871 872 873 874
    box_DD->start += l ;
    box_DD->end += l ;

    float evalue_D = multiplier * (r-l) * germline->rep_4.totalSize() * segment_cost.toPValue(box_DD->score[0].first);
875

876
    if (evalue_D > evalue_threshold)
877
      return false;
Mikaël Salson's avatar
Mikaël Salson committed
878 879
    
    //overlap VD
880 881
    seg_N1 = check_and_resolve_overlap(seq, 0, box_DD->end,
                                       box_Y, box_DD, segment_cost);
Mikaël Salson's avatar
Mikaël Salson committed
882 883
    
    //overlap DJ
884 885
    seg_N2 = check_and_resolve_overlap(seq, box_DD->start, seq.length(),
                                       box_DD, box_Z, segment_cost);
886 887 888 889 890 891 892 893 894

    return true;
}

void FineSegmenter::FineSegmentD(Germline *germline, double evalue_threshold, int multiplier){

  if (segmented){

    dSegmented = FineSegmentD(germline,
895
                              box_V, box_D, box_J,
896 897 898 899
                              evalue_threshold, multiplier);

    if (!dSegmented)
      return ;
900

901 902
    code = box_V->ref_label +
    " "+ string_of_int(box_V->del_right) +
Mikaël Salson's avatar
Mikaël Salson committed
903 904
    "/" + seg_N1 + 
    
905 906 907
    "/" + string_of_int(box_D->del_left) +
    " " + box_D->ref_label +
    " " + string_of_int(box_D->del_right) +
Mikaël Salson's avatar
Mikaël Salson committed
908 909
    
    "/" + seg_N2 +
910 911
    "/" + string_of_int(box_J->del_left) +
    " " + box_J->ref_label;
Mikaël Salson's avatar
Mikaël Salson committed
912
    
Mikaël Salson's avatar
Mikaël Salson committed
913 914 915
    finishSegmentationD();
  }
}
Mikaël Salson's avatar
Mikaël Salson committed
916

Marc Duez's avatar
Marc Duez committed
917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935
void FineSegmenter::findCDR3(){
    string str = getSequence().sequence;
    
    list<string> codon_start;
    codon_start.push_back("TGT");
    codon_start.push_back("TGC");
    
    list<string> codon_end;
    codon_end.push_back("TTT");
    codon_end.push_back("TTC");
    codon_end.push_back("TGG");
    
    list<int> p_start;
    list<int> p_end;

    size_t loc;
    std::list<string>::const_iterator it;
    for (it = codon_start.begin(); it != codon_start.end(); ++it) {//filter 1 : start codon must be in V
        loc = 0;
936
        while ( loc != string::npos && loc < (size_t)box_V->end){
Marc Duez's avatar
Marc Duez committed
937
            loc = str.find(*it, loc+3);
938
            if (loc != string::npos && loc < (size_t)box_V->end) {