Mise à jour terminée. Pour connaître les apports de la version 13.8.4 par rapport à notre ancienne version vous pouvez lire les "Release Notes" suivantes :
https://about.gitlab.com/releases/2021/02/11/security-release-gitlab-13-8-4-released/
https://about.gitlab.com/releases/2021/02/05/gitlab-13-8-3-released/

segment.cpp 30.6 KB
Newer Older
Mikaël Salson's avatar
Mikaël Salson committed
1
/*
2 3 4 5 6 7 8
  This file is part of Vidjil <http://www.vidjil.org>
  Copyright (C) 2011, 2012, 2013, 2014, 2015 by Bonsai bioinformatics 
  at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille
  Contributors: 
      Mathieu Giraud <mathieu.giraud@vidjil.org>
      Mikaël Salson <mikael.salson@vidjil.org>
      Marc Duez <marc.duez@vidjil.org>
Mikaël Salson's avatar
Mikaël Salson committed
9 10 11 12 13 14 15 16 17 18 19 20 21 22

  "Vidjil" is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.

  "Vidjil" is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with "Vidjil". If not, see <http://www.gnu.org/licenses/>
*/
Mikaël Salson's avatar
Mikaël Salson committed
23
#include <algorithm>    // std::sort
Mikaël Salson's avatar
Mikaël Salson committed
24 25 26 27
#include <cassert>
#include "segment.h"
#include "tools.h"
#include "affectanalyser.h"
Mikaël Salson's avatar
Mikaël Salson committed
28
#include <sstream>
29
#include <cstring>
Marc Duez's avatar
Marc Duez committed
30
#include <string>
Mikaël Salson's avatar
Mikaël Salson committed
31

Mathieu Giraud's avatar
Mathieu Giraud committed
32
Segmenter::~Segmenter() {}
Mikaël Salson's avatar
Mikaël Salson committed
33

Mathieu Giraud's avatar
Mathieu Giraud committed
34
Sequence Segmenter::getSequence() const {
Mikaël Salson's avatar
Mikaël Salson committed
35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
  Sequence s ;
  s.label_full = info ;
  s.label = label + " " + (reversed ? "-" : "+");
  s.sequence = revcomp(sequence, reversed);

  return s ;
}

string Segmenter::getJunction(int l) const {
  assert(isSegmented());

  int start = (getLeft() + getRight())/2 - l/2;
  
  if (start < 0 or start + l > (int)sequence.size())  // TODO: +l ou +l-1 ?
    return "" ;

  return getSequence().sequence.substr(start, l);
}

int Segmenter::getLeft() const {
Mathieu Giraud's avatar
Mathieu Giraud committed
55
  return Vend;
Mikaël Salson's avatar
Mikaël Salson committed
56 57 58
}
  
int Segmenter::getRight() const {
Mathieu Giraud's avatar
Mathieu Giraud committed
59
  return Jstart;
Mikaël Salson's avatar
Mikaël Salson committed
60 61 62
}

int Segmenter::getLeftD() const {
Mathieu Giraud's avatar
Mathieu Giraud committed
63
  return Dstart;
Mikaël Salson's avatar
Mikaël Salson committed
64 65 66
}
  
int Segmenter::getRightD() const {
Mathieu Giraud's avatar
Mathieu Giraud committed
67
  return Dend;
Mikaël Salson's avatar
Mikaël Salson committed
68 69 70 71 72 73 74 75 76 77 78 79 80 81
}

bool Segmenter::isReverse() const {
  return reversed;
}

bool Segmenter::isSegmented() const {
  return segmented;
}

bool Segmenter::isDSegmented() const {
  return dSegmented;
}

82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100

// E-values

void Segmenter::checkLeftRightEvaluesThreshold(double threshold, int strand)
{
  if (threshold == NO_LIMIT_VALUE)
    return ;

  if (evalue_left >= threshold && evalue_right >= threshold)
    because = UNSEG_TOO_FEW_ZERO ;
  else if ((strand == 1 ? evalue_left : evalue_right) >= threshold)
    because = UNSEG_TOO_FEW_V ;
  else if ((strand == 1 ? evalue_right : evalue_left) >= threshold)
    because = UNSEG_TOO_FEW_J ;
  else if (evalue >= threshold) // left and right are <= threshold, but their sum is > threshold
    because = UNSEG_TOO_FEW_ZERO ;
}


Mikaël Salson's avatar
Mikaël Salson committed
101 102 103 104 105 106 107 108
// Chevauchement

string Segmenter::removeChevauchement()
{
  assert(isSegmented());
  
  string chevauchement = "" ;

Mathieu Giraud's avatar
Mathieu Giraud committed
109
  if (Vend >= Jstart)
Mikaël Salson's avatar
Mikaël Salson committed
110
    {
Mathieu Giraud's avatar
Mathieu Giraud committed
111
      int middle = (Vend + Jstart) / 2 ;
112
      chevauchement = " !ov " + string_of_int (Vend - Jstart + 1);
Mathieu Giraud's avatar
Mathieu Giraud committed
113 114
      Vend = middle ;
      Jstart = middle+1 ;
Mikaël Salson's avatar
Mikaël Salson committed
115 116 117 118 119 120 121 122 123 124 125 126 127 128
    }

  return chevauchement ;
}

// Prettyprint


bool Segmenter::finishSegmentation() 
{
  assert(isSegmented());
  
  string seq = getSequence().sequence;
    
Mathieu Giraud's avatar
Mathieu Giraud committed
129 130 131 132 133
  seg_V = seq.substr(0, Vend+1) ;
  seg_N = seq.substr(Vend+1, Jstart-Vend-1) ;  // Twice computed for FineSegmenter, but only once in KmerSegmenter !
  seg_J = seq.substr(Jstart) ;
  Dstart=0;
  Dend=0;
Mikaël Salson's avatar
Mikaël Salson committed
134 135 136 137 138 139 140 141 142 143 144 145 146

  info = "VJ \t" + string_of_int(FIRST_POS) + " " + info + " " + string_of_int(seq.size() - 1 + FIRST_POS) ;
  info += "\t" + code ;

  info = (reversed ? "- " : "+ ") + info ;

  return true ;
}

bool Segmenter::finishSegmentationD() 
{
  string seq = getSequence().sequence;

Mathieu Giraud's avatar
Mathieu Giraud committed
147 148
  seg_V = seq.substr(0, Vend+1) ; // From pos. 0 to Vend
  seg_J = seq.substr(Jstart) ;
149
  seg_N = seq.substr(Vend+1, Jstart-Vend-1) ;  // Twice computed for FineSegmenter, but only once in KmerSegmenter !
Mathieu Giraud's avatar
Mathieu Giraud committed
150
  seg_D  = seq.substr(Dstart, Dend-Dstart+1) ; // From Dstart to Dend
Mikaël Salson's avatar
Mikaël Salson committed
151
  
Mathieu Giraud's avatar
Mathieu Giraud committed
152 153 154 155
  info = "VDJ \t0 " + string_of_int(Vend) +
		" " + string_of_int(Dstart) + 
		" " + string_of_int(Dend) +
		" " + string_of_int(Jstart) +
Mikaël Salson's avatar
Mikaël Salson committed
156 157 158 159 160 161 162 163 164
		" " + string_of_int(seq.size()-1+FIRST_POS) ;
		
  info += "\t" + code ;
  
  info = (reversed ? "- " : "+ ") + info ;

  return true ;
}

165 166
string Segmenter::getInfoLine() const
{
167
  string s = "" ;
168 169 170 171 172

  s += (segmented ? "" : "! ") + info ;
  s += " " + info_extra ;
  s += " " + segmented_germline->code ;
  s += " " + string(segmented_mesg[because]) ;
173 174 175 176

  if (evalue > NO_LIMIT_VALUE)
    s += " " + scientific_string_of_double(evalue);

177 178 179 180 181
  if (evalue_left > NO_LIMIT_VALUE)
    s += " " + scientific_string_of_double(evalue_left);
  if (evalue_right > NO_LIMIT_VALUE)
    s += "/" + scientific_string_of_double(evalue_right);

182 183 184
  return s ;
}

185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
string KmerSegmenter::getInfoLineWithAffects() const
{
   stringstream ss;

   ss << "# "
      << right << setw(3) << score << " "
      << left << setw(30)
      << getInfoLine() ;

   if (getSegmentationStatus() != UNSEG_TOO_SHORT)
     ss << getKmerAffectAnalyser()->toString();

   return ss.str();
}


Mikaël Salson's avatar
Mikaël Salson committed
201 202
ostream &operator<<(ostream &out, const Segmenter &s)
{
203
  out << ">" << s.label << " " ;
204
  out << s.getInfoLine() << endl;
Mikaël Salson's avatar
Mikaël Salson committed
205 206 207 208 209 210 211 212 213

  if (s.segmented)
    {
      out << s.seg_V << endl ;
      out << s.seg_N << endl ;
      out << s.seg_J << endl ;
    }
  else
    {
214
      out << s.getSequence().sequence << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
215 216 217 218 219 220 221 222
    }

  return out ;
}


// KmerSegmenter (Cheap)

223 224
KmerSegmenter::KmerSegmenter() { kaa = 0 ; }

225
KmerSegmenter::KmerSegmenter(Sequence seq, Germline *germline, double threshold, int multiplier)
Mikaël Salson's avatar
Mikaël Salson committed
226 227 228 229
{
  label = seq.label ;
  sequence = seq.sequence ;
  info = "" ;
230
  info_extra = "seed";
Mikaël Salson's avatar
Mikaël Salson committed
231
  segmented = false;
232
  segmented_germline = germline ;
233
  system = germline->code; // useful ?
234
  reversed = false;
Mathieu Giraud's avatar
Mathieu Giraud committed
235
  Dend=0;
236
  because = NOT_PROCESSED ; // Cause of unsegmentation
237
  score = 0 ;
238
  evalue = NO_LIMIT_VALUE;
239 240
  evalue_left = NO_LIMIT_VALUE;
  evalue_right = NO_LIMIT_VALUE;
241

242
  int s = (size_t)germline->index->getS() ;
Mikaël Salson's avatar
Mikaël Salson committed
243
  int length = sequence.length() ;
Mikaël Salson's avatar
Mikaël Salson committed
244

245
  if (length < s) 
Mikaël Salson's avatar
Mikaël Salson committed
246
    {
Mathieu Giraud's avatar
Mathieu Giraud committed
247 248
      because = UNSEG_TOO_SHORT;
      kaa = NULL;
249
      return ;
Mikaël Salson's avatar
Mikaël Salson committed
250 251
    }
 
252
  kaa = new KmerAffectAnalyser(*(germline->index), sequence);
Mikaël Salson's avatar
Mikaël Salson committed
253 254
  
  // Check strand consistency among the affectations.
Mikaël Salson's avatar
Mikaël Salson committed
255 256 257 258 259 260 261 262 263
  int strand;
  int nb_strand[2] = {0,0};     // In cell 0 we'll put the number of negative
                                // strand, while in cell 1 we'll put the
                                // positives
  for (int i = 0; i < kaa->count(); i++) { 
    KmerAffect it = kaa->getAffectation(i);
    if (! it.isAmbiguous() && ! it.isUnknown()) {
      strand = affect_strand(it.affect);
      nb_strand[(strand + 1) / 2] ++; // (strand+1) / 2 → 0 if strand == -1; 1 if strand == 1
Mikaël Salson's avatar
Mikaël Salson committed
264 265 266
    }
  }

267 268
  score = nb_strand[0] + nb_strand[1] ; // Used only for non-segmented germlines

269 270
  if ((germline->seg_method == SEG_METHOD_MAX12)
      || (germline->seg_method == SEG_METHOD_MAX1U))
271 272 273 274
    { // Pseudo-germline, MAX12 and MAX1U
      pair <KmerAffect, KmerAffect> max12 ;
      CountKmerAffectAnalyser ckaa(*(germline->index), sequence);

275 276 277 278 279

      set<KmerAffect> forbidden;
      forbidden.insert(KmerAffect::getAmbiguous());
      forbidden.insert(KmerAffect::getUnknown());

280
      if (germline->seg_method == SEG_METHOD_MAX12)
281 282 283
        // MAX12: two maximum k-mers (no unknown)
        {
          max12 = ckaa.max12(forbidden);
284

285 286 287 288 289 290
          if (max12.first.isUnknown() || max12.second.isUnknown())
            {
              because = UNSEG_TOO_FEW_ZERO ;
              return ;
            }
        }
291

292 293
      else
        // MAX1U: the maximum k-mers (no unknown) + unknown
294
        {
295 296 297 298 299 300 301 302 303
          CountKmerAffectAnalyser ckaa(*(germline->index), sequence);
          KmerAffect max = ckaa.max(forbidden);

          if (max.isUnknown())
            {
              because = UNSEG_TOO_FEW_ZERO ;
              return ;
            }
          max12 = make_pair(max, KmerAffect::getUnknown());
304 305
        }

306 307 308 309 310
      pair <KmerAffect, KmerAffect> before_after =  ckaa.sortLeftRight(max12);

      before = before_after.first ;
      after = before_after.second ;

311 312 313
      // This strand computation is only a heuristic, especially for chimera +/- reads
      // Anyway, it allows to gather such reads and their reverse complement into a unique window...
      // ... except when the read is quite different outside the window
314 315 316 317 318 319
      strand = nb_strand[0] > nb_strand[1] ? -1 : 1 ;
    }

  else
    { // Regular germline

320
  // Test on which strand we are, select the before and after KmerAffects
Mikaël Salson's avatar
Mikaël Salson committed
321
  if (nb_strand[0] == 0 && nb_strand[1] == 0) {
322
    because = UNSEG_TOO_FEW_ZERO ;
323
    return ;
Mikaël Salson's avatar
Mikaël Salson committed
324 325
  } else if (nb_strand[0] > RATIO_STRAND * nb_strand[1]) {
    strand = -1;
326 327
    before = KmerAffect(germline->affect_3, -1); 
    after = KmerAffect(germline->affect_5, -1);
Mikaël Salson's avatar
Mikaël Salson committed
328 329
  } else if (nb_strand[1] > RATIO_STRAND * nb_strand[0]) {
    strand = 1;
330 331
    before = KmerAffect(germline->affect_5, 1); 
    after = KmerAffect(germline->affect_3, 1);    
Mikaël Salson's avatar
Mikaël Salson committed
332 333
  } else {
    // Ambiguous information: we have positive and negative strands
334 335 336 337 338
    // and there is not enough difference to put them apart.
    if (nb_strand[0] + nb_strand[1] >= DETECT_THRESHOLD_STRAND)
      because = UNSEG_STRAND_NOT_CONSISTENT ;
    else
      because = UNSEG_TOO_FEW_ZERO ;
339
    return ;
Mikaël Salson's avatar
Mikaël Salson committed
340 341
  }

342
    } // endif Pseudo-germline
343
 
344
  computeSegmentation(strand, before, after, threshold, multiplier);
Mathieu Giraud's avatar
Mathieu Giraud committed
345
}
Mikaël Salson's avatar
Mikaël Salson committed
346

Mathieu Giraud's avatar
Mathieu Giraud committed
347
KmerSegmenter::~KmerSegmenter() {
348 349
  if (kaa)
    delete kaa;
350 351
}

352 353
KmerMultiSegmenter::KmerMultiSegmenter(Sequence seq, MultiGermline *multigermline, ostream *out_unsegmented,
                                       double threshold, int nb_reads_for_evalue)
354
{
355 356
  bool found_seg = false ; // Found a segmentation
  double best_evalue_seg = NO_LIMIT_VALUE ; // Best evalue, segmented sequences
357
  int best_score_unseg = 0 ; // Best score, unsegmented sequences
358
  the_kseg = NULL;
359
  multi_germline = multigermline;
360
  threshold_nb_expected = threshold;
361 362 363

  // E-value multiplier
  int multiplier = multi_germline->germlines.size() * nb_reads_for_evalue;
364
  
365 366 367 368 369
  // Iterate over the germlines
  for (list<Germline*>::const_iterator it = multigermline->germlines.begin(); it != multigermline->germlines.end(); ++it)
    {
      Germline *germline = *it ;

370
      KmerSegmenter *kseg = new KmerSegmenter(seq, germline, threshold, multiplier);
371
      bool keep_seg = false;
372

373 374 375
      if (out_unsegmented)
        {
          // Debug, display k-mer affectation and segmentation result for this germline
376
          *out_unsegmented << kseg->getInfoLineWithAffects() << endl ;
377 378
        }

379 380
      // Always remember the first kseg
      if (the_kseg == NULL)
381
        keep_seg = true;
382
      
383
      if (kseg->isSegmented())
384 385
        {
          // Yes, it is segmented
386
          // Should we keep the kseg ?
387
          if (!found_seg || (kseg->evalue < best_evalue_seg))
388
            {
389
              keep_seg = true;
390 391 392
              best_evalue_seg = kseg->evalue ;

              found_seg = true;
393
            }
394
        }
395 396 397 398 399 400 401
      else
        {
          // It is not segmented
          // Should we keep the kseg (with the unsegmentation cause) ?
            if (kseg->score > best_score_unseg)
            {              
              best_score_unseg = kseg->score ;
402
              if (!found_seg)
403 404 405 406
                keep_seg = true;
            }
        }
      
407
      if (keep_seg) {
408 409
        if (the_kseg)
          delete the_kseg;
410 411 412 413
        the_kseg = kseg;
      } else {
        delete kseg;
      }
414 415 416
    } // end for (Germlines)
}

417
KmerMultiSegmenter::~KmerMultiSegmenter() {
418 419
  if (the_kseg)
    delete the_kseg;
Mathieu Giraud's avatar
Mathieu Giraud committed
420 421
}

422 423
void KmerSegmenter::computeSegmentation(int strand, KmerAffect before, KmerAffect after,
                                        double threshold, int multiplier) {
424
  // Try to segment, computing 'Vend' and 'Jstart'
425 426
  // If not segmented, put the cause of unsegmentation in 'because'

427
  affect_infos max;
428
  max = kaa->getMaximum(before, after);
Mikaël Salson's avatar
Mikaël Salson committed
429

430 431 432 433 434 435
  // We did not find a good segmentation point
  if (!max.max_found) {
    // We labeled it detected if there were both enough affect_5 and enough affect_3
    bool detected = (max.nb_before_left + max.nb_before_right >= DETECT_THRESHOLD)
      && (max.nb_after_left + max.nb_after_right >= DETECT_THRESHOLD);

436 437 438
    if (max.nb_before_left + max.nb_before_right + max.nb_after_left + max.nb_after_right == 0)
      because = UNSEG_TOO_FEW_ZERO ;
    else if ((strand == 1 && max.nb_before_left == 0) || (strand == -1 && max.nb_after_right == 0))
439 440 441 442 443 444 445 446 447 448
      because = detected ? UNSEG_AMBIGUOUS : UNSEG_TOO_FEW_V ;
    else if ((strand == 1 && max.nb_after_right == 0)|| (strand == -1 && max.nb_before_left == 0))
      because = detected ? UNSEG_AMBIGUOUS : UNSEG_TOO_FEW_J ;
    else
      because = UNSEG_AMBIGUOUS;

    return ;
  }


449
  // E-values
450 451 452
  pair <double, double> pvalues = kaa->getLeftRightProbabilityAtLeastOrAbove();
  evalue_left = pvalues.first * multiplier ;
  evalue_right = pvalues.second * multiplier ;
453
  evalue = evalue_left + evalue_right ;
454

455
  checkLeftRightEvaluesThreshold(threshold, strand);
456

457
  if (because != NOT_PROCESSED)
458 459
    return ;

460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481
   // There was a good segmentation point

   Vend = max.first_pos_max;
   Jstart = max.last_pos_max + 1;
   if (strand == -1) {
     int tmp = sequence.size() - Vend - 1;
     Vend = sequence.size() - Jstart - 1;
     Jstart = tmp;
   }

  // Yes, it is segmented
  segmented = true;
  reversed = (strand == -1);
  because = reversed ? SEG_MINUS : SEG_PLUS ;

  info = string_of_int(Vend + FIRST_POS) + " " + string_of_int(Jstart + FIRST_POS)  ;

  // removeChevauchement is called once info was already computed: it is only to output info_extra
  info_extra += removeChevauchement();
  finishSegmentation();

  return ;
Mathieu Giraud's avatar
Mathieu Giraud committed
482
}
Mikaël Salson's avatar
Mikaël Salson committed
483

484
KmerAffectAnalyser *KmerSegmenter::getKmerAffectAnalyser() const {
Mathieu Giraud's avatar
Mathieu Giraud committed
485 486
  return kaa;
}
Mikaël Salson's avatar
Mikaël Salson committed
487

488
int Segmenter::getSegmentationStatus() const {
Mathieu Giraud's avatar
Mathieu Giraud committed
489
  return because;
Mikaël Salson's avatar
Mikaël Salson committed
490 491
}

492 493 494 495 496
void Segmenter::setSegmentationStatus(int status) {
  because = status;
  segmented = (status == SEG_PLUS || status == SEG_MINUS);
}

Mikaël Salson's avatar
Mikaël Salson committed
497 498
// FineSegmenter

499 500 501

void best_overlap_split(int overlap, string seq_left, string seq_right,
                        string ref_left, string ref_right,
502 503
                        int *pos_seq_left, int *pos_seq_right,
                        int *trim_ref_left, int *trim_ref_right, Cost segment_cost)
Mikaël Salson's avatar
Mikaël Salson committed
504 505 506 507 508
{
      int score_r[overlap+1];
      int score_l[overlap+1];
      
      //LEFT
509
      DynProg dp_l = DynProg(seq_left, ref_left,
Mikaël Salson's avatar
Mikaël Salson committed
510
			   DynProg::Local, segment_cost);
511
      score_l[0] = dp_l.compute();
512

Mikaël Salson's avatar
Mikaël Salson committed
513 514
      
      //RIGHT
515 516 517 518 519
      // reverse right sequence
      ref_right=string(ref_right.rbegin(), ref_right.rend());
      seq_right=string(seq_right.rbegin(), seq_right.rend());


520
      DynProg dp_r = DynProg(seq_right, ref_right,
Mikaël Salson's avatar
Mikaël Salson committed
521
			   DynProg::Local, segment_cost);
522
      score_r[0] = dp_r.compute();
523 524 525 526 527 528 529 530 531



      int trim_l[overlap+1];
      int trim_r[overlap+1];

      for(int i=0; i<=overlap; i++) {
        score_l[i] = i < seq_left.size()  ? dp_l.best_score_on_i(seq_left.size()  - i, trim_l + i) : MINUS_INF ;
        score_r[i] = i < seq_right.size() ? dp_r.best_score_on_i(seq_right.size() - i, trim_r + i) : MINUS_INF ;
Mikaël Salson's avatar
Mikaël Salson committed
532
      }
533

534 535 536 537 538 539

// #define DEBUG_OVERLAP
#ifdef DEBUG_OVERLAP
      cout << dp_l ;
      cout << dp_r ;

540 541
      cout << "seq:" << seq_left << "\t\t" << seq_right << endl;
      cout << "ref:" << ref_left << "\t\t" << ref_right << endl;
542 543 544 545 546 547 548 549
      for(int i=0; i<=overlap; i++)
        cout << i << "  left: " << score_l[i] << "/" << trim_l[i] << "     right: " << score_r[i] << "/" << trim_r[i] << endl;
#endif

      int score = MINUS_INF;
      int best_i = 0 ;
      int best_j = 0 ;

Mikaël Salson's avatar
Mikaël Salson committed
550

551
      // Find (i, j), with i+j >= overlap,
552
      // maximizing score_l[j] + score_r[i]
Mikaël Salson's avatar
Mikaël Salson committed
553 554
      for (int i=0; i<=overlap; i++){
	for (int j=overlap-i; j<=overlap; j++){
555
          int score_ij = score_l[i] + score_r[j];
556

557 558 559 560 561
	  if (score_ij > score) {
            best_i = i ;
            best_j = j ;
            *trim_ref_left  = ref_left.size() - trim_l[i];
	    *trim_ref_right = ref_right.size() - trim_r[j];
562
	    score = score_ij;
Mikaël Salson's avatar
Mikaël Salson committed
563 564 565
	  }
	}
      }
566 567 568 569 570

      *pos_seq_left -= best_i ;
      *pos_seq_right += best_j ;

#ifdef DEBUG_OVERLAP
571 572 573 574
      cout << "overlap: " << overlap << ", " << "best_overlap_split: " << score
           << "    left: " << best_i << "-" << *trim_ref_left << " @" << *pos_seq_right
           << "    right:" << best_j << "-" << *trim_ref_right << " @" << *pos_seq_right
           << endl;
575
#endif
Mikaël Salson's avatar
Mikaël Salson committed
576 577
}

Mikaël Salson's avatar
Mikaël Salson committed
578 579 580 581 582
bool comp_pair (pair<int,int> i,pair<int,int> j)
{
  return ( i.first > j.first);
}

583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599

/**
 * Align a read against a collection of sequences, maximizing the alignment 'score'
 * @param read:         the read
 * @param rep:          a collection of reference sequences
 * @param reverse_both: if true, reverse both the read and the reference sequences (J segment)
 * @param local:        if true, Local alignment (D segment), otherwise LocalEndWithSomeDeletions (V and J segments)
 * @param segment_cost: the cost used by the dynamic programing
 * @return score:       the maximized score, together with the following values:
 * @return tag:           - the name of the sequence
 * @return del:           - the number of deletions at the end of the reference sequence (or at the start if 'reverse_both')
 * @return del2:          - the number of deletions at the start of the reference sequence (used with 'local')
 * @return begin:         - the start position of the aligned segment in the read
 * @return length:        - the length of the alignmed segment in the read
 * @return best_best_i: ?
 */

Mathieu Giraud's avatar
Mathieu Giraud committed
600
int align_against_collection(string &read, Fasta &rep, bool reverse_both, bool local, int *tag, 
Mikaël Salson's avatar
Mikaël Salson committed
601
			     int *del, int *del2, int *begin, int *length, vector<pair<int, int> > *score
Mikaël Salson's avatar
Mikaël Salson committed
602 603 604 605 606 607 608 609 610 611
			    , Cost segment_cost)
{
  
  int best_score = MINUS_INF ;
  int best_r = MINUS_INF ;
  int best_best_i = (int) string::npos ;
  int best_best_j = (int) string::npos ;
  int best_first_i = (int) string::npos ;
  int best_first_j = (int) string::npos ;
  string best_label = "" ;
Mikaël Salson's avatar
Mikaël Salson committed
612
  vector<pair<int, int> > score_r;
Mikaël Salson's avatar
Mikaël Salson committed
613 614 615 616 617 618 619 620 621 622 623

  DynProg::DynProgMode dpMode = DynProg::LocalEndWithSomeDeletions;
  if (local==true) dpMode = DynProg::Local;
  
  for (int r = 0 ; r < rep.size() ; r++)
    {
      DynProg dp = DynProg(read, rep.sequence(r),
			   dpMode, // DynProg::SemiGlobalTrans, 
			   segment_cost, // DNA
			   reverse_both, reverse_both);
      int score = dp.compute();
Mikaël Salson's avatar
Mikaël Salson committed
624
      
Mikaël Salson's avatar
Mikaël Salson committed
625 626 627 628 629 630 631 632 633 634 635 636 637 638
      if (local==true){ 
	dp.backtrack();
      }
      
      if (score > best_score)
	{
	  best_score = score ;
	  best_best_i = dp.best_i ;
	  best_best_j = dp.best_j ;
	  best_first_i = dp.first_i ;
	  best_first_j = dp.first_j ;
	  best_r = r ;
	  best_label = rep.label(r) ;
	}
Mikaël Salson's avatar
Mikaël Salson committed
639 640
	
	score_r.push_back(make_pair(score, r));
Mathieu Giraud's avatar
Mathieu Giraud committed
641 642 643 644 645 646

	// #define DEBUG_SEGMENT      

#ifdef DEBUG_SEGMENT	
	cout << rep.label(r) << " " << score << " " << dp.best_i << endl ;
#endif
Mikaël Salson's avatar
Mikaël Salson committed
647 648

    }
Mikaël Salson's avatar
Mikaël Salson committed
649
    sort(score_r.begin(),score_r.end(),comp_pair);
Mikaël Salson's avatar
Mikaël Salson committed
650 651 652 653

  *del = reverse_both ? best_best_j : rep.sequence(best_r).size() - best_best_j - 1;
  *del2 = best_first_j;
  *begin = best_first_i;
Mathieu Giraud's avatar
Mathieu Giraud committed
654
  *tag = best_r ; 
Mikaël Salson's avatar
Mikaël Salson committed
655 656
  
  *length -= *del ;
Mikaël Salson's avatar
Mikaël Salson committed
657 658
  
  *score=score_r;
Mathieu Giraud's avatar
Mathieu Giraud committed
659 660 661 662 663 664

#ifdef DEBUG_SEGMENT	
  cout << "best: " << best_labels << " " << best_score ;
  cout << "del/del2/begin:" << (*del) << "/" << (*del2) << "/" << (*begin) << endl;
  cout << endl;
#endif
Mikaël Salson's avatar
Mikaël Salson committed
665 666 667 668 669 670 671 672 673
  
  return best_best_i ;
}

string format_del(int deletions)
{
  return deletions ? *"(" + string_of_int(deletions) + " del)" : "" ;
}

674
FineSegmenter::FineSegmenter(Sequence seq, Germline *germline, Cost segment_c,  double threshold, int multiplier)
Mikaël Salson's avatar
Mikaël Salson committed
675
{
676 677
  segmented = false;
  dSegmented = false;
678
  because = NOT_PROCESSED ;
679
  segmented_germline = germline ;
680
  info_extra = "" ;
681
  code_short = "" ;
Mikaël Salson's avatar
Mikaël Salson committed
682 683
  label = seq.label ;
  sequence = seq.sequence ;
Mathieu Giraud's avatar
Mathieu Giraud committed
684
  Dend=0;
Mikaël Salson's avatar
Mikaël Salson committed
685
  segment_cost=segment_c;
686
  evalue = NO_LIMIT_VALUE;
687 688
  evalue_left = NO_LIMIT_VALUE;
  evalue_right = NO_LIMIT_VALUE;
Mikaël Salson's avatar
Mikaël Salson committed
689

690 691 692
  CDR3start = -1;
  CDR3end = -1;
  
693
  if (!germline->rep_5.size() || !germline->rep_3.size())
694
    {
695
      // We check whether this sequence is segmented with MAX12 or MAX1U (with default e-value parameters)
696
      KmerSegmenter *kseg = new KmerSegmenter(seq, germline, THRESHOLD_NB_EXPECTED, 1);
697 698
      if (kseg->isSegmented() && ((germline->seg_method == SEG_METHOD_MAX12)
                                  || (germline->seg_method == SEG_METHOD_MAX1U)))
699
        {
700 701 702 703 704
          reversed = kseg->isReverse();

          KmerAffect left = reversed ? KmerAffect(kseg->after, true) : kseg->before ;
          KmerAffect right = reversed ? KmerAffect(kseg->before, true) : kseg->after ;

705
          code_short = "Unexpected ";
706 707

          code_short += left.toStringSigns() + germline->index->getLabel(left);
708
          code_short += "/";
709 710
          code_short += right.toStringSigns() + germline->index->getLabel(right);
          info_extra += " " + left.toString() + "/" + right.toString() + " (" + code_short + ")";
711 712 713
        }
      return ;
    }
714

Mikaël Salson's avatar
Mikaël Salson committed
715 716 717 718 719
  // TODO: factoriser tout cela, peut-etre en lancant deux segmenteurs, un +, un -, puis un qui chapote
  
  // Strand +
  
  int plus_score = 0 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
720
  int tag_plus_V, tag_plus_J;
Mikaël Salson's avatar
Mikaël Salson committed
721 722 723 724
  int plus_length = 0 ;
  int del_plus_V, del_plus_J ;
  int del2=0;
  int beg=0;
Mikaël Salson's avatar
Mikaël Salson committed
725 726 727 728
  
  vector<pair<int, int> > score_plus_V;
  vector<pair<int, int> > score_plus_J;
  
729
  int plus_left = align_against_collection(sequence, germline->rep_5, false, false, &tag_plus_V, &del_plus_V, &del2, &beg, 
Mikaël Salson's avatar
Mikaël Salson committed
730 731
					   &plus_length, &score_plus_V
					   , segment_cost);
732
  int plus_right = align_against_collection(sequence, germline->rep_3, true, false, &tag_plus_J, &del_plus_J, &del2, &beg,
Mikaël Salson's avatar
Mikaël Salson committed
733 734
					    &plus_length, &score_plus_J
					    , segment_cost);
Mikaël Salson's avatar
Mikaël Salson committed
735 736
  plus_length += plus_right - plus_left ;

Mikaël Salson's avatar
Mikaël Salson committed
737 738
  plus_score=score_plus_V[0].first + score_plus_J[0].first ;
  
Mikaël Salson's avatar
Mikaël Salson committed
739 740 741
  // Strand -
  string rc = revcomp(sequence) ;
  int minus_score = 0 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
742
  int tag_minus_V, tag_minus_J;
Mikaël Salson's avatar
Mikaël Salson committed
743 744
  int minus_length = 0 ;
  int del_minus_V, del_minus_J ;
Mikaël Salson's avatar
Mikaël Salson committed
745 746 747 748
  
  vector<pair<int, int> > score_minus_V;
  vector<pair<int, int> > score_minus_J;
  
749
  int minus_left = align_against_collection(rc, germline->rep_5, false, false, &tag_minus_V, &del_minus_V, &del2, &beg,
Mikaël Salson's avatar
Mikaël Salson committed
750 751
					    &minus_length, &score_minus_V
					    ,  segment_cost);
752
  int minus_right = align_against_collection(rc, germline->rep_3, true, false, &tag_minus_J, &del_minus_J, &del2, &beg,
Mikaël Salson's avatar
Mikaël Salson committed
753 754
					     &minus_length, &score_minus_J
					     , segment_cost);
Mikaël Salson's avatar
Mikaël Salson committed
755 756
  minus_length += minus_right - minus_left ;

Mikaël Salson's avatar
Mikaël Salson committed
757 758
  minus_score=score_minus_V[0].first + score_minus_J[0].first ;
  
Mikaël Salson's avatar
Mikaël Salson committed
759 760 761 762
  reversed = (minus_score > plus_score) ;

  if (!reversed)
    {
Mathieu Giraud's avatar
Mathieu Giraud committed
763 764 765 766
      Vend = plus_left ;
      Jstart = plus_right ;
      best_V = tag_plus_V ;
      best_J = tag_plus_J ;
Mikaël Salson's avatar
Mikaël Salson committed
767 768
      del_V = del_plus_V ;
      del_J = del_plus_J ;
Mikaël Salson's avatar
Mikaël Salson committed
769 770
      score_V=score_plus_V;
      score_J=score_plus_J;
Mikaël Salson's avatar
Mikaël Salson committed
771 772 773
    }
  else
    {
Mathieu Giraud's avatar
Mathieu Giraud committed
774 775 776 777
      Vend = minus_left ;
      Jstart = minus_right ;
      best_V = tag_minus_V ;
      best_J = tag_minus_J ;
Mikaël Salson's avatar
Mikaël Salson committed
778 779
      del_V = del_minus_V ;
      del_J = del_minus_J ;
Mikaël Salson's avatar
Mikaël Salson committed
780 781
      score_V=score_minus_V;
      score_J=score_minus_J;
Mikaël Salson's avatar
Mikaël Salson committed
782 783
    }

784
  /* E-values */
Mathieu Giraud's avatar
Mathieu Giraud committed
785 786
  evalue_left  = multiplier * sequence.size() * germline->rep_5.totalSize() * segment_cost.toPValue(score_V[0].first);
  evalue_right = multiplier * sequence.size() * germline->rep_3.totalSize() * segment_cost.toPValue(score_J[0].first);
787
  evalue = evalue_left + evalue_right ;
788 789 790 791

  /* Unsegmentation causes */
  if (Vend == (int) string::npos)
    {
792
      evalue_left = BAD_EVALUE ;
793
    }
794
      
795 796
  if (Jstart == (int) string::npos)
    {
797
      evalue_right = BAD_EVALUE ;
798 799
    }

800 801
  checkLeftRightEvaluesThreshold(threshold, reversed ? -1 : 1);

802 803 804 805
  if (because != NOT_PROCESSED)
    {
      segmented = false;
      info = " @" + string_of_int (Vend + FIRST_POS) + "  @" + string_of_int(Jstart + FIRST_POS) ;
Mikaël Salson's avatar
Mikaël Salson committed
806 807
      return ;
    }
808 809 810 811

  /* The sequence is segmented */
  segmented = true ;
  because = reversed ? SEG_MINUS : SEG_PLUS ;
812 813 814

  string sequence_or_rc = getSequence().sequence; // segmented sequence, possibly rev-comped

Mikaël Salson's avatar
Mikaël Salson committed
815
    //overlap VJ
Mathieu Giraud's avatar
Mathieu Giraud committed
816 817
    if(Jstart-Vend <=0){
      int overlap=Vend-Jstart+1;
818 819 820

      string seq_left = sequence_or_rc.substr(0, Vend+1);
      string seq_right = sequence_or_rc.substr(Jstart);
Mikaël Salson's avatar
Mikaël Salson committed
821

822 823
      best_overlap_split(overlap, seq_left, seq_right,
                         germline->rep_5.sequence(best_V), germline->rep_3.sequence(best_J),
824 825
                         &Vend, &Jstart, &del_V, &del_J, segment_cost);

826
      if (Jstart>=(int) sequence.length())
Mathieu Giraud's avatar
Mathieu Giraud committed
827
	  Jstart=sequence.length()-1;
Mikaël Salson's avatar
Mikaël Salson committed
828 829 830
    }

    // string chevauchement = removeChevauchement();
Mikaël Salson's avatar
Mikaël Salson committed
831 832

    /// used only below, then recomputed in finishSegmentation() ;
833
    seg_N = sequence_or_rc.substr(Vend+1, Jstart-Vend-1);
Mikaël Salson's avatar
Mikaël Salson committed
834

835
  code = germline->rep_5.label(best_V) +
Mikaël Salson's avatar
Mikaël Salson committed
836 837 838 839
    " "+ string_of_int(del_V) + 
    "/" + seg_N + 
    // chevauchement +
    "/" + string_of_int(del_J) +
840
    " " + germline->rep_3.label(best_J); 
Mikaël Salson's avatar
Mikaël Salson committed
841

Mikaël Salson's avatar
Mikaël Salson committed
842
    stringstream code_s;
843
   code_s<< germline->rep_5.label(best_V) <<
Mikaël Salson's avatar
Mikaël Salson committed
844 845 846 847
    " -" << string_of_int(del_V) << "/" 
    << seg_N.size()
    // chevauchement +
    << "/-" << string_of_int(del_J)
848
    <<" " << germline->rep_3.label(best_J);
Mikaël Salson's avatar
Mikaël Salson committed
849 850
    code_short=code_s.str();
    
851 852
  code_light = germline->rep_5.label(best_V) +
    "/ " + germline->rep_3.label(best_J); 
Mikaël Salson's avatar
Mikaël Salson committed
853 854

 
Mathieu Giraud's avatar
Mathieu Giraud committed
855
  info = string_of_int(Vend + FIRST_POS) + " " + string_of_int(Jstart + FIRST_POS) ;
Mikaël Salson's avatar
Mikaël Salson committed
856 857 858 859
  finishSegmentation();
}


860
void FineSegmenter::FineSegmentD(Germline *germline, double evalue_threshold, int multiplier){
Mikaël Salson's avatar
Mikaël Salson committed
861 862 863 864
  
  if (segmented){
    
    int end = (int) string::npos ;
Mathieu Giraud's avatar
Mathieu Giraud committed
865
    int tag_D;
Mikaël Salson's avatar
Mikaël Salson committed
866 867 868 869
    int length = 0 ;
    int begin = 0;
    
    // Create a zone where to look for D, adding at most EXTEND_D_ZONE nucleotides at each side
Mathieu Giraud's avatar
Mathieu Giraud committed
870
    int l = Vend - EXTEND_D_ZONE;
Mikaël Salson's avatar
Mikaël Salson committed
871 872 873
    if (l<0) 
      l=0 ;

Mathieu Giraud's avatar
Mathieu Giraud committed
874
    int r = Jstart + EXTEND_D_ZONE;
Mikaël Salson's avatar
Mikaël Salson committed
875

876 877 878 879
    string seq = getSequence().sequence; // segmented sequence, possibly rev-comped

    if (r > (int) seq.length())
      r = seq.length();
Mikaël Salson's avatar
Mikaël Salson committed
880
      
881
    string str = seq.substr(l, r-l);
Mikaël Salson's avatar
Mikaël Salson committed
882 883

    // Align
884
    end = align_against_collection(str, germline->rep_4, false, true, &tag_D, &del_D_right, &del_D_left, &begin,
Mikaël Salson's avatar
Mikaël Salson committed
885 886
				&length, &score_D, segment_cost);
    
Mathieu Giraud's avatar
Mathieu Giraud committed
887
    best_D = tag_D;
Mikaël Salson's avatar
Mikaël Salson committed
888
    
Mathieu Giraud's avatar
Mathieu Giraud committed
889 890
    Dstart = l + begin;
    Dend = l + end;
Mikaël Salson's avatar
Mikaël Salson committed
891
	
892
    float evalue_D = multiplier * (r-l) * germline->rep_4.totalSize() * segment_cost.toPValue(score_D[0].first);
893 894 895



896 897
    if (evalue_D > evalue_threshold)
      return ;
898 899

    dSegmented=true;
Mikaël Salson's avatar
Mikaël Salson committed
900 901
    
    //overlap VD
Mathieu Giraud's avatar
Mathieu Giraud committed
902 903 904 905
    if(Dstart-Vend <=0){
      int overlap=Vend-Dstart+1;
      string seq_left = seq.substr(0, Vend+1);
      string seq_right = seq.substr(Dstart, Dend-Dstart+1);
Mikaël Salson's avatar
Mikaël Salson committed
906

907 908
      best_overlap_split(overlap, seq_left, seq_right,
                         germline->rep_5.sequence(best_V), germline->rep_4.sequence(best_D),
909
                         &Vend, &Dstart, &del_V, &del_D_left, segment_cost);
Mikaël Salson's avatar
Mikaël Salson committed
910
    }
911

Mathieu Giraud's avatar
Mathieu Giraud committed
912
    seg_N1 = seq.substr(Vend+1, Dstart-Vend-1) ; // From Vend+1 to Dstart-1
Mikaël Salson's avatar
Mikaël Salson committed
913 914
    
    //overlap DJ
Mathieu Giraud's avatar
Mathieu Giraud committed
915 916
    if(Jstart-Dend <=0){
      int overlap=Dend-Jstart+1;
917 918
      string seq_left = seq.substr(Dstart, Dend-Dstart+1);
      string seq_right = seq.substr(Jstart, seq.length()-Jstart);
Mikaël Salson's avatar
Mikaël Salson committed
919

920 921
      best_overlap_split(overlap, seq_left, seq_right,
                         germline->rep_4.sequence(best_D), germline->rep_3.sequence(best_J),
922
                         &Dend, &Jstart, &del_D_right, &del_J, segment_cost);
Mikaël Salson's avatar
Mikaël Salson committed
923
    }
924

Mathieu Giraud's avatar
Mathieu Giraud committed
925
    seg_N2 = seq.substr(Dend+1, Jstart-Dend-1) ; // From Dend+1 to right-1
926
    code = germline->rep_5.label(best_V) +
Mikaël Salson's avatar
Mikaël Salson committed
927 928 929 930
    " "+ string_of_int(del_V) + 
    "/" + seg_N1 + 
    
    "/" + string_of_int(del_D_left) +
931
    " " + germline->rep_4.label(best_D) +
Mikaël Salson's avatar
Mikaël Salson committed
932 933 934 935
    " " + string_of_int(del_D_right) +
    
    "/" + seg_N2 +
    "/" + string_of_int(del_J) +
936
    " " + germline->rep_3.label(best_J); 
Mikaël Salson's avatar
Mikaël Salson committed
937

Mikaël Salson's avatar
Mikaël Salson committed
938
    stringstream code_s;
939
    code_s << germline->rep_5.label(best_V) 
Mikaël Salson's avatar
Mikaël Salson committed
940 941 942 943
    << " -" << string_of_int(del_V) << "/" 
    << seg_N1.size()
    
    << "/-" << string_of_int(del_D_left) 
944
    << " " << germline->rep_4.label(best_D) 
Mikaël Salson's avatar
Mikaël Salson committed
945 946 947 948
    << " -" << string_of_int(del_D_right) << "/"
    
    << seg_N2.size()
    << "/-" << string_of_int(del_J) 
949
    << " " << germline->rep_3.label(best_J);
Mikaël Salson's avatar
Mikaël Salson committed
950 951 952
    code_short=code_s.str();
    
    
953 954 955
    code_light = germline->rep_5.label(best_V) +
    "/ " + germline->rep_4.label(best_D) +
    "/ " + germline->rep_3.label(best_J); 
Mikaël Salson's avatar
Mikaël Salson committed
956 957 958 959
    
    finishSegmentationD();
  }
}