Attention une mise à jour du service Gitlab va être effectuée le mardi 14 décembre entre 13h30 et 14h00. Cette mise à jour va générer une interruption du service dont nous ne maîtrisons pas complètement la durée mais qui ne devrait pas excéder quelques minutes.

segment.cpp 43.3 KB
Newer Older
Mikaël Salson's avatar
Mikaël Salson committed
1
/*
2
  This file is part of Vidjil <http://www.vidjil.org>
Mathieu Giraud's avatar
Mathieu Giraud committed
3
  Copyright (C) 2011-2020 by VidjilNet consortium and Bonsai bioinformatics
4 5 6 7 8
  at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille
  Contributors: 
      Mathieu Giraud <mathieu.giraud@vidjil.org>
      Mikaël Salson <mikael.salson@vidjil.org>
      Marc Duez <marc.duez@vidjil.org>
Mikaël Salson's avatar
Mikaël Salson committed
9 10 11 12 13 14 15 16 17 18 19 20 21 22

  "Vidjil" is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.

  "Vidjil" is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with "Vidjil". If not, see <http://www.gnu.org/licenses/>
*/
Mikaël Salson's avatar
Mikaël Salson committed
23
#include <algorithm>    // std::sort
Mikaël Salson's avatar
Mikaël Salson committed
24 25 26
#include <cassert>
#include "segment.h"
#include "tools.h"
27
#include "output.h"
28
#include "../lib/json.hpp"
Mikaël Salson's avatar
Mikaël Salson committed
29
#include "affectanalyser.h"
Mikaël Salson's avatar
Mikaël Salson committed
30
#include <sstream>
31
#include <cstring>
Marc Duez's avatar
Marc Duez committed
32
#include <string>
33
#include "windowExtractor.h"
34 35 36 37
#include <set>
#include "automaton.h"
#include <map>
#include <string>
38
#define NO_FORBIDDEN_ID (-1)
39

40
AlignBox::AlignBox(string _key, string _color) {
41
  key = _key;
42
  color = _color;
43

44 45 46 47 48 49 50 51 52 53
  del_left = 0 ;
  start = 0 ;
  end = 0 ;
  del_right = 0 ;

  ref_nb = 0 ;
  ref = "";
  ref_label = "";
}

54 55 56 57 58 59 60 61 62 63
void AlignBox::reverse() {
  int start_ = start;
  start = seq_length - end - 1;
  end = seq_length - start_ - 1;

  int del_left_ = del_left;
  del_left = del_right;
  del_right = del_left_;
}

64 65 66 67
int AlignBox::getLength() {
  return end - start + 1 ;
}

68 69 70 71 72 73 74 75 76 77 78 79
char AlignBox::getInitial() {

  // TRGV -> V, IGHD -> D...
  if (ref_label.size() > 4)
    return ref_label[3] ;

  if (key.size())
    return key[0] ;

  return '?' ;
}

80 81 82 83
string AlignBox::getSequence(string sequence) {
  return sequence.substr(start, end-start+1);
}

84 85 86 87 88 89 90 91 92 93
bool AlignBox::CoverFirstPos()
{
  return (start <= 0);
}

bool AlignBox::CoverLastPos()
{
  return (end >= seq_length - 1);
}

94
void AlignBox::addToOutput(CloneOutput *clone, int alternative_genes) {
95

96 97 98
  json j;
  j["name"] = ref_label;

99
  if (key != "3" || !CoverLastPos()) // end information for J
100
    {
101
      j["stop"] = end + 1;
102
      j["delRight"] = del_right;
103
    }
104

105
  if (key != "5" || !CoverFirstPos()) // start information for V
106
    {
107
      j["start"] = start + 1;
108
      j["delLeft"] = del_left;
109
    }
110

111
  clone->setSeg(key, j) ;
112

113
  /*Export the N best genes if threshold parameter is specified*/
114
  if(rep && !this->score.empty() && rep->size() <= (int)this->score.size() && alternative_genes > 0){
115
    json jalt = json::array();
116 117 118
    int last_score = this->score[0].first;
    for(int i = 0; i < (int)this->score.size() &&
          (i < alternative_genes || last_score == this->score[i].first);++i){
Mikaël Salson's avatar
Mikaël Salson committed
119 120
      int r = this->score[i].second;
      jalt.push_back(json::object({{"name",rep->label(r)}}));
121
      last_score = this->score[i].first;
122
    }
123
    clone->setSeg(key + "alt", jalt);
124
  }
125
}
126

127
int AlignBox::posInRef(int i) const {
128 129 130 131 132 133 134 135 136 137 138
  // Works now only for V/J boxes

  if (del_left >= 0) // J
    return i - start + del_left + 1; // Why +1 ?

  if (del_right >= 0) // V
    return i + (ref.size() - del_right) - end ;

  return -99;
}

139
string AlignBox::refToString(int from, int to) const {
140 141 142

  stringstream s;

143
  s << left << setw(SHOW_NAME_WIDTH) << ref_label << " " ;
144 145 146

  int j = posInRef(from);

147
  s << right << setw(4) << j << " " ;
148 149 150 151 152 153 154 155 156

  if (from > start)
    s << color;

  for (int i=from; i<=to; i++) {

    if (i == start)
      s << color;

157
    if (j > 0 && (size_t)j <= ref.size())
158 159 160 161 162 163 164 165 166 167 168 169 170 171
      s << ref[j-1] ;
    else
      s << ".";

    if (i == end)
      s << NO_COLOR;

    // Related position. To improve
    j++ ;
  }

  if (to < end)
    s << NO_COLOR;

172
  s << right << setw(4) << j  ;
173 174 175 176

  return s.str();
}

177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202
void show_colored_read(ostream &out, Sequence seq, const AlignBox *box_V, const AlignBox *box_J, int start_5, int end_3)
{
  out << left << setw(SHOW_NAME_WIDTH) << seq.label.substr(0,SHOW_NAME_WIDTH) << " "
      << right << setw(4) << start_5 << " " ;

  out << V_COLOR << seq.sequence.substr(start_5, box_V->end - start_5 + 1)
      << NO_COLOR
      << seq.sequence.substr(box_V->end+1, (box_J->start - 1) - (box_V->end + 1) +1)
      << J_COLOR
      << seq.sequence.substr(box_J->start, end_3 - box_J->start + 1)
      << NO_COLOR ;
  
  out << right << setw(4) << end_3 << endl ;
}

void show_colored_read_germlines(ostream &out, Sequence seq, const AlignBox *box_V, const AlignBox *box_J, int max_gene_align)
{
  int align_V_length = min(max_gene_align, box_V->end - box_V->start + 1);
  int align_J_length = min(max_gene_align, (int)seq.sequence.size() - box_J->start + 1);
  int start_V = box_V->end - align_V_length + 1;
  int end_J = box_J->start + align_J_length - 1;

  show_colored_read(out, seq, box_V, box_J, start_V, end_J);
  out << box_V->refToString(start_V, end_J) << "   " << *box_V << endl ;
  out << box_J->refToString(start_V, end_J) << "   " << *box_J << endl ;
}
203 204


205 206 207 208 209 210 211 212 213 214 215
ostream &operator<<(ostream &out, const AlignBox &box)
{
  out << "[/" << box.del_left << " " ;
  out << "@" << box.start << " " ;
  out << box.ref_label << "(" << box.ref_nb << ") " ;
  out << "@" << box.end << " " ;
  out << box.del_right << "/]" ;

  return out ;
}

216 217 218 219 220 221 222 223 224 225
string codeFromBoxes(vector <AlignBox*> boxes, string sequence)
{
  string code = "";

  int n = boxes.size();

  for (int i=0; i<n; i++) {

    if (i>0) {
      code += " " + string_of_int(boxes[i-1]->del_right) + "/"
Mathieu Giraud's avatar
Mathieu Giraud committed
226
        // From box_left->end + 1 to box_right->start - 1, both positions included
227 228 229 230 231 232 233 234 235
        + sequence.substr(boxes[i-1]->end + 1, boxes[i]->start - boxes[i-1]->end - 1)
        + "/" + string_of_int(boxes[i]->del_left) + " " ;
    }

    code += boxes[i]->ref_label ;
  }

  return code;
}
236

237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254
string posFromBoxes(vector <AlignBox*> boxes)
{
  string poss = "";
  string initials = "";

  int n = boxes.size();


  for (int i=0; i<n; i++) {
    initials += boxes[i]->getInitial() ;

    poss += " " + string_of_int(boxes[i]->start + FIRST_POS) ;
    poss += " " + string_of_int(boxes[i]->end + FIRST_POS) ;
  }

  return initials + "\t" + poss;
}

255

Mathieu Giraud's avatar
Mathieu Giraud committed
256
Segmenter::~Segmenter() {}
Mikaël Salson's avatar
Mikaël Salson committed
257

Mathieu Giraud's avatar
Mathieu Giraud committed
258
Sequence Segmenter::getSequence() const {
Mikaël Salson's avatar
Mikaël Salson committed
259 260
  Sequence s ;
  s.label_full = info ;
261 262 263 264 265 266
  if (segmented) {
    s.label = label + " " + (reversed ? "-" : "+");
    s.sequence = revcomp(sequence, reversed);
  } else {
    s.sequence = sequence;
  }
Mikaël Salson's avatar
Mikaël Salson committed
267 268 269 270

  return s ;
}

271
string Segmenter::getJunction(int l, int shift) {
Mikaël Salson's avatar
Mikaël Salson committed
272 273
  assert(isSegmented());

274
  junctionChanged = false;
275 276 277 278 279
  // '-w all'
  if (l == NO_LIMIT_VALUE)
    return getSequence().sequence;

  // Regular '-w'
280 281 282 283 284
  int central_pos = (getLeft() + getRight())/2 + shift;

  pair<int, int> length_shift = WindowExtractor::get_best_length_shifts(getSequence().sequence.size(),
                                                                        l, central_pos,
                                                                        DEFAULT_WINDOW_SHIFT);
Mathieu Giraud's avatar
Mathieu Giraud committed
285
  // Yield UNSEG_TOO_SHORT_FOR_WINDOW into windowExtractor
286 287
  if (length_shift.first < MINIMAL_WINDOW_LENGTH && length_shift.first < l) {
    info += " w" + string_of_int(length_shift.first) + "/" + string_of_int(length_shift.second);
Mikaël Salson's avatar
Mikaël Salson committed
288
    return "" ;
289
  }
Mikaël Salson's avatar
Mikaël Salson committed
290

291 292
  if (length_shift.first < l || length_shift.second != 0) {
    info += " w" + string_of_int(length_shift.first) + "/" + string_of_int(length_shift.second);
293
    junctionChanged = true;
294
  }
295

Mathieu Giraud's avatar
Mathieu Giraud committed
296
  // Window succesfully extracted
297
  return getSequence().sequence.substr(central_pos + length_shift.second - length_shift.first / 2, length_shift.first);
Mikaël Salson's avatar
Mikaël Salson committed
298 299 300
}

int Segmenter::getLeft() const {
301
  return box_V->end;
Mikaël Salson's avatar
Mikaël Salson committed
302 303 304
}
  
int Segmenter::getRight() const {
305
  return box_J->start;
Mikaël Salson's avatar
Mikaël Salson committed
306 307
}

308 309 310 311
int Segmenter::getMidLength() const {
  return box_J->start - box_V->end - 1;
}

Mikaël Salson's avatar
Mikaël Salson committed
312
int Segmenter::getLeftD() const {
313
  return box_D->start;
Mikaël Salson's avatar
Mikaël Salson committed
314 315 316
}
  
int Segmenter::getRightD() const {
317
  return box_D->end;
Mikaël Salson's avatar
Mikaël Salson committed
318 319 320 321 322 323 324 325 326 327 328 329 330 331
}

bool Segmenter::isReverse() const {
  return reversed;
}

bool Segmenter::isSegmented() const {
  return segmented;
}

bool Segmenter::isDSegmented() const {
  return dSegmented;
}

332 333
bool Segmenter::isJunctionChanged() const {
  return junctionChanged;
334
}
335 336 337 338 339 340 341 342 343 344
// E-values

void Segmenter::checkLeftRightEvaluesThreshold(double threshold, int strand)
{
  if (threshold == NO_LIMIT_VALUE)
    return ;

  if (evalue_left >= threshold && evalue_right >= threshold)
    because = UNSEG_TOO_FEW_ZERO ;
  else if ((strand == 1 ? evalue_left : evalue_right) >= threshold)
345
    because = UNSEG_ONLY_J ;
346
  else if ((strand == 1 ? evalue_right : evalue_left) >= threshold)
347
    because = UNSEG_ONLY_V ;
348 349 350 351 352
  else if (evalue >= threshold) // left and right are <= threshold, but their sum is > threshold
    because = UNSEG_TOO_FEW_ZERO ;
}


Mikaël Salson's avatar
Mikaël Salson committed
353 354 355 356 357 358 359 360
// Chevauchement

string Segmenter::removeChevauchement()
{
  assert(isSegmented());
  
  string chevauchement = "" ;

361
  if (box_V->end >= box_J->start)
Mikaël Salson's avatar
Mikaël Salson committed
362
    {
363 364 365 366
      int middle = (box_V->end + box_J->start) / 2 ;
      chevauchement = " !ov " + string_of_int (box_V->end - box_J->start + 1);
      box_V->end = middle ;
      box_J->start = middle+1 ;
Mikaël Salson's avatar
Mikaël Salson committed
367 368 369 370 371 372 373 374 375 376 377 378 379 380
    }

  return chevauchement ;
}

// Prettyprint


bool Segmenter::finishSegmentation() 
{
  assert(isSegmented());
  
  string seq = getSequence().sequence;
    
381 382 383 384 385
  seg_V = seq.substr(0, box_V->end+1) ;
  seg_N = seq.substr(box_V->end+1, box_J->start-box_V->end-1) ;  // Twice computed for FineSegmenter, but only once in KmerSegmenter !
  seg_J = seq.substr(box_J->start) ;
  box_D->start=0;
  box_D->end=0;
Mikaël Salson's avatar
Mikaël Salson committed
386

387
  info = (reversed ? "- " : "+ ") + info + "\t" + code ;
Mikaël Salson's avatar
Mikaël Salson committed
388 389 390 391 392 393 394 395

  return true ;
}

bool Segmenter::finishSegmentationD() 
{
  string seq = getSequence().sequence;

396 397 398 399
  seg_V = seq.substr(0, box_V->end+1) ; // From pos. 0 to box_V->end
  seg_J = seq.substr(box_J->start) ;
  seg_N = seq.substr(box_V->end+1, box_J->start-box_V->end-1) ;  // Twice computed for FineSegmenter, but only once in KmerSegmenter !
  seg_D  = seq.substr(box_D->start, box_D->end-box_D->start+1) ; // From Dstart to Dend
400 401

  info = (reversed ? "- " : "+ ") + info + "\t" + code ;
Mikaël Salson's avatar
Mikaël Salson committed
402 403 404 405

  return true ;
}

406 407
string Segmenter::getInfoLine() const
{
408
  string s = "" ;
409

410
  s += (segmented ? "" : "\t ! ") + info ;
411 412 413
  s += " " + info_extra ;
  s += " " + segmented_germline->code ;
  s += " " + string(segmented_mesg[because]) ;
414 415 416 417

  if (evalue > NO_LIMIT_VALUE)
    s += " " + scientific_string_of_double(evalue);

418 419 420 421 422
  if (evalue_left > NO_LIMIT_VALUE)
    s += " " + scientific_string_of_double(evalue_left);
  if (evalue_right > NO_LIMIT_VALUE)
    s += "/" + scientific_string_of_double(evalue_right);

423
  if (CDR3start > 0)
424 425
    s += " {" + string_of_int(JUNCTIONstart) + "(" + string_of_int(JUNCTIONend-JUNCTIONstart+1) + ")" + string_of_int(JUNCTIONend) + " "
      + "up"[JUNCTIONproductive] + " " + JUNCTIONaa + "}";
426

427 428 429
  return s ;
}

430 431 432 433
string KmerSegmenter::getInfoLineWithAffects() const
{
   stringstream ss;

434
   ss << "= " << right << setw(10) << segmented_germline->code << " "
435 436
      << right << setw(3) << score << " "
      << left << setw(30)
437
      << getInfoLine();
438 439

   if (getSegmentationStatus() != UNSEG_TOO_SHORT)
440 441 442 443 444 445 446 447
   {
     ss << endl;
     ss << "# " << right << setw(10) << segmented_germline->code << " "
        << getKmerAffectAnalyser()->toStringValues();
     ss << endl;
     ss << "$ " << right << setw(10) << segmented_germline->code << " "
        << getKmerAffectAnalyser()->toStringSigns();
   }
448 449 450 451 452

   return ss.str();
}


Mikaël Salson's avatar
Mikaël Salson committed
453 454
ostream &operator<<(ostream &out, const Segmenter &s)
{
455
  out << ">" << s.label << " " ;
456
  out << s.getInfoLine() << endl;
Mikaël Salson's avatar
Mikaël Salson committed
457 458 459 460 461 462 463 464 465

  if (s.segmented)
    {
      out << s.seg_V << endl ;
      out << s.seg_N << endl ;
      out << s.seg_J << endl ;
    }
  else
    {
466
      out << s.getSequence().sequence << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
467 468 469 470 471 472 473 474
    }

  return out ;
}


// KmerSegmenter (Cheap)

475 476
KmerSegmenter::KmerSegmenter() { kaa = 0 ; }

477
KmerSegmenter::KmerSegmenter(Sequence seq, Germline *germline, double threshold, double multiplier)
Mikaël Salson's avatar
Mikaël Salson committed
478
{
479
  box_V = new AlignBox("5", V_COLOR);
480
  box_D = new AlignBox();
481
  box_J = new AlignBox("3", J_COLOR);
482

483 484 485
  CDR3start = -1;
  CDR3end = -1;

486 487 488
  JUNCTIONstart = -1;
  JUNCTIONend = -1;

Mikaël Salson's avatar
Mikaël Salson committed
489 490 491
  label = seq.label ;
  sequence = seq.sequence ;
  info = "" ;
492
  info_extra = "seed";
Mikaël Salson's avatar
Mikaël Salson committed
493
  segmented = false;
494
  segmented_germline = germline ;
495
  reversed = false;
496
  because = NOT_PROCESSED ; // Cause of unsegmentation
497
  score = 0 ;
498
  evalue = NO_LIMIT_VALUE;
499 500
  evalue_left = NO_LIMIT_VALUE;
  evalue_right = NO_LIMIT_VALUE;
501

502
  int s = (size_t)germline->index->getS() ;
Mikaël Salson's avatar
Mikaël Salson committed
503
  int length = sequence.length() ;
Mikaël Salson's avatar
Mikaël Salson committed
504

505
  if (length < s) 
Mikaël Salson's avatar
Mikaël Salson committed
506
    {
Mathieu Giraud's avatar
Mathieu Giraud committed
507 508
      because = UNSEG_TOO_SHORT;
      kaa = NULL;
509
      return ;
Mikaël Salson's avatar
Mikaël Salson committed
510 511
    }
 
512
  kaa = new KmerAffectAnalyser(*(germline->index), sequence);
Mikaël Salson's avatar
Mikaël Salson committed
513 514
  
  // Check strand consistency among the affectations.
Mikaël Salson's avatar
Mikaël Salson committed
515 516 517 518 519 520 521 522 523
  int strand;
  int nb_strand[2] = {0,0};     // In cell 0 we'll put the number of negative
                                // strand, while in cell 1 we'll put the
                                // positives
  for (int i = 0; i < kaa->count(); i++) { 
    KmerAffect it = kaa->getAffectation(i);
    if (! it.isAmbiguous() && ! it.isUnknown()) {
      strand = affect_strand(it.affect);
      nb_strand[(strand + 1) / 2] ++; // (strand+1) / 2 → 0 if strand == -1; 1 if strand == 1
Mikaël Salson's avatar
Mikaël Salson committed
524 525 526
    }
  }

527 528
  score = nb_strand[0] + nb_strand[1] ; // Used only for non-segmented germlines

529 530
  reversed = (nb_strand[0] > nb_strand[1]) ;

531 532 533 534 535 536


  if (germline->seg_method == SEG_METHOD_ONE) {

    KmerAffectAnalyser kaa(*(germline->index), sequence);

537
    KmerAffect kmer = KmerAffect(germline->affect_4, 1, germline->seed_4.size());
538 539 540 541 542 543 544 545 546 547 548 549
    int c = kaa.count(kmer);

    // E-value
    double pvalue = kaa.getProbabilityAtLeastOrAbove(kmer, c);
    evalue = pvalue * multiplier ;

    if (evalue >= threshold)
      {
        because = UNSEG_TOO_FEW_ZERO ;
        return ;
      }

550 551
    int pos = kaa.minimize(kmer, DEFAULT_MINIMIZE_ONE_MARGIN, DEFAULT_MINIMIZE_WIDTH);

552
    if (pos == NO_MINIMIZING_POSITION)
553 554 555 556 557
      {
        because = UNSEG_TOO_SHORT_FOR_WINDOW;
        return ;
      }

558 559 560 561 562 563 564 565 566 567 568 569 570 571
    segmented = true ;
    because = reversed ? SEG_MINUS : SEG_PLUS ;

    info = "=" + string_of_int(c) + " @" + string_of_int(pos) ;

    // getJunction() will be centered on pos
    box_V->end = pos;
    box_J->start = pos;
    finishSegmentation();

    return ;
  }
  
  
572 573
  if ((germline->seg_method == SEG_METHOD_MAX12)
      || (germline->seg_method == SEG_METHOD_MAX1U))
574 575
    { // Pseudo-germline, MAX12 and MAX1U
      pair <KmerAffect, KmerAffect> max12 ;
576
      KmerAffectAnalyser ckaa = *kaa;
577

578 579 580 581 582

      set<KmerAffect> forbidden;
      forbidden.insert(KmerAffect::getAmbiguous());
      forbidden.insert(KmerAffect::getUnknown());

583
      if (germline->seg_method == SEG_METHOD_MAX12)
584 585 586
        // MAX12: two maximum k-mers (no unknown)
        {
          max12 = ckaa.max12(forbidden);
587

588 589 590 591 592 593
          if (max12.first.isUnknown() || max12.second.isUnknown())
            {
              because = UNSEG_TOO_FEW_ZERO ;
              return ;
            }
        }
594

595 596
      else
        // MAX1U: the maximum k-mers (no unknown) + unknown
597
        {
598 599 600 601 602 603 604 605 606
          CountKmerAffectAnalyser ckaa(*(germline->index), sequence);
          KmerAffect max = ckaa.max(forbidden);

          if (max.isUnknown())
            {
              because = UNSEG_TOO_FEW_ZERO ;
              return ;
            }
          max12 = make_pair(max, KmerAffect::getUnknown());
607 608
        }

609 610 611 612 613
      pair <KmerAffect, KmerAffect> before_after =  ckaa.sortLeftRight(max12);

      before = before_after.first ;
      after = before_after.second ;

614 615 616
      // This strand computation is only a heuristic, especially for chimera +/- reads
      // Anyway, it allows to gather such reads and their reverse complement into a unique window...
      // ... except when the read is quite different outside the window
617
      strand = reversed ? -1 : 1 ;
618 619 620 621 622
    }

  else
    { // Regular germline

623
  // Test on which strand we are, select the before and after KmerAffects
Mikaël Salson's avatar
Mikaël Salson committed
624
  if (nb_strand[0] == 0 && nb_strand[1] == 0) {
625
    because = UNSEG_TOO_FEW_ZERO ;
626
    return ;
Mikaël Salson's avatar
Mikaël Salson committed
627 628
  } else if (nb_strand[0] > RATIO_STRAND * nb_strand[1]) {
    strand = -1;
629 630
    before = KmerAffect(germline->affect_3, -1, germline->seed_3.size());
    after = KmerAffect(germline->affect_5, -1, germline->seed_5.size());
Mikaël Salson's avatar
Mikaël Salson committed
631 632
  } else if (nb_strand[1] > RATIO_STRAND * nb_strand[0]) {
    strand = 1;
633 634
    before = KmerAffect(germline->affect_5, 1, germline->seed_5.size());
    after = KmerAffect(germline->affect_3, 1, germline->seed_3.size());
Mikaël Salson's avatar
Mikaël Salson committed
635 636
  } else {
    // Ambiguous information: we have positive and negative strands
637 638 639 640 641
    // and there is not enough difference to put them apart.
    if (nb_strand[0] + nb_strand[1] >= DETECT_THRESHOLD_STRAND)
      because = UNSEG_STRAND_NOT_CONSISTENT ;
    else
      because = UNSEG_TOO_FEW_ZERO ;
642
    return ;
Mikaël Salson's avatar
Mikaël Salson committed
643 644
  }

645
    } // endif Pseudo-germline
646
 
647
  computeSegmentation(strand, before, after, threshold, multiplier);
Mathieu Giraud's avatar
Mathieu Giraud committed
648
}
Mikaël Salson's avatar
Mikaël Salson committed
649

Mathieu Giraud's avatar
Mathieu Giraud committed
650
KmerSegmenter::~KmerSegmenter() {
651 652
  if (kaa)
    delete kaa;
653 654 655 656

  delete box_V;
  delete box_D;
  delete box_J;
657 658
}

659 660
KmerMultiSegmenter::KmerMultiSegmenter(Sequence seq, MultiGermline *multigermline, ostream *out_unsegmented,
                                       double threshold, int nb_reads_for_evalue)
661
{
662 663
  bool found_seg = false ; // Found a segmentation
  double best_evalue_seg = NO_LIMIT_VALUE ; // Best evalue, segmented sequences
664
  int best_score_unseg = 0 ; // Best score, unsegmented sequences
665
  the_kseg = NULL;
666
  multi_germline = multigermline;
667
  threshold_nb_expected = threshold;
668 669

  // E-value multiplier
670
  double multiplier = multi_germline->germlines.size() * nb_reads_for_evalue;
671 672 673
#ifdef DEBUG_KMS_EVALUE
  cerr << "multiplier: germline_size=" << multi_germline->germlines.size() <<", nb_reads_for_evalue=" << nb_reads_for_evalue << endl;
#endif
674
  
675 676 677 678
  // Iterate over the germlines
  for (list<Germline*>::const_iterator it = multigermline->germlines.begin(); it != multigermline->germlines.end(); ++it)
    {
      Germline *germline = *it ;
679
      double incomplete_multiplier = 1;
680

681 682 683 684 685
      if (germline->code.find("+") != string::npos) {
        incomplete_multiplier = 1e9;
      }

      KmerSegmenter *kseg = new KmerSegmenter(seq, germline, threshold, multiplier*incomplete_multiplier);
686
      bool keep_seg = false;
687

688 689 690
      if (out_unsegmented)
        {
          // Debug, display k-mer affectation and segmentation result for this germline
691
          *out_unsegmented << kseg->getInfoLineWithAffects() << endl ;
692 693
        }

694 695
      // Always remember the first kseg
      if (the_kseg == NULL)
696
        keep_seg = true;
697
      
698
      if (kseg->isSegmented())
699 700
        {
          // Yes, it is segmented
701
          // Should we keep the kseg ?
702
          if (!found_seg || (kseg->evalue < best_evalue_seg))
703
            {
704
              keep_seg = true;
705 706 707
              best_evalue_seg = kseg->evalue ;

              found_seg = true;
708
            }
709
        }
710 711 712 713 714 715 716
      else
        {
          // It is not segmented
          // Should we keep the kseg (with the unsegmentation cause) ?
            if (kseg->score > best_score_unseg)
            {              
              best_score_unseg = kseg->score ;
717
              if (!found_seg)
718 719 720 721
                keep_seg = true;
            }
        }
      
722
      if (keep_seg) {
723 724
        if (the_kseg)
          delete the_kseg;
725 726 727 728
        the_kseg = kseg;
      } else {
        delete kseg;
      }
729 730 731
    } // end for (Germlines)
}

732
KmerMultiSegmenter::~KmerMultiSegmenter() {
733 734
  if (the_kseg)
    delete the_kseg;
Mathieu Giraud's avatar
Mathieu Giraud committed
735 736
}

737
void KmerSegmenter::computeSegmentation(int strand, KmerAffect before, KmerAffect after,
738
                                        double threshold, double multiplier) {
739
  // Try to segment, computing 'box_V->end' and 'box_J->start'
740 741
  // If not segmented, put the cause of unsegmentation in 'because'

742
  affect_infos max;
743
  max = kaa->getMaximum(before, after);
Mikaël Salson's avatar
Mikaël Salson committed
744

745
  // E-values
746 747 748
  pair <double, double> pvalues = kaa->getLeftRightProbabilityAtLeastOrAbove();
  evalue_left = pvalues.first * multiplier ;
  evalue_right = pvalues.second * multiplier ;
749
  evalue = evalue_left + evalue_right ;
750 751 752 753
#ifdef DEBUG_KMS_EVALUE
  cerr << "\tmultiplier=" << multiplier << ",\tevalues=[" << evalue_left << ", " << evalue_right << "],\t"
       << "evalue=" << evalue << endl;
#endif
754

755
  // This can lead to UNSEG_TOO_FEW_ZERO or UNSEG_ONLY_V/J
756
  checkLeftRightEvaluesThreshold(threshold, strand);
757

758
  if (because != NOT_PROCESSED)
759 760
    return ;

761 762 763 764 765 766 767
  if (!max.max_found) {
    // The 'ratioMin' checks at the end of kaa->getMaximum() failed
    because = UNSEG_AMBIGUOUS;
    return ;
  }
  

768 769
   // There was a good segmentation point

770 771
   box_V->end = max.first_pos_max;
   box_J->start = max.last_pos_max + 1;
772
   if (strand == -1) {
773 774 775
     int tmp = sequence.size() - box_V->end - 1;
     box_V->end = sequence.size() - box_J->start - 1;
     box_J->start = tmp;
776 777 778 779 780 781
   }

  // Yes, it is segmented
  segmented = true;
  because = reversed ? SEG_MINUS : SEG_PLUS ;

782 783 784 785 786 787 788
  // TODO: this should also use possFromBoxes()... but 'boxes' is not defined here
  info = "VJ \t"
   + string_of_int(FIRST_POS) + " "
   + string_of_int(box_V->end + FIRST_POS) + " "
   + string_of_int(box_J->start + FIRST_POS) + " "
   + string_of_int(sequence.size() - 1 + FIRST_POS) ;
  
789 790 791 792 793
  // removeChevauchement is called once info was already computed: it is only to output info_extra
  info_extra += removeChevauchement();
  finishSegmentation();

  return ;
Mathieu Giraud's avatar
Mathieu Giraud committed
794
}
Mikaël Salson's avatar
Mikaël Salson committed
795

796
KmerAffectAnalyser *KmerSegmenter::getKmerAffectAnalyser() const {
Mathieu Giraud's avatar
Mathieu Giraud committed
797 798
  return kaa;
}
Mikaël Salson's avatar
Mikaël Salson committed
799

800
int Segmenter::getSegmentationStatus() const {
Mathieu Giraud's avatar
Mathieu Giraud committed
801
  return because;
Mikaël Salson's avatar
Mikaël Salson committed
802 803
}

804 805 806 807 808
void Segmenter::setSegmentationStatus(int status) {
  because = status;
  segmented = (status == SEG_PLUS || status == SEG_MINUS);
}

Mikaël Salson's avatar
Mikaël Salson committed
809 810
// FineSegmenter

811

812
string check_and_resolve_overlap(string seq, int seq_begin, int seq_end,
813
                                 AlignBox *box_left, AlignBox *box_right,
814
                                 Cost segment_cost, bool reverse_V, bool reverse_J)
Mikaël Salson's avatar
Mikaël Salson committed
815
{
816
  // Overlap size
817
  int overlap = box_left->end - box_right->start + 1;
818 819 820

  if (overlap > 0)
    {
821 822
      string seq_left = seq.substr(seq_begin, box_left->end - seq_begin + 1);
      string seq_right = seq.substr(box_right->start, seq_end - box_right->start + 1);
823

Mikaël Salson's avatar
Mikaël Salson committed
824 825 826 827
      int score_r[overlap+1];
      int score_l[overlap+1];
      
      //LEFT
828
      DynProg dp_l = DynProg(seq_left, revcomp(box_left->ref, reverse_V),
Mikaël Salson's avatar
Mikaël Salson committed
829
			   DynProg::Local, segment_cost);
830
      score_l[0] = dp_l.compute();
831

Mikaël Salson's avatar
Mikaël Salson committed
832 833
      
      //RIGHT
834
      // reverse right sequence
835
      string ref_right=string(box_right->ref.rbegin(), box_right->ref.rend());
836
      ref_right = revcomp(ref_right, reverse_J);
837 838 839
      seq_right=string(seq_right.rbegin(), seq_right.rend());


840
      DynProg dp_r = DynProg(seq_right, ref_right,
Mikaël Salson's avatar
Mikaël Salson committed
841
			   DynProg::Local, segment_cost);
842
      score_r[0] = dp_r.compute();
843 844 845 846 847 848



      int trim_l[overlap+1];
      int trim_r[overlap+1];

849
      for(size_t i=0; i<=(size_t)overlap; i++) {
850 851
        score_l[i] = i < seq_left.size()  ? dp_l.best_score_on_i(seq_left.size()  - i, trim_l + i) : MINUS_INF ;
        score_r[i] = i < seq_right.size() ? dp_r.best_score_on_i(seq_right.size() - i, trim_r + i) : MINUS_INF ;
Mikaël Salson's avatar
Mikaël Salson committed
852
      }
853

854 855 856

// #define DEBUG_OVERLAP
#ifdef DEBUG_OVERLAP
857 858 859 860 861 862
     cout << "=== check_and_resolve_overlap" << endl;
     cout << seq << endl;
     cout << "boxes: " << *box_left << "/" << *box_right << endl ; 

      // cout << dp_l ;
      // cout << dp_r ;
863

864
      cout << "seq:" << seq_left << "\t\t" << seq_right << endl;
865
      cout << "ref:" << box_left->ref << "\t\t" << ref_right << endl;
866 867 868 869 870 871 872 873
      for(int i=0; i<=overlap; i++)
        cout << i << "  left: " << score_l[i] << "/" << trim_l[i] << "     right: " << score_r[i] << "/" << trim_r[i] << endl;
#endif

      int score = MINUS_INF;
      int best_i = 0 ;
      int best_j = 0 ;

Mikaël Salson's avatar
Mikaël Salson committed
874

875
      // Find (i, j), with i+j >= overlap,
876
      // maximizing score_l[j] + score_r[i]
Mikaël Salson's avatar
Mikaël Salson committed
877 878
      for (int i=0; i<=overlap; i++){
	for (int j=overlap-i; j<=overlap; j++){
879
          int score_ij = score_l[i] + score_r[j];
880

881 882 883
	  if (score_ij > score) {
            best_i = i ;
            best_j = j ;
884 885
            box_left->del_right = box_left->ref.size() - trim_l[i];
	    box_right->del_left = box_right->ref.size() - trim_r[j];
886
	    score = score_ij;
Mikaël Salson's avatar
Mikaël Salson committed
887 888 889
	  }
	}
      }
890

891 892
      box_left->end -= best_i ;
      box_right->start += best_j ;
893 894

#ifdef DEBUG_OVERLAP
895
      cout << "overlap: " << overlap << ", " << "best_overlap_split: " << score
896 897
           << "    left: " << best_i << "-" << box_left->del_right << " @" << box_left->end
           << "    right:" << best_j << "-" << box_right->del_left << " @" << box_right->start
898
           << endl;
899
      cout << "boxes: " << *box_left << " / " << *box_right << endl ;
900
#endif