segment.cpp 25.9 KB
Newer Older
Mikaël Salson's avatar
Mikaël Salson committed
1
/*
2 3 4 5 6 7 8
  This file is part of Vidjil <http://www.vidjil.org>
  Copyright (C) 2011, 2012, 2013, 2014, 2015 by Bonsai bioinformatics 
  at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille
  Contributors: 
      Mathieu Giraud <mathieu.giraud@vidjil.org>
      Mikaël Salson <mikael.salson@vidjil.org>
      Marc Duez <marc.duez@vidjil.org>
Mikaël Salson's avatar
Mikaël Salson committed
9 10 11 12 13 14 15 16 17 18 19 20 21 22

  "Vidjil" is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.

  "Vidjil" is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with "Vidjil". If not, see <http://www.gnu.org/licenses/>
*/
Mikaël Salson's avatar
Mikaël Salson committed
23
#include <algorithm>    // std::sort
Mikaël Salson's avatar
Mikaël Salson committed
24 25 26 27
#include <cassert>
#include "segment.h"
#include "tools.h"
#include "affectanalyser.h"
Mikaël Salson's avatar
Mikaël Salson committed
28
#include <sstream>
Marc Duez's avatar
Marc Duez committed
29
#include <string>
Mikaël Salson's avatar
Mikaël Salson committed
30

Mathieu Giraud's avatar
Mathieu Giraud committed
31
Segmenter::~Segmenter() {}
Mikaël Salson's avatar
Mikaël Salson committed
32

Mathieu Giraud's avatar
Mathieu Giraud committed
33
Sequence Segmenter::getSequence() const {
Mikaël Salson's avatar
Mikaël Salson committed
34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
  Sequence s ;
  s.label_full = info ;
  s.label = label + " " + (reversed ? "-" : "+");
  s.sequence = revcomp(sequence, reversed);

  return s ;
}

string Segmenter::getJunction(int l) const {
  assert(isSegmented());

  int start = (getLeft() + getRight())/2 - l/2;
  
  if (start < 0 or start + l > (int)sequence.size())  // TODO: +l ou +l-1 ?
    return "" ;

  return getSequence().sequence.substr(start, l);
}

int Segmenter::getLeft() const {
Mathieu Giraud's avatar
Mathieu Giraud committed
54
  return Vend;
Mikaël Salson's avatar
Mikaël Salson committed
55 56 57
}
  
int Segmenter::getRight() const {
Mathieu Giraud's avatar
Mathieu Giraud committed
58
  return Jstart;
Mikaël Salson's avatar
Mikaël Salson committed
59 60 61
}

int Segmenter::getLeftD() const {
Mathieu Giraud's avatar
Mathieu Giraud committed
62
  return Dstart;
Mikaël Salson's avatar
Mikaël Salson committed
63 64 65
}
  
int Segmenter::getRightD() const {
Mathieu Giraud's avatar
Mathieu Giraud committed
66
  return Dend;
Mikaël Salson's avatar
Mikaël Salson committed
67 68 69 70 71 72 73 74 75 76 77 78 79 80
}

bool Segmenter::isReverse() const {
  return reversed;
}

bool Segmenter::isSegmented() const {
  return segmented;
}

bool Segmenter::isDSegmented() const {
  return dSegmented;
}

81 82 83 84
bool KmerSegmenter::isDetected() const {
  return detected;
}

Mikaël Salson's avatar
Mikaël Salson committed
85 86 87 88 89 90 91 92
// Chevauchement

string Segmenter::removeChevauchement()
{
  assert(isSegmented());
  
  string chevauchement = "" ;

Mathieu Giraud's avatar
Mathieu Giraud committed
93
  if (Vend >= Jstart)
Mikaël Salson's avatar
Mikaël Salson committed
94
    {
Mathieu Giraud's avatar
Mathieu Giraud committed
95
      int middle = (Vend + Jstart) / 2 ;
96
      chevauchement = " !ov " + string_of_int (Vend - Jstart + 1);
Mathieu Giraud's avatar
Mathieu Giraud committed
97 98
      Vend = middle ;
      Jstart = middle+1 ;
Mikaël Salson's avatar
Mikaël Salson committed
99 100 101 102 103 104 105 106 107 108 109 110 111 112
    }

  return chevauchement ;
}

// Prettyprint


bool Segmenter::finishSegmentation() 
{
  assert(isSegmented());
  
  string seq = getSequence().sequence;
    
Mathieu Giraud's avatar
Mathieu Giraud committed
113 114 115 116 117
  seg_V = seq.substr(0, Vend+1) ;
  seg_N = seq.substr(Vend+1, Jstart-Vend-1) ;  // Twice computed for FineSegmenter, but only once in KmerSegmenter !
  seg_J = seq.substr(Jstart) ;
  Dstart=0;
  Dend=0;
Mikaël Salson's avatar
Mikaël Salson committed
118 119 120 121 122 123 124 125 126 127 128 129 130

  info = "VJ \t" + string_of_int(FIRST_POS) + " " + info + " " + string_of_int(seq.size() - 1 + FIRST_POS) ;
  info += "\t" + code ;

  info = (reversed ? "- " : "+ ") + info ;

  return true ;
}

bool Segmenter::finishSegmentationD() 
{
  string seq = getSequence().sequence;

Mathieu Giraud's avatar
Mathieu Giraud committed
131 132
  seg_V = seq.substr(0, Vend+1) ; // From pos. 0 to Vend
  seg_J = seq.substr(Jstart) ;
Mikaël Salson's avatar
Mikaël Salson committed
133
  
Mathieu Giraud's avatar
Mathieu Giraud committed
134
  seg_D  = seq.substr(Dstart, Dend-Dstart+1) ; // From Dstart to Dend
Mikaël Salson's avatar
Mikaël Salson committed
135
  
Mathieu Giraud's avatar
Mathieu Giraud committed
136 137 138 139
  info = "VDJ \t0 " + string_of_int(Vend) +
		" " + string_of_int(Dstart) + 
		" " + string_of_int(Dend) +
		" " + string_of_int(Jstart) +
Mikaël Salson's avatar
Mikaël Salson committed
140 141 142 143 144 145 146 147 148
		" " + string_of_int(seq.size()-1+FIRST_POS) ;
		
  info += "\t" + code ;
  
  info = (reversed ? "- " : "+ ") + info ;

  return true ;
}

149 150 151 152 153 154 155 156 157
string Segmenter::getInfoLine() const
{
  string s = ">" ;

  s += label + " " ;
  s += (segmented ? "" : "! ") + info ;
  s += " " + info_extra ;
  s += " " + segmented_germline->code ;
  s += " " + string(segmented_mesg[because]) ;
158 159 160 161

  if (evalue > NO_LIMIT_VALUE)
    s += " " + scientific_string_of_double(evalue);

162 163 164 165 166
  if (evalue_left > NO_LIMIT_VALUE)
    s += " " + scientific_string_of_double(evalue_left);
  if (evalue_right > NO_LIMIT_VALUE)
    s += "/" + scientific_string_of_double(evalue_right);

167 168 169
  return s ;
}

Mikaël Salson's avatar
Mikaël Salson committed
170 171
ostream &operator<<(ostream &out, const Segmenter &s)
{
172
  out << s.getInfoLine() << endl;
Mikaël Salson's avatar
Mikaël Salson committed
173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190

  if (s.segmented)
    {
      out << s.seg_V << endl ;
      out << s.seg_N << endl ;
      out << s.seg_J << endl ;
    }
  else
    {
      out << s.sequence << endl ;
    }

  return out ;
}


// KmerSegmenter (Cheap)

191 192 193
KmerSegmenter::KmerSegmenter() { kaa = 0 ; }

KmerSegmenter::KmerSegmenter(Sequence seq, Germline *germline)
Mikaël Salson's avatar
Mikaël Salson committed
194 195 196 197
{
  label = seq.label ;
  sequence = seq.sequence ;
  info = "" ;
198
  info_extra = "seed";
Mikaël Salson's avatar
Mikaël Salson committed
199
  segmented = false;
200
  segmented_germline = germline ;
201
  reversed = false;
Mathieu Giraud's avatar
Mathieu Giraud committed
202
  Dend=0;
203 204
  because = 0 ; // Cause of unsegmentation
  score = 0 ;
205
  evalue = NO_LIMIT_VALUE;
206

207
  int s = (size_t)germline->index->getS() ;
Mikaël Salson's avatar
Mikaël Salson committed
208
  int length = sequence.length() ;
Mikaël Salson's avatar
Mikaël Salson committed
209

210
  if (length < s) 
Mikaël Salson's avatar
Mikaël Salson committed
211
    {
Mathieu Giraud's avatar
Mathieu Giraud committed
212 213
      because = UNSEG_TOO_SHORT;
      kaa = NULL;
214
      return ;
Mikaël Salson's avatar
Mikaël Salson committed
215 216
    }
 
217
  kaa = new KmerAffectAnalyser(*(germline->index), sequence);
Mikaël Salson's avatar
Mikaël Salson committed
218 219
  
  // Check strand consistency among the affectations.
Mikaël Salson's avatar
Mikaël Salson committed
220 221 222 223 224 225 226 227 228
  int strand;
  int nb_strand[2] = {0,0};     // In cell 0 we'll put the number of negative
                                // strand, while in cell 1 we'll put the
                                // positives
  for (int i = 0; i < kaa->count(); i++) { 
    KmerAffect it = kaa->getAffectation(i);
    if (! it.isAmbiguous() && ! it.isUnknown()) {
      strand = affect_strand(it.affect);
      nb_strand[(strand + 1) / 2] ++; // (strand+1) / 2 → 0 if strand == -1; 1 if strand == 1
Mikaël Salson's avatar
Mikaël Salson committed
229 230 231
    }
  }

232 233 234
  KmerAffect before, after;

  // Test on which strand we are, select the before and after KmerAffects
Mikaël Salson's avatar
Mikaël Salson committed
235
  if (nb_strand[0] == 0 && nb_strand[1] == 0) {
236
    because = UNSEG_TOO_FEW_ZERO ;
237
    return ;
Mikaël Salson's avatar
Mikaël Salson committed
238 239
  } else if (nb_strand[0] > RATIO_STRAND * nb_strand[1]) {
    strand = -1;
240 241
    before = KmerAffect(germline->affect_3, -1); 
    after = KmerAffect(germline->affect_5, -1);
Mikaël Salson's avatar
Mikaël Salson committed
242 243
  } else if (nb_strand[1] > RATIO_STRAND * nb_strand[0]) {
    strand = 1;
244 245
    before = KmerAffect(germline->affect_5, 1); 
    after = KmerAffect(germline->affect_3, 1);    
Mikaël Salson's avatar
Mikaël Salson committed
246 247 248
  } else {
    // Ambiguous information: we have positive and negative strands
    // and there is not enough difference to put them aparat.
249 250
    because = UNSEG_STRAND_NOT_CONSISTENT ;
    return ;
Mikaël Salson's avatar
Mikaël Salson committed
251 252
  }

253
  detected = false ;
254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269
  computeSegmentation(strand, before, after);

  if (! because)
    {
      // Now we check the delta between Vend and right
   
      if (Jstart - Vend < germline->delta_min)
	{
	  because = UNSEG_BAD_DELTA_MIN ;
	}

      if (Jstart - Vend > germline->delta_max)
	{
	  because = UNSEG_BAD_DELTA_MAX ;
	}
    } 
Mathieu Giraud's avatar
Mathieu Giraud committed
270

271
  if (!because)
Mathieu Giraud's avatar
Mathieu Giraud committed
272 273
    {
      // Yes, it is segmented
274
      segmented = true;
Mathieu Giraud's avatar
Mathieu Giraud committed
275 276 277 278
      reversed = (strand == -1); 
      because = reversed ? SEG_MINUS : SEG_PLUS ;

      info = string_of_int(Vend + FIRST_POS) + " " + string_of_int(Jstart + FIRST_POS)  ;
279 280
      // removeChevauchement is called once info was already computed: it is only to output info_extra
      info_extra += removeChevauchement();
Mathieu Giraud's avatar
Mathieu Giraud committed
281
      finishSegmentation();
282
      system = germline->code;
283
      return ;
Mathieu Giraud's avatar
Mathieu Giraud committed
284
    } 
285

286
 
Mathieu Giraud's avatar
Mathieu Giraud committed
287
}
Mikaël Salson's avatar
Mikaël Salson committed
288

Mathieu Giraud's avatar
Mathieu Giraud committed
289
KmerSegmenter::~KmerSegmenter() {
290 291
  if (kaa)
    delete kaa;
292 293
}

294
KmerMultiSegmenter::KmerMultiSegmenter(Sequence seq, MultiGermline *multigermline, ostream *out_unsegmented, double threshold)
295
{
296
  int best_score_seg = 0 ; // Best score, segmented sequences
297
  int best_score_unseg = 0 ; // Best score, unsegmented sequences
298
  the_kseg = NULL;
299
  multi_germline = multigermline;
300
  threshold_nb_expected = threshold;
301
  
302 303 304 305 306
  // Iterate over the germlines
  for (list<Germline*>::const_iterator it = multigermline->germlines.begin(); it != multigermline->germlines.end(); ++it)
    {
      Germline *germline = *it ;

307 308
      KmerSegmenter *kseg = new KmerSegmenter(seq, germline);
      bool keep_seg = false;
309

310 311 312 313
      if (out_unsegmented)
        {
          // Debug, display k-mer affectation and segmentation result for this germline
          *out_unsegmented << "#"
314 315
                           << left << setw(4) << kseg->segmented_germline->code << " "
                           << left << setw(20) << segmented_mesg[kseg->getSegmentationStatus()] << " ";
316

317
          *out_unsegmented << right << setw(3) << kseg->score << " ";
318
          
319 320
          if (kseg->getSegmentationStatus() != UNSEG_TOO_SHORT) 
            *out_unsegmented << kseg->getKmerAffectAnalyser()->toString();
321 322 323 324

          *out_unsegmented << endl ;
        }

325 326
      // Always remember the first kseg
      if (the_kseg == NULL)
327
        keep_seg = true;
328
      
329
      if (kseg->isSegmented())
330 331
        {
          // Yes, it is segmented
332
          // Should we keep the kseg ?
333
          if (kseg->score > best_score_seg)
334
            {
335
              keep_seg = true;
336
              best_score_seg = kseg->score ;
337
            }
338
        }
339 340 341 342 343 344 345 346 347 348 349 350
      else
        {
          // It is not segmented
          // Should we keep the kseg (with the unsegmentation cause) ?
            if (kseg->score > best_score_unseg)
            {              
              best_score_unseg = kseg->score ;
              if (!best_score_seg)
                keep_seg = true;
            }
        }
      
351
      if (keep_seg) {
352 353
        if (the_kseg)
          delete the_kseg;
354 355 356 357
        the_kseg = kseg;
      } else {
        delete kseg;
      }
358
    } // end for (Germlines)
359

360
  // E-value threshold
361
  if (threshold_nb_expected > NO_LIMIT_VALUE)
362 363 364 365 366 367
    if (the_kseg->isSegmented()) {
        // the_kseg->evalue also depends on the number of germlines from the *Multi*KmerSegmenter
        the_kseg->evalue = getNbExpected();
        if (the_kseg->evalue > threshold_nb_expected) {
          the_kseg->setSegmentationStatus(UNSEG_NOISY);
        }
368 369 370 371 372 373 374 375 376 377 378

        pair <double, double> p = the_kseg->getKmerAffectAnalyser()->getLeftRightProbabilityAtLeastOrAbove();
        the_kseg->evalue_left = p.first;
        the_kseg->evalue_right = p.second;

        if (the_kseg->evalue_left > threshold_nb_expected) {
          the_kseg->setSegmentationStatus(UNSEG_NOISY); // TOO_FEW_V ?
        }
        if (the_kseg->evalue_right > threshold_nb_expected) {
          the_kseg->setSegmentationStatus(UNSEG_NOISY); // TOO_FEW_J ?
        }
379
      }
380 381
}

382
double KmerMultiSegmenter::getNbExpected() const {
383
  double proba = the_kseg->getKmerAffectAnalyser()->getProbabilityAtLeastOrAbove(the_kseg->score);
384
  return multi_germline->germlines.size() * proba;
385 386 387
}

KmerMultiSegmenter::~KmerMultiSegmenter() {
388 389
  if (the_kseg)
    delete the_kseg;
Mathieu Giraud's avatar
Mathieu Giraud committed
390 391
}

392 393
void KmerSegmenter::computeSegmentation(int strand, KmerAffect before, KmerAffect after) {
  // Try to segment, computing 'Vend' and 'Jstart'
394 395
  // If not segmented, put the cause of unsegmentation in 'because'

396
  affect_infos max;
Mikaël Salson's avatar
Mikaël Salson committed
397

398
  max = kaa->getMaximum(before, after); 
399 400 401 402 403

      // We labeled it detected if there were both enough affect_5 and enough affect_3
      detected = (max.nb_before_left + max.nb_before_right >= DETECT_THRESHOLD)
        && (max.nb_after_left + max.nb_after_right >= DETECT_THRESHOLD);
      
404
      if (! max.max_found) {
405 406
        if ((strand == 1 && max.nb_before_left == 0)
            || (strand == -1 && max.nb_after_right == 0)) 
407
          because = detected ? UNSEG_AMBIGUOUS : UNSEG_TOO_FEW_V ;
408 409
        else if ((strand == 1 && max.nb_after_right == 0)
                 || (strand == -1 && max.nb_before_left == 0))
Mikaël Salson's avatar
Mikaël Salson committed
410
	{
411
	  because = detected ? UNSEG_AMBIGUOUS : UNSEG_TOO_FEW_J ;
412
	} else 
413 414 415 416 417 418 419 420 421 422
          because = UNSEG_AMBIGUOUS; 
      } else {
        Vend = max.first_pos_max;
        Jstart = max.last_pos_max + 1;
        if (strand == -1) {
          int tmp = sequence.size() - Vend - 1;
          Vend = sequence.size() - Jstart - 1;
          Jstart = tmp;
        }
      }
Mikaël Salson's avatar
Mikaël Salson committed
423
  
424
    score = max.nb_before_left + max.nb_before_right + max.nb_after_left + max.nb_after_right;  
Mathieu Giraud's avatar
Mathieu Giraud committed
425
}
Mikaël Salson's avatar
Mikaël Salson committed
426

427
KmerAffectAnalyser *KmerSegmenter::getKmerAffectAnalyser() const {
Mathieu Giraud's avatar
Mathieu Giraud committed
428 429
  return kaa;
}
Mikaël Salson's avatar
Mikaël Salson committed
430

431
int Segmenter::getSegmentationStatus() const {
Mathieu Giraud's avatar
Mathieu Giraud committed
432
  return because;
Mikaël Salson's avatar
Mikaël Salson committed
433 434
}

435 436 437 438 439
void Segmenter::setSegmentationStatus(int status) {
  because = status;
  segmented = (status == SEG_PLUS || status == SEG_MINUS);
}

Mikaël Salson's avatar
Mikaël Salson committed
440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464
// FineSegmenter

void best_align(int overlap, string seq_left, string seq_right,
		string ref_left ,string ref_right, int *b_r, int *b_l, Cost segment_cost)
{
      int score_r[overlap+1];
      int score_l[overlap+1];
      
      //LEFT
      ref_left=string(ref_left.rbegin(), ref_left.rend());
      seq_left=string(seq_left.rbegin(), seq_left.rend()); 
      
      DynProg dp1 = DynProg(seq_left, ref_left,
			   DynProg::Local, segment_cost);
      score_l[0] = dp1.compute();
      dp1.backtrack();
      
      //RIGHT
      DynProg dp = DynProg(seq_right, ref_right,
			   DynProg::Local, segment_cost);
      score_r[0] = dp.compute();
      dp.backtrack();    
      
      for(int i=1; i<=overlap; i++){
	if(dp.best_i-i >0)
465
	score_r[i] = dp.B[dp.best_i-i][dp.best_j].score;
Mikaël Salson's avatar
Mikaël Salson committed
466 467 468
	else
	score_r[i] =0;
	if(dp1.best_i-i >0)
469
	score_l[i] = dp1.B[dp1.best_i-i][dp1.best_j].score;	
Mikaël Salson's avatar
Mikaël Salson committed
470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494
	else
	score_l[i] =0;
      }
      int score=-1000000;
      int best_r=0;
      int best_l=0;

      for (int i=0; i<=overlap; i++){
	for (int j=overlap-i; j<=overlap; j++){
	  if ( ((score_r[i]+score_l[j]) == score) && (i+j < best_r+best_l )){
	    best_r=i;
	    best_l=j;
	    score=(score_r[i]+score_l[j]) ;
	  }
	  if ( (score_r[i]+score_l[j]) > score){
	    best_r=i;
	    best_l=j;
	    score=(score_r[i]+score_l[j]) ;
	  }
	}
      }
      *b_r=best_r;
      *b_l=best_l;
}

Mikaël Salson's avatar
Mikaël Salson committed
495 496 497 498 499
bool comp_pair (pair<int,int> i,pair<int,int> j)
{
  return ( i.first > j.first);
}

Mathieu Giraud's avatar
Mathieu Giraud committed
500
int align_against_collection(string &read, Fasta &rep, bool reverse_both, bool local, int *tag, 
Mikaël Salson's avatar
Mikaël Salson committed
501
			     int *del, int *del2, int *begin, int *length, vector<pair<int, int> > *score
Mikaël Salson's avatar
Mikaël Salson committed
502 503 504 505 506 507 508 509 510 511
			    , Cost segment_cost)
{
  
  int best_score = MINUS_INF ;
  int best_r = MINUS_INF ;
  int best_best_i = (int) string::npos ;
  int best_best_j = (int) string::npos ;
  int best_first_i = (int) string::npos ;
  int best_first_j = (int) string::npos ;
  string best_label = "" ;
Mikaël Salson's avatar
Mikaël Salson committed
512
  vector<pair<int, int> > score_r;
Mikaël Salson's avatar
Mikaël Salson committed
513 514 515 516 517 518 519 520 521 522 523

  DynProg::DynProgMode dpMode = DynProg::LocalEndWithSomeDeletions;
  if (local==true) dpMode = DynProg::Local;
  
  for (int r = 0 ; r < rep.size() ; r++)
    {
      DynProg dp = DynProg(read, rep.sequence(r),
			   dpMode, // DynProg::SemiGlobalTrans, 
			   segment_cost, // DNA
			   reverse_both, reverse_both);
      int score = dp.compute();
Mikaël Salson's avatar
Mikaël Salson committed
524
      
Mikaël Salson's avatar
Mikaël Salson committed
525 526 527 528 529 530 531 532 533 534 535 536 537 538
      if (local==true){ 
	dp.backtrack();
      }
      
      if (score > best_score)
	{
	  best_score = score ;
	  best_best_i = dp.best_i ;
	  best_best_j = dp.best_j ;
	  best_first_i = dp.first_i ;
	  best_first_j = dp.first_j ;
	  best_r = r ;
	  best_label = rep.label(r) ;
	}
Mikaël Salson's avatar
Mikaël Salson committed
539 540
	
	score_r.push_back(make_pair(score, r));
Mathieu Giraud's avatar
Mathieu Giraud committed
541 542 543 544 545 546

	// #define DEBUG_SEGMENT      

#ifdef DEBUG_SEGMENT	
	cout << rep.label(r) << " " << score << " " << dp.best_i << endl ;
#endif
Mikaël Salson's avatar
Mikaël Salson committed
547 548

    }
Mikaël Salson's avatar
Mikaël Salson committed
549
    sort(score_r.begin(),score_r.end(),comp_pair);
Mikaël Salson's avatar
Mikaël Salson committed
550 551 552 553

  *del = reverse_both ? best_best_j : rep.sequence(best_r).size() - best_best_j - 1;
  *del2 = best_first_j;
  *begin = best_first_i;
Mathieu Giraud's avatar
Mathieu Giraud committed
554
  *tag = best_r ; 
Mikaël Salson's avatar
Mikaël Salson committed
555 556
  
  *length -= *del ;
Mikaël Salson's avatar
Mikaël Salson committed
557 558
  
  *score=score_r;
Mathieu Giraud's avatar
Mathieu Giraud committed
559 560 561 562 563 564

#ifdef DEBUG_SEGMENT	
  cout << "best: " << best_labels << " " << best_score ;
  cout << "del/del2/begin:" << (*del) << "/" << (*del2) << "/" << (*begin) << endl;
  cout << endl;
#endif
Mikaël Salson's avatar
Mikaël Salson committed
565 566 567 568 569 570 571 572 573
  
  return best_best_i ;
}

string format_del(int deletions)
{
  return deletions ? *"(" + string_of_int(deletions) + " del)" : "" ;
}

574
FineSegmenter::FineSegmenter(Sequence seq, Germline *germline, Cost segment_c)
Mikaël Salson's avatar
Mikaël Salson committed
575
{
576 577
  segmented = false;
  dSegmented = false;
578
  because = 0 ;
579
  segmented_germline = germline ;
580
  info_extra = "" ;
Mikaël Salson's avatar
Mikaël Salson committed
581 582
  label = seq.label ;
  sequence = seq.sequence ;
Mathieu Giraud's avatar
Mathieu Giraud committed
583
  Dend=0;
Mikaël Salson's avatar
Mikaël Salson committed
584
  segment_cost=segment_c;
585
  evalue = NO_LIMIT_VALUE;
586 587
  evalue_left = NO_LIMIT_VALUE;
  evalue_right = NO_LIMIT_VALUE;
Mikaël Salson's avatar
Mikaël Salson committed
588

589 590 591
  CDR3start = -1;
  CDR3end = -1;
  
Mikaël Salson's avatar
Mikaël Salson committed
592 593 594 595 596
  // TODO: factoriser tout cela, peut-etre en lancant deux segmenteurs, un +, un -, puis un qui chapote
  
  // Strand +
  
  int plus_score = 0 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
597
  int tag_plus_V, tag_plus_J;
Mikaël Salson's avatar
Mikaël Salson committed
598 599 600 601
  int plus_length = 0 ;
  int del_plus_V, del_plus_J ;
  int del2=0;
  int beg=0;
Mikaël Salson's avatar
Mikaël Salson committed
602 603 604 605
  
  vector<pair<int, int> > score_plus_V;
  vector<pair<int, int> > score_plus_J;
  
606
  int plus_left = align_against_collection(sequence, germline->rep_5, false, false, &tag_plus_V, &del_plus_V, &del2, &beg, 
Mikaël Salson's avatar
Mikaël Salson committed
607 608
					   &plus_length, &score_plus_V
					   , segment_cost);
609
  int plus_right = align_against_collection(sequence, germline->rep_3, true, false, &tag_plus_J, &del_plus_J, &del2, &beg,
Mikaël Salson's avatar
Mikaël Salson committed
610 611
					    &plus_length, &score_plus_J
					    , segment_cost);
Mikaël Salson's avatar
Mikaël Salson committed
612 613
  plus_length += plus_right - plus_left ;

Mikaël Salson's avatar
Mikaël Salson committed
614 615
  plus_score=score_plus_V[0].first + score_plus_J[0].first ;
  
Mikaël Salson's avatar
Mikaël Salson committed
616 617 618
  // Strand -
  string rc = revcomp(sequence) ;
  int minus_score = 0 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
619
  int tag_minus_V, tag_minus_J;
Mikaël Salson's avatar
Mikaël Salson committed
620 621
  int minus_length = 0 ;
  int del_minus_V, del_minus_J ;
Mikaël Salson's avatar
Mikaël Salson committed
622 623 624 625
  
  vector<pair<int, int> > score_minus_V;
  vector<pair<int, int> > score_minus_J;
  
626
  int minus_left = align_against_collection(rc, germline->rep_5, false, false, &tag_minus_V, &del_minus_V, &del2, &beg,
Mikaël Salson's avatar
Mikaël Salson committed
627 628
					    &minus_length, &score_minus_V
					    ,  segment_cost);
629
  int minus_right = align_against_collection(rc, germline->rep_3, true, false, &tag_minus_J, &del_minus_J, &del2, &beg,
Mikaël Salson's avatar
Mikaël Salson committed
630 631
					     &minus_length, &score_minus_J
					     , segment_cost);
Mikaël Salson's avatar
Mikaël Salson committed
632 633
  minus_length += minus_right - minus_left ;

Mikaël Salson's avatar
Mikaël Salson committed
634 635
  minus_score=score_minus_V[0].first + score_minus_J[0].first ;
  
Mikaël Salson's avatar
Mikaël Salson committed
636 637 638 639
  reversed = (minus_score > plus_score) ;

  if (!reversed)
    {
Mathieu Giraud's avatar
Mathieu Giraud committed
640 641 642 643
      Vend = plus_left ;
      Jstart = plus_right ;
      best_V = tag_plus_V ;
      best_J = tag_plus_J ;
Mikaël Salson's avatar
Mikaël Salson committed
644 645
      del_V = del_plus_V ;
      del_J = del_plus_J ;
Mikaël Salson's avatar
Mikaël Salson committed
646 647
      score_V=score_plus_V;
      score_J=score_plus_J;
Mikaël Salson's avatar
Mikaël Salson committed
648 649 650
    }
  else
    {
Mathieu Giraud's avatar
Mathieu Giraud committed
651 652 653 654
      Vend = minus_left ;
      Jstart = minus_right ;
      best_V = tag_minus_V ;
      best_J = tag_minus_J ;
Mikaël Salson's avatar
Mikaël Salson committed
655 656
      del_V = del_minus_V ;
      del_J = del_minus_J ;
Mikaël Salson's avatar
Mikaël Salson committed
657 658
      score_V=score_minus_V;
      score_J=score_minus_J;
Mikaël Salson's avatar
Mikaël Salson committed
659 660
    }

Mathieu Giraud's avatar
Mathieu Giraud committed
661
  segmented = (Vend != (int) string::npos) && (Jstart != (int) string::npos) && 
662
    (Jstart - Vend >= germline->delta_min) && (Jstart - Vend <= germline->delta_max);
Mikaël Salson's avatar
Mikaël Salson committed
663 664 665
    
  if (!segmented)
    {
666
      because = DONT_KNOW;
Mathieu Giraud's avatar
Mathieu Giraud committed
667
      info = " @" + string_of_int (Vend + FIRST_POS) + "  @" + string_of_int(Jstart + FIRST_POS) ;
668
      
669
      if (Jstart - Vend < germline->delta_min)
670 671 672 673
        {
          because = UNSEG_BAD_DELTA_MIN  ;
        }

674
      if (Jstart - Vend > germline->delta_max)
675 676 677 678 679 680 681 682 683 684 685 686 687 688
        {
          because = UNSEG_BAD_DELTA_MAX  ;
        }
        
      if (Vend == (int) string::npos) 
        {
          because = UNSEG_TOO_FEW_V ;
        }
      
      if (Jstart == (int) string::npos)
        {
          because = UNSEG_TOO_FEW_J ;
        }
      
Mikaël Salson's avatar
Mikaël Salson committed
689 690 691
      return ;
    }
    
692 693
    because = reversed ? SEG_MINUS : SEG_PLUS ;
    
Mikaël Salson's avatar
Mikaël Salson committed
694
    //overlap VJ
Mathieu Giraud's avatar
Mathieu Giraud committed
695
    if(Jstart-Vend <=0){
Mikaël Salson's avatar
Mikaël Salson committed
696
      int b_r, b_l;
Mathieu Giraud's avatar
Mathieu Giraud committed
697
      int overlap=Vend-Jstart+1;
Mikaël Salson's avatar
Mikaël Salson committed
698
      
Mathieu Giraud's avatar
Mathieu Giraud committed
699 700
      string seq_left = sequence.substr(0, Vend+1);
      string seq_right = sequence.substr(Jstart);
Mikaël Salson's avatar
Mikaël Salson committed
701 702

      best_align(overlap, seq_left, seq_right, 
703
		 germline->rep_5.sequence(best_V), germline->rep_3.sequence(best_J), &b_r,&b_l, segment_cost);
Mikaël Salson's avatar
Mikaël Salson committed
704
      // Trim V
Mathieu Giraud's avatar
Mathieu Giraud committed
705
      Vend -= b_l;
Mikaël Salson's avatar
Mikaël Salson committed
706 707 708
      del_V += b_l;

      // Trim J
Mathieu Giraud's avatar
Mathieu Giraud committed
709
      Jstart += b_r;
Mikaël Salson's avatar
Mikaël Salson committed
710
      del_J += b_r;
711
      if (Jstart>=(int) sequence.length())
Mathieu Giraud's avatar
Mathieu Giraud committed
712
	  Jstart=sequence.length()-1;
Mikaël Salson's avatar
Mikaël Salson committed
713 714 715
    }

    // string chevauchement = removeChevauchement();
Mikaël Salson's avatar
Mikaël Salson committed
716 717

    /// used only below, then recomputed in finishSegmentation() ;
Mathieu Giraud's avatar
Mathieu Giraud committed
718
    seg_N = revcomp(sequence, reversed).substr(Vend+1, Jstart-Vend-1); 
Mikaël Salson's avatar
Mikaël Salson committed
719

720
  code = germline->rep_5.label(best_V) +
Mikaël Salson's avatar
Mikaël Salson committed
721 722 723 724
    " "+ string_of_int(del_V) + 
    "/" + seg_N + 
    // chevauchement +
    "/" + string_of_int(del_J) +
725
    " " + germline->rep_3.label(best_J); 
Mikaël Salson's avatar
Mikaël Salson committed
726

Mikaël Salson's avatar
Mikaël Salson committed
727
    stringstream code_s;
728
   code_s<< germline->rep_5.label(best_V) <<
Mikaël Salson's avatar
Mikaël Salson committed
729 730 731 732
    " -" << string_of_int(del_V) << "/" 
    << seg_N.size()
    // chevauchement +
    << "/-" << string_of_int(del_J)
733
    <<" " << germline->rep_3.label(best_J);
Mikaël Salson's avatar
Mikaël Salson committed
734 735
    code_short=code_s.str();
    
736 737
  code_light = germline->rep_5.label(best_V) +
    "/ " + germline->rep_3.label(best_J); 
Mikaël Salson's avatar
Mikaël Salson committed
738 739

 
Mathieu Giraud's avatar
Mathieu Giraud committed
740
  info = string_of_int(Vend + FIRST_POS) + " " + string_of_int(Jstart + FIRST_POS) ;
Mikaël Salson's avatar
Mikaël Salson committed
741 742 743 744
  finishSegmentation();
}


745
void FineSegmenter::FineSegmentD(Germline *germline){
Mikaël Salson's avatar
Mikaël Salson committed
746 747 748 749
  
  if (segmented){
    
    int end = (int) string::npos ;
Mathieu Giraud's avatar
Mathieu Giraud committed
750
    int tag_D;
Mikaël Salson's avatar
Mikaël Salson committed
751 752 753 754
    int length = 0 ;
    int begin = 0;
    
    // Create a zone where to look for D, adding at most EXTEND_D_ZONE nucleotides at each side
Mathieu Giraud's avatar
Mathieu Giraud committed
755
    int l = Vend - EXTEND_D_ZONE;
Mikaël Salson's avatar
Mikaël Salson committed
756 757 758
    if (l<0) 
      l=0 ;

Mathieu Giraud's avatar
Mathieu Giraud committed
759
    int r = Jstart + EXTEND_D_ZONE;
Mikaël Salson's avatar
Mikaël Salson committed
760 761 762 763 764 765 766

    if (r > (int)getSequence().sequence.length()) 
      r = getSequence().sequence.length();
      
    string str = getSequence().sequence.substr(l, r-l);

    // Align
767
    end = align_against_collection(str, germline->rep_4, false, true, &tag_D, &del_D_right, &del_D_left, &begin,
Mikaël Salson's avatar
Mikaël Salson committed
768 769
				&length, &score_D, segment_cost);
    
Mathieu Giraud's avatar
Mathieu Giraud committed
770
    best_D = tag_D;
Mikaël Salson's avatar
Mikaël Salson committed
771
    
Mathieu Giraud's avatar
Mathieu Giraud committed
772 773
    Dstart = l + begin;
    Dend = l + end;
Mikaël Salson's avatar
Mikaël Salson committed
774 775
	
    string seq = getSequence().sequence;
776 777 778 779 780 781 782 783 784 785


    // recompute remaining length for D
    length = germline->rep_4.sequence(best_D).length() - del_D_right - del_D_left;

    if (length < MIN_D_LENGTH)
      return ;


    dSegmented=true;
Mikaël Salson's avatar
Mikaël Salson committed
786 787
    
    //overlap VD
Mathieu Giraud's avatar
Mathieu Giraud committed
788
    if(Dstart-Vend <=0){
Mikaël Salson's avatar
Mikaël Salson committed
789
      int b_r, b_l;
Mathieu Giraud's avatar
Mathieu Giraud committed
790 791 792
      int overlap=Vend-Dstart+1;
      string seq_left = seq.substr(0, Vend+1);
      string seq_right = seq.substr(Dstart, Dend-Dstart+1);
Mikaël Salson's avatar
Mikaël Salson committed
793 794

      best_align(overlap, seq_left, seq_right, 
795
		 germline->rep_5.sequence(best_V), germline->rep_4.sequence(best_D), &b_r,&b_l, segment_cost);
Mikaël Salson's avatar
Mikaël Salson committed
796 797

      // Trim V
Mathieu Giraud's avatar
Mathieu Giraud committed
798
      Vend -= b_l;
Mikaël Salson's avatar
Mikaël Salson committed
799 800 801
      del_V += b_l;

      // Trim D
Mathieu Giraud's avatar
Mathieu Giraud committed
802
      Dstart += b_r;
Mikaël Salson's avatar
Mikaël Salson committed
803
    }
Mathieu Giraud's avatar
Mathieu Giraud committed
804
    seg_N1 = seq.substr(Vend+1, Dstart-Vend-1) ; // From Vend+1 to Dstart-1
Mikaël Salson's avatar
Mikaël Salson committed
805 806
    
    //overlap DJ
Mathieu Giraud's avatar
Mathieu Giraud committed
807
    if(Jstart-Dend <=0){
Mikaël Salson's avatar
Mikaël Salson committed
808
      int b_r, b_l;
Mathieu Giraud's avatar
Mathieu Giraud committed
809 810 811
      int overlap=Dend-Jstart+1;
      string seq_right = seq.substr(Dstart, Dend-Dstart+1);
      string seq_left = seq.substr(Jstart, seq.length()-Jstart);
Mikaël Salson's avatar
Mikaël Salson committed
812 813

      best_align(overlap, seq_left, seq_right, 
814
		 germline->rep_4.sequence(best_D), germline->rep_3.sequence(best_J), &b_r,&b_l, segment_cost);
Mikaël Salson's avatar
Mikaël Salson committed
815 816

      // Trim D
Mathieu Giraud's avatar
Mathieu Giraud committed
817
      Dend -= b_l;
Mikaël Salson's avatar
Mikaël Salson committed
818 819

      // Trim J
Mathieu Giraud's avatar
Mathieu Giraud committed
820
      Jstart += b_r;
Mikaël Salson's avatar
Mikaël Salson committed
821 822 823
      del_J += b_r;

    }
Mathieu Giraud's avatar
Mathieu Giraud committed
824
    seg_N2 = seq.substr(Dend+1, Jstart-Dend-1) ; // From Dend+1 to right-1
825
    code = germline->rep_5.label(best_V) +
Mikaël Salson's avatar
Mikaël Salson committed
826 827 828 829
    " "+ string_of_int(del_V) + 
    "/" + seg_N1 + 
    
    "/" + string_of_int(del_D_left) +
830
    " " + germline->rep_4.label(best_D) +
Mikaël Salson's avatar
Mikaël Salson committed
831 832 833 834
    " " + string_of_int(del_D_right) +
    
    "/" + seg_N2 +
    "/" + string_of_int(del_J) +
835
    " " + germline->rep_3.label(best_J); 
Mikaël Salson's avatar
Mikaël Salson committed
836

Mikaël Salson's avatar
Mikaël Salson committed
837
    stringstream code_s;
838
    code_s << germline->rep_5.label(best_V) 
Mikaël Salson's avatar
Mikaël Salson committed
839 840 841 842
    << " -" << string_of_int(del_V) << "/" 
    << seg_N1.size()
    
    << "/-" << string_of_int(del_D_left) 
843
    << " " << germline->rep_4.label(best_D) 
Mikaël Salson's avatar
Mikaël Salson committed
844 845 846 847
    << " -" << string_of_int(del_D_right) << "/"
    
    << seg_N2.size()
    << "/-" << string_of_int(del_J) 
848
    << " " << germline->rep_3.label(best_J);
Mikaël Salson's avatar
Mikaël Salson committed
849 850 851
    code_short=code_s.str();
    
    
852 853 854
    code_light = germline->rep_5.label(best_V) +
    "/ " + germline->rep_4.label(best_D) +
    "/ " + germline->rep_3.label(best_J); 
Mikaël Salson's avatar
Mikaël Salson committed
855 856 857 858
    
    finishSegmentationD();
  }
}
Mikaël Salson's avatar
Mikaël Salson committed
859

Marc Duez's avatar
Marc Duez committed
860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878
void FineSegmenter::findCDR3(){
    string str = getSequence().sequence;
    
    list<string> codon_start;
    codon_start.push_back("TGT");
    codon_start.push_back("TGC");
    
    list<string> codon_end;
    codon_end.push_back("TTT");
    codon_end.push_back("TTC");
    codon_end.push_back("TGG");
    
    list<int> p_start;
    list<int> p_end;

    size_t loc;
    std::list<string>::const_iterator it;
    for (it = codon_start.begin(); it != codon_start.end(); ++it) {//filter 1 : start codon must be in V
        loc = 0;
879
        while ( loc != string::npos && loc < (size_t)Vend){
Marc Duez's avatar
Marc Duez committed
880
            loc = str.find(*it, loc+3);
881
            if (loc != string::npos && loc < (size_t)Vend) {
Marc Duez's avatar
Marc Duez committed
882 883 884 885 886 887 888 889 890 891 892 893 894 895 896
                p_start.push_front(loc);
            }
        }
    }

    for (it = codon_end.begin(); it != codon_end.end(); ++it) {//filter 2 : end codon must be in J
        loc = Jstart;
        while ( loc != string::npos){
            loc = str.find(*it, loc+3);
            if (loc != string::npos) {
                p_end.push_back(loc);
            }
        }
    }

897 898
    CDR3start = -1;
    CDR3end = -1;
Marc Duez's avatar
Marc Duez committed
899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917
    
    std::list<int>::const_iterator it1;
    for (it1 = p_start.begin(); it1 != p_start.end(); ++it1) {
        
        std::list<int>::const_iterator it2;
        for (it2 = p_end.begin(); it2 != p_end.end(); ++it2) {
            
            if ( (*it2-*it1)%3 == 0){       //filter 3 : start/stop codon must be seprated by a multiple of 3
                
                if ( fabs((*it2-*it1)-36 ) < fabs((CDR3end-CDR3start)-36) ){ //filter 4 : cdr3 length must be close to 12 AA
                    CDR3start = *it1;
                    CDR3end = *it2;
                }
            }
        }
    }
    
}

918
void FineSegmenter::toJsonList(JsonList *seg){
Tatiana Rocher's avatar
Tatiana Rocher committed
919

Mathieu Giraud's avatar
Mathieu Giraud committed
920
  if (isSegmented()) {
921 922 923
    seg->add("5", segmented_germline->rep_5.label(best_V));
    seg->add("5start", 0);
    seg->add("5end", Vend);
Mathieu Giraud's avatar
Mathieu Giraud committed
924 925
    
    if (score_D.size()>0){
926 927 928
      seg->add("4", segmented_germline->rep_4.label(best_D));
      seg->add("4start", Dstart);
      seg->add("4end", Dend);
Mikaël Salson's avatar
Mikaël Salson committed
929
    }
Mathieu Giraud's avatar
Mathieu Giraud committed
930
    
931 932
    seg->add("3", segmented_germline->rep_3.label(best_J));
    seg->add("3start", Jstart);
933 934 935

    if (CDR3start >= 0)
      {
Marc Duez's avatar
Marc Duez committed
936 937 938 939
    JsonList *json_cdr;
    json_cdr=new JsonList();
    json_cdr->add("start", CDR3start);
    json_cdr->add("stop", CDR3end);
940
    seg->add("cdr3", *json_cdr);
941
      }
Tatiana Rocher's avatar
Tatiana Rocher committed
942

943 944
  }
}
945

946 947
void KmerSegmenter::toJsonList(JsonList *seg)
{
948 949
    int sequenceSize = sequence.size();

950
    if (evalue > NO_LIMIT_VALUE)