segment.cpp 25.2 KB
Newer Older
Mikaël Salson's avatar
Mikaël Salson committed
1
/*
2 3 4 5 6 7 8
  This file is part of Vidjil <http://www.vidjil.org>
  Copyright (C) 2011, 2012, 2013, 2014, 2015 by Bonsai bioinformatics 
  at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille
  Contributors: 
      Mathieu Giraud <mathieu.giraud@vidjil.org>
      Mikaël Salson <mikael.salson@vidjil.org>
      Marc Duez <marc.duez@vidjil.org>
Mikaël Salson's avatar
Mikaël Salson committed
9 10 11 12 13 14 15 16 17 18 19 20 21 22

  "Vidjil" is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.

  "Vidjil" is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with "Vidjil". If not, see <http://www.gnu.org/licenses/>
*/
Mikaël Salson's avatar
Mikaël Salson committed
23
#include <algorithm>    // std::sort
Mikaël Salson's avatar
Mikaël Salson committed
24 25 26 27
#include <cassert>
#include "segment.h"
#include "tools.h"
#include "affectanalyser.h"
Mikaël Salson's avatar
Mikaël Salson committed
28
#include <sstream>
Marc Duez's avatar
Marc Duez committed
29
#include <string>
Mikaël Salson's avatar
Mikaël Salson committed
30

Mathieu Giraud's avatar
Mathieu Giraud committed
31
Segmenter::~Segmenter() {}
Mikaël Salson's avatar
Mikaël Salson committed
32

Mathieu Giraud's avatar
Mathieu Giraud committed
33
Sequence Segmenter::getSequence() const {
Mikaël Salson's avatar
Mikaël Salson committed
34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
  Sequence s ;
  s.label_full = info ;
  s.label = label + " " + (reversed ? "-" : "+");
  s.sequence = revcomp(sequence, reversed);

  return s ;
}

string Segmenter::getJunction(int l) const {
  assert(isSegmented());

  int start = (getLeft() + getRight())/2 - l/2;
  
  if (start < 0 or start + l > (int)sequence.size())  // TODO: +l ou +l-1 ?
    return "" ;

  return getSequence().sequence.substr(start, l);
}

int Segmenter::getLeft() const {
Mathieu Giraud's avatar
Mathieu Giraud committed
54
  return Vend;
Mikaël Salson's avatar
Mikaël Salson committed
55 56 57
}
  
int Segmenter::getRight() const {
Mathieu Giraud's avatar
Mathieu Giraud committed
58
  return Jstart;
Mikaël Salson's avatar
Mikaël Salson committed
59 60 61
}

int Segmenter::getLeftD() const {
Mathieu Giraud's avatar
Mathieu Giraud committed
62
  return Dstart;
Mikaël Salson's avatar
Mikaël Salson committed
63 64 65
}
  
int Segmenter::getRightD() const {
Mathieu Giraud's avatar
Mathieu Giraud committed
66
  return Dend;
Mikaël Salson's avatar
Mikaël Salson committed
67 68 69 70 71 72 73 74 75 76 77 78 79 80
}

bool Segmenter::isReverse() const {
  return reversed;
}

bool Segmenter::isSegmented() const {
  return segmented;
}

bool Segmenter::isDSegmented() const {
  return dSegmented;
}

81 82 83 84
bool KmerSegmenter::isDetected() const {
  return detected;
}

Mikaël Salson's avatar
Mikaël Salson committed
85 86 87 88 89 90 91 92
// Chevauchement

string Segmenter::removeChevauchement()
{
  assert(isSegmented());
  
  string chevauchement = "" ;

Mathieu Giraud's avatar
Mathieu Giraud committed
93
  if (Vend >= Jstart)
Mikaël Salson's avatar
Mikaël Salson committed
94
    {
Mathieu Giraud's avatar
Mathieu Giraud committed
95
      int middle = (Vend + Jstart) / 2 ;
96
      chevauchement = " !ov " + string_of_int (Vend - Jstart + 1);
Mathieu Giraud's avatar
Mathieu Giraud committed
97 98
      Vend = middle ;
      Jstart = middle+1 ;
Mikaël Salson's avatar
Mikaël Salson committed
99 100 101 102 103 104 105 106 107 108 109 110 111 112
    }

  return chevauchement ;
}

// Prettyprint


bool Segmenter::finishSegmentation() 
{
  assert(isSegmented());
  
  string seq = getSequence().sequence;
    
Mathieu Giraud's avatar
Mathieu Giraud committed
113 114 115 116 117
  seg_V = seq.substr(0, Vend+1) ;
  seg_N = seq.substr(Vend+1, Jstart-Vend-1) ;  // Twice computed for FineSegmenter, but only once in KmerSegmenter !
  seg_J = seq.substr(Jstart) ;
  Dstart=0;
  Dend=0;
Mikaël Salson's avatar
Mikaël Salson committed
118 119 120 121 122 123 124 125 126 127 128 129 130

  info = "VJ \t" + string_of_int(FIRST_POS) + " " + info + " " + string_of_int(seq.size() - 1 + FIRST_POS) ;
  info += "\t" + code ;

  info = (reversed ? "- " : "+ ") + info ;

  return true ;
}

bool Segmenter::finishSegmentationD() 
{
  string seq = getSequence().sequence;

Mathieu Giraud's avatar
Mathieu Giraud committed
131 132
  seg_V = seq.substr(0, Vend+1) ; // From pos. 0 to Vend
  seg_J = seq.substr(Jstart) ;
Mikaël Salson's avatar
Mikaël Salson committed
133
  
Mathieu Giraud's avatar
Mathieu Giraud committed
134
  seg_D  = seq.substr(Dstart, Dend-Dstart+1) ; // From Dstart to Dend
Mikaël Salson's avatar
Mikaël Salson committed
135
  
Mathieu Giraud's avatar
Mathieu Giraud committed
136 137 138 139
  info = "VDJ \t0 " + string_of_int(Vend) +
		" " + string_of_int(Dstart) + 
		" " + string_of_int(Dend) +
		" " + string_of_int(Jstart) +
Mikaël Salson's avatar
Mikaël Salson committed
140 141 142 143 144 145 146 147 148
		" " + string_of_int(seq.size()-1+FIRST_POS) ;
		
  info += "\t" + code ;
  
  info = (reversed ? "- " : "+ ") + info ;

  return true ;
}

149 150 151 152 153 154 155 156 157
string Segmenter::getInfoLine() const
{
  string s = ">" ;

  s += label + " " ;
  s += (segmented ? "" : "! ") + info ;
  s += " " + info_extra ;
  s += " " + segmented_germline->code ;
  s += " " + string(segmented_mesg[because]) ;
158 159 160 161

  if (evalue > NO_LIMIT_VALUE)
    s += " " + scientific_string_of_double(evalue);

162 163 164
  return s ;
}

Mikaël Salson's avatar
Mikaël Salson committed
165 166
ostream &operator<<(ostream &out, const Segmenter &s)
{
167
  out << s.getInfoLine() << endl;
Mikaël Salson's avatar
Mikaël Salson committed
168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185

  if (s.segmented)
    {
      out << s.seg_V << endl ;
      out << s.seg_N << endl ;
      out << s.seg_J << endl ;
    }
  else
    {
      out << s.sequence << endl ;
    }

  return out ;
}


// KmerSegmenter (Cheap)

186 187 188
KmerSegmenter::KmerSegmenter() { kaa = 0 ; }

KmerSegmenter::KmerSegmenter(Sequence seq, Germline *germline)
Mikaël Salson's avatar
Mikaël Salson committed
189 190 191 192
{
  label = seq.label ;
  sequence = seq.sequence ;
  info = "" ;
193
  info_extra = "seed";
Mikaël Salson's avatar
Mikaël Salson committed
194
  segmented = false;
195
  segmented_germline = germline ;
196
  reversed = false;
Mathieu Giraud's avatar
Mathieu Giraud committed
197
  Dend=0;
198 199
  because = 0 ; // Cause of unsegmentation
  score = 0 ;
200
  evalue = NO_LIMIT_VALUE;
201

202
  int s = (size_t)germline->index->getS() ;
Mikaël Salson's avatar
Mikaël Salson committed
203
  int length = sequence.length() ;
Mikaël Salson's avatar
Mikaël Salson committed
204

205
  if (length < s) 
Mikaël Salson's avatar
Mikaël Salson committed
206
    {
Mathieu Giraud's avatar
Mathieu Giraud committed
207 208
      because = UNSEG_TOO_SHORT;
      kaa = NULL;
209
      return ;
Mikaël Salson's avatar
Mikaël Salson committed
210 211
    }
 
212
  kaa = new KmerAffectAnalyser(*(germline->index), sequence);
Mikaël Salson's avatar
Mikaël Salson committed
213 214
  
  // Check strand consistency among the affectations.
Mikaël Salson's avatar
Mikaël Salson committed
215 216 217 218 219 220 221 222 223
  int strand;
  int nb_strand[2] = {0,0};     // In cell 0 we'll put the number of negative
                                // strand, while in cell 1 we'll put the
                                // positives
  for (int i = 0; i < kaa->count(); i++) { 
    KmerAffect it = kaa->getAffectation(i);
    if (! it.isAmbiguous() && ! it.isUnknown()) {
      strand = affect_strand(it.affect);
      nb_strand[(strand + 1) / 2] ++; // (strand+1) / 2 → 0 if strand == -1; 1 if strand == 1
Mikaël Salson's avatar
Mikaël Salson committed
224 225 226
    }
  }

227 228 229
  KmerAffect before, after;

  // Test on which strand we are, select the before and after KmerAffects
Mikaël Salson's avatar
Mikaël Salson committed
230
  if (nb_strand[0] == 0 && nb_strand[1] == 0) {
231
    because = UNSEG_TOO_FEW_ZERO ;
232
    return ;
Mikaël Salson's avatar
Mikaël Salson committed
233 234
  } else if (nb_strand[0] > RATIO_STRAND * nb_strand[1]) {
    strand = -1;
235 236
    before = KmerAffect(germline->affect_3, -1); 
    after = KmerAffect(germline->affect_5, -1);
Mikaël Salson's avatar
Mikaël Salson committed
237 238
  } else if (nb_strand[1] > RATIO_STRAND * nb_strand[0]) {
    strand = 1;
239 240
    before = KmerAffect(germline->affect_5, 1); 
    after = KmerAffect(germline->affect_3, 1);    
Mikaël Salson's avatar
Mikaël Salson committed
241 242 243
  } else {
    // Ambiguous information: we have positive and negative strands
    // and there is not enough difference to put them aparat.
244 245
    because = UNSEG_STRAND_NOT_CONSISTENT ;
    return ;
Mikaël Salson's avatar
Mikaël Salson committed
246 247
  }

248
  detected = false ;
249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264
  computeSegmentation(strand, before, after);

  if (! because)
    {
      // Now we check the delta between Vend and right
   
      if (Jstart - Vend < germline->delta_min)
	{
	  because = UNSEG_BAD_DELTA_MIN ;
	}

      if (Jstart - Vend > germline->delta_max)
	{
	  because = UNSEG_BAD_DELTA_MAX ;
	}
    } 
Mathieu Giraud's avatar
Mathieu Giraud committed
265

266
  if (!because)
Mathieu Giraud's avatar
Mathieu Giraud committed
267 268
    {
      // Yes, it is segmented
269
      segmented = true;
Mathieu Giraud's avatar
Mathieu Giraud committed
270 271 272 273
      reversed = (strand == -1); 
      because = reversed ? SEG_MINUS : SEG_PLUS ;

      info = string_of_int(Vend + FIRST_POS) + " " + string_of_int(Jstart + FIRST_POS)  ;
274 275
      // removeChevauchement is called once info was already computed: it is only to output info_extra
      info_extra += removeChevauchement();
Mathieu Giraud's avatar
Mathieu Giraud committed
276
      finishSegmentation();
277
      system = germline->code;
278
      return ;
Mathieu Giraud's avatar
Mathieu Giraud committed
279
    } 
280

281
 
Mathieu Giraud's avatar
Mathieu Giraud committed
282
}
Mikaël Salson's avatar
Mikaël Salson committed
283

Mathieu Giraud's avatar
Mathieu Giraud committed
284
KmerSegmenter::~KmerSegmenter() {
285 286
  if (kaa)
    delete kaa;
287 288
}

289
KmerMultiSegmenter::KmerMultiSegmenter(Sequence seq, MultiGermline *multigermline, ostream *out_unsegmented, double threshold)
290
{
291
  int best_score_seg = 0 ; // Best score, segmented sequences
292
  int best_score_unseg = 0 ; // Best score, unsegmented sequences
293
  the_kseg = NULL;
294
  multi_germline = multigermline;
295
  threshold_nb_expected = threshold;
296
  
297 298 299 300 301
  // Iterate over the germlines
  for (list<Germline*>::const_iterator it = multigermline->germlines.begin(); it != multigermline->germlines.end(); ++it)
    {
      Germline *germline = *it ;

302 303
      KmerSegmenter *kseg = new KmerSegmenter(seq, germline);
      bool keep_seg = false;
304

305 306 307 308
      if (out_unsegmented)
        {
          // Debug, display k-mer affectation and segmentation result for this germline
          *out_unsegmented << "#"
309 310
                           << left << setw(4) << kseg->segmented_germline->code << " "
                           << left << setw(20) << segmented_mesg[kseg->getSegmentationStatus()] << " ";
311

312
          *out_unsegmented << right << setw(3) << kseg->score << " ";
313
          
314 315
          if (kseg->getSegmentationStatus() != UNSEG_TOO_SHORT) 
            *out_unsegmented << kseg->getKmerAffectAnalyser()->toString();
316 317 318 319

          *out_unsegmented << endl ;
        }

320 321
      // Always remember the first kseg
      if (the_kseg == NULL)
322
        keep_seg = true;
323
      
324
      if (kseg->isSegmented())
325 326
        {
          // Yes, it is segmented
327
          // Should we keep the kseg ?
328
          if (kseg->score > best_score_seg)
329
            {
330
              keep_seg = true;
331
              best_score_seg = kseg->score ;
332
            }
333
        }
334 335 336 337 338 339 340 341 342 343 344 345
      else
        {
          // It is not segmented
          // Should we keep the kseg (with the unsegmentation cause) ?
            if (kseg->score > best_score_unseg)
            {              
              best_score_unseg = kseg->score ;
              if (!best_score_seg)
                keep_seg = true;
            }
        }
      
346
      if (keep_seg) {
347 348
        if (the_kseg)
          delete the_kseg;
349 350 351 352
        the_kseg = kseg;
      } else {
        delete kseg;
      }
353
    } // end for (Germlines)
354

355
  // E-value threshold
356
  if (threshold_nb_expected > NO_LIMIT_VALUE)
357 358 359 360 361 362 363
    if (the_kseg->isSegmented()) {
        // the_kseg->evalue also depends on the number of germlines from the *Multi*KmerSegmenter
        the_kseg->evalue = getNbExpected();
        if (the_kseg->evalue > threshold_nb_expected) {
          the_kseg->setSegmentationStatus(UNSEG_NOISY);
        }
      }
364 365
}

366 367
double KmerSegmenter::getProbabilityAtLeastOrAbove(int at_least) const {

368 369
  return getKmerAffectAnalyser()->getIndex().getProbabilityAtLeastOrAbove(at_least,
                                                                          getSequence().sequence.size());
370 371 372 373
}

double KmerMultiSegmenter::getNbExpected() const {
  double proba = the_kseg->getProbabilityAtLeastOrAbove(the_kseg->score);
374
  return multi_germline->germlines.size() * proba;
375 376 377
}

KmerMultiSegmenter::~KmerMultiSegmenter() {
378 379
  if (the_kseg)
    delete the_kseg;
Mathieu Giraud's avatar
Mathieu Giraud committed
380 381
}

382 383
void KmerSegmenter::computeSegmentation(int strand, KmerAffect before, KmerAffect after) {
  // Try to segment, computing 'Vend' and 'Jstart'
384 385
  // If not segmented, put the cause of unsegmentation in 'because'

386
  affect_infos max;
Mikaël Salson's avatar
Mikaël Salson committed
387

388
  max = kaa->getMaximum(before, after); 
389 390 391 392 393

      // We labeled it detected if there were both enough affect_5 and enough affect_3
      detected = (max.nb_before_left + max.nb_before_right >= DETECT_THRESHOLD)
        && (max.nb_after_left + max.nb_after_right >= DETECT_THRESHOLD);
      
394
      if (! max.max_found) {
395 396
        if ((strand == 1 && max.nb_before_left == 0)
            || (strand == -1 && max.nb_after_right == 0)) 
397
          because = detected ? UNSEG_AMBIGUOUS : UNSEG_TOO_FEW_V ;
398 399
        else if ((strand == 1 && max.nb_after_right == 0)
                 || (strand == -1 && max.nb_before_left == 0))
Mikaël Salson's avatar
Mikaël Salson committed
400
	{
401
	  because = detected ? UNSEG_AMBIGUOUS : UNSEG_TOO_FEW_J ;
402
	} else 
403 404 405 406 407 408 409 410 411 412
          because = UNSEG_AMBIGUOUS; 
      } else {
        Vend = max.first_pos_max;
        Jstart = max.last_pos_max + 1;
        if (strand == -1) {
          int tmp = sequence.size() - Vend - 1;
          Vend = sequence.size() - Jstart - 1;
          Jstart = tmp;
        }
      }
Mikaël Salson's avatar
Mikaël Salson committed
413
  
414
    score = max.nb_before_left + max.nb_before_right + max.nb_after_left + max.nb_after_right;  
Mathieu Giraud's avatar
Mathieu Giraud committed
415
}
Mikaël Salson's avatar
Mikaël Salson committed
416

417
KmerAffectAnalyser *KmerSegmenter::getKmerAffectAnalyser() const {
Mathieu Giraud's avatar
Mathieu Giraud committed
418 419
  return kaa;
}
Mikaël Salson's avatar
Mikaël Salson committed
420

421
int Segmenter::getSegmentationStatus() const {
Mathieu Giraud's avatar
Mathieu Giraud committed
422
  return because;
Mikaël Salson's avatar
Mikaël Salson committed
423 424
}

425 426 427 428 429
void Segmenter::setSegmentationStatus(int status) {
  because = status;
  segmented = (status == SEG_PLUS || status == SEG_MINUS);
}

Mikaël Salson's avatar
Mikaël Salson committed
430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454
// FineSegmenter

void best_align(int overlap, string seq_left, string seq_right,
		string ref_left ,string ref_right, int *b_r, int *b_l, Cost segment_cost)
{
      int score_r[overlap+1];
      int score_l[overlap+1];
      
      //LEFT
      ref_left=string(ref_left.rbegin(), ref_left.rend());
      seq_left=string(seq_left.rbegin(), seq_left.rend()); 
      
      DynProg dp1 = DynProg(seq_left, ref_left,
			   DynProg::Local, segment_cost);
      score_l[0] = dp1.compute();
      dp1.backtrack();
      
      //RIGHT
      DynProg dp = DynProg(seq_right, ref_right,
			   DynProg::Local, segment_cost);
      score_r[0] = dp.compute();
      dp.backtrack();    
      
      for(int i=1; i<=overlap; i++){
	if(dp.best_i-i >0)
455
	score_r[i] = dp.B[dp.best_i-i][dp.best_j].score;
Mikaël Salson's avatar
Mikaël Salson committed
456 457 458
	else
	score_r[i] =0;
	if(dp1.best_i-i >0)
459
	score_l[i] = dp1.B[dp1.best_i-i][dp1.best_j].score;	
Mikaël Salson's avatar
Mikaël Salson committed
460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484
	else
	score_l[i] =0;
      }
      int score=-1000000;
      int best_r=0;
      int best_l=0;

      for (int i=0; i<=overlap; i++){
	for (int j=overlap-i; j<=overlap; j++){
	  if ( ((score_r[i]+score_l[j]) == score) && (i+j < best_r+best_l )){
	    best_r=i;
	    best_l=j;
	    score=(score_r[i]+score_l[j]) ;
	  }
	  if ( (score_r[i]+score_l[j]) > score){
	    best_r=i;
	    best_l=j;
	    score=(score_r[i]+score_l[j]) ;
	  }
	}
      }
      *b_r=best_r;
      *b_l=best_l;
}

Mikaël Salson's avatar
Mikaël Salson committed
485 486 487 488 489
bool comp_pair (pair<int,int> i,pair<int,int> j)
{
  return ( i.first > j.first);
}

Mathieu Giraud's avatar
Mathieu Giraud committed
490
int align_against_collection(string &read, Fasta &rep, bool reverse_both, bool local, int *tag, 
Mikaël Salson's avatar
Mikaël Salson committed
491
			     int *del, int *del2, int *begin, int *length, vector<pair<int, int> > *score
Mikaël Salson's avatar
Mikaël Salson committed
492 493 494 495 496 497 498 499 500 501
			    , Cost segment_cost)
{
  
  int best_score = MINUS_INF ;
  int best_r = MINUS_INF ;
  int best_best_i = (int) string::npos ;
  int best_best_j = (int) string::npos ;
  int best_first_i = (int) string::npos ;
  int best_first_j = (int) string::npos ;
  string best_label = "" ;
Mikaël Salson's avatar
Mikaël Salson committed
502
  vector<pair<int, int> > score_r;
Mikaël Salson's avatar
Mikaël Salson committed
503 504 505 506 507 508 509 510 511 512 513

  DynProg::DynProgMode dpMode = DynProg::LocalEndWithSomeDeletions;
  if (local==true) dpMode = DynProg::Local;
  
  for (int r = 0 ; r < rep.size() ; r++)
    {
      DynProg dp = DynProg(read, rep.sequence(r),
			   dpMode, // DynProg::SemiGlobalTrans, 
			   segment_cost, // DNA
			   reverse_both, reverse_both);
      int score = dp.compute();
Mikaël Salson's avatar
Mikaël Salson committed
514
      
Mikaël Salson's avatar
Mikaël Salson committed
515 516 517 518 519 520 521 522 523 524 525 526 527 528
      if (local==true){ 
	dp.backtrack();
      }
      
      if (score > best_score)
	{
	  best_score = score ;
	  best_best_i = dp.best_i ;
	  best_best_j = dp.best_j ;
	  best_first_i = dp.first_i ;
	  best_first_j = dp.first_j ;
	  best_r = r ;
	  best_label = rep.label(r) ;
	}
Mikaël Salson's avatar
Mikaël Salson committed
529 530
	
	score_r.push_back(make_pair(score, r));
Mathieu Giraud's avatar
Mathieu Giraud committed
531 532 533 534 535 536

	// #define DEBUG_SEGMENT      

#ifdef DEBUG_SEGMENT	
	cout << rep.label(r) << " " << score << " " << dp.best_i << endl ;
#endif
Mikaël Salson's avatar
Mikaël Salson committed
537 538

    }
Mikaël Salson's avatar
Mikaël Salson committed
539
    sort(score_r.begin(),score_r.end(),comp_pair);
Mikaël Salson's avatar
Mikaël Salson committed
540 541 542 543

  *del = reverse_both ? best_best_j : rep.sequence(best_r).size() - best_best_j - 1;
  *del2 = best_first_j;
  *begin = best_first_i;
Mathieu Giraud's avatar
Mathieu Giraud committed
544
  *tag = best_r ; 
Mikaël Salson's avatar
Mikaël Salson committed
545 546
  
  *length -= *del ;
Mikaël Salson's avatar
Mikaël Salson committed
547 548
  
  *score=score_r;
Mathieu Giraud's avatar
Mathieu Giraud committed
549 550 551 552 553 554

#ifdef DEBUG_SEGMENT	
  cout << "best: " << best_labels << " " << best_score ;
  cout << "del/del2/begin:" << (*del) << "/" << (*del2) << "/" << (*begin) << endl;
  cout << endl;
#endif
Mikaël Salson's avatar
Mikaël Salson committed
555 556 557 558 559 560 561 562 563
  
  return best_best_i ;
}

string format_del(int deletions)
{
  return deletions ? *"(" + string_of_int(deletions) + " del)" : "" ;
}

564
FineSegmenter::FineSegmenter(Sequence seq, Germline *germline, Cost segment_c)
Mikaël Salson's avatar
Mikaël Salson committed
565
{
566 567
  segmented = false;
  dSegmented = false;
568
  because = 0 ;
569
  segmented_germline = germline ;
570
  info_extra = "" ;
Mikaël Salson's avatar
Mikaël Salson committed
571 572
  label = seq.label ;
  sequence = seq.sequence ;
Mathieu Giraud's avatar
Mathieu Giraud committed
573
  Dend=0;
Mikaël Salson's avatar
Mikaël Salson committed
574
  segment_cost=segment_c;
575
  evalue = NO_LIMIT_VALUE;
Mikaël Salson's avatar
Mikaël Salson committed
576

577 578 579
  CDR3start = -1;
  CDR3end = -1;
  
Mikaël Salson's avatar
Mikaël Salson committed
580 581 582 583 584
  // TODO: factoriser tout cela, peut-etre en lancant deux segmenteurs, un +, un -, puis un qui chapote
  
  // Strand +
  
  int plus_score = 0 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
585
  int tag_plus_V, tag_plus_J;
Mikaël Salson's avatar
Mikaël Salson committed
586 587 588 589
  int plus_length = 0 ;
  int del_plus_V, del_plus_J ;
  int del2=0;
  int beg=0;
Mikaël Salson's avatar
Mikaël Salson committed
590 591 592 593
  
  vector<pair<int, int> > score_plus_V;
  vector<pair<int, int> > score_plus_J;
  
594
  int plus_left = align_against_collection(sequence, germline->rep_5, false, false, &tag_plus_V, &del_plus_V, &del2, &beg, 
Mikaël Salson's avatar
Mikaël Salson committed
595 596
					   &plus_length, &score_plus_V
					   , segment_cost);
597
  int plus_right = align_against_collection(sequence, germline->rep_3, true, false, &tag_plus_J, &del_plus_J, &del2, &beg,
Mikaël Salson's avatar
Mikaël Salson committed
598 599
					    &plus_length, &score_plus_J
					    , segment_cost);
Mikaël Salson's avatar
Mikaël Salson committed
600 601
  plus_length += plus_right - plus_left ;

Mikaël Salson's avatar
Mikaël Salson committed
602 603
  plus_score=score_plus_V[0].first + score_plus_J[0].first ;
  
Mikaël Salson's avatar
Mikaël Salson committed
604 605 606
  // Strand -
  string rc = revcomp(sequence) ;
  int minus_score = 0 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
607
  int tag_minus_V, tag_minus_J;
Mikaël Salson's avatar
Mikaël Salson committed
608 609
  int minus_length = 0 ;
  int del_minus_V, del_minus_J ;
Mikaël Salson's avatar
Mikaël Salson committed
610 611 612 613
  
  vector<pair<int, int> > score_minus_V;
  vector<pair<int, int> > score_minus_J;
  
614
  int minus_left = align_against_collection(rc, germline->rep_5, false, false, &tag_minus_V, &del_minus_V, &del2, &beg,
Mikaël Salson's avatar
Mikaël Salson committed
615 616
					    &minus_length, &score_minus_V
					    ,  segment_cost);
617
  int minus_right = align_against_collection(rc, germline->rep_3, true, false, &tag_minus_J, &del_minus_J, &del2, &beg,
Mikaël Salson's avatar
Mikaël Salson committed
618 619
					     &minus_length, &score_minus_J
					     , segment_cost);
Mikaël Salson's avatar
Mikaël Salson committed
620 621
  minus_length += minus_right - minus_left ;

Mikaël Salson's avatar
Mikaël Salson committed
622 623
  minus_score=score_minus_V[0].first + score_minus_J[0].first ;
  
Mikaël Salson's avatar
Mikaël Salson committed
624 625 626 627
  reversed = (minus_score > plus_score) ;

  if (!reversed)
    {
Mathieu Giraud's avatar
Mathieu Giraud committed
628 629 630 631
      Vend = plus_left ;
      Jstart = plus_right ;
      best_V = tag_plus_V ;
      best_J = tag_plus_J ;
Mikaël Salson's avatar
Mikaël Salson committed
632 633
      del_V = del_plus_V ;
      del_J = del_plus_J ;
Mikaël Salson's avatar
Mikaël Salson committed
634 635
      score_V=score_plus_V;
      score_J=score_plus_J;
Mikaël Salson's avatar
Mikaël Salson committed
636 637 638
    }
  else
    {
Mathieu Giraud's avatar
Mathieu Giraud committed
639 640 641 642
      Vend = minus_left ;
      Jstart = minus_right ;
      best_V = tag_minus_V ;
      best_J = tag_minus_J ;
Mikaël Salson's avatar
Mikaël Salson committed
643 644
      del_V = del_minus_V ;
      del_J = del_minus_J ;
Mikaël Salson's avatar
Mikaël Salson committed
645 646
      score_V=score_minus_V;
      score_J=score_minus_J;
Mikaël Salson's avatar
Mikaël Salson committed
647 648
    }

Mathieu Giraud's avatar
Mathieu Giraud committed
649
  segmented = (Vend != (int) string::npos) && (Jstart != (int) string::npos) && 
650
    (Jstart - Vend >= germline->delta_min) && (Jstart - Vend <= germline->delta_max);
Mikaël Salson's avatar
Mikaël Salson committed
651 652 653
    
  if (!segmented)
    {
654
      because = DONT_KNOW;
Mathieu Giraud's avatar
Mathieu Giraud committed
655
      info = " @" + string_of_int (Vend + FIRST_POS) + "  @" + string_of_int(Jstart + FIRST_POS) ;
656
      
657
      if (Jstart - Vend < germline->delta_min)
658 659 660 661
        {
          because = UNSEG_BAD_DELTA_MIN  ;
        }

662
      if (Jstart - Vend > germline->delta_max)
663 664 665 666 667 668 669 670 671 672 673 674 675 676
        {
          because = UNSEG_BAD_DELTA_MAX  ;
        }
        
      if (Vend == (int) string::npos) 
        {
          because = UNSEG_TOO_FEW_V ;
        }
      
      if (Jstart == (int) string::npos)
        {
          because = UNSEG_TOO_FEW_J ;
        }
      
Mikaël Salson's avatar
Mikaël Salson committed
677 678 679
      return ;
    }
    
680 681
    because = reversed ? SEG_MINUS : SEG_PLUS ;
    
Mikaël Salson's avatar
Mikaël Salson committed
682
    //overlap VJ
Mathieu Giraud's avatar
Mathieu Giraud committed
683
    if(Jstart-Vend <=0){
Mikaël Salson's avatar
Mikaël Salson committed
684
      int b_r, b_l;
Mathieu Giraud's avatar
Mathieu Giraud committed
685
      int overlap=Vend-Jstart+1;
Mikaël Salson's avatar
Mikaël Salson committed
686
      
Mathieu Giraud's avatar
Mathieu Giraud committed
687 688
      string seq_left = sequence.substr(0, Vend+1);
      string seq_right = sequence.substr(Jstart);
Mikaël Salson's avatar
Mikaël Salson committed
689 690

      best_align(overlap, seq_left, seq_right, 
691
		 germline->rep_5.sequence(best_V), germline->rep_3.sequence(best_J), &b_r,&b_l, segment_cost);
Mikaël Salson's avatar
Mikaël Salson committed
692
      // Trim V
Mathieu Giraud's avatar
Mathieu Giraud committed
693
      Vend -= b_l;
Mikaël Salson's avatar
Mikaël Salson committed
694 695 696
      del_V += b_l;

      // Trim J
Mathieu Giraud's avatar
Mathieu Giraud committed
697
      Jstart += b_r;
Mikaël Salson's avatar
Mikaël Salson committed
698
      del_J += b_r;
699
      if (Jstart>=(int) sequence.length())
Mathieu Giraud's avatar
Mathieu Giraud committed
700
	  Jstart=sequence.length()-1;
Mikaël Salson's avatar
Mikaël Salson committed
701 702 703
    }

    // string chevauchement = removeChevauchement();
Mikaël Salson's avatar
Mikaël Salson committed
704 705

    /// used only below, then recomputed in finishSegmentation() ;
Mathieu Giraud's avatar
Mathieu Giraud committed
706
    seg_N = revcomp(sequence, reversed).substr(Vend+1, Jstart-Vend-1); 
Mikaël Salson's avatar
Mikaël Salson committed
707

708
  code = germline->rep_5.label(best_V) +
Mikaël Salson's avatar
Mikaël Salson committed
709 710 711 712
    " "+ string_of_int(del_V) + 
    "/" + seg_N + 
    // chevauchement +
    "/" + string_of_int(del_J) +
713
    " " + germline->rep_3.label(best_J); 
Mikaël Salson's avatar
Mikaël Salson committed
714

Mikaël Salson's avatar
Mikaël Salson committed
715
    stringstream code_s;
716
   code_s<< germline->rep_5.label(best_V) <<
Mikaël Salson's avatar
Mikaël Salson committed
717 718 719 720
    " -" << string_of_int(del_V) << "/" 
    << seg_N.size()
    // chevauchement +
    << "/-" << string_of_int(del_J)
721
    <<" " << germline->rep_3.label(best_J);
Mikaël Salson's avatar
Mikaël Salson committed
722 723
    code_short=code_s.str();
    
724 725
  code_light = germline->rep_5.label(best_V) +
    "/ " + germline->rep_3.label(best_J); 
Mikaël Salson's avatar
Mikaël Salson committed
726 727

 
Mathieu Giraud's avatar
Mathieu Giraud committed
728
  info = string_of_int(Vend + FIRST_POS) + " " + string_of_int(Jstart + FIRST_POS) ;
Mikaël Salson's avatar
Mikaël Salson committed
729 730 731 732
  finishSegmentation();
}


733
void FineSegmenter::FineSegmentD(Germline *germline){
Mikaël Salson's avatar
Mikaël Salson committed
734 735 736 737
  
  if (segmented){
    
    int end = (int) string::npos ;
Mathieu Giraud's avatar
Mathieu Giraud committed
738
    int tag_D;
Mikaël Salson's avatar
Mikaël Salson committed
739 740 741 742
    int length = 0 ;
    int begin = 0;
    
    // Create a zone where to look for D, adding at most EXTEND_D_ZONE nucleotides at each side
Mathieu Giraud's avatar
Mathieu Giraud committed
743
    int l = Vend - EXTEND_D_ZONE;
Mikaël Salson's avatar
Mikaël Salson committed
744 745 746
    if (l<0) 
      l=0 ;

Mathieu Giraud's avatar
Mathieu Giraud committed
747
    int r = Jstart + EXTEND_D_ZONE;
Mikaël Salson's avatar
Mikaël Salson committed
748 749 750 751 752 753 754

    if (r > (int)getSequence().sequence.length()) 
      r = getSequence().sequence.length();
      
    string str = getSequence().sequence.substr(l, r-l);

    // Align
755
    end = align_against_collection(str, germline->rep_4, false, true, &tag_D, &del_D_right, &del_D_left, &begin,
Mikaël Salson's avatar
Mikaël Salson committed
756 757
				&length, &score_D, segment_cost);
    
Mathieu Giraud's avatar
Mathieu Giraud committed
758
    best_D = tag_D;
Mikaël Salson's avatar
Mikaël Salson committed
759
    
Mathieu Giraud's avatar
Mathieu Giraud committed
760 761
    Dstart = l + begin;
    Dend = l + end;
Mikaël Salson's avatar
Mikaël Salson committed
762 763
	
    string seq = getSequence().sequence;
764 765 766 767 768 769 770 771 772 773


    // recompute remaining length for D
    length = germline->rep_4.sequence(best_D).length() - del_D_right - del_D_left;

    if (length < MIN_D_LENGTH)
      return ;


    dSegmented=true;
Mikaël Salson's avatar
Mikaël Salson committed
774 775
    
    //overlap VD
Mathieu Giraud's avatar
Mathieu Giraud committed
776
    if(Dstart-Vend <=0){
Mikaël Salson's avatar
Mikaël Salson committed
777
      int b_r, b_l;
Mathieu Giraud's avatar
Mathieu Giraud committed
778 779 780
      int overlap=Vend-Dstart+1;
      string seq_left = seq.substr(0, Vend+1);
      string seq_right = seq.substr(Dstart, Dend-Dstart+1);
Mikaël Salson's avatar
Mikaël Salson committed
781 782

      best_align(overlap, seq_left, seq_right, 
783
		 germline->rep_5.sequence(best_V), germline->rep_4.sequence(best_D), &b_r,&b_l, segment_cost);
Mikaël Salson's avatar
Mikaël Salson committed
784 785

      // Trim V
Mathieu Giraud's avatar
Mathieu Giraud committed
786
      Vend -= b_l;
Mikaël Salson's avatar
Mikaël Salson committed
787 788 789
      del_V += b_l;

      // Trim D
Mathieu Giraud's avatar
Mathieu Giraud committed
790
      Dstart += b_r;
Mikaël Salson's avatar
Mikaël Salson committed
791
    }
Mathieu Giraud's avatar
Mathieu Giraud committed
792
    seg_N1 = seq.substr(Vend+1, Dstart-Vend-1) ; // From Vend+1 to Dstart-1
Mikaël Salson's avatar
Mikaël Salson committed
793 794
    
    //overlap DJ
Mathieu Giraud's avatar
Mathieu Giraud committed
795
    if(Jstart-Dend <=0){
Mikaël Salson's avatar
Mikaël Salson committed
796
      int b_r, b_l;
Mathieu Giraud's avatar
Mathieu Giraud committed
797 798 799
      int overlap=Dend-Jstart+1;
      string seq_right = seq.substr(Dstart, Dend-Dstart+1);
      string seq_left = seq.substr(Jstart, seq.length()-Jstart);
Mikaël Salson's avatar
Mikaël Salson committed
800 801

      best_align(overlap, seq_left, seq_right, 
802
		 germline->rep_4.sequence(best_D), germline->rep_3.sequence(best_J), &b_r,&b_l, segment_cost);
Mikaël Salson's avatar
Mikaël Salson committed
803 804

      // Trim D
Mathieu Giraud's avatar
Mathieu Giraud committed
805
      Dend -= b_l;
Mikaël Salson's avatar
Mikaël Salson committed
806 807

      // Trim J
Mathieu Giraud's avatar
Mathieu Giraud committed
808
      Jstart += b_r;
Mikaël Salson's avatar
Mikaël Salson committed
809 810 811
      del_J += b_r;

    }
Mathieu Giraud's avatar
Mathieu Giraud committed
812
    seg_N2 = seq.substr(Dend+1, Jstart-Dend-1) ; // From Dend+1 to right-1
813
    code = germline->rep_5.label(best_V) +
Mikaël Salson's avatar
Mikaël Salson committed
814 815 816 817
    " "+ string_of_int(del_V) + 
    "/" + seg_N1 + 
    
    "/" + string_of_int(del_D_left) +
818
    " " + germline->rep_4.label(best_D) +
Mikaël Salson's avatar
Mikaël Salson committed
819 820 821 822
    " " + string_of_int(del_D_right) +
    
    "/" + seg_N2 +
    "/" + string_of_int(del_J) +
823
    " " + germline->rep_3.label(best_J); 
Mikaël Salson's avatar
Mikaël Salson committed
824

Mikaël Salson's avatar
Mikaël Salson committed
825
    stringstream code_s;
826
    code_s << germline->rep_5.label(best_V) 
Mikaël Salson's avatar
Mikaël Salson committed
827 828 829 830
    << " -" << string_of_int(del_V) << "/" 
    << seg_N1.size()
    
    << "/-" << string_of_int(del_D_left) 
831
    << " " << germline->rep_4.label(best_D) 
Mikaël Salson's avatar
Mikaël Salson committed
832 833 834 835
    << " -" << string_of_int(del_D_right) << "/"
    
    << seg_N2.size()
    << "/-" << string_of_int(del_J) 
836
    << " " << germline->rep_3.label(best_J);
Mikaël Salson's avatar
Mikaël Salson committed
837 838 839
    code_short=code_s.str();
    
    
840 841 842
    code_light = germline->rep_5.label(best_V) +
    "/ " + germline->rep_4.label(best_D) +
    "/ " + germline->rep_3.label(best_J); 
Mikaël Salson's avatar
Mikaël Salson committed
843 844 845 846
    
    finishSegmentationD();
  }
}
Mikaël Salson's avatar
Mikaël Salson committed
847

Marc Duez's avatar
Marc Duez committed
848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866
void FineSegmenter::findCDR3(){
    string str = getSequence().sequence;
    
    list<string> codon_start;
    codon_start.push_back("TGT");
    codon_start.push_back("TGC");
    
    list<string> codon_end;
    codon_end.push_back("TTT");
    codon_end.push_back("TTC");
    codon_end.push_back("TGG");
    
    list<int> p_start;
    list<int> p_end;

    size_t loc;
    std::list<string>::const_iterator it;
    for (it = codon_start.begin(); it != codon_start.end(); ++it) {//filter 1 : start codon must be in V
        loc = 0;
867
        while ( loc != string::npos && loc < (size_t)Vend){
Marc Duez's avatar
Marc Duez committed
868
            loc = str.find(*it, loc+3);
869
            if (loc != string::npos && loc < (size_t)Vend) {
Marc Duez's avatar
Marc Duez committed
870 871 872 873 874 875 876 877 878 879 880 881 882 883 884
                p_start.push_front(loc);
            }
        }
    }

    for (it = codon_end.begin(); it != codon_end.end(); ++it) {//filter 2 : end codon must be in J
        loc = Jstart;
        while ( loc != string::npos){
            loc = str.find(*it, loc+3);
            if (loc != string::npos) {
                p_end.push_back(loc);
            }
        }
    }

885 886
    CDR3start = -1;
    CDR3end = -1;
Marc Duez's avatar
Marc Duez committed
887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905
    
    std::list<int>::const_iterator it1;
    for (it1 = p_start.begin(); it1 != p_start.end(); ++it1) {
        
        std::list<int>::const_iterator it2;
        for (it2 = p_end.begin(); it2 != p_end.end(); ++it2) {
            
            if ( (*it2-*it1)%3 == 0){       //filter 3 : start/stop codon must be seprated by a multiple of 3
                
                if ( fabs((*it2-*it1)-36 ) < fabs((CDR3end-CDR3start)-36) ){ //filter 4 : cdr3 length must be close to 12 AA
                    CDR3start = *it1;
                    CDR3end = *it2;
                }
            }
        }
    }
    
}

906
void FineSegmenter::toJsonList(JsonList *seg){
Tatiana Rocher's avatar
Tatiana Rocher committed
907

Mathieu Giraud's avatar
Mathieu Giraud committed
908
  if (isSegmented()) {
909 910 911
    seg->add("5", segmented_germline->rep_5.label(best_V));
    seg->add("5start", 0);
    seg->add("5end", Vend);
Mathieu Giraud's avatar
Mathieu Giraud committed
912 913
    
    if (score_D.size()>0){
914 915 916
      seg->add("4", segmented_germline->rep_4.label(best_D));
      seg->add("4start", Dstart);
      seg->add("4end", Dend);
Mikaël Salson's avatar
Mikaël Salson committed
917
    }
Mathieu Giraud's avatar
Mathieu Giraud committed
918
    
919 920
    seg->add("3", segmented_germline->rep_3.label(best_J));
    seg->add("3start", Jstart);
921 922 923

    if (CDR3start >= 0)
      {
Marc Duez's avatar
Marc Duez committed
924 925 926 927
    JsonList *json_cdr;
    json_cdr=new JsonList();
    json_cdr->add("start", CDR3start);
    json_cdr->add("stop", CDR3end);
928
    seg->add("cdr3", *json_cdr);
929
      }
Tatiana Rocher's avatar
Tatiana Rocher committed
930

931 932
  }
}
933

934 935
void KmerSegmenter::toJsonList(JsonList *seg)
{
936 937
    int sequenceSize = sequence.size();

938
    if (evalue > NO_LIMIT_VALUE)
939
      seg->add("_evalue", scientific_string_of_double(evalue));
940

941 942 943 944
    JsonList *json_affectValues;
    json_affectValues=new JsonList();
    json_affectValues->add("start", 0);
    json_affectValues->add("stop", sequenceSize); 
945 946
    json_affectValues->add("seq", getKmerAffectAnalyser()->toStringValues());
    seg->add("affectValues", *json_affectValues);