Mise à jour terminée. Pour connaître les apports de la version 13.8.4 par rapport à notre ancienne version vous pouvez lire les "Release Notes" suivantes :
https://about.gitlab.com/releases/2021/02/11/security-release-gitlab-13-8-4-released/
https://about.gitlab.com/releases/2021/02/05/gitlab-13-8-3-released/

segment.cpp 25 KB
Newer Older
Mikaël Salson's avatar
Mikaël Salson committed
1
/*
2 3 4 5 6 7 8
  This file is part of Vidjil <http://www.vidjil.org>
  Copyright (C) 2011, 2012, 2013, 2014, 2015 by Bonsai bioinformatics 
  at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille
  Contributors: 
      Mathieu Giraud <mathieu.giraud@vidjil.org>
      Mikaël Salson <mikael.salson@vidjil.org>
      Marc Duez <marc.duez@vidjil.org>
Mikaël Salson's avatar
Mikaël Salson committed
9 10 11 12 13 14 15 16 17 18 19 20 21 22

  "Vidjil" is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.

  "Vidjil" is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with "Vidjil". If not, see <http://www.gnu.org/licenses/>
*/
Mikaël Salson's avatar
Mikaël Salson committed
23
#include <algorithm>    // std::sort
Mikaël Salson's avatar
Mikaël Salson committed
24 25 26 27
#include <cassert>
#include "segment.h"
#include "tools.h"
#include "affectanalyser.h"
Mikaël Salson's avatar
Mikaël Salson committed
28
#include <sstream>
Marc Duez's avatar
Marc Duez committed
29
#include <string>
Mikaël Salson's avatar
Mikaël Salson committed
30

Mathieu Giraud's avatar
Mathieu Giraud committed
31
Segmenter::~Segmenter() {}
Mikaël Salson's avatar
Mikaël Salson committed
32

Mathieu Giraud's avatar
Mathieu Giraud committed
33
Sequence Segmenter::getSequence() const {
Mikaël Salson's avatar
Mikaël Salson committed
34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
  Sequence s ;
  s.label_full = info ;
  s.label = label + " " + (reversed ? "-" : "+");
  s.sequence = revcomp(sequence, reversed);

  return s ;
}

string Segmenter::getJunction(int l) const {
  assert(isSegmented());

  int start = (getLeft() + getRight())/2 - l/2;
  
  if (start < 0 or start + l > (int)sequence.size())  // TODO: +l ou +l-1 ?
    return "" ;

  return getSequence().sequence.substr(start, l);
}

int Segmenter::getLeft() const {
Mathieu Giraud's avatar
Mathieu Giraud committed
54
  return Vend;
Mikaël Salson's avatar
Mikaël Salson committed
55 56 57
}
  
int Segmenter::getRight() const {
Mathieu Giraud's avatar
Mathieu Giraud committed
58
  return Jstart;
Mikaël Salson's avatar
Mikaël Salson committed
59 60 61
}

int Segmenter::getLeftD() const {
Mathieu Giraud's avatar
Mathieu Giraud committed
62
  return Dstart;
Mikaël Salson's avatar
Mikaël Salson committed
63 64 65
}
  
int Segmenter::getRightD() const {
Mathieu Giraud's avatar
Mathieu Giraud committed
66
  return Dend;
Mikaël Salson's avatar
Mikaël Salson committed
67 68 69 70 71 72 73 74 75 76 77 78 79 80
}

bool Segmenter::isReverse() const {
  return reversed;
}

bool Segmenter::isSegmented() const {
  return segmented;
}

bool Segmenter::isDSegmented() const {
  return dSegmented;
}

81 82 83 84
bool KmerSegmenter::isDetected() const {
  return detected;
}

Mikaël Salson's avatar
Mikaël Salson committed
85 86 87 88 89 90 91 92
// Chevauchement

string Segmenter::removeChevauchement()
{
  assert(isSegmented());
  
  string chevauchement = "" ;

Mathieu Giraud's avatar
Mathieu Giraud committed
93
  if (Vend >= Jstart)
Mikaël Salson's avatar
Mikaël Salson committed
94
    {
Mathieu Giraud's avatar
Mathieu Giraud committed
95
      int middle = (Vend + Jstart) / 2 ;
96
      chevauchement = " !ov " + string_of_int (Vend - Jstart + 1);
Mathieu Giraud's avatar
Mathieu Giraud committed
97 98
      Vend = middle ;
      Jstart = middle+1 ;
Mikaël Salson's avatar
Mikaël Salson committed
99 100 101 102 103 104 105 106 107 108 109 110 111 112
    }

  return chevauchement ;
}

// Prettyprint


bool Segmenter::finishSegmentation() 
{
  assert(isSegmented());
  
  string seq = getSequence().sequence;
    
Mathieu Giraud's avatar
Mathieu Giraud committed
113 114 115 116 117
  seg_V = seq.substr(0, Vend+1) ;
  seg_N = seq.substr(Vend+1, Jstart-Vend-1) ;  // Twice computed for FineSegmenter, but only once in KmerSegmenter !
  seg_J = seq.substr(Jstart) ;
  Dstart=0;
  Dend=0;
Mikaël Salson's avatar
Mikaël Salson committed
118 119 120 121 122 123 124 125 126 127 128 129 130

  info = "VJ \t" + string_of_int(FIRST_POS) + " " + info + " " + string_of_int(seq.size() - 1 + FIRST_POS) ;
  info += "\t" + code ;

  info = (reversed ? "- " : "+ ") + info ;

  return true ;
}

bool Segmenter::finishSegmentationD() 
{
  string seq = getSequence().sequence;

Mathieu Giraud's avatar
Mathieu Giraud committed
131 132
  seg_V = seq.substr(0, Vend+1) ; // From pos. 0 to Vend
  seg_J = seq.substr(Jstart) ;
Mikaël Salson's avatar
Mikaël Salson committed
133
  
Mathieu Giraud's avatar
Mathieu Giraud committed
134
  seg_D  = seq.substr(Dstart, Dend-Dstart+1) ; // From Dstart to Dend
Mikaël Salson's avatar
Mikaël Salson committed
135
  
Mathieu Giraud's avatar
Mathieu Giraud committed
136 137 138 139
  info = "VDJ \t0 " + string_of_int(Vend) +
		" " + string_of_int(Dstart) + 
		" " + string_of_int(Dend) +
		" " + string_of_int(Jstart) +
Mikaël Salson's avatar
Mikaël Salson committed
140 141 142 143 144 145 146 147 148
		" " + string_of_int(seq.size()-1+FIRST_POS) ;
		
  info += "\t" + code ;
  
  info = (reversed ? "- " : "+ ") + info ;

  return true ;
}

149 150 151 152 153 154 155 156 157
string Segmenter::getInfoLine() const
{
  string s = ">" ;

  s += label + " " ;
  s += (segmented ? "" : "! ") + info ;
  s += " " + info_extra ;
  s += " " + segmented_germline->code ;
  s += " " + string(segmented_mesg[because]) ;
158 159 160 161

  if (evalue > NO_LIMIT_VALUE)
    s += " " + scientific_string_of_double(evalue);

162 163 164
  return s ;
}

Mikaël Salson's avatar
Mikaël Salson committed
165 166
ostream &operator<<(ostream &out, const Segmenter &s)
{
167
  out << s.getInfoLine() << endl;
Mikaël Salson's avatar
Mikaël Salson committed
168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185

  if (s.segmented)
    {
      out << s.seg_V << endl ;
      out << s.seg_N << endl ;
      out << s.seg_J << endl ;
    }
  else
    {
      out << s.sequence << endl ;
    }

  return out ;
}


// KmerSegmenter (Cheap)

186 187 188
KmerSegmenter::KmerSegmenter() { kaa = 0 ; }

KmerSegmenter::KmerSegmenter(Sequence seq, Germline *germline)
Mikaël Salson's avatar
Mikaël Salson committed
189 190 191 192
{
  label = seq.label ;
  sequence = seq.sequence ;
  info = "" ;
193
  info_extra = "seed";
Mikaël Salson's avatar
Mikaël Salson committed
194
  segmented = false;
195
  segmented_germline = germline ;
196
  reversed = false;
Mathieu Giraud's avatar
Mathieu Giraud committed
197
  Dend=0;
198 199
  because = 0 ; // Cause of unsegmentation
  score = 0 ;
200
  evalue = NO_LIMIT_VALUE;
201

202
  int s = (size_t)germline->index->getS() ;
Mikaël Salson's avatar
Mikaël Salson committed
203
  int length = sequence.length() ;
Mikaël Salson's avatar
Mikaël Salson committed
204

205
  if (length < s) 
Mikaël Salson's avatar
Mikaël Salson committed
206
    {
Mathieu Giraud's avatar
Mathieu Giraud committed
207 208
      because = UNSEG_TOO_SHORT;
      kaa = NULL;
209
      return ;
Mikaël Salson's avatar
Mikaël Salson committed
210 211
    }
 
212
  kaa = new KmerAffectAnalyser(*(germline->index), sequence);
Mikaël Salson's avatar
Mikaël Salson committed
213 214
  
  // Check strand consistency among the affectations.
Mikaël Salson's avatar
Mikaël Salson committed
215 216 217 218 219 220 221 222 223
  int strand;
  int nb_strand[2] = {0,0};     // In cell 0 we'll put the number of negative
                                // strand, while in cell 1 we'll put the
                                // positives
  for (int i = 0; i < kaa->count(); i++) { 
    KmerAffect it = kaa->getAffectation(i);
    if (! it.isAmbiguous() && ! it.isUnknown()) {
      strand = affect_strand(it.affect);
      nb_strand[(strand + 1) / 2] ++; // (strand+1) / 2 → 0 if strand == -1; 1 if strand == 1
Mikaël Salson's avatar
Mikaël Salson committed
224 225 226
    }
  }

227 228 229
  KmerAffect before, after;

  // Test on which strand we are, select the before and after KmerAffects
Mikaël Salson's avatar
Mikaël Salson committed
230
  if (nb_strand[0] == 0 && nb_strand[1] == 0) {
231
    because = UNSEG_TOO_FEW_ZERO ;
232
    return ;
Mikaël Salson's avatar
Mikaël Salson committed
233 234
  } else if (nb_strand[0] > RATIO_STRAND * nb_strand[1]) {
    strand = -1;
235 236
    before = KmerAffect(germline->affect_3, -1); 
    after = KmerAffect(germline->affect_5, -1);
Mikaël Salson's avatar
Mikaël Salson committed
237 238
  } else if (nb_strand[1] > RATIO_STRAND * nb_strand[0]) {
    strand = 1;
239 240
    before = KmerAffect(germline->affect_5, 1); 
    after = KmerAffect(germline->affect_3, 1);    
Mikaël Salson's avatar
Mikaël Salson committed
241 242 243
  } else {
    // Ambiguous information: we have positive and negative strands
    // and there is not enough difference to put them aparat.
244 245
    because = UNSEG_STRAND_NOT_CONSISTENT ;
    return ;
Mikaël Salson's avatar
Mikaël Salson committed
246 247
  }

248
  detected = false ;
249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264
  computeSegmentation(strand, before, after);

  if (! because)
    {
      // Now we check the delta between Vend and right
   
      if (Jstart - Vend < germline->delta_min)
	{
	  because = UNSEG_BAD_DELTA_MIN ;
	}

      if (Jstart - Vend > germline->delta_max)
	{
	  because = UNSEG_BAD_DELTA_MAX ;
	}
    } 
Mathieu Giraud's avatar
Mathieu Giraud committed
265

266
  if (!because)
Mathieu Giraud's avatar
Mathieu Giraud committed
267 268
    {
      // Yes, it is segmented
269
      segmented = true;
Mathieu Giraud's avatar
Mathieu Giraud committed
270 271 272 273
      reversed = (strand == -1); 
      because = reversed ? SEG_MINUS : SEG_PLUS ;

      info = string_of_int(Vend + FIRST_POS) + " " + string_of_int(Jstart + FIRST_POS)  ;
274 275
      // removeChevauchement is called once info was already computed: it is only to output info_extra
      info_extra += removeChevauchement();
Mathieu Giraud's avatar
Mathieu Giraud committed
276
      finishSegmentation();
277
      system = germline->code;
278
      return ;
Mathieu Giraud's avatar
Mathieu Giraud committed
279
    } 
280

281
 
Mathieu Giraud's avatar
Mathieu Giraud committed
282
}
Mikaël Salson's avatar
Mikaël Salson committed
283

Mathieu Giraud's avatar
Mathieu Giraud committed
284
KmerSegmenter::~KmerSegmenter() {
285 286
  if (kaa)
    delete kaa;
287 288
}

289
KmerMultiSegmenter::KmerMultiSegmenter(Sequence seq, MultiGermline *multigermline, ostream *out_unsegmented, double threshold)
290
{
291
  int best_score_seg = 0 ; // Best score, segmented sequences
292
  int best_score_unseg = 0 ; // Best score, unsegmented sequences
293
  the_kseg = NULL;
294
  multi_germline = multigermline;
295
  threshold_nb_expected = threshold;
296
  
297 298 299 300 301
  // Iterate over the germlines
  for (list<Germline*>::const_iterator it = multigermline->germlines.begin(); it != multigermline->germlines.end(); ++it)
    {
      Germline *germline = *it ;

302 303
      KmerSegmenter *kseg = new KmerSegmenter(seq, germline);
      bool keep_seg = false;
304

305 306 307 308
      if (out_unsegmented)
        {
          // Debug, display k-mer affectation and segmentation result for this germline
          *out_unsegmented << "#"
309 310
                           << left << setw(4) << kseg->segmented_germline->code << " "
                           << left << setw(20) << segmented_mesg[kseg->getSegmentationStatus()] << " ";
311

312
          *out_unsegmented << right << setw(3) << kseg->score << " ";
313
          
314 315
          if (kseg->getSegmentationStatus() != UNSEG_TOO_SHORT) 
            *out_unsegmented << kseg->getKmerAffectAnalyser()->toString();
316 317 318 319

          *out_unsegmented << endl ;
        }

320 321
      // Always remember the first kseg
      if (the_kseg == NULL)
322
        keep_seg = true;
323
      
324
      if (kseg->isSegmented())
325 326
        {
          // Yes, it is segmented
327
          // Should we keep the kseg ?
328
          if (kseg->score > best_score_seg)
329
            {
330
              keep_seg = true;
331
              best_score_seg = kseg->score ;
332
            }
333
        }
334 335 336 337 338 339 340 341 342 343 344 345
      else
        {
          // It is not segmented
          // Should we keep the kseg (with the unsegmentation cause) ?
            if (kseg->score > best_score_unseg)
            {              
              best_score_unseg = kseg->score ;
              if (!best_score_seg)
                keep_seg = true;
            }
        }
      
346
      if (keep_seg) {
347 348
        if (the_kseg)
          delete the_kseg;
349 350 351 352
        the_kseg = kseg;
      } else {
        delete kseg;
      }
353
    } // end for (Germlines)
354

355
  // E-value threshold
356
  if (threshold_nb_expected > NO_LIMIT_VALUE)
357 358 359 360 361 362 363
    if (the_kseg->isSegmented()) {
        // the_kseg->evalue also depends on the number of germlines from the *Multi*KmerSegmenter
        the_kseg->evalue = getNbExpected();
        if (the_kseg->evalue > threshold_nb_expected) {
          the_kseg->setSegmentationStatus(UNSEG_NOISY);
        }
      }
364 365
}

366
double KmerMultiSegmenter::getNbExpected() const {
367
  double proba = the_kseg->getKmerAffectAnalyser()->getProbabilityAtLeastOrAbove(the_kseg->score);
368
  return multi_germline->germlines.size() * proba;
369 370 371
}

KmerMultiSegmenter::~KmerMultiSegmenter() {
372 373
  if (the_kseg)
    delete the_kseg;
Mathieu Giraud's avatar
Mathieu Giraud committed
374 375
}

376 377
void KmerSegmenter::computeSegmentation(int strand, KmerAffect before, KmerAffect after) {
  // Try to segment, computing 'Vend' and 'Jstart'
378 379
  // If not segmented, put the cause of unsegmentation in 'because'

380
  affect_infos max;
Mikaël Salson's avatar
Mikaël Salson committed
381

382
  max = kaa->getMaximum(before, after); 
383 384 385 386 387

      // We labeled it detected if there were both enough affect_5 and enough affect_3
      detected = (max.nb_before_left + max.nb_before_right >= DETECT_THRESHOLD)
        && (max.nb_after_left + max.nb_after_right >= DETECT_THRESHOLD);
      
388
      if (! max.max_found) {
389 390
        if ((strand == 1 && max.nb_before_left == 0)
            || (strand == -1 && max.nb_after_right == 0)) 
391
          because = detected ? UNSEG_AMBIGUOUS : UNSEG_TOO_FEW_V ;
392 393
        else if ((strand == 1 && max.nb_after_right == 0)
                 || (strand == -1 && max.nb_before_left == 0))
Mikaël Salson's avatar
Mikaël Salson committed
394
	{
395
	  because = detected ? UNSEG_AMBIGUOUS : UNSEG_TOO_FEW_J ;
396
	} else 
397 398 399 400 401 402 403 404 405 406
          because = UNSEG_AMBIGUOUS; 
      } else {
        Vend = max.first_pos_max;
        Jstart = max.last_pos_max + 1;
        if (strand == -1) {
          int tmp = sequence.size() - Vend - 1;
          Vend = sequence.size() - Jstart - 1;
          Jstart = tmp;
        }
      }
Mikaël Salson's avatar
Mikaël Salson committed
407
  
408
    score = max.nb_before_left + max.nb_before_right + max.nb_after_left + max.nb_after_right;  
Mathieu Giraud's avatar
Mathieu Giraud committed
409
}
Mikaël Salson's avatar
Mikaël Salson committed
410

411
KmerAffectAnalyser *KmerSegmenter::getKmerAffectAnalyser() const {
Mathieu Giraud's avatar
Mathieu Giraud committed
412 413
  return kaa;
}
Mikaël Salson's avatar
Mikaël Salson committed
414

415
int Segmenter::getSegmentationStatus() const {
Mathieu Giraud's avatar
Mathieu Giraud committed
416
  return because;
Mikaël Salson's avatar
Mikaël Salson committed
417 418
}

419 420 421 422 423
void Segmenter::setSegmentationStatus(int status) {
  because = status;
  segmented = (status == SEG_PLUS || status == SEG_MINUS);
}

Mikaël Salson's avatar
Mikaël Salson committed
424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448
// FineSegmenter

void best_align(int overlap, string seq_left, string seq_right,
		string ref_left ,string ref_right, int *b_r, int *b_l, Cost segment_cost)
{
      int score_r[overlap+1];
      int score_l[overlap+1];
      
      //LEFT
      ref_left=string(ref_left.rbegin(), ref_left.rend());
      seq_left=string(seq_left.rbegin(), seq_left.rend()); 
      
      DynProg dp1 = DynProg(seq_left, ref_left,
			   DynProg::Local, segment_cost);
      score_l[0] = dp1.compute();
      dp1.backtrack();
      
      //RIGHT
      DynProg dp = DynProg(seq_right, ref_right,
			   DynProg::Local, segment_cost);
      score_r[0] = dp.compute();
      dp.backtrack();    
      
      for(int i=1; i<=overlap; i++){
	if(dp.best_i-i >0)
449
	score_r[i] = dp.B[dp.best_i-i][dp.best_j].score;
Mikaël Salson's avatar
Mikaël Salson committed
450 451 452
	else
	score_r[i] =0;
	if(dp1.best_i-i >0)
453
	score_l[i] = dp1.B[dp1.best_i-i][dp1.best_j].score;	
Mikaël Salson's avatar
Mikaël Salson committed
454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478
	else
	score_l[i] =0;
      }
      int score=-1000000;
      int best_r=0;
      int best_l=0;

      for (int i=0; i<=overlap; i++){
	for (int j=overlap-i; j<=overlap; j++){
	  if ( ((score_r[i]+score_l[j]) == score) && (i+j < best_r+best_l )){
	    best_r=i;
	    best_l=j;
	    score=(score_r[i]+score_l[j]) ;
	  }
	  if ( (score_r[i]+score_l[j]) > score){
	    best_r=i;
	    best_l=j;
	    score=(score_r[i]+score_l[j]) ;
	  }
	}
      }
      *b_r=best_r;
      *b_l=best_l;
}

Mikaël Salson's avatar
Mikaël Salson committed
479 480 481 482 483
bool comp_pair (pair<int,int> i,pair<int,int> j)
{
  return ( i.first > j.first);
}

Mathieu Giraud's avatar
Mathieu Giraud committed
484
int align_against_collection(string &read, Fasta &rep, bool reverse_both, bool local, int *tag, 
Mikaël Salson's avatar
Mikaël Salson committed
485
			     int *del, int *del2, int *begin, int *length, vector<pair<int, int> > *score
Mikaël Salson's avatar
Mikaël Salson committed
486 487 488 489 490 491 492 493 494 495
			    , Cost segment_cost)
{
  
  int best_score = MINUS_INF ;
  int best_r = MINUS_INF ;
  int best_best_i = (int) string::npos ;
  int best_best_j = (int) string::npos ;
  int best_first_i = (int) string::npos ;
  int best_first_j = (int) string::npos ;
  string best_label = "" ;
Mikaël Salson's avatar
Mikaël Salson committed
496
  vector<pair<int, int> > score_r;
Mikaël Salson's avatar
Mikaël Salson committed
497 498 499 500 501 502 503 504 505 506 507

  DynProg::DynProgMode dpMode = DynProg::LocalEndWithSomeDeletions;
  if (local==true) dpMode = DynProg::Local;
  
  for (int r = 0 ; r < rep.size() ; r++)
    {
      DynProg dp = DynProg(read, rep.sequence(r),
			   dpMode, // DynProg::SemiGlobalTrans, 
			   segment_cost, // DNA
			   reverse_both, reverse_both);
      int score = dp.compute();
Mikaël Salson's avatar
Mikaël Salson committed
508
      
Mikaël Salson's avatar
Mikaël Salson committed
509 510 511 512 513 514 515 516 517 518 519 520 521 522
      if (local==true){ 
	dp.backtrack();
      }
      
      if (score > best_score)
	{
	  best_score = score ;
	  best_best_i = dp.best_i ;
	  best_best_j = dp.best_j ;
	  best_first_i = dp.first_i ;
	  best_first_j = dp.first_j ;
	  best_r = r ;
	  best_label = rep.label(r) ;
	}
Mikaël Salson's avatar
Mikaël Salson committed
523 524
	
	score_r.push_back(make_pair(score, r));
Mathieu Giraud's avatar
Mathieu Giraud committed
525 526 527 528 529 530

	// #define DEBUG_SEGMENT      

#ifdef DEBUG_SEGMENT	
	cout << rep.label(r) << " " << score << " " << dp.best_i << endl ;
#endif
Mikaël Salson's avatar
Mikaël Salson committed
531 532

    }
Mikaël Salson's avatar
Mikaël Salson committed
533
    sort(score_r.begin(),score_r.end(),comp_pair);
Mikaël Salson's avatar
Mikaël Salson committed
534 535 536 537

  *del = reverse_both ? best_best_j : rep.sequence(best_r).size() - best_best_j - 1;
  *del2 = best_first_j;
  *begin = best_first_i;
Mathieu Giraud's avatar
Mathieu Giraud committed
538
  *tag = best_r ; 
Mikaël Salson's avatar
Mikaël Salson committed
539 540
  
  *length -= *del ;
Mikaël Salson's avatar
Mikaël Salson committed
541 542
  
  *score=score_r;
Mathieu Giraud's avatar
Mathieu Giraud committed
543 544 545 546 547 548

#ifdef DEBUG_SEGMENT	
  cout << "best: " << best_labels << " " << best_score ;
  cout << "del/del2/begin:" << (*del) << "/" << (*del2) << "/" << (*begin) << endl;
  cout << endl;
#endif
Mikaël Salson's avatar
Mikaël Salson committed
549 550 551 552 553 554 555 556 557
  
  return best_best_i ;
}

string format_del(int deletions)
{
  return deletions ? *"(" + string_of_int(deletions) + " del)" : "" ;
}

558
FineSegmenter::FineSegmenter(Sequence seq, Germline *germline, Cost segment_c)
Mikaël Salson's avatar
Mikaël Salson committed
559
{
560 561
  segmented = false;
  dSegmented = false;
562
  because = 0 ;
563
  segmented_germline = germline ;
564
  info_extra = "" ;
Mikaël Salson's avatar
Mikaël Salson committed
565 566
  label = seq.label ;
  sequence = seq.sequence ;
Mathieu Giraud's avatar
Mathieu Giraud committed
567
  Dend=0;
Mikaël Salson's avatar
Mikaël Salson committed
568
  segment_cost=segment_c;
569
  evalue = NO_LIMIT_VALUE;
Mikaël Salson's avatar
Mikaël Salson committed
570

571 572 573
  CDR3start = -1;
  CDR3end = -1;
  
Mikaël Salson's avatar
Mikaël Salson committed
574 575 576 577 578
  // TODO: factoriser tout cela, peut-etre en lancant deux segmenteurs, un +, un -, puis un qui chapote
  
  // Strand +
  
  int plus_score = 0 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
579
  int tag_plus_V, tag_plus_J;
Mikaël Salson's avatar
Mikaël Salson committed
580 581 582 583
  int plus_length = 0 ;
  int del_plus_V, del_plus_J ;
  int del2=0;
  int beg=0;
Mikaël Salson's avatar
Mikaël Salson committed
584 585 586 587
  
  vector<pair<int, int> > score_plus_V;
  vector<pair<int, int> > score_plus_J;
  
588
  int plus_left = align_against_collection(sequence, germline->rep_5, false, false, &tag_plus_V, &del_plus_V, &del2, &beg, 
Mikaël Salson's avatar
Mikaël Salson committed
589 590
					   &plus_length, &score_plus_V
					   , segment_cost);
591
  int plus_right = align_against_collection(sequence, germline->rep_3, true, false, &tag_plus_J, &del_plus_J, &del2, &beg,
Mikaël Salson's avatar
Mikaël Salson committed
592 593
					    &plus_length, &score_plus_J
					    , segment_cost);
Mikaël Salson's avatar
Mikaël Salson committed
594 595
  plus_length += plus_right - plus_left ;

Mikaël Salson's avatar
Mikaël Salson committed
596 597
  plus_score=score_plus_V[0].first + score_plus_J[0].first ;
  
Mikaël Salson's avatar
Mikaël Salson committed
598 599 600
  // Strand -
  string rc = revcomp(sequence) ;
  int minus_score = 0 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
601
  int tag_minus_V, tag_minus_J;
Mikaël Salson's avatar
Mikaël Salson committed
602 603
  int minus_length = 0 ;
  int del_minus_V, del_minus_J ;
Mikaël Salson's avatar
Mikaël Salson committed
604 605 606 607
  
  vector<pair<int, int> > score_minus_V;
  vector<pair<int, int> > score_minus_J;
  
608
  int minus_left = align_against_collection(rc, germline->rep_5, false, false, &tag_minus_V, &del_minus_V, &del2, &beg,
Mikaël Salson's avatar
Mikaël Salson committed
609 610
					    &minus_length, &score_minus_V
					    ,  segment_cost);
611
  int minus_right = align_against_collection(rc, germline->rep_3, true, false, &tag_minus_J, &del_minus_J, &del2, &beg,
Mikaël Salson's avatar
Mikaël Salson committed
612 613
					     &minus_length, &score_minus_J
					     , segment_cost);
Mikaël Salson's avatar
Mikaël Salson committed
614 615
  minus_length += minus_right - minus_left ;

Mikaël Salson's avatar
Mikaël Salson committed
616 617
  minus_score=score_minus_V[0].first + score_minus_J[0].first ;
  
Mikaël Salson's avatar
Mikaël Salson committed
618 619 620 621
  reversed = (minus_score > plus_score) ;

  if (!reversed)
    {
Mathieu Giraud's avatar
Mathieu Giraud committed
622 623 624 625
      Vend = plus_left ;
      Jstart = plus_right ;
      best_V = tag_plus_V ;
      best_J = tag_plus_J ;
Mikaël Salson's avatar
Mikaël Salson committed
626 627
      del_V = del_plus_V ;
      del_J = del_plus_J ;
Mikaël Salson's avatar
Mikaël Salson committed
628 629
      score_V=score_plus_V;
      score_J=score_plus_J;
Mikaël Salson's avatar
Mikaël Salson committed
630 631 632
    }
  else
    {
Mathieu Giraud's avatar
Mathieu Giraud committed
633 634 635 636
      Vend = minus_left ;
      Jstart = minus_right ;
      best_V = tag_minus_V ;
      best_J = tag_minus_J ;
Mikaël Salson's avatar
Mikaël Salson committed
637 638
      del_V = del_minus_V ;
      del_J = del_minus_J ;
Mikaël Salson's avatar
Mikaël Salson committed
639 640
      score_V=score_minus_V;
      score_J=score_minus_J;
Mikaël Salson's avatar
Mikaël Salson committed
641 642
    }

Mathieu Giraud's avatar
Mathieu Giraud committed
643
  segmented = (Vend != (int) string::npos) && (Jstart != (int) string::npos) && 
644
    (Jstart - Vend >= germline->delta_min) && (Jstart - Vend <= germline->delta_max);
Mikaël Salson's avatar
Mikaël Salson committed
645 646 647
    
  if (!segmented)
    {
648
      because = DONT_KNOW;
Mathieu Giraud's avatar
Mathieu Giraud committed
649
      info = " @" + string_of_int (Vend + FIRST_POS) + "  @" + string_of_int(Jstart + FIRST_POS) ;
650
      
651
      if (Jstart - Vend < germline->delta_min)
652 653 654 655
        {
          because = UNSEG_BAD_DELTA_MIN  ;
        }

656
      if (Jstart - Vend > germline->delta_max)
657 658 659 660 661 662 663 664 665 666 667 668 669 670
        {
          because = UNSEG_BAD_DELTA_MAX  ;
        }
        
      if (Vend == (int) string::npos) 
        {
          because = UNSEG_TOO_FEW_V ;
        }
      
      if (Jstart == (int) string::npos)
        {
          because = UNSEG_TOO_FEW_J ;
        }
      
Mikaël Salson's avatar
Mikaël Salson committed
671 672 673
      return ;
    }
    
674 675
    because = reversed ? SEG_MINUS : SEG_PLUS ;
    
Mikaël Salson's avatar
Mikaël Salson committed
676
    //overlap VJ
Mathieu Giraud's avatar
Mathieu Giraud committed
677
    if(Jstart-Vend <=0){
Mikaël Salson's avatar
Mikaël Salson committed
678
      int b_r, b_l;
Mathieu Giraud's avatar
Mathieu Giraud committed
679
      int overlap=Vend-Jstart+1;
Mikaël Salson's avatar
Mikaël Salson committed
680
      
Mathieu Giraud's avatar
Mathieu Giraud committed
681 682
      string seq_left = sequence.substr(0, Vend+1);
      string seq_right = sequence.substr(Jstart);
Mikaël Salson's avatar
Mikaël Salson committed
683 684

      best_align(overlap, seq_left, seq_right, 
685
		 germline->rep_5.sequence(best_V), germline->rep_3.sequence(best_J), &b_r,&b_l, segment_cost);
Mikaël Salson's avatar
Mikaël Salson committed
686
      // Trim V
Mathieu Giraud's avatar
Mathieu Giraud committed
687
      Vend -= b_l;
Mikaël Salson's avatar
Mikaël Salson committed
688 689 690
      del_V += b_l;

      // Trim J
Mathieu Giraud's avatar
Mathieu Giraud committed
691
      Jstart += b_r;
Mikaël Salson's avatar
Mikaël Salson committed
692
      del_J += b_r;
693
      if (Jstart>=(int) sequence.length())
Mathieu Giraud's avatar
Mathieu Giraud committed
694
	  Jstart=sequence.length()-1;
Mikaël Salson's avatar
Mikaël Salson committed
695 696 697
    }

    // string chevauchement = removeChevauchement();
Mikaël Salson's avatar
Mikaël Salson committed
698 699

    /// used only below, then recomputed in finishSegmentation() ;
Mathieu Giraud's avatar
Mathieu Giraud committed
700
    seg_N = revcomp(sequence, reversed).substr(Vend+1, Jstart-Vend-1); 
Mikaël Salson's avatar
Mikaël Salson committed
701

702
  code = germline->rep_5.label(best_V) +
Mikaël Salson's avatar
Mikaël Salson committed
703 704 705 706
    " "+ string_of_int(del_V) + 
    "/" + seg_N + 
    // chevauchement +
    "/" + string_of_int(del_J) +
707
    " " + germline->rep_3.label(best_J); 
Mikaël Salson's avatar
Mikaël Salson committed
708

Mikaël Salson's avatar
Mikaël Salson committed
709
    stringstream code_s;
710
   code_s<< germline->rep_5.label(best_V) <<
Mikaël Salson's avatar
Mikaël Salson committed
711 712 713 714
    " -" << string_of_int(del_V) << "/" 
    << seg_N.size()
    // chevauchement +
    << "/-" << string_of_int(del_J)
715
    <<" " << germline->rep_3.label(best_J);
Mikaël Salson's avatar
Mikaël Salson committed
716 717
    code_short=code_s.str();
    
718 719
  code_light = germline->rep_5.label(best_V) +
    "/ " + germline->rep_3.label(best_J); 
Mikaël Salson's avatar
Mikaël Salson committed
720 721

 
Mathieu Giraud's avatar
Mathieu Giraud committed
722
  info = string_of_int(Vend + FIRST_POS) + " " + string_of_int(Jstart + FIRST_POS) ;
Mikaël Salson's avatar
Mikaël Salson committed
723 724 725 726
  finishSegmentation();
}


727
void FineSegmenter::FineSegmentD(Germline *germline){
Mikaël Salson's avatar
Mikaël Salson committed
728 729 730 731
  
  if (segmented){
    
    int end = (int) string::npos ;
Mathieu Giraud's avatar
Mathieu Giraud committed
732
    int tag_D;
Mikaël Salson's avatar
Mikaël Salson committed
733 734 735 736
    int length = 0 ;
    int begin = 0;
    
    // Create a zone where to look for D, adding at most EXTEND_D_ZONE nucleotides at each side
Mathieu Giraud's avatar
Mathieu Giraud committed
737
    int l = Vend - EXTEND_D_ZONE;
Mikaël Salson's avatar
Mikaël Salson committed
738 739 740
    if (l<0) 
      l=0 ;

Mathieu Giraud's avatar
Mathieu Giraud committed
741
    int r = Jstart + EXTEND_D_ZONE;
Mikaël Salson's avatar
Mikaël Salson committed
742 743 744 745 746 747 748

    if (r > (int)getSequence().sequence.length()) 
      r = getSequence().sequence.length();
      
    string str = getSequence().sequence.substr(l, r-l);

    // Align
749
    end = align_against_collection(str, germline->rep_4, false, true, &tag_D, &del_D_right, &del_D_left, &begin,
Mikaël Salson's avatar
Mikaël Salson committed
750 751
				&length, &score_D, segment_cost);
    
Mathieu Giraud's avatar
Mathieu Giraud committed
752
    best_D = tag_D;
Mikaël Salson's avatar
Mikaël Salson committed
753
    
Mathieu Giraud's avatar
Mathieu Giraud committed
754 755
    Dstart = l + begin;
    Dend = l + end;
Mikaël Salson's avatar
Mikaël Salson committed
756 757
	
    string seq = getSequence().sequence;
758 759 760 761 762 763 764 765 766 767


    // recompute remaining length for D
    length = germline->rep_4.sequence(best_D).length() - del_D_right - del_D_left;

    if (length < MIN_D_LENGTH)
      return ;


    dSegmented=true;
Mikaël Salson's avatar
Mikaël Salson committed
768 769
    
    //overlap VD
Mathieu Giraud's avatar
Mathieu Giraud committed
770
    if(Dstart-Vend <=0){
Mikaël Salson's avatar
Mikaël Salson committed
771
      int b_r, b_l;
Mathieu Giraud's avatar
Mathieu Giraud committed
772 773 774
      int overlap=Vend-Dstart+1;
      string seq_left = seq.substr(0, Vend+1);
      string seq_right = seq.substr(Dstart, Dend-Dstart+1);
Mikaël Salson's avatar
Mikaël Salson committed
775 776

      best_align(overlap, seq_left, seq_right, 
777
		 germline->rep_5.sequence(best_V), germline->rep_4.sequence(best_D), &b_r,&b_l, segment_cost);
Mikaël Salson's avatar
Mikaël Salson committed
778 779

      // Trim V
Mathieu Giraud's avatar
Mathieu Giraud committed
780
      Vend -= b_l;
Mikaël Salson's avatar
Mikaël Salson committed
781 782 783
      del_V += b_l;

      // Trim D
Mathieu Giraud's avatar
Mathieu Giraud committed
784
      Dstart += b_r;
Mikaël Salson's avatar
Mikaël Salson committed
785
    }
Mathieu Giraud's avatar
Mathieu Giraud committed
786
    seg_N1 = seq.substr(Vend+1, Dstart-Vend-1) ; // From Vend+1 to Dstart-1
Mikaël Salson's avatar
Mikaël Salson committed
787 788
    
    //overlap DJ
Mathieu Giraud's avatar
Mathieu Giraud committed
789
    if(Jstart-Dend <=0){
Mikaël Salson's avatar
Mikaël Salson committed
790
      int b_r, b_l;
Mathieu Giraud's avatar
Mathieu Giraud committed
791 792 793
      int overlap=Dend-Jstart+1;
      string seq_right = seq.substr(Dstart, Dend-Dstart+1);
      string seq_left = seq.substr(Jstart, seq.length()-Jstart);
Mikaël Salson's avatar
Mikaël Salson committed
794 795

      best_align(overlap, seq_left, seq_right, 
796
		 germline->rep_4.sequence(best_D), germline->rep_3.sequence(best_J), &b_r,&b_l, segment_cost);
Mikaël Salson's avatar
Mikaël Salson committed
797 798

      // Trim D
Mathieu Giraud's avatar
Mathieu Giraud committed
799
      Dend -= b_l;
Mikaël Salson's avatar
Mikaël Salson committed
800 801

      // Trim J
Mathieu Giraud's avatar
Mathieu Giraud committed
802
      Jstart += b_r;
Mikaël Salson's avatar
Mikaël Salson committed
803 804 805
      del_J += b_r;

    }
Mathieu Giraud's avatar
Mathieu Giraud committed
806
    seg_N2 = seq.substr(Dend+1, Jstart-Dend-1) ; // From Dend+1 to right-1
807
    code = germline->rep_5.label(best_V) +
Mikaël Salson's avatar
Mikaël Salson committed
808 809 810 811
    " "+ string_of_int(del_V) + 
    "/" + seg_N1 + 
    
    "/" + string_of_int(del_D_left) +
812
    " " + germline->rep_4.label(best_D) +
Mikaël Salson's avatar
Mikaël Salson committed
813 814 815 816
    " " + string_of_int(del_D_right) +
    
    "/" + seg_N2 +
    "/" + string_of_int(del_J) +
817
    " " + germline->rep_3.label(best_J); 
Mikaël Salson's avatar
Mikaël Salson committed
818

Mikaël Salson's avatar
Mikaël Salson committed
819
    stringstream code_s;
820
    code_s << germline->rep_5.label(best_V) 
Mikaël Salson's avatar
Mikaël Salson committed
821 822 823 824
    << " -" << string_of_int(del_V) << "/" 
    << seg_N1.size()
    
    << "/-" << string_of_int(del_D_left) 
825
    << " " << germline->rep_4.label(best_D) 
Mikaël Salson's avatar
Mikaël Salson committed
826 827 828 829
    << " -" << string_of_int(del_D_right) << "/"
    
    << seg_N2.size()
    << "/-" << string_of_int(del_J) 
830
    << " " << germline->rep_3.label(best_J);
Mikaël Salson's avatar
Mikaël Salson committed
831 832 833
    code_short=code_s.str();
    
    
834 835 836
    code_light = germline->rep_5.label(best_V) +
    "/ " + germline->rep_4.label(best_D) +
    "/ " + germline->rep_3.label(best_J); 
Mikaël Salson's avatar
Mikaël Salson committed
837 838 839 840
    
    finishSegmentationD();
  }
}
Mikaël Salson's avatar
Mikaël Salson committed
841

Marc Duez's avatar
Marc Duez committed
842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860
void FineSegmenter::findCDR3(){
    string str = getSequence().sequence;
    
    list<string> codon_start;
    codon_start.push_back("TGT");
    codon_start.push_back("TGC");
    
    list<string> codon_end;
    codon_end.push_back("TTT");
    codon_end.push_back("TTC");
    codon_end.push_back("TGG");
    
    list<int> p_start;
    list<int> p_end;

    size_t loc;
    std::list<string>::const_iterator it;
    for (it = codon_start.begin(); it != codon_start.end(); ++it) {//filter 1 : start codon must be in V
        loc = 0;
861
        while ( loc != string::npos && loc < (size_t)Vend){
Marc Duez's avatar
Marc Duez committed
862
            loc = str.find(*it, loc+3);
863
            if (loc != string::npos && loc < (size_t)Vend) {
Marc Duez's avatar
Marc Duez committed
864 865 866 867 868 869 870 871 872 873 874 875 876 877 878
                p_start.push_front(loc);
            }
        }
    }

    for (it = codon_end.begin(); it != codon_end.end(); ++it) {//filter 2 : end codon must be in J
        loc = Jstart;
        while ( loc != string::npos){
            loc = str.find(*it, loc+3);
            if (loc != string::npos) {
                p_end.push_back(loc);
            }
        }
    }

879 880
    CDR3start = -1;
    CDR3end = -1;
Marc Duez's avatar
Marc Duez committed
881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899
    
    std::list<int>::const_iterator it1;
    for (it1 = p_start.begin(); it1 != p_start.end(); ++it1) {
        
        std::list<int>::const_iterator it2;
        for (it2 = p_end.begin(); it2 != p_end.end(); ++it2) {
            
            if ( (*it2-*it1)%3 == 0){       //filter 3 : start/stop codon must be seprated by a multiple of 3
                
                if ( fabs((*it2-*it1)-36 ) < fabs((CDR3end-CDR3start)-36) ){ //filter 4 : cdr3 length must be close to 12 AA
                    CDR3start = *it1;
                    CDR3end = *it2;
                }
            }
        }
    }
    
}

900
void FineSegmenter::toJsonList(JsonList *seg){
Tatiana Rocher's avatar
Tatiana Rocher committed
901

Mathieu Giraud's avatar
Mathieu Giraud committed
902
  if (isSegmented()) {
903 904 905
    seg->add("5", segmented_germline->rep_5.label(best_V));
    seg->add("5start", 0);
    seg->add("5end", Vend);
Mathieu Giraud's avatar
Mathieu Giraud committed
906 907
    
    if (score_D.size()>0){
908 909 910
      seg->add("4", segmented_germline->rep_4.label(best_D));
      seg->add("4start", Dstart);
      seg->add("4end", Dend);
Mikaël Salson's avatar
Mikaël Salson committed
911
    }
Mathieu Giraud's avatar
Mathieu Giraud committed
912
    
913 914
    seg->add("3", segmented_germline->rep_3.label(best_J));
    seg->add("3start", Jstart);
915 916 917

    if (CDR3start >= 0)
      {
Marc Duez's avatar
Marc Duez committed
918 919 920 921
    JsonList *json_cdr;
    json_cdr=new JsonList();
    json_cdr->add("start", CDR3start);
    json_cdr->add("stop", CDR3end);
922
    seg->add("cdr3", *json_cdr);
923
      }
Tatiana Rocher's avatar
Tatiana Rocher committed
924

925 926
  }
}
927

928 929
void KmerSegmenter::toJsonList(JsonList *seg)
{
930 931
    int sequenceSize = sequence.size();

932
    if (evalue > NO_LIMIT_VALUE)
933
      seg->add("_evalue", scientific_string_of_double(evalue));
934

935 936 937 938
    JsonList *json_affectValues;
    json_affectValues=new JsonList();
    json_affectValues->add("start", 0);
    json_affectValues->add("stop", sequenceSize); 
939 940
    json_affectValues->add("seq", getKmerAffectAnalyser()->toStringValues());
    seg->add("affectValues", *json_affectValues);
Tatiana Rocher's avatar
Tatiana Rocher committed
941 942
      

943
    JsonList *json_affectSigns;
944 945 946
    json_affectSigns=new JsonList();
    json_affectSigns->add("start", 0);
    json_affectSigns->add("stop", sequenceSize); 
947 948
    json_affectSigns->add("seq", getKmerAffectAnalyser()->toStringSigns());
    seg->add("affectSigns", *json_affectSigns);
949 950 951

    delete json_affectValues;
    delete json_affectSigns;
Mikaël Salson's avatar
Mikaël Salson committed
952 953 954
}