segment.cpp 19.8 KB
Newer Older
Mikaël Salson's avatar
Mikaël Salson committed
1
/*
2
3
4
5
6
7
8
  This file is part of Vidjil <http://www.vidjil.org>
  Copyright (C) 2011, 2012, 2013, 2014, 2015 by Bonsai bioinformatics 
  at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille
  Contributors: 
      Mathieu Giraud <mathieu.giraud@vidjil.org>
      Mikaël Salson <mikael.salson@vidjil.org>
      Marc Duez <marc.duez@vidjil.org>
Mikaël Salson's avatar
Mikaël Salson committed
9
10
11
12
13
14
15
16
17
18
19
20
21
22

  "Vidjil" is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.

  "Vidjil" is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with "Vidjil". If not, see <http://www.gnu.org/licenses/>
*/
Mikaël Salson's avatar
Mikaël Salson committed
23
#include <algorithm>    // std::sort
Mikaël Salson's avatar
Mikaël Salson committed
24
25
26
27
#include <cassert>
#include "segment.h"
#include "tools.h"
#include "affectanalyser.h"
Mikaël Salson's avatar
Mikaël Salson committed
28
#include <sstream>
Mikaël Salson's avatar
Mikaël Salson committed
29

Mathieu Giraud's avatar
Mathieu Giraud committed
30
Segmenter::~Segmenter() {}
Mikaël Salson's avatar
Mikaël Salson committed
31

Mathieu Giraud's avatar
Mathieu Giraud committed
32
Sequence Segmenter::getSequence() const {
Mikaël Salson's avatar
Mikaël Salson committed
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
  Sequence s ;
  s.label_full = info ;
  s.label = label + " " + (reversed ? "-" : "+");
  s.sequence = revcomp(sequence, reversed);

  return s ;
}

string Segmenter::getJunction(int l) const {
  assert(isSegmented());

  int start = (getLeft() + getRight())/2 - l/2;
  
  if (start < 0 or start + l > (int)sequence.size())  // TODO: +l ou +l-1 ?
    return "" ;

  return getSequence().sequence.substr(start, l);
}

int Segmenter::getLeft() const {
Mathieu Giraud's avatar
Mathieu Giraud committed
53
  return Vend;
Mikaël Salson's avatar
Mikaël Salson committed
54
55
56
}
  
int Segmenter::getRight() const {
Mathieu Giraud's avatar
Mathieu Giraud committed
57
  return Jstart;
Mikaël Salson's avatar
Mikaël Salson committed
58
59
60
}

int Segmenter::getLeftD() const {
Mathieu Giraud's avatar
Mathieu Giraud committed
61
  return Dstart;
Mikaël Salson's avatar
Mikaël Salson committed
62
63
64
}
  
int Segmenter::getRightD() const {
Mathieu Giraud's avatar
Mathieu Giraud committed
65
  return Dend;
Mikaël Salson's avatar
Mikaël Salson committed
66
67
68
69
70
71
72
73
74
75
76
77
78
79
}

bool Segmenter::isReverse() const {
  return reversed;
}

bool Segmenter::isSegmented() const {
  return segmented;
}

bool Segmenter::isDSegmented() const {
  return dSegmented;
}

80
81
82
83
bool KmerSegmenter::isDetected() const {
  return detected;
}

Mikaël Salson's avatar
Mikaël Salson committed
84
85
86
87
88
89
90
91
// Chevauchement

string Segmenter::removeChevauchement()
{
  assert(isSegmented());
  
  string chevauchement = "" ;

Mathieu Giraud's avatar
Mathieu Giraud committed
92
  if (Vend >= Jstart)
Mikaël Salson's avatar
Mikaël Salson committed
93
    {
Mathieu Giraud's avatar
Mathieu Giraud committed
94
      int middle = (Vend + Jstart) / 2 ;
95
      chevauchement = " !ov " + string_of_int (Vend - Jstart + 1);
Mathieu Giraud's avatar
Mathieu Giraud committed
96
97
      Vend = middle ;
      Jstart = middle+1 ;
Mikaël Salson's avatar
Mikaël Salson committed
98
99
100
101
102
103
104
105
106
107
108
109
110
111
    }

  return chevauchement ;
}

// Prettyprint


bool Segmenter::finishSegmentation() 
{
  assert(isSegmented());
  
  string seq = getSequence().sequence;
    
Mathieu Giraud's avatar
Mathieu Giraud committed
112
113
114
115
116
  seg_V = seq.substr(0, Vend+1) ;
  seg_N = seq.substr(Vend+1, Jstart-Vend-1) ;  // Twice computed for FineSegmenter, but only once in KmerSegmenter !
  seg_J = seq.substr(Jstart) ;
  Dstart=0;
  Dend=0;
Mikaël Salson's avatar
Mikaël Salson committed
117
118
119
120
121
122
123
124
125
126
127
128
129

  info = "VJ \t" + string_of_int(FIRST_POS) + " " + info + " " + string_of_int(seq.size() - 1 + FIRST_POS) ;
  info += "\t" + code ;

  info = (reversed ? "- " : "+ ") + info ;

  return true ;
}

bool Segmenter::finishSegmentationD() 
{
  string seq = getSequence().sequence;

Mathieu Giraud's avatar
Mathieu Giraud committed
130
131
  seg_V = seq.substr(0, Vend+1) ; // From pos. 0 to Vend
  seg_J = seq.substr(Jstart) ;
Mikaël Salson's avatar
Mikaël Salson committed
132
  
Mathieu Giraud's avatar
Mathieu Giraud committed
133
  seg_D  = seq.substr(Dstart, Dend-Dstart+1) ; // From Dstart to Dend
Mikaël Salson's avatar
Mikaël Salson committed
134
  
Mathieu Giraud's avatar
Mathieu Giraud committed
135
136
137
138
  info = "VDJ \t0 " + string_of_int(Vend) +
		" " + string_of_int(Dstart) + 
		" " + string_of_int(Dend) +
		" " + string_of_int(Jstart) +
Mikaël Salson's avatar
Mikaël Salson committed
139
140
141
142
143
144
145
146
147
148
149
150
151
		" " + string_of_int(seq.size()-1+FIRST_POS) ;
		
  info += "\t" + code ;
  
  info = (reversed ? "- " : "+ ") + info ;

  return true ;
}

ostream &operator<<(ostream &out, const Segmenter &s)
{
  out << ">" << s.label << " " ;
  out << (s.segmented ? "" : "! ") << s.info ;
152
  out << " " << s.info_extra ;
153
  out << " " << s.segmented_germline->code ;
Mikaël Salson's avatar
Mikaël Salson committed
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
  out << endl ;

  if (s.segmented)
    {
      out << s.seg_V << endl ;
      out << s.seg_N << endl ;
      out << s.seg_J << endl ;
    }
  else
    {
      out << s.sequence << endl ;
    }

  return out ;
}


// KmerSegmenter (Cheap)

173
174
175
KmerSegmenter::KmerSegmenter() { kaa = 0 ; }

KmerSegmenter::KmerSegmenter(Sequence seq, Germline *germline)
Mikaël Salson's avatar
Mikaël Salson committed
176
177
178
179
{
  label = seq.label ;
  sequence = seq.sequence ;
  info = "" ;
180
  info_extra = "seed";
Mikaël Salson's avatar
Mikaël Salson committed
181
  segmented = false;
182
  segmented_germline = germline ;
183
  reversed = false;
Mathieu Giraud's avatar
Mathieu Giraud committed
184
  Dend=0;
Mikaël Salson's avatar
Mikaël Salson committed
185
  
186
  int s = (size_t)germline->index->getS() ;
Mikaël Salson's avatar
Mikaël Salson committed
187
  int length = sequence.length() ;
Mikaël Salson's avatar
Mikaël Salson committed
188

189
  if (length < s) 
Mikaël Salson's avatar
Mikaël Salson committed
190
    {
Mathieu Giraud's avatar
Mathieu Giraud committed
191
192
      because = UNSEG_TOO_SHORT;
      kaa = NULL;
193
      return ;
Mikaël Salson's avatar
Mikaël Salson committed
194
195
    }
 
196
  kaa = new KmerAffectAnalyser(*(germline->index), sequence);
Mikaël Salson's avatar
Mikaël Salson committed
197
198
  
  // Check strand consistency among the affectations.
Mikaël Salson's avatar
Mikaël Salson committed
199
200
201
202
203
204
205
206
207
  int strand;
  int nb_strand[2] = {0,0};     // In cell 0 we'll put the number of negative
                                // strand, while in cell 1 we'll put the
                                // positives
  for (int i = 0; i < kaa->count(); i++) { 
    KmerAffect it = kaa->getAffectation(i);
    if (! it.isAmbiguous() && ! it.isUnknown()) {
      strand = affect_strand(it.affect);
      nb_strand[(strand + 1) / 2] ++; // (strand+1) / 2 → 0 if strand == -1; 1 if strand == 1
Mikaël Salson's avatar
Mikaël Salson committed
208
209
210
    }
  }

Mikaël Salson's avatar
Mikaël Salson committed
211
212
213
214
215
216
217
218
219
220
221
222
223
  // Test on which strand we are.
  if (nb_strand[0] == 0 && nb_strand[1] == 0) {
    strand = 0;                 // No info
  } else if (nb_strand[0] > RATIO_STRAND * nb_strand[1]) {
    strand = -1;
  } else if (nb_strand[1] > RATIO_STRAND * nb_strand[0]) {
    strand = 1;
  } else {
    // Ambiguous information: we have positive and negative strands
    // and there is not enough difference to put them aparat.
    strand = 2;
  }

224
  detected = false ;
225
  computeSegmentation(strand, germline);
Mathieu Giraud's avatar
Mathieu Giraud committed
226
227
228
229
230
231
232
233

  if (segmented)
    {
      // Yes, it is segmented
      reversed = (strand == -1); 
      because = reversed ? SEG_MINUS : SEG_PLUS ;

      info = string_of_int(Vend + FIRST_POS) + " " + string_of_int(Jstart + FIRST_POS)  ;
234
235
      // removeChevauchement is called once info was already computed: it is only to output info_extra
      info_extra += removeChevauchement();
Mathieu Giraud's avatar
Mathieu Giraud committed
236
      finishSegmentation();
237
      system = germline->code;
238
      return ;
Mathieu Giraud's avatar
Mathieu Giraud committed
239
    } 
240

241
 
Mathieu Giraud's avatar
Mathieu Giraud committed
242
}
Mikaël Salson's avatar
Mikaël Salson committed
243

Mathieu Giraud's avatar
Mathieu Giraud committed
244
KmerSegmenter::~KmerSegmenter() {
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
  // if (kaa)
  //   delete kaa;
}

KmerMultiSegmenter::KmerMultiSegmenter(Sequence seq, MultiGermline *multigermline)
{
  // Iterate over the germlines
  for (list<Germline*>::const_iterator it = multigermline->germlines.begin(); it != multigermline->germlines.end(); ++it)
    {
      Germline *germline = *it ;

      KmerSegmenter kseg(seq, germline);
      the_kseg = kseg;
      
      if (kseg.isSegmented())
        {
          // Yes, it is segmented
          germline->stats.insert(seq.sequence.length());
          return ;
        }
        
      // If the germline was detected, do not test other germlines
      if (kseg.isDetected())
        return ;
      
    } // end for (Germlines)  
}

KmerMultiSegmenter::~KmerMultiSegmenter() {
Mathieu Giraud's avatar
Mathieu Giraud committed
274
275
}

276
void KmerSegmenter::computeSegmentation(int strand, Germline* germline) {
277
278
279
  // Try to segment, computing 'Vend' and 'Jstart', and 'segmented'
  // If not segmented, put the cause of unsegmentation in 'because'

Mikaël Salson's avatar
Mikaël Salson committed
280
  segmented = true ;
Mathieu Giraud's avatar
Mathieu Giraud committed
281
  because = 0 ; // Cause of unsegmentation
Mikaël Salson's avatar
Mikaël Salson committed
282
283
284
285

  // Zero information
  if (strand == 0)
    {
Mikaël Salson's avatar
Mikaël Salson committed
286
      because = UNSEG_TOO_FEW_ZERO ;
Mathieu Giraud's avatar
Mathieu Giraud committed
287
288
    } 
  else if (strand == 2) // Ambiguous
Mikaël Salson's avatar
Mikaël Salson committed
289
    {
Mikaël Salson's avatar
Mikaël Salson committed
290
      because = UNSEG_STRAND_NOT_CONSISTENT ;
Mathieu Giraud's avatar
Mathieu Giraud committed
291
    } 
292
  else
Mikaël Salson's avatar
Mikaël Salson committed
293
    {
294
295
      affect_infos max;
      if (strand == 1)
296
297
        max = kaa->getMaximum(KmerAffect(germline->affect_5, 1), 
			      KmerAffect(germline->affect_3, 1));
298
      else
299
300
        max = kaa->getMaximum(KmerAffect(germline->affect_3, -1), 
			      KmerAffect(germline->affect_5, -1));
301

302
303
304
305
306

      // We labeled it detected if there were both enough affect_5 and enough affect_3
      detected = (max.nb_before_left + max.nb_before_right >= DETECT_THRESHOLD)
        && (max.nb_after_left + max.nb_after_right >= DETECT_THRESHOLD);
      
307
      if (! max.max_found) {
308
309
        if ((strand == 1 && max.nb_before_left == 0)
            || (strand == -1 && max.nb_after_right == 0)) 
310
          because = detected ? UNSEG_AMBIGUOUS : UNSEG_TOO_FEW_V ;
311
312
        else if ((strand == 1 && max.nb_after_right == 0)
                 || (strand == -1 && max.nb_before_left == 0))
Mikaël Salson's avatar
Mikaël Salson committed
313
	{
314
	  because = detected ? UNSEG_AMBIGUOUS : UNSEG_TOO_FEW_J ;
315
	} else 
316
317
318
319
320
321
322
323
324
325
          because = UNSEG_AMBIGUOUS; 
      } else {
        Vend = max.first_pos_max;
        Jstart = max.last_pos_max + 1;
        if (strand == -1) {
          int tmp = sequence.size() - Vend - 1;
          Vend = sequence.size() - Jstart - 1;
          Jstart = tmp;
        }
      }
Mikaël Salson's avatar
Mikaël Salson committed
326
327
    } 
  
Mathieu Giraud's avatar
Mathieu Giraud committed
328
  if (! because)
Mikaël Salson's avatar
Mikaël Salson committed
329
    {
Mathieu Giraud's avatar
Mathieu Giraud committed
330
      // Now we check the delta between Vend and right
Mikaël Salson's avatar
Mikaël Salson committed
331
   
332
      if (Jstart - Vend < germline->delta_min)
Mikaël Salson's avatar
Mikaël Salson committed
333
334
335
	{
	  because = UNSEG_BAD_DELTA_MIN ;
	}
Mikaël Salson's avatar
Mikaël Salson committed
336

337
      if (Jstart - Vend > germline->delta_max)
Mikaël Salson's avatar
Mikaël Salson committed
338
339
340
	{
	  because = UNSEG_BAD_DELTA_MAX ;
	}
Mathieu Giraud's avatar
Mathieu Giraud committed
341
342
343
344
    } 
  if (because)
    segmented = false;
}
Mikaël Salson's avatar
Mikaël Salson committed
345

346
KmerAffectAnalyser *KmerSegmenter::getKmerAffectAnalyser() const {
Mathieu Giraud's avatar
Mathieu Giraud committed
347
348
  return kaa;
}
Mikaël Salson's avatar
Mikaël Salson committed
349

Mathieu Giraud's avatar
Mathieu Giraud committed
350
351
int KmerSegmenter::getSegmentationStatus() const {
  return because;
Mikaël Salson's avatar
Mikaël Salson committed
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
}

// FineSegmenter

void best_align(int overlap, string seq_left, string seq_right,
		string ref_left ,string ref_right, int *b_r, int *b_l, Cost segment_cost)
{
      int score_r[overlap+1];
      int score_l[overlap+1];
      
      //LEFT
      ref_left=string(ref_left.rbegin(), ref_left.rend());
      seq_left=string(seq_left.rbegin(), seq_left.rend()); 
      
      DynProg dp1 = DynProg(seq_left, ref_left,
			   DynProg::Local, segment_cost);
      score_l[0] = dp1.compute();
      dp1.backtrack();
      
      //RIGHT
      DynProg dp = DynProg(seq_right, ref_right,
			   DynProg::Local, segment_cost);
      score_r[0] = dp.compute();
      dp.backtrack();    
      
      for(int i=1; i<=overlap; i++){
	if(dp.best_i-i >0)
	score_r[i] = dp.S[dp.best_i-i][dp.best_j];
	else
	score_r[i] =0;
	if(dp1.best_i-i >0)
	score_l[i] = dp1.S[dp1.best_i-i][dp1.best_j];	
	else
	score_l[i] =0;
      }
      int score=-1000000;
      int best_r=0;
      int best_l=0;

      for (int i=0; i<=overlap; i++){
	for (int j=overlap-i; j<=overlap; j++){
	  if ( ((score_r[i]+score_l[j]) == score) && (i+j < best_r+best_l )){
	    best_r=i;
	    best_l=j;
	    score=(score_r[i]+score_l[j]) ;
	  }
	  if ( (score_r[i]+score_l[j]) > score){
	    best_r=i;
	    best_l=j;
	    score=(score_r[i]+score_l[j]) ;
	  }
	}
      }
      *b_r=best_r;
      *b_l=best_l;
}

Mikaël Salson's avatar
Mikaël Salson committed
409
410
411
412
413
bool comp_pair (pair<int,int> i,pair<int,int> j)
{
  return ( i.first > j.first);
}

Mathieu Giraud's avatar
Mathieu Giraud committed
414
int align_against_collection(string &read, Fasta &rep, bool reverse_both, bool local, int *tag, 
Mikaël Salson's avatar
Mikaël Salson committed
415
			     int *del, int *del2, int *begin, int *length, vector<pair<int, int> > *score
Mikaël Salson's avatar
Mikaël Salson committed
416
417
418
419
420
421
422
423
424
425
			    , Cost segment_cost)
{
  
  int best_score = MINUS_INF ;
  int best_r = MINUS_INF ;
  int best_best_i = (int) string::npos ;
  int best_best_j = (int) string::npos ;
  int best_first_i = (int) string::npos ;
  int best_first_j = (int) string::npos ;
  string best_label = "" ;
Mikaël Salson's avatar
Mikaël Salson committed
426
  vector<pair<int, int> > score_r;
Mikaël Salson's avatar
Mikaël Salson committed
427
428
429
430
431
432
433
434
435
436
437

  DynProg::DynProgMode dpMode = DynProg::LocalEndWithSomeDeletions;
  if (local==true) dpMode = DynProg::Local;
  
  for (int r = 0 ; r < rep.size() ; r++)
    {
      DynProg dp = DynProg(read, rep.sequence(r),
			   dpMode, // DynProg::SemiGlobalTrans, 
			   segment_cost, // DNA
			   reverse_both, reverse_both);
      int score = dp.compute();
Mikaël Salson's avatar
Mikaël Salson committed
438
      
Mikaël Salson's avatar
Mikaël Salson committed
439
440
441
442
443
444
445
446
447
448
449
450
451
452
      if (local==true){ 
	dp.backtrack();
      }
      
      if (score > best_score)
	{
	  best_score = score ;
	  best_best_i = dp.best_i ;
	  best_best_j = dp.best_j ;
	  best_first_i = dp.first_i ;
	  best_first_j = dp.first_j ;
	  best_r = r ;
	  best_label = rep.label(r) ;
	}
Mikaël Salson's avatar
Mikaël Salson committed
453
454
	
	score_r.push_back(make_pair(score, r));
Mathieu Giraud's avatar
Mathieu Giraud committed
455
456
457
458
459
460

	// #define DEBUG_SEGMENT      

#ifdef DEBUG_SEGMENT	
	cout << rep.label(r) << " " << score << " " << dp.best_i << endl ;
#endif
Mikaël Salson's avatar
Mikaël Salson committed
461
462

    }
Mikaël Salson's avatar
Mikaël Salson committed
463
    sort(score_r.begin(),score_r.end(),comp_pair);
Mikaël Salson's avatar
Mikaël Salson committed
464
465
466
467

  *del = reverse_both ? best_best_j : rep.sequence(best_r).size() - best_best_j - 1;
  *del2 = best_first_j;
  *begin = best_first_i;
Mathieu Giraud's avatar
Mathieu Giraud committed
468
  *tag = best_r ; 
Mikaël Salson's avatar
Mikaël Salson committed
469
470
  
  *length -= *del ;
Mikaël Salson's avatar
Mikaël Salson committed
471
472
  
  *score=score_r;
Mathieu Giraud's avatar
Mathieu Giraud committed
473
474
475
476
477
478

#ifdef DEBUG_SEGMENT	
  cout << "best: " << best_labels << " " << best_score ;
  cout << "del/del2/begin:" << (*del) << "/" << (*del2) << "/" << (*begin) << endl;
  cout << endl;
#endif
Mikaël Salson's avatar
Mikaël Salson committed
479
480
481
482
483
484
485
486
487
  
  return best_best_i ;
}

string format_del(int deletions)
{
  return deletions ? *"(" + string_of_int(deletions) + " del)" : "" ;
}

488
FineSegmenter::FineSegmenter(Sequence seq, Germline *germline, Cost segment_c)
Mikaël Salson's avatar
Mikaël Salson committed
489
{
490
  segmented_germline = germline ;
491
  info_extra = "" ;
Mikaël Salson's avatar
Mikaël Salson committed
492
493
  label = seq.label ;
  sequence = seq.sequence ;
Mathieu Giraud's avatar
Mathieu Giraud committed
494
  Dend=0;
Mikaël Salson's avatar
Mikaël Salson committed
495
496
497
498
499
500
501
  segment_cost=segment_c;

  // TODO: factoriser tout cela, peut-etre en lancant deux segmenteurs, un +, un -, puis un qui chapote
  
  // Strand +
  
  int plus_score = 0 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
502
  int tag_plus_V, tag_plus_J;
Mikaël Salson's avatar
Mikaël Salson committed
503
504
505
506
  int plus_length = 0 ;
  int del_plus_V, del_plus_J ;
  int del2=0;
  int beg=0;
Mikaël Salson's avatar
Mikaël Salson committed
507
508
509
510
  
  vector<pair<int, int> > score_plus_V;
  vector<pair<int, int> > score_plus_J;
  
511
  int plus_left = align_against_collection(sequence, germline->rep_5, false, false, &tag_plus_V, &del_plus_V, &del2, &beg, 
Mikaël Salson's avatar
Mikaël Salson committed
512
513
					   &plus_length, &score_plus_V
					   , segment_cost);
514
  int plus_right = align_against_collection(sequence, germline->rep_3, true, false, &tag_plus_J, &del_plus_J, &del2, &beg,
Mikaël Salson's avatar
Mikaël Salson committed
515
516
					    &plus_length, &score_plus_J
					    , segment_cost);
Mikaël Salson's avatar
Mikaël Salson committed
517
518
  plus_length += plus_right - plus_left ;

Mikaël Salson's avatar
Mikaël Salson committed
519
520
  plus_score=score_plus_V[0].first + score_plus_J[0].first ;
  
Mikaël Salson's avatar
Mikaël Salson committed
521
522
523
  // Strand -
  string rc = revcomp(sequence) ;
  int minus_score = 0 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
524
  int tag_minus_V, tag_minus_J;
Mikaël Salson's avatar
Mikaël Salson committed
525
526
  int minus_length = 0 ;
  int del_minus_V, del_minus_J ;
Mikaël Salson's avatar
Mikaël Salson committed
527
528
529
530
  
  vector<pair<int, int> > score_minus_V;
  vector<pair<int, int> > score_minus_J;
  
531
  int minus_left = align_against_collection(rc, germline->rep_5, false, false, &tag_minus_V, &del_minus_V, &del2, &beg,
Mikaël Salson's avatar
Mikaël Salson committed
532
533
					    &minus_length, &score_minus_V
					    ,  segment_cost);
534
  int minus_right = align_against_collection(rc, germline->rep_3, true, false, &tag_minus_J, &del_minus_J, &del2, &beg,
Mikaël Salson's avatar
Mikaël Salson committed
535
536
					     &minus_length, &score_minus_J
					     , segment_cost);
Mikaël Salson's avatar
Mikaël Salson committed
537
538
  minus_length += minus_right - minus_left ;

Mikaël Salson's avatar
Mikaël Salson committed
539
540
  minus_score=score_minus_V[0].first + score_minus_J[0].first ;
  
Mikaël Salson's avatar
Mikaël Salson committed
541
542
543
544
  reversed = (minus_score > plus_score) ;

  if (!reversed)
    {
Mathieu Giraud's avatar
Mathieu Giraud committed
545
546
547
548
      Vend = plus_left ;
      Jstart = plus_right ;
      best_V = tag_plus_V ;
      best_J = tag_plus_J ;
Mikaël Salson's avatar
Mikaël Salson committed
549
550
      del_V = del_plus_V ;
      del_J = del_plus_J ;
Mikaël Salson's avatar
Mikaël Salson committed
551
552
      score_V=score_plus_V;
      score_J=score_plus_J;
Mikaël Salson's avatar
Mikaël Salson committed
553
554
555
    }
  else
    {
Mathieu Giraud's avatar
Mathieu Giraud committed
556
557
558
559
      Vend = minus_left ;
      Jstart = minus_right ;
      best_V = tag_minus_V ;
      best_J = tag_minus_J ;
Mikaël Salson's avatar
Mikaël Salson committed
560
561
      del_V = del_minus_V ;
      del_J = del_minus_J ;
Mikaël Salson's avatar
Mikaël Salson committed
562
563
      score_V=score_minus_V;
      score_J=score_minus_J;
Mikaël Salson's avatar
Mikaël Salson committed
564
565
    }

Mathieu Giraud's avatar
Mathieu Giraud committed
566
  segmented = (Vend != (int) string::npos) && (Jstart != (int) string::npos) && 
567
    (Jstart - Vend >= germline->delta_min) && (Jstart - Vend <= germline->delta_max);
Mikaël Salson's avatar
Mikaël Salson committed
568
569
570
571
572
    
  dSegmented=false;

  if (!segmented)
    {
573
      because = DONT_KNOW;
Mathieu Giraud's avatar
Mathieu Giraud committed
574
      info = " @" + string_of_int (Vend + FIRST_POS) + "  @" + string_of_int(Jstart + FIRST_POS) ;
575
      
576
      if (Jstart - Vend < germline->delta_min)
577
578
579
580
        {
          because = UNSEG_BAD_DELTA_MIN  ;
        }

581
      if (Jstart - Vend > germline->delta_max)
582
583
584
585
586
587
588
589
590
591
592
593
594
595
        {
          because = UNSEG_BAD_DELTA_MAX  ;
        }
        
      if (Vend == (int) string::npos) 
        {
          because = UNSEG_TOO_FEW_V ;
        }
      
      if (Jstart == (int) string::npos)
        {
          because = UNSEG_TOO_FEW_J ;
        }
      
Mikaël Salson's avatar
Mikaël Salson committed
596
597
598
      return ;
    }
    
599
600
    because = reversed ? SEG_MINUS : SEG_PLUS ;
    
Mikaël Salson's avatar
Mikaël Salson committed
601
    //overlap VJ
Mathieu Giraud's avatar
Mathieu Giraud committed
602
    if(Jstart-Vend <=0){
Mikaël Salson's avatar
Mikaël Salson committed
603
      int b_r, b_l;
Mathieu Giraud's avatar
Mathieu Giraud committed
604
      int overlap=Vend-Jstart+1;
Mikaël Salson's avatar
Mikaël Salson committed
605
      
Mathieu Giraud's avatar
Mathieu Giraud committed
606
607
      string seq_left = sequence.substr(0, Vend+1);
      string seq_right = sequence.substr(Jstart);
Mikaël Salson's avatar
Mikaël Salson committed
608
609

      best_align(overlap, seq_left, seq_right, 
610
		 germline->rep_5.sequence(best_V), germline->rep_3.sequence(best_J), &b_r,&b_l, segment_cost);
Mikaël Salson's avatar
Mikaël Salson committed
611
      // Trim V
Mathieu Giraud's avatar
Mathieu Giraud committed
612
      Vend -= b_l;
Mikaël Salson's avatar
Mikaël Salson committed
613
614
615
      del_V += b_l;

      // Trim J
Mathieu Giraud's avatar
Mathieu Giraud committed
616
      Jstart += b_r;
Mikaël Salson's avatar
Mikaël Salson committed
617
      del_J += b_r;
618
      if (Jstart>=(int) sequence.length())
Mathieu Giraud's avatar
Mathieu Giraud committed
619
	  Jstart=sequence.length()-1;
Mikaël Salson's avatar
Mikaël Salson committed
620
621
622
    }

    // string chevauchement = removeChevauchement();
Mikaël Salson's avatar
Mikaël Salson committed
623
624

    /// used only below, then recomputed in finishSegmentation() ;
Mathieu Giraud's avatar
Mathieu Giraud committed
625
    seg_N = revcomp(sequence, reversed).substr(Vend+1, Jstart-Vend-1); 
Mikaël Salson's avatar
Mikaël Salson committed
626

627
  code = germline->rep_5.label(best_V) +
Mikaël Salson's avatar
Mikaël Salson committed
628
629
630
631
    " "+ string_of_int(del_V) + 
    "/" + seg_N + 
    // chevauchement +
    "/" + string_of_int(del_J) +
632
    " " + germline->rep_3.label(best_J); 
Mikaël Salson's avatar
Mikaël Salson committed
633

Mikaël Salson's avatar
Mikaël Salson committed
634
    stringstream code_s;
635
   code_s<< germline->rep_5.label(best_V) <<
Mikaël Salson's avatar
Mikaël Salson committed
636
637
638
639
    " -" << string_of_int(del_V) << "/" 
    << seg_N.size()
    // chevauchement +
    << "/-" << string_of_int(del_J)
640
    <<" " << germline->rep_3.label(best_J);
Mikaël Salson's avatar
Mikaël Salson committed
641
642
    code_short=code_s.str();
    
643
644
  code_light = germline->rep_5.label(best_V) +
    "/ " + germline->rep_3.label(best_J); 
Mikaël Salson's avatar
Mikaël Salson committed
645
646

 
Mathieu Giraud's avatar
Mathieu Giraud committed
647
  info = string_of_int(Vend + FIRST_POS) + " " + string_of_int(Jstart + FIRST_POS) ;
Mikaël Salson's avatar
Mikaël Salson committed
648
649
650
651
652

  finishSegmentation();
}


653
void FineSegmenter::FineSegmentD(Germline *germline){
Mikaël Salson's avatar
Mikaël Salson committed
654
655
656
657
  
  if (segmented){
    
    int end = (int) string::npos ;
Mathieu Giraud's avatar
Mathieu Giraud committed
658
    int tag_D;
Mikaël Salson's avatar
Mikaël Salson committed
659
660
661
662
    int length = 0 ;
    int begin = 0;
    
    // Create a zone where to look for D, adding at most EXTEND_D_ZONE nucleotides at each side
Mathieu Giraud's avatar
Mathieu Giraud committed
663
    int l = Vend - EXTEND_D_ZONE;
Mikaël Salson's avatar
Mikaël Salson committed
664
665
666
    if (l<0) 
      l=0 ;

Mathieu Giraud's avatar
Mathieu Giraud committed
667
    int r = Jstart + EXTEND_D_ZONE;
Mikaël Salson's avatar
Mikaël Salson committed
668
669
670
671
672
673
674

    if (r > (int)getSequence().sequence.length()) 
      r = getSequence().sequence.length();
      
    string str = getSequence().sequence.substr(l, r-l);

    // Align
675
    end = align_against_collection(str, germline->rep_4, false, true, &tag_D, &del_D_right, &del_D_left, &begin,
Mikaël Salson's avatar
Mikaël Salson committed
676
677
				&length, &score_D, segment_cost);
    
Mathieu Giraud's avatar
Mathieu Giraud committed
678
    best_D = tag_D;
Mikaël Salson's avatar
Mikaël Salson committed
679
    
Mathieu Giraud's avatar
Mathieu Giraud committed
680
681
    Dstart = l + begin;
    Dend = l + end;
Mikaël Salson's avatar
Mikaël Salson committed
682
683
684
685
686
687
	
    string seq = getSequence().sequence;
    
    if (length>0) dSegmented=true;
    
    //overlap VD
Mathieu Giraud's avatar
Mathieu Giraud committed
688
    if(Dstart-Vend <=0){
Mikaël Salson's avatar
Mikaël Salson committed
689
      int b_r, b_l;
Mathieu Giraud's avatar
Mathieu Giraud committed
690
691
692
      int overlap=Vend-Dstart+1;
      string seq_left = seq.substr(0, Vend+1);
      string seq_right = seq.substr(Dstart, Dend-Dstart+1);
Mikaël Salson's avatar
Mikaël Salson committed
693
694

      best_align(overlap, seq_left, seq_right, 
695
		 germline->rep_5.sequence(best_V), germline->rep_4.sequence(best_D), &b_r,&b_l, segment_cost);
Mikaël Salson's avatar
Mikaël Salson committed
696
697

      // Trim V
Mathieu Giraud's avatar
Mathieu Giraud committed
698
      Vend -= b_l;
Mikaël Salson's avatar
Mikaël Salson committed
699
700
701
      del_V += b_l;

      // Trim D
Mathieu Giraud's avatar
Mathieu Giraud committed
702
      Dstart += b_r;
Mikaël Salson's avatar
Mikaël Salson committed
703
    }
Mathieu Giraud's avatar
Mathieu Giraud committed
704
    seg_N1 = seq.substr(Vend+1, Dstart-Vend-1) ; // From Vend+1 to Dstart-1
Mikaël Salson's avatar
Mikaël Salson committed
705
706
    
    //overlap DJ
Mathieu Giraud's avatar
Mathieu Giraud committed
707
    if(Jstart-Dend <=0){
Mikaël Salson's avatar
Mikaël Salson committed
708
      int b_r, b_l;
Mathieu Giraud's avatar
Mathieu Giraud committed
709
710
711
      int overlap=Dend-Jstart+1;
      string seq_right = seq.substr(Dstart, Dend-Dstart+1);
      string seq_left = seq.substr(Jstart, seq.length()-Jstart);
Mikaël Salson's avatar
Mikaël Salson committed
712
713

      best_align(overlap, seq_left, seq_right, 
714
		 germline->rep_4.sequence(best_D), germline->rep_3.sequence(best_J), &b_r,&b_l, segment_cost);
Mikaël Salson's avatar
Mikaël Salson committed
715
716

      // Trim D
Mathieu Giraud's avatar
Mathieu Giraud committed
717
      Dend -= b_l;
Mikaël Salson's avatar
Mikaël Salson committed
718
719

      // Trim J
Mathieu Giraud's avatar
Mathieu Giraud committed
720
      Jstart += b_r;
Mikaël Salson's avatar
Mikaël Salson committed
721
722
723
      del_J += b_r;

    }
Mathieu Giraud's avatar
Mathieu Giraud committed
724
    seg_N2 = seq.substr(Dend+1, Jstart-Dend-1) ; // From Dend+1 to right-1
725
    code = germline->rep_5.label(best_V) +
Mikaël Salson's avatar
Mikaël Salson committed
726
727
728
729
    " "+ string_of_int(del_V) + 
    "/" + seg_N1 + 
    
    "/" + string_of_int(del_D_left) +
730
    " " + germline->rep_4.label(best_D) +
Mikaël Salson's avatar
Mikaël Salson committed
731
732
733
734
    " " + string_of_int(del_D_right) +
    
    "/" + seg_N2 +
    "/" + string_of_int(del_J) +
735
    " " + germline->rep_3.label(best_J); 
Mikaël Salson's avatar
Mikaël Salson committed
736

Mikaël Salson's avatar
Mikaël Salson committed
737
    stringstream code_s;
738
    code_s << germline->rep_5.label(best_V) 
Mikaël Salson's avatar
Mikaël Salson committed
739
740
741
742
    << " -" << string_of_int(del_V) << "/" 
    << seg_N1.size()
    
    << "/-" << string_of_int(del_D_left) 
743
    << " " << germline->rep_4.label(best_D) 
Mikaël Salson's avatar
Mikaël Salson committed
744
745
746
747
    << " -" << string_of_int(del_D_right) << "/"
    
    << seg_N2.size()
    << "/-" << string_of_int(del_J) 
748
    << " " << germline->rep_3.label(best_J);
Mikaël Salson's avatar
Mikaël Salson committed
749
750
751
    code_short=code_s.str();
    
    
752
753
754
    code_light = germline->rep_5.label(best_V) +
    "/ " + germline->rep_4.label(best_D) +
    "/ " + germline->rep_3.label(best_J); 
Mikaël Salson's avatar
Mikaël Salson committed
755
756
757
758
    
    finishSegmentationD();
  }
}
Mikaël Salson's avatar
Mikaël Salson committed
759

760
JsonList FineSegmenter::toJsonList(Germline *germline){
Mathieu Giraud's avatar
Mathieu Giraud committed
761
  JsonList result;
Mikaël Salson's avatar
Mikaël Salson committed
762
  
Mathieu Giraud's avatar
Mathieu Giraud committed
763
764
765
766
  result.add("sequence", revcomp(sequence, reversed) );
  if (isSegmented()) {
    result.add("name", code_short);
    
767
    JsonList seg;
Marc Duez's avatar
Marc Duez committed
768
    seg.add("5", germline->rep_5.label(best_V));
769
770
    seg.add("5start", 0);
    seg.add("5end", Vend);
Mathieu Giraud's avatar
Mathieu Giraud committed
771
772
    
    if (score_D.size()>0){
Marc Duez's avatar
Marc Duez committed
773
      seg.add("4", germline->rep_4.label(best_D));
Marc Duez's avatar
Marc Duez committed
774
775
      seg.add("4start", Dstart);
      seg.add("4end", Dend);      
Mikaël Salson's avatar
Mikaël Salson committed
776
    }
Mathieu Giraud's avatar
Mathieu Giraud committed
777
    
Marc Duez's avatar
Marc Duez committed
778
    seg.add("3", germline->rep_3.label(best_J));
779
780
781
    seg.add("3start", Jstart);
    
    result.add("seg", seg);
Mikaël Salson's avatar
Mikaël Salson committed
782
  }
Mathieu Giraud's avatar
Mathieu Giraud committed
783
  return result;
Mikaël Salson's avatar
Mikaël Salson committed
784
785
786
}