dynprog.cpp 14.5 KB
Newer Older
Mikaël Salson's avatar
Mikaël Salson committed
1
/*
2 3 4 5 6 7 8
  This file is part of Vidjil <http://www.vidjil.org>
  Copyright (C) 2011, 2012, 2013, 2014, 2015 by Bonsai bioinformatics 
  at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille
  Contributors: 
      Mathieu Giraud <mathieu.giraud@vidjil.org>
      Mikaël Salson <mikael.salson@vidjil.org>
      Marc Duez <marc.duez@vidjil.org>
Mikaël Salson's avatar
Mikaël Salson committed
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25

  "Vidjil" is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.

  "Vidjil" is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with "Vidjil". If not, see <http://www.gnu.org/licenses/>
*/

#include "dynprog.h"
#include "tools.h"
26
#include "segment.h"
Mikaël Salson's avatar
Mikaël Salson committed
27 28 29 30
#include <cassert>
#include <list>
#include <cstdlib>
#include <string>
31
#include <cmath>
Mikaël Salson's avatar
Mikaël Salson committed
32

33 34 35 36 37 38 39 40
#define SUBST '|'
#define MISMATCH '.'
#define INSER 'i'
#define DELET 'd'
#define HOMO2X 'h'
#define HOMO2Y 'h'
#define FIN ' '
#define BEGIN 'B'
Mikaël Salson's avatar
Mikaël Salson committed
41
#define BACKSIZE 120
Mikaël Salson's avatar
Mikaël Salson committed
42 43 44 45 46 47 48 49 50 51 52 53

using namespace std;


Cost::Cost(int match, int mismatch, int indel, int del_end, int homopolymer)
{
  this -> match = match ;
  this -> mismatch = mismatch ;
  this -> insertion = indel ;
  this -> deletion = indel ;
  this -> deletion_end = del_end ;
  this -> homopolymer = (homopolymer == MINUS_INF ? indel: homopolymer);
54

55 56 57
  this -> open_insertion = this -> open_deletion = MINUS_INF ;
  this -> extend_insertion = this -> extend_deletion = MINUS_INF ;
  this -> affine_gap = false ;
58 59

  estimate_K_lambda();
60 61 62 63 64 65 66 67 68 69 70 71 72
}

Cost::Cost(int match, int mismatch, int open_gap, int extend_gap, int del_end, int homopolymer)
{
  this -> match = match ;
  this -> mismatch = mismatch ;
  this -> insertion = MINUS_INF ;
  this -> deletion = MINUS_INF ;
  this -> deletion_end = del_end ;
  this -> homopolymer = homopolymer ;

  this -> open_insertion = this -> open_deletion = open_gap ;
  this -> extend_insertion = this -> extend_deletion = extend_gap ;
73
  this -> affine_gap = true ;
74 75 76 77 78 79 80 81 82

  estimate_K_lambda();
}

void Cost::estimate_K_lambda()
{
  // Rough estimate of K and lambda parameters
  lambda = log(4) / match ;
  K = 0.05 ;
Mikaël Salson's avatar
Mikaël Salson committed
83 84 85 86 87 88 89 90 91
}


ostream& operator<<(ostream& out, const Cost& cost)
{
  out << "(" << cost.match 
      << ", " << cost.mismatch
      << "/" << cost.insertion
      << "/" << cost.deletion
92 93
      << ", " << cost.open_insertion << cost.extend_insertion
      << "/" << cost.open_deletion << cost.extend_deletion
Mikaël Salson's avatar
Mikaël Salson committed
94 95
      << ", " << cost.deletion_end
      << ", " << cost.homopolymer
96 97 98
      << ") "
      << cost.K << "/" << cost.lambda ;

Mikaël Salson's avatar
Mikaël Salson committed
99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
  return out;
}

Cost::Cost()
{
}

int Cost::substitution(char a, char b)
{
  return (a == b) ? match : mismatch ;
}

int Cost::homo2(char xa, char xb, char y)
{
  return ((xa == xb) && (xb == y)) ? homopolymer : MINUS_INF ;
}

116
double Cost::toPValue(const int score)
117
{
118
  return K * exp(-lambda * score);
119 120
}

Mikaël Salson's avatar
Mikaël Salson committed
121

122 123 124
DynProg::DynProg(const string &x, const string &y, DynProgMode mode, const Cost& cost,
                 const bool reverse_x, const bool reverse_y,
                 const int marked_pos_j)
Mikaël Salson's avatar
Mikaël Salson committed
125 126 127 128 129 130 131
{
  this -> x = reverse_x ? reverse(x) : x ;
  this -> y = reverse_y ? reverse(y) : y ;

  this -> reverse_x = reverse_x ;
  this -> reverse_y = reverse_y ;

132 133 134
  this -> marked_pos_j = marked_pos_j;
  this -> marked_pos_i = 0;

Mikaël Salson's avatar
Mikaël Salson committed
135 136 137 138
  m = x.size();
  n = y.size();
  // cout << "Dynprog " << x << "(" << m << ") / " << y << "(" << n << ")" << endl ;

139
  B = new operation*[m+1];
Mikaël Salson's avatar
Mikaël Salson committed
140
  for (int i = 0; i <= m; i++) {
141
    B[i] = new operation[n+1];
Mikaël Salson's avatar
Mikaël Salson committed
142
  }
143 144 145 146 147 148 149 150
  Bins = new operation*[m+1];
  for (int i = 0; i <= m; i++) {
    Bins[i] = new operation[n+1];
  }
  Bdel = new operation*[m+1];
  for (int i = 0; i <= m; i++) {
    Bdel[i] = new operation[n+1];
  }  
Mikaël Salson's avatar
Mikaël Salson committed
151 152 153 154 155 156 157
  this -> mode = mode;
  this -> cost = cost;

  this -> best_i = -1 ;
  this -> best_j = -1 ;
  this -> first_i = -1 ;
  this -> first_j = -1 ;
Mikaël Salson's avatar
Mikaël Salson committed
158 159 160 161
  
  gap1=NULL;
  gap2=NULL;
  linkgap=NULL;
Mikaël Salson's avatar
Mikaël Salson committed
162 163 164 165 166

  init();
}

DynProg::~DynProg() {
167

Mikaël Salson's avatar
Mikaël Salson committed
168 169
  for (int i = 0; i <= m; i++) {
    delete [] B[i];
170 171
    delete [] Bins[i];
    delete [] Bdel[i];
Mikaël Salson's avatar
Mikaël Salson committed
172
  }
173
  delete [] B;  
174 175
  delete [] Bins;
  delete [] Bdel;  
Mikaël Salson's avatar
Mikaël Salson committed
176 177 178
  delete [] gap1;
  delete [] gap2;
  delete [] linkgap;
Mikaël Salson's avatar
Mikaël Salson committed
179 180 181
}

void DynProg::init()
182 183 184 185 186 187
{
  for (int i=1; i<=m; i++)
    {
      B[i][0].type = INSER ;
      B[i][0].i = i-1 ;
      B[i][0].j = 0 ;
188
      Bdel[i][0].score = Bins[i][0].score = MINUS_INF ;
189 190 191 192 193 194 195
    }

  for (int j=1; j<=n; j++)
    {
      B[0][j].type = DELET ;    
      B[0][j].i = 0 ;
      B[0][j].j = j-1 ;
196
      Bdel[0][j].score = Bins[0][j].score = MINUS_INF ;
197 198
    }

Mikaël Salson's avatar
Mikaël Salson committed
199 200
  if (mode == Local || mode == LocalEndWithSomeDeletions)
    for (int i=0; i<=m; i++)
201
      B[i][0].score = 0 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
202 203
  else if (mode == GlobalButMostlyLocal)
    for (int i=0; i<=m; i++)
204
      B[i][0].score = i * cost.insertion / 2 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
205 206
  else // Global, SemiGlobal
    for (int i=0; i<=m; i++)
207
      B[i][0].score = i * cost.insertion ;
Mikaël Salson's avatar
Mikaël Salson committed
208 209 210
   
  if (mode == SemiGlobal || mode == SemiGlobalTrans || mode == Local || mode == LocalEndWithSomeDeletions)
    for (int j=0; j<=n; j++)
211
      B[0][j].score = 0 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
212 213
  else if (mode == GlobalButMostlyLocal)
    for (int j=0; j<=n; j++)
214
      B[0][j].score = j * cost.deletion / 2 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
215
  else // Global
Mikaël Salson's avatar
Mikaël Salson committed
216
    for (int j=0; j<=n; j++)
217
      B[0][j].score = j * cost.deletion ;
Mikaël Salson's avatar
Mikaël Salson committed
218 219
}

220 221 222 223 224 225 226 227 228 229 230
inline void try_operation(operation &best, int type, int i, int j, int score)
{
  if (score > best.score)
    {
      best.type = type ;
      best.i = i ;
      best.j = j ;
      best.score = score ;
    }
}

231
int DynProg::compute(bool onlyBottomTriangle, int onlyBottomTriangleShift)
Mikaël Salson's avatar
Mikaël Salson committed
232 233
{
  best_score = MINUS_INF ;
234 235 236 237
  best_i = 0 ;
  best_j = 0 ;
  
  operation best ;
Mikaël Salson's avatar
Mikaël Salson committed
238 239 240 241

  for (int i=1; i<=m; i++)
    for (int j=1; j<=n; j++)
    {
242
      best.score = MINUS_INF ;
Mikaël Salson's avatar
Mikaël Salson committed
243

244 245 246 247 248 249 250 251 252 253 254 255 256
      // If onlyBottomTriangle, stops when we are not in the bottom triangle
      if (onlyBottomTriangle && ((n-j) >= (m-i) + onlyBottomTriangleShift))
        {
          best.type = 'k';
          B[i][j] = best;
          if (cost.affine_gap)
            {
              Bins[i][j] = best;
              Bdel[i][j] = best;
            }
          continue ;
        }

257
      // The edit operations, with their backtracking information and their score
258 259

      // Match, mismatch
260
      try_operation(best, SUBST, i-1, j-1, B[i-1][j-1].score + cost.substitution(x[i-1], y[j-1]));
261

262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283
      if (!cost.affine_gap)
        {
          // Regular indels
          try_operation(best, INSER, i-1, j  , B[i-1][j  ].score + cost.insertion);
          try_operation(best, DELET, i  , j-1, B[i  ][j-1].score + cost.deletion);
        }
      else
        {
          // Gotoh affine gaps - insertion
          Bins[i][j].score = MINUS_INF ;
          try_operation(Bins[i][j], 'o', i-1, j, B[i-1][j].score + cost.open_insertion);
          try_operation(Bins[i][j], 'x', Bins[i-1][j].i, j, Bins[i-1][j].score + cost.extend_insertion);
          try_operation(best, INSER, Bins[i][j].i, j ,Bins[i][j].score);

          // Gotoh affine gaps - deletions
          Bdel[i][j].score = MINUS_INF ;
          try_operation(Bdel[i][j], 'o', i, j-1, B[i][j-1].score + cost.open_deletion);
          try_operation(Bdel[i][j], 'x', i, Bdel[i][j-1].j, Bdel[i-1][j].score + cost.extend_deletion);
          try_operation(best, DELET, i, Bdel[i][j].j,  Bdel[i][j].score);
        }

      // Homopolymers
284 285
      try_operation(best, HOMO2X, i-2, j-1, i > 1 ? B[i-2][j-1].score + cost.homo2(x[i-2], x[i-1], y[j-1]) : MINUS_INF);
      try_operation(best, HOMO2Y, i-1, j-2, j > 1 ? B[i-1][j-2].score + cost.homo2(y[j-2], y[j-1], x[i-1]) : MINUS_INF);
286
      
287
      // Local alignment
288 289
      if (mode == Local || mode == LocalEndWithSomeDeletions)
	try_operation(best, BEGIN, 0, 0, 0);
Mikaël Salson's avatar
Mikaël Salson committed
290

291 292
      if ((best.type == SUBST) && (x[i-1] != y[j-1]))
        best.type = MISMATCH ;
Mikaël Salson's avatar
Mikaël Salson committed
293

294 295
      // Fill the score and the backtrack information
      B[i][j] = best ;
Mikaël Salson's avatar
Mikaël Salson committed
296
      
297 298
      // cout << " " << i << "," << j << " " << best_op.type << " " <<  best_op.i << "," << best_op.j << " "  << best_op.score << " " << best << endl ;

Mikaël Salson's avatar
Mikaël Salson committed
299 300
      if (mode == Local || mode == LocalEndWithSomeDeletions)
	{
301
	  int tbest = best.score ;
Mikaël Salson's avatar
Mikaël Salson committed
302 303 304 305 306 307 308 309 310 311 312

	  if (mode == LocalEndWithSomeDeletions)
	    tbest += cost.deletion_end*(n-j);   
     
	  if (tbest > best_score)
	    {
	      best_score = tbest ;
	      best_i = i ;
	      best_j = j ;
	    }
	    
313
	  if (best.score == 0){
314
	    B[i][j].type = FIN;
Mikaël Salson's avatar
Mikaël Salson committed
315 316
	  }
	}
Mikaël Salson's avatar
Mikaël Salson committed
317

Mikaël Salson's avatar
Mikaël Salson committed
318 319
      if (mode == Local || mode == LocalEndWithSomeDeletions)
	{
320
      	  if (best.score == 0){
321
	    B[i][j].type = FIN;
Mikaël Salson's avatar
Mikaël Salson committed
322 323 324 325 326
	  }
	}
      
    }

327 328
  // End. Find best_i and best_j, put FIN keywords where the backtrack should stop
  
Mikaël Salson's avatar
Mikaël Salson committed
329 330
  if (mode == Local || mode == LocalEndWithSomeDeletions)
    {
331 332
      for (int j=0; j<=n; j++){
	B[0][j].type = FIN;
Mikaël Salson's avatar
Mikaël Salson committed
333 334
      }
      for (int i=0; i<=m; i++){
335
	B[i][0].type = FIN;
Mikaël Salson's avatar
Mikaël Salson committed
336 337 338 339 340 341 342 343
      }
      
    }
  if (mode == SemiGlobal)
    {
      best_i = m ;
      
      for (int j=1; j<=n; j++)
344
	if (B[m][j].score > best_score)
Mikaël Salson's avatar
Mikaël Salson committed
345
	  {
346
	    best_score = B[m][j].score ;
Mikaël Salson's avatar
Mikaël Salson committed
347 348 349
	    best_j = j ;
	  }
      for (int i=0; i<=m; i++){
350
	B[i][0].type = FIN;
Mikaël Salson's avatar
Mikaël Salson committed
351 352 353 354 355 356 357 358
      }
    }

  if (mode == SemiGlobalTrans)
    {
      best_j = n ;
      
      for (int i=1; i<=m; i++)
359
	if (B[i][n].score > best_score)
Mikaël Salson's avatar
Mikaël Salson committed
360
	  {
361
	    best_score = B[i][n].score ;
Mikaël Salson's avatar
Mikaël Salson committed
362 363 364 365
	    best_i = i ;
	  }
	  
      for (int i=0; i<=n; i++){
366
	B[0][i].type = FIN;
367
      }     
Mikaël Salson's avatar
Mikaël Salson committed
368 369
    }

Mathieu Giraud's avatar
Mathieu Giraud committed
370
  if ((mode == Global) || (mode == GlobalButMostlyLocal))
Mikaël Salson's avatar
Mikaël Salson committed
371 372 373
    {
      best_i = m ;
      best_j = n ;
374
      best_score = B[m][n].score;
Mikaël Salson's avatar
Mikaël Salson committed
375 376 377 378 379 380 381 382
    }

  if (reverse_x)
    best_i = m - best_i + 1 ;

  if (reverse_y)
    best_j = n - best_j + 1;

383 384
  B[0][0].type = FIN;
  
Mikaël Salson's avatar
Mikaël Salson committed
385 386 387 388 389 390 391
  // In the matrix positions start at 1, but start positions may start at 0
  best_i -= 1;   
  best_j -= 1;

  return best_score ;
}

392 393 394 395 396 397 398 399 400 401 402 403 404
int DynProg::best_score_on_i(int i, int *best_j)
{
  int best_score = MINUS_INF ;

  for (int j=0; j<=n; j++)
    if (B[i][j].score > best_score) {
      *best_j = j ;
      best_score = B[i][j].score;
    }

  return best_score ;
}

Mikaël Salson's avatar
Mikaël Salson committed
405 406
void DynProg::backtrack()
{
407
  // Tables for displaying the alignment
Mikaël Salson's avatar
Mikaël Salson committed
408 409 410 411
  gap1 = new int[x.size()+1];
  linkgap = new int[x.size()+1];
  gap2 = new int[y.size()+1];
      
412
  for (unsigned int i = 0; i <=x.size(); i++) {
Mikaël Salson's avatar
Mikaël Salson committed
413 414 415
    gap1[i] = 0;
    linkgap[i] = 0;
  }
416
  for (unsigned int i = 0; i <= y.size(); i++) {
Mikaël Salson's avatar
Mikaël Salson committed
417 418 419 420 421 422 423
    gap2[i] = 0;
  }
  
  
  int g1=x.size();
  int g2=y.size();
  
424 425
  int i=best_i+1;
  int j=best_j+1;
426

427
  // Compute backtrack strings
Mikaël Salson's avatar
Mikaël Salson committed
428
  
429 430
  ostringstream back_x;
  ostringstream back_y;
Mikaël Salson's avatar
Mikaël Salson committed
431 432
  ostringstream back_tr;
  
433 434
  while (1) {

435 436 437 438 439
    if (j == marked_pos_j)
      {
        marked_pos_i = i ;
      }

440 441 442 443
    if ((i<0) || (j<0))
      { 
        cout << "Invalid backtrack: " << i << "," << j << endl ;
        exit(1);
Mikaël Salson's avatar
Mikaël Salson committed
444
      }
445 446 447

    if  (B[i][j].type == FIN)
      break ;
Mikaël Salson's avatar
Mikaël Salson committed
448
      
449
    // cout << "bt " << i << "/" << j << " " << B[i][j].type << " " << B[i][j].i << "," << B[i][j].j << endl ;
450 451 452
    
    int next_i = B[i][j].i;
    int next_j = B[i][j].j;
Mikaël Salson's avatar
Mikaël Salson committed
453

454 455
    // The maximum number of characters implied by this operation
    int max_k = max(i - next_i, j - next_j);
Mikaël Salson's avatar
Mikaël Salson committed
456
      
457 458 459
    for (int k=0; k<max_k; k++)
      {
        linkgap[g1]=g2;
460 461 462

        // Some character(s) from x, then fill with spaces
        if (i-k > next_i)
463
          {
464
            back_x << x[i-1-k];
465 466 467 468
            g1--;
          }
        else
          {
469
            back_x << " " ;
470 471
            gap1[g1]++ ;
          }
472 473 474

        // Some character(s) from y, then fill with spaces
        if (j-k > next_j)
475
          {
476
            back_y << y[j-1-k];
477 478 479 480
            g2--;
          }
        else
          {
481
            back_y << " " ;
482 483
            gap2[g2]++ ;
          }
484 485 486 487 488 489

        // The operation character, then fill with spaces
        if (k == max_k-1)          
          back_tr << B[i][j].type ;
        else
          back_tr << " " ;
490
      }
Mikaël Salson's avatar
Mikaël Salson committed
491
    
492 493
    i = next_i;
    j = next_j;
Mikaël Salson's avatar
Mikaël Salson committed
494 495
  }

496 497 498
  // Format backtrack strings
  
  string str1, str2, str3;
499
  str1 = back_x.str();
Mikaël Salson's avatar
Mikaël Salson committed
500 501 502 503
  str1 =string (str1.rbegin(), str1.rend());
  str1 = str1;
  str2=back_tr.str();
  str2 = string (str2.rbegin(), str2.rend());
504
  str3 = back_y.str();
Mikaël Salson's avatar
Mikaël Salson committed
505
  str3 = string (str3.rbegin(), str3.rend());
506 507

  ostringstream back;    
508 509 510
  back << setw(3) << i   << " " << str1.substr(0, BACKSIZE-8) << endl;
  back << setw(3) << " " << " " << str2.substr(0, BACKSIZE-8) << endl;
  back << setw(3) << j   << " " << str3.substr(0, BACKSIZE-8) << endl << endl;
Mikaël Salson's avatar
Mikaël Salson committed
511 512 513 514 515
  for (size_t k=0 ; (BACKSIZE-8+k*BACKSIZE)< str1.length() ; k++){
    back << str1.substr(BACKSIZE-8+k*BACKSIZE, BACKSIZE) << endl;
    back << str2.substr(BACKSIZE-8+k*BACKSIZE, BACKSIZE) << endl;
    back << str3.substr(BACKSIZE-8+k*BACKSIZE, BACKSIZE) << endl << endl;
  }
516
  back << "score: " << best_score << endl;
Mikaël Salson's avatar
Mikaël Salson committed
517
  
518 519
  first_i=i;
  first_j=j;
Mikaël Salson's avatar
Mikaël Salson committed
520 521
  
  str_back=back.str();
Mathieu Giraud's avatar
Mathieu Giraud committed
522 523

  // cout << str_back << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557
}


float identity_percent(int score)
{
  int match = (score + IdentityDirty.match/2) / IdentityDirty.match ;

  int mismatch = (score - match * IdentityDirty.match) / IdentityDirty.mismatch ;

  float pcent = (100. * (float) match) / (match + mismatch) ;

  // cout << score << "  " << match << "  " << mismatch << "  " << pcent << endl ; 

  return pcent ;
}


ostream& operator<<(ostream& out, const DynProg& dp)
{
  out << "       " ;

  for (int j=0; j<dp.n; j++)
    out << setw(4) << dp.y[j] << " ";

  out << endl ;

  for (int i=0; i<=dp.m; i++)
    {
      if (i)
	out << dp.x[i-1] << " " ;
      else
	out << "  " ;

      for (int j=0; j<=dp.n; j++)
558
        {
559 560
          if (dp.B[i][j].score)
            out << setw(4) << dp.B[i][j].score << dp.B[i][j].type ;
561 562 563
          else
            out << "    " << dp.B[i][j].type ;
        }
Mikaël Salson's avatar
Mikaël Salson committed
564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611
      out << endl ;      
    }
  
  out << "best: " << dp.best_i << "," << dp.best_j << endl;

  return out ;
}


Cost strToCost(string str, Cost default_cost){

  if (str.length()!=0){
    string::size_type stTemp = str.find(',');
	  
    std::list<int> val;
    
    while(stTemp != string::npos)
    {
	  val.push_back(atoi(str.substr(0, stTemp).c_str() ) );
	  str = str.substr(stTemp + 1);
	  stTemp = str.find(',');
    }
    val.push_back(atoi(str.c_str() ) );

    if (val.size()==5){ 
      int val1=val.front();
      val.pop_front();
      int val2=val.front();
      val.pop_front();
      int val3=val.front();
      val.pop_front();
      int val4=val.front();
      val.pop_front();
      int val5=val.front();
      val.pop_front();
      Cost result = Cost(val1, val2, val3, val4, val5);
      cout << "use custom Cost "<< result << endl;
      return result;
    }
    
    cout << "incorrect Cost format, use default "<< default_cost<<endl;
  }
  
  return default_cost;
}