dynprog.cpp 14.3 KB
Newer Older
Mikaël Salson's avatar
Mikaël Salson committed
1
/*
2 3 4 5 6 7 8
  This file is part of Vidjil <http://www.vidjil.org>
  Copyright (C) 2011, 2012, 2013, 2014, 2015 by Bonsai bioinformatics 
  at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille
  Contributors: 
      Mathieu Giraud <mathieu.giraud@vidjil.org>
      Mikaël Salson <mikael.salson@vidjil.org>
      Marc Duez <marc.duez@vidjil.org>
Mikaël Salson's avatar
Mikaël Salson committed
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25

  "Vidjil" is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.

  "Vidjil" is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with "Vidjil". If not, see <http://www.gnu.org/licenses/>
*/

#include "dynprog.h"
#include "tools.h"
26
#include "segment.h"
Mikaël Salson's avatar
Mikaël Salson committed
27 28 29 30
#include <cassert>
#include <list>
#include <cstdlib>
#include <string>
31
#include <cmath>
Mikaël Salson's avatar
Mikaël Salson committed
32

33 34 35 36 37 38 39 40
#define SUBST '|'
#define MISMATCH '.'
#define INSER 'i'
#define DELET 'd'
#define HOMO2X 'h'
#define HOMO2Y 'h'
#define FIN ' '
#define BEGIN 'B'
Mikaël Salson's avatar
Mikaël Salson committed
41
#define BACKSIZE 120
Mikaël Salson's avatar
Mikaël Salson committed
42 43 44 45 46 47 48 49 50 51 52 53

using namespace std;


Cost::Cost(int match, int mismatch, int indel, int del_end, int homopolymer)
{
  this -> match = match ;
  this -> mismatch = mismatch ;
  this -> insertion = indel ;
  this -> deletion = indel ;
  this -> deletion_end = del_end ;
  this -> homopolymer = (homopolymer == MINUS_INF ? indel: homopolymer);
54

55 56 57
  this -> open_insertion = this -> open_deletion = MINUS_INF ;
  this -> extend_insertion = this -> extend_deletion = MINUS_INF ;
  this -> affine_gap = false ;
58 59

  estimate_K_lambda();
60 61 62 63 64 65 66 67 68 69 70 71 72
}

Cost::Cost(int match, int mismatch, int open_gap, int extend_gap, int del_end, int homopolymer)
{
  this -> match = match ;
  this -> mismatch = mismatch ;
  this -> insertion = MINUS_INF ;
  this -> deletion = MINUS_INF ;
  this -> deletion_end = del_end ;
  this -> homopolymer = homopolymer ;

  this -> open_insertion = this -> open_deletion = open_gap ;
  this -> extend_insertion = this -> extend_deletion = extend_gap ;
73
  this -> affine_gap = true ;
74 75 76 77 78 79 80 81 82

  estimate_K_lambda();
}

void Cost::estimate_K_lambda()
{
  // Rough estimate of K and lambda parameters
  lambda = log(4) / match ;
  K = 0.05 ;
Mikaël Salson's avatar
Mikaël Salson committed
83 84 85 86 87 88 89 90 91
}


ostream& operator<<(ostream& out, const Cost& cost)
{
  out << "(" << cost.match 
      << ", " << cost.mismatch
      << "/" << cost.insertion
      << "/" << cost.deletion
92 93
      << ", " << cost.open_insertion << cost.extend_insertion
      << "/" << cost.open_deletion << cost.extend_deletion
Mikaël Salson's avatar
Mikaël Salson committed
94 95
      << ", " << cost.deletion_end
      << ", " << cost.homopolymer
96 97 98
      << ") "
      << cost.K << "/" << cost.lambda ;

Mikaël Salson's avatar
Mikaël Salson committed
99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
  return out;
}

Cost::Cost()
{
}

int Cost::substitution(char a, char b)
{
  return (a == b) ? match : mismatch ;
}

int Cost::homo2(char xa, char xb, char y)
{
  return ((xa == xb) && (xb == y)) ? homopolymer : MINUS_INF ;
}

116
double Cost::toPValue(const int score)
117
{
118
  return K * exp(-lambda * score);
119 120
}

Mikaël Salson's avatar
Mikaël Salson committed
121 122 123 124 125 126 127 128 129 130 131 132 133

DynProg::DynProg(const string &x, const string &y, DynProgMode mode, const Cost& cost, const bool reverse_x, const bool reverse_y)
{
  this -> x = reverse_x ? reverse(x) : x ;
  this -> y = reverse_y ? reverse(y) : y ;

  this -> reverse_x = reverse_x ;
  this -> reverse_y = reverse_y ;

  m = x.size();
  n = y.size();
  // cout << "Dynprog " << x << "(" << m << ") / " << y << "(" << n << ")" << endl ;

134
  B = new operation*[m+1];
Mikaël Salson's avatar
Mikaël Salson committed
135
  for (int i = 0; i <= m; i++) {
136
    B[i] = new operation[n+1];
Mikaël Salson's avatar
Mikaël Salson committed
137
  }
138 139 140 141 142 143 144 145
  Bins = new operation*[m+1];
  for (int i = 0; i <= m; i++) {
    Bins[i] = new operation[n+1];
  }
  Bdel = new operation*[m+1];
  for (int i = 0; i <= m; i++) {
    Bdel[i] = new operation[n+1];
  }  
Mikaël Salson's avatar
Mikaël Salson committed
146 147 148 149 150 151 152
  this -> mode = mode;
  this -> cost = cost;

  this -> best_i = -1 ;
  this -> best_j = -1 ;
  this -> first_i = -1 ;
  this -> first_j = -1 ;
Mikaël Salson's avatar
Mikaël Salson committed
153 154 155 156
  
  gap1=NULL;
  gap2=NULL;
  linkgap=NULL;
Mikaël Salson's avatar
Mikaël Salson committed
157 158 159 160 161

  init();
}

DynProg::~DynProg() {
162

Mikaël Salson's avatar
Mikaël Salson committed
163 164
  for (int i = 0; i <= m; i++) {
    delete [] B[i];
165 166
    delete [] Bins[i];
    delete [] Bdel[i];
Mikaël Salson's avatar
Mikaël Salson committed
167
  }
168
  delete [] B;  
169 170
  delete [] Bins;
  delete [] Bdel;  
Mikaël Salson's avatar
Mikaël Salson committed
171 172 173
  delete [] gap1;
  delete [] gap2;
  delete [] linkgap;
Mikaël Salson's avatar
Mikaël Salson committed
174 175 176
}

void DynProg::init()
177 178 179 180 181 182
{
  for (int i=1; i<=m; i++)
    {
      B[i][0].type = INSER ;
      B[i][0].i = i-1 ;
      B[i][0].j = 0 ;
183
      Bdel[i][0].score = Bins[i][0].score = MINUS_INF ;
184 185 186 187 188 189 190
    }

  for (int j=1; j<=n; j++)
    {
      B[0][j].type = DELET ;    
      B[0][j].i = 0 ;
      B[0][j].j = j-1 ;
191
      Bdel[0][j].score = Bins[0][j].score = MINUS_INF ;
192 193
    }

Mikaël Salson's avatar
Mikaël Salson committed
194 195
  if (mode == Local || mode == LocalEndWithSomeDeletions)
    for (int i=0; i<=m; i++)
196
      B[i][0].score = 0 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
197 198
  else if (mode == GlobalButMostlyLocal)
    for (int i=0; i<=m; i++)
199
      B[i][0].score = i * cost.insertion / 2 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
200 201
  else // Global, SemiGlobal
    for (int i=0; i<=m; i++)
202
      B[i][0].score = i * cost.insertion ;
Mikaël Salson's avatar
Mikaël Salson committed
203 204 205
   
  if (mode == SemiGlobal || mode == SemiGlobalTrans || mode == Local || mode == LocalEndWithSomeDeletions)
    for (int j=0; j<=n; j++)
206
      B[0][j].score = 0 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
207 208
  else if (mode == GlobalButMostlyLocal)
    for (int j=0; j<=n; j++)
209
      B[0][j].score = j * cost.deletion / 2 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
210
  else // Global
Mikaël Salson's avatar
Mikaël Salson committed
211
    for (int j=0; j<=n; j++)
212
      B[0][j].score = j * cost.deletion ;
Mikaël Salson's avatar
Mikaël Salson committed
213 214
}

215 216 217 218 219 220 221 222 223 224 225
inline void try_operation(operation &best, int type, int i, int j, int score)
{
  if (score > best.score)
    {
      best.type = type ;
      best.i = i ;
      best.j = j ;
      best.score = score ;
    }
}

226
int DynProg::compute(bool onlyBottomTriangle, int onlyBottomTriangleShift)
Mikaël Salson's avatar
Mikaël Salson committed
227 228
{
  best_score = MINUS_INF ;
229 230 231 232
  best_i = 0 ;
  best_j = 0 ;
  
  operation best ;
Mikaël Salson's avatar
Mikaël Salson committed
233 234 235 236

  for (int i=1; i<=m; i++)
    for (int j=1; j<=n; j++)
    {
237
      best.score = MINUS_INF ;
Mikaël Salson's avatar
Mikaël Salson committed
238

239 240 241 242 243 244 245 246 247 248 249 250 251
      // If onlyBottomTriangle, stops when we are not in the bottom triangle
      if (onlyBottomTriangle && ((n-j) >= (m-i) + onlyBottomTriangleShift))
        {
          best.type = 'k';
          B[i][j] = best;
          if (cost.affine_gap)
            {
              Bins[i][j] = best;
              Bdel[i][j] = best;
            }
          continue ;
        }

252
      // The edit operations, with their backtracking information and their score
253 254

      // Match, mismatch
255
      try_operation(best, SUBST, i-1, j-1, B[i-1][j-1].score + cost.substitution(x[i-1], y[j-1]));
256

257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278
      if (!cost.affine_gap)
        {
          // Regular indels
          try_operation(best, INSER, i-1, j  , B[i-1][j  ].score + cost.insertion);
          try_operation(best, DELET, i  , j-1, B[i  ][j-1].score + cost.deletion);
        }
      else
        {
          // Gotoh affine gaps - insertion
          Bins[i][j].score = MINUS_INF ;
          try_operation(Bins[i][j], 'o', i-1, j, B[i-1][j].score + cost.open_insertion);
          try_operation(Bins[i][j], 'x', Bins[i-1][j].i, j, Bins[i-1][j].score + cost.extend_insertion);
          try_operation(best, INSER, Bins[i][j].i, j ,Bins[i][j].score);

          // Gotoh affine gaps - deletions
          Bdel[i][j].score = MINUS_INF ;
          try_operation(Bdel[i][j], 'o', i, j-1, B[i][j-1].score + cost.open_deletion);
          try_operation(Bdel[i][j], 'x', i, Bdel[i][j-1].j, Bdel[i-1][j].score + cost.extend_deletion);
          try_operation(best, DELET, i, Bdel[i][j].j,  Bdel[i][j].score);
        }

      // Homopolymers
279 280
      try_operation(best, HOMO2X, i-2, j-1, i > 1 ? B[i-2][j-1].score + cost.homo2(x[i-2], x[i-1], y[j-1]) : MINUS_INF);
      try_operation(best, HOMO2Y, i-1, j-2, j > 1 ? B[i-1][j-2].score + cost.homo2(y[j-2], y[j-1], x[i-1]) : MINUS_INF);
281
      
282
      // Local alignment
283 284
      if (mode == Local || mode == LocalEndWithSomeDeletions)
	try_operation(best, BEGIN, 0, 0, 0);
Mikaël Salson's avatar
Mikaël Salson committed
285

286 287
      if ((best.type == SUBST) && (x[i-1] != y[j-1]))
        best.type = MISMATCH ;
Mikaël Salson's avatar
Mikaël Salson committed
288

289 290
      // Fill the score and the backtrack information
      B[i][j] = best ;
Mikaël Salson's avatar
Mikaël Salson committed
291
      
292 293
      // cout << " " << i << "," << j << " " << best_op.type << " " <<  best_op.i << "," << best_op.j << " "  << best_op.score << " " << best << endl ;

Mikaël Salson's avatar
Mikaël Salson committed
294 295
      if (mode == Local || mode == LocalEndWithSomeDeletions)
	{
296
	  int tbest = best.score ;
Mikaël Salson's avatar
Mikaël Salson committed
297 298 299 300 301 302 303 304 305 306 307

	  if (mode == LocalEndWithSomeDeletions)
	    tbest += cost.deletion_end*(n-j);   
     
	  if (tbest > best_score)
	    {
	      best_score = tbest ;
	      best_i = i ;
	      best_j = j ;
	    }
	    
308
	  if (best.score == 0){
309
	    B[i][j].type = FIN;
Mikaël Salson's avatar
Mikaël Salson committed
310 311
	  }
	}
Mikaël Salson's avatar
Mikaël Salson committed
312

Mikaël Salson's avatar
Mikaël Salson committed
313 314
      if (mode == Local || mode == LocalEndWithSomeDeletions)
	{
315
      	  if (best.score == 0){
316
	    B[i][j].type = FIN;
Mikaël Salson's avatar
Mikaël Salson committed
317 318 319 320 321
	  }
	}
      
    }

322 323
  // End. Find best_i and best_j, put FIN keywords where the backtrack should stop
  
Mikaël Salson's avatar
Mikaël Salson committed
324 325
  if (mode == Local || mode == LocalEndWithSomeDeletions)
    {
326 327
      for (int j=0; j<=n; j++){
	B[0][j].type = FIN;
Mikaël Salson's avatar
Mikaël Salson committed
328 329
      }
      for (int i=0; i<=m; i++){
330
	B[i][0].type = FIN;
Mikaël Salson's avatar
Mikaël Salson committed
331 332 333 334 335 336 337 338
      }
      
    }
  if (mode == SemiGlobal)
    {
      best_i = m ;
      
      for (int j=1; j<=n; j++)
339
	if (B[m][j].score > best_score)
Mikaël Salson's avatar
Mikaël Salson committed
340
	  {
341
	    best_score = B[m][j].score ;
Mikaël Salson's avatar
Mikaël Salson committed
342 343 344
	    best_j = j ;
	  }
      for (int i=0; i<=m; i++){
345
	B[i][0].type = FIN;
Mikaël Salson's avatar
Mikaël Salson committed
346 347 348 349 350 351 352 353
      }
    }

  if (mode == SemiGlobalTrans)
    {
      best_j = n ;
      
      for (int i=1; i<=m; i++)
354
	if (B[i][n].score > best_score)
Mikaël Salson's avatar
Mikaël Salson committed
355
	  {
356
	    best_score = B[i][n].score ;
Mikaël Salson's avatar
Mikaël Salson committed
357 358 359 360
	    best_i = i ;
	  }
	  
      for (int i=0; i<=n; i++){
361
	B[0][i].type = FIN;
362
      }     
Mikaël Salson's avatar
Mikaël Salson committed
363 364
    }

Mathieu Giraud's avatar
Mathieu Giraud committed
365
  if ((mode == Global) || (mode == GlobalButMostlyLocal))
Mikaël Salson's avatar
Mikaël Salson committed
366 367 368
    {
      best_i = m ;
      best_j = n ;
369
      best_score = B[m][n].score;
Mikaël Salson's avatar
Mikaël Salson committed
370 371 372 373 374 375 376 377
    }

  if (reverse_x)
    best_i = m - best_i + 1 ;

  if (reverse_y)
    best_j = n - best_j + 1;

378 379
  B[0][0].type = FIN;
  
Mikaël Salson's avatar
Mikaël Salson committed
380 381 382 383 384 385 386
  // In the matrix positions start at 1, but start positions may start at 0
  best_i -= 1;   
  best_j -= 1;

  return best_score ;
}

387 388 389 390 391 392 393 394 395 396 397 398 399
int DynProg::best_score_on_i(int i, int *best_j)
{
  int best_score = MINUS_INF ;

  for (int j=0; j<=n; j++)
    if (B[i][j].score > best_score) {
      *best_j = j ;
      best_score = B[i][j].score;
    }

  return best_score ;
}

Mikaël Salson's avatar
Mikaël Salson committed
400 401
void DynProg::backtrack()
{
402
  // Tables for displaying the alignment
Mikaël Salson's avatar
Mikaël Salson committed
403 404 405 406
  gap1 = new int[x.size()+1];
  linkgap = new int[x.size()+1];
  gap2 = new int[y.size()+1];
      
407
  for (unsigned int i = 0; i <=x.size(); i++) {
Mikaël Salson's avatar
Mikaël Salson committed
408 409 410
    gap1[i] = 0;
    linkgap[i] = 0;
  }
411
  for (unsigned int i = 0; i <= y.size(); i++) {
Mikaël Salson's avatar
Mikaël Salson committed
412 413 414 415 416 417 418
    gap2[i] = 0;
  }
  
  
  int g1=x.size();
  int g2=y.size();
  
419 420
  int i=best_i+1;
  int j=best_j+1;
421

422
  // Compute backtrack strings
Mikaël Salson's avatar
Mikaël Salson committed
423
  
424 425
  ostringstream back_x;
  ostringstream back_y;
Mikaël Salson's avatar
Mikaël Salson committed
426 427
  ostringstream back_tr;
  
428 429 430 431 432 433
  while (1) {

    if ((i<0) || (j<0))
      { 
        cout << "Invalid backtrack: " << i << "," << j << endl ;
        exit(1);
Mikaël Salson's avatar
Mikaël Salson committed
434
      }
435 436 437

    if  (B[i][j].type == FIN)
      break ;
Mikaël Salson's avatar
Mikaël Salson committed
438
      
439
    // cout << "bt " << i << "/" << j << " " << B[i][j].type << " " << B[i][j].i << "," << B[i][j].j << endl ;
440 441 442
    
    int next_i = B[i][j].i;
    int next_j = B[i][j].j;
Mikaël Salson's avatar
Mikaël Salson committed
443

444 445
    // The maximum number of characters implied by this operation
    int max_k = max(i - next_i, j - next_j);
Mikaël Salson's avatar
Mikaël Salson committed
446
      
447 448 449
    for (int k=0; k<max_k; k++)
      {
        linkgap[g1]=g2;
450 451 452

        // Some character(s) from x, then fill with spaces
        if (i-k > next_i)
453
          {
454
            back_x << x[i-1-k];
455 456 457 458
            g1--;
          }
        else
          {
459
            back_x << " " ;
460 461
            gap1[g1]++ ;
          }
462 463 464

        // Some character(s) from y, then fill with spaces
        if (j-k > next_j)
465
          {
466
            back_y << y[j-1-k];
467 468 469 470
            g2--;
          }
        else
          {
471
            back_y << " " ;
472 473
            gap2[g2]++ ;
          }
474 475 476 477 478 479

        // The operation character, then fill with spaces
        if (k == max_k-1)          
          back_tr << B[i][j].type ;
        else
          back_tr << " " ;
480
      }
Mikaël Salson's avatar
Mikaël Salson committed
481
    
482 483
    i = next_i;
    j = next_j;
Mikaël Salson's avatar
Mikaël Salson committed
484 485
  }

486 487 488
  // Format backtrack strings
  
  string str1, str2, str3;
489
  str1 = back_x.str();
Mikaël Salson's avatar
Mikaël Salson committed
490 491 492 493
  str1 =string (str1.rbegin(), str1.rend());
  str1 = str1;
  str2=back_tr.str();
  str2 = string (str2.rbegin(), str2.rend());
494
  str3 = back_y.str();
Mikaël Salson's avatar
Mikaël Salson committed
495
  str3 = string (str3.rbegin(), str3.rend());
496 497

  ostringstream back;    
498 499 500
  back << setw(3) << i   << " " << str1.substr(0, BACKSIZE-8) << endl;
  back << setw(3) << " " << " " << str2.substr(0, BACKSIZE-8) << endl;
  back << setw(3) << j   << " " << str3.substr(0, BACKSIZE-8) << endl << endl;
Mikaël Salson's avatar
Mikaël Salson committed
501 502 503 504 505
  for (size_t k=0 ; (BACKSIZE-8+k*BACKSIZE)< str1.length() ; k++){
    back << str1.substr(BACKSIZE-8+k*BACKSIZE, BACKSIZE) << endl;
    back << str2.substr(BACKSIZE-8+k*BACKSIZE, BACKSIZE) << endl;
    back << str3.substr(BACKSIZE-8+k*BACKSIZE, BACKSIZE) << endl << endl;
  }
506
  back << "score: " << best_score << endl;
Mikaël Salson's avatar
Mikaël Salson committed
507
  
508 509
  first_i=i;
  first_j=j;
Mikaël Salson's avatar
Mikaël Salson committed
510 511
  
  str_back=back.str();
Mathieu Giraud's avatar
Mathieu Giraud committed
512 513

  // cout << str_back << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547
}


float identity_percent(int score)
{
  int match = (score + IdentityDirty.match/2) / IdentityDirty.match ;

  int mismatch = (score - match * IdentityDirty.match) / IdentityDirty.mismatch ;

  float pcent = (100. * (float) match) / (match + mismatch) ;

  // cout << score << "  " << match << "  " << mismatch << "  " << pcent << endl ; 

  return pcent ;
}


ostream& operator<<(ostream& out, const DynProg& dp)
{
  out << "       " ;

  for (int j=0; j<dp.n; j++)
    out << setw(4) << dp.y[j] << " ";

  out << endl ;

  for (int i=0; i<=dp.m; i++)
    {
      if (i)
	out << dp.x[i-1] << " " ;
      else
	out << "  " ;

      for (int j=0; j<=dp.n; j++)
548
        {
549 550
          if (dp.B[i][j].score)
            out << setw(4) << dp.B[i][j].score << dp.B[i][j].type ;
551 552 553
          else
            out << "    " << dp.B[i][j].type ;
        }
Mikaël Salson's avatar
Mikaël Salson committed
554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601
      out << endl ;      
    }
  
  out << "best: " << dp.best_i << "," << dp.best_j << endl;

  return out ;
}


Cost strToCost(string str, Cost default_cost){

  if (str.length()!=0){
    string::size_type stTemp = str.find(',');
	  
    std::list<int> val;
    
    while(stTemp != string::npos)
    {
	  val.push_back(atoi(str.substr(0, stTemp).c_str() ) );
	  str = str.substr(stTemp + 1);
	  stTemp = str.find(',');
    }
    val.push_back(atoi(str.c_str() ) );

    if (val.size()==5){ 
      int val1=val.front();
      val.pop_front();
      int val2=val.front();
      val.pop_front();
      int val3=val.front();
      val.pop_front();
      int val4=val.front();
      val.pop_front();
      int val5=val.front();
      val.pop_front();
      Cost result = Cost(val1, val2, val3, val4, val5);
      cout << "use custom Cost "<< result << endl;
      return result;
    }
    
    cout << "incorrect Cost format, use default "<< default_cost<<endl;
  }
  
  return default_cost;
}