dynprog.cpp 13.8 KB
Newer Older
Mikaël Salson's avatar
Mikaël Salson committed
1
/*
2 3 4 5 6 7 8
  This file is part of Vidjil <http://www.vidjil.org>
  Copyright (C) 2011, 2012, 2013, 2014, 2015 by Bonsai bioinformatics 
  at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille
  Contributors: 
      Mathieu Giraud <mathieu.giraud@vidjil.org>
      Mikaël Salson <mikael.salson@vidjil.org>
      Marc Duez <marc.duez@vidjil.org>
Mikaël Salson's avatar
Mikaël Salson committed
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25

  "Vidjil" is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.

  "Vidjil" is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with "Vidjil". If not, see <http://www.gnu.org/licenses/>
*/

#include "dynprog.h"
#include "tools.h"
26
#include "segment.h"
Mikaël Salson's avatar
Mikaël Salson committed
27 28 29 30
#include <cassert>
#include <list>
#include <cstdlib>
#include <string>
31
#include <cmath>
Mikaël Salson's avatar
Mikaël Salson committed
32

33 34 35 36 37 38 39 40
#define SUBST '|'
#define MISMATCH '.'
#define INSER 'i'
#define DELET 'd'
#define HOMO2X 'h'
#define HOMO2Y 'h'
#define FIN ' '
#define BEGIN 'B'
Mikaël Salson's avatar
Mikaël Salson committed
41
#define BACKSIZE 120
Mikaël Salson's avatar
Mikaël Salson committed
42 43 44 45 46 47 48 49 50 51 52 53

using namespace std;


Cost::Cost(int match, int mismatch, int indel, int del_end, int homopolymer)
{
  this -> match = match ;
  this -> mismatch = mismatch ;
  this -> insertion = indel ;
  this -> deletion = indel ;
  this -> deletion_end = del_end ;
  this -> homopolymer = (homopolymer == MINUS_INF ? indel: homopolymer);
54

55 56 57
  this -> open_insertion = this -> open_deletion = MINUS_INF ;
  this -> extend_insertion = this -> extend_deletion = MINUS_INF ;
  this -> affine_gap = false ;
58 59

  estimate_K_lambda();
60 61 62 63 64 65 66 67 68 69 70 71 72
}

Cost::Cost(int match, int mismatch, int open_gap, int extend_gap, int del_end, int homopolymer)
{
  this -> match = match ;
  this -> mismatch = mismatch ;
  this -> insertion = MINUS_INF ;
  this -> deletion = MINUS_INF ;
  this -> deletion_end = del_end ;
  this -> homopolymer = homopolymer ;

  this -> open_insertion = this -> open_deletion = open_gap ;
  this -> extend_insertion = this -> extend_deletion = extend_gap ;
73
  this -> affine_gap = true ;
74 75 76 77 78 79 80 81 82

  estimate_K_lambda();
}

void Cost::estimate_K_lambda()
{
  // Rough estimate of K and lambda parameters
  lambda = log(4) / match ;
  K = 0.05 ;
Mikaël Salson's avatar
Mikaël Salson committed
83 84 85 86 87 88 89 90 91
}


ostream& operator<<(ostream& out, const Cost& cost)
{
  out << "(" << cost.match 
      << ", " << cost.mismatch
      << "/" << cost.insertion
      << "/" << cost.deletion
92 93
      << ", " << cost.open_insertion << cost.extend_insertion
      << "/" << cost.open_deletion << cost.extend_deletion
Mikaël Salson's avatar
Mikaël Salson committed
94 95
      << ", " << cost.deletion_end
      << ", " << cost.homopolymer
96 97 98
      << ") "
      << cost.K << "/" << cost.lambda ;

Mikaël Salson's avatar
Mikaël Salson committed
99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
  return out;
}

Cost::Cost()
{
}

int Cost::substitution(char a, char b)
{
  return (a == b) ? match : mismatch ;
}

int Cost::homo2(char xa, char xb, char y)
{
  return ((xa == xb) && (xb == y)) ? homopolymer : MINUS_INF ;
}

116
double Cost::toPValue(const int score)
117
{
118
  return K * exp(-lambda * score);
119 120
}

Mikaël Salson's avatar
Mikaël Salson committed
121 122 123 124 125 126 127 128 129 130 131 132 133

DynProg::DynProg(const string &x, const string &y, DynProgMode mode, const Cost& cost, const bool reverse_x, const bool reverse_y)
{
  this -> x = reverse_x ? reverse(x) : x ;
  this -> y = reverse_y ? reverse(y) : y ;

  this -> reverse_x = reverse_x ;
  this -> reverse_y = reverse_y ;

  m = x.size();
  n = y.size();
  // cout << "Dynprog " << x << "(" << m << ") / " << y << "(" << n << ")" << endl ;

134
  B = new operation*[m+1];
Mikaël Salson's avatar
Mikaël Salson committed
135
  for (int i = 0; i <= m; i++) {
136
    B[i] = new operation[n+1];
Mikaël Salson's avatar
Mikaël Salson committed
137
  }
138 139 140 141 142 143 144 145
  Bins = new operation*[m+1];
  for (int i = 0; i <= m; i++) {
    Bins[i] = new operation[n+1];
  }
  Bdel = new operation*[m+1];
  for (int i = 0; i <= m; i++) {
    Bdel[i] = new operation[n+1];
  }  
Mikaël Salson's avatar
Mikaël Salson committed
146 147 148 149 150 151 152
  this -> mode = mode;
  this -> cost = cost;

  this -> best_i = -1 ;
  this -> best_j = -1 ;
  this -> first_i = -1 ;
  this -> first_j = -1 ;
Mikaël Salson's avatar
Mikaël Salson committed
153 154 155 156
  
  gap1=NULL;
  gap2=NULL;
  linkgap=NULL;
Mikaël Salson's avatar
Mikaël Salson committed
157 158 159 160 161

  init();
}

DynProg::~DynProg() {
162

Mikaël Salson's avatar
Mikaël Salson committed
163 164
  for (int i = 0; i <= m; i++) {
    delete [] B[i];
165 166
    delete [] Bins[i];
    delete [] Bdel[i];
Mikaël Salson's avatar
Mikaël Salson committed
167
  }
168
  delete [] B;  
169 170
  delete [] Bins;
  delete [] Bdel;  
Mikaël Salson's avatar
Mikaël Salson committed
171 172 173
  delete [] gap1;
  delete [] gap2;
  delete [] linkgap;
Mikaël Salson's avatar
Mikaël Salson committed
174 175 176
}

void DynProg::init()
177 178 179 180 181 182
{
  for (int i=1; i<=m; i++)
    {
      B[i][0].type = INSER ;
      B[i][0].i = i-1 ;
      B[i][0].j = 0 ;
183
      Bdel[i][0].score = Bins[i][0].score = MINUS_INF ;
184 185 186 187 188 189 190
    }

  for (int j=1; j<=n; j++)
    {
      B[0][j].type = DELET ;    
      B[0][j].i = 0 ;
      B[0][j].j = j-1 ;
191
      Bdel[0][j].score = Bins[0][j].score = MINUS_INF ;
192 193
    }

Mikaël Salson's avatar
Mikaël Salson committed
194 195
  if (mode == Local || mode == LocalEndWithSomeDeletions)
    for (int i=0; i<=m; i++)
196
      B[i][0].score = 0 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
197 198
  else if (mode == GlobalButMostlyLocal)
    for (int i=0; i<=m; i++)
199
      B[i][0].score = i * cost.insertion / 2 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
200 201
  else // Global, SemiGlobal
    for (int i=0; i<=m; i++)
202
      B[i][0].score = i * cost.insertion ;
Mikaël Salson's avatar
Mikaël Salson committed
203 204 205
   
  if (mode == SemiGlobal || mode == SemiGlobalTrans || mode == Local || mode == LocalEndWithSomeDeletions)
    for (int j=0; j<=n; j++)
206
      B[0][j].score = 0 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
207 208
  else if (mode == GlobalButMostlyLocal)
    for (int j=0; j<=n; j++)
209
      B[0][j].score = j * cost.deletion / 2 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
210
  else // Global
Mikaël Salson's avatar
Mikaël Salson committed
211
    for (int j=0; j<=n; j++)
212
      B[0][j].score = j * cost.deletion ;
Mikaël Salson's avatar
Mikaël Salson committed
213 214
}

215 216 217 218 219 220 221 222 223 224 225
inline void try_operation(operation &best, int type, int i, int j, int score)
{
  if (score > best.score)
    {
      best.type = type ;
      best.i = i ;
      best.j = j ;
      best.score = score ;
    }
}

Mikaël Salson's avatar
Mikaël Salson committed
226 227 228
int DynProg::compute()
{
  best_score = MINUS_INF ;
229 230 231 232
  best_i = 0 ;
  best_j = 0 ;
  
  operation best ;
Mikaël Salson's avatar
Mikaël Salson committed
233 234 235 236

  for (int i=1; i<=m; i++)
    for (int j=1; j<=n; j++)
    {
237
      best.score = MINUS_INF ;
Mikaël Salson's avatar
Mikaël Salson committed
238

239
      // The edit operations, with their backtracking information and their score
240 241

      // Match, mismatch
242
      try_operation(best, SUBST, i-1, j-1, B[i-1][j-1].score + cost.substitution(x[i-1], y[j-1]));
243

244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265
      if (!cost.affine_gap)
        {
          // Regular indels
          try_operation(best, INSER, i-1, j  , B[i-1][j  ].score + cost.insertion);
          try_operation(best, DELET, i  , j-1, B[i  ][j-1].score + cost.deletion);
        }
      else
        {
          // Gotoh affine gaps - insertion
          Bins[i][j].score = MINUS_INF ;
          try_operation(Bins[i][j], 'o', i-1, j, B[i-1][j].score + cost.open_insertion);
          try_operation(Bins[i][j], 'x', Bins[i-1][j].i, j, Bins[i-1][j].score + cost.extend_insertion);
          try_operation(best, INSER, Bins[i][j].i, j ,Bins[i][j].score);

          // Gotoh affine gaps - deletions
          Bdel[i][j].score = MINUS_INF ;
          try_operation(Bdel[i][j], 'o', i, j-1, B[i][j-1].score + cost.open_deletion);
          try_operation(Bdel[i][j], 'x', i, Bdel[i][j-1].j, Bdel[i-1][j].score + cost.extend_deletion);
          try_operation(best, DELET, i, Bdel[i][j].j,  Bdel[i][j].score);
        }

      // Homopolymers
266 267
      try_operation(best, HOMO2X, i-2, j-1, i > 1 ? B[i-2][j-1].score + cost.homo2(x[i-2], x[i-1], y[j-1]) : MINUS_INF);
      try_operation(best, HOMO2Y, i-1, j-2, j > 1 ? B[i-1][j-2].score + cost.homo2(y[j-2], y[j-1], x[i-1]) : MINUS_INF);
268
      
269
      // Local alignment
270 271
      if (mode == Local || mode == LocalEndWithSomeDeletions)
	try_operation(best, BEGIN, 0, 0, 0);
Mikaël Salson's avatar
Mikaël Salson committed
272

273 274
      if ((best.type == SUBST) && (x[i-1] != y[j-1]))
        best.type = MISMATCH ;
Mikaël Salson's avatar
Mikaël Salson committed
275

276 277
      // Fill the score and the backtrack information
      B[i][j] = best ;
Mikaël Salson's avatar
Mikaël Salson committed
278
      
279 280
      // cout << " " << i << "," << j << " " << best_op.type << " " <<  best_op.i << "," << best_op.j << " "  << best_op.score << " " << best << endl ;

Mikaël Salson's avatar
Mikaël Salson committed
281 282
      if (mode == Local || mode == LocalEndWithSomeDeletions)
	{
283
	  int tbest = best.score ;
Mikaël Salson's avatar
Mikaël Salson committed
284 285 286 287 288 289 290 291 292 293 294

	  if (mode == LocalEndWithSomeDeletions)
	    tbest += cost.deletion_end*(n-j);   
     
	  if (tbest > best_score)
	    {
	      best_score = tbest ;
	      best_i = i ;
	      best_j = j ;
	    }
	    
295
	  if (best.score == 0){
296
	    B[i][j].type = FIN;
Mikaël Salson's avatar
Mikaël Salson committed
297 298
	  }
	}
Mikaël Salson's avatar
Mikaël Salson committed
299

Mikaël Salson's avatar
Mikaël Salson committed
300 301
      if (mode == Local || mode == LocalEndWithSomeDeletions)
	{
302
      	  if (best.score == 0){
303
	    B[i][j].type = FIN;
Mikaël Salson's avatar
Mikaël Salson committed
304 305 306 307 308
	  }
	}
      
    }

309 310
  // End. Find best_i and best_j, put FIN keywords where the backtrack should stop
  
Mikaël Salson's avatar
Mikaël Salson committed
311 312
  if (mode == Local || mode == LocalEndWithSomeDeletions)
    {
313 314
      for (int j=0; j<=n; j++){
	B[0][j].type = FIN;
Mikaël Salson's avatar
Mikaël Salson committed
315 316
      }
      for (int i=0; i<=m; i++){
317
	B[i][0].type = FIN;
Mikaël Salson's avatar
Mikaël Salson committed
318 319 320 321 322 323 324 325
      }
      
    }
  if (mode == SemiGlobal)
    {
      best_i = m ;
      
      for (int j=1; j<=n; j++)
326
	if (B[m][j].score > best_score)
Mikaël Salson's avatar
Mikaël Salson committed
327
	  {
328
	    best_score = B[m][j].score ;
Mikaël Salson's avatar
Mikaël Salson committed
329 330 331
	    best_j = j ;
	  }
      for (int i=0; i<=m; i++){
332
	B[i][0].type = FIN;
Mikaël Salson's avatar
Mikaël Salson committed
333 334 335 336 337 338 339 340
      }
    }

  if (mode == SemiGlobalTrans)
    {
      best_j = n ;
      
      for (int i=1; i<=m; i++)
341
	if (B[i][n].score > best_score)
Mikaël Salson's avatar
Mikaël Salson committed
342
	  {
343
	    best_score = B[i][n].score ;
Mikaël Salson's avatar
Mikaël Salson committed
344 345 346 347
	    best_i = i ;
	  }
	  
      for (int i=0; i<=n; i++){
348
	B[0][i].type = FIN;
349
      }     
Mikaël Salson's avatar
Mikaël Salson committed
350 351
    }

Mathieu Giraud's avatar
Mathieu Giraud committed
352
  if ((mode == Global) || (mode == GlobalButMostlyLocal))
Mikaël Salson's avatar
Mikaël Salson committed
353 354 355
    {
      best_i = m ;
      best_j = n ;
356
      best_score = B[m][n].score;
Mikaël Salson's avatar
Mikaël Salson committed
357 358 359 360 361 362 363 364
    }

  if (reverse_x)
    best_i = m - best_i + 1 ;

  if (reverse_y)
    best_j = n - best_j + 1;

365 366
  B[0][0].type = FIN;
  
Mikaël Salson's avatar
Mikaël Salson committed
367 368 369 370 371 372 373
  // In the matrix positions start at 1, but start positions may start at 0
  best_i -= 1;   
  best_j -= 1;

  return best_score ;
}

374 375 376 377 378 379 380 381 382 383 384 385 386
int DynProg::best_score_on_i(int i, int *best_j)
{
  int best_score = MINUS_INF ;

  for (int j=0; j<=n; j++)
    if (B[i][j].score > best_score) {
      *best_j = j ;
      best_score = B[i][j].score;
    }

  return best_score ;
}

Mikaël Salson's avatar
Mikaël Salson committed
387 388
void DynProg::backtrack()
{
389
  // Tables for displaying the alignment
Mikaël Salson's avatar
Mikaël Salson committed
390 391 392 393
  gap1 = new int[x.size()+1];
  linkgap = new int[x.size()+1];
  gap2 = new int[y.size()+1];
      
394
  for (unsigned int i = 0; i <=x.size(); i++) {
Mikaël Salson's avatar
Mikaël Salson committed
395 396 397
    gap1[i] = 0;
    linkgap[i] = 0;
  }
398
  for (unsigned int i = 0; i <= y.size(); i++) {
Mikaël Salson's avatar
Mikaël Salson committed
399 400 401 402 403 404 405
    gap2[i] = 0;
  }
  
  
  int g1=x.size();
  int g2=y.size();
  
406 407
  int i=best_i+1;
  int j=best_j+1;
408

409
  // Compute backtrack strings
Mikaël Salson's avatar
Mikaël Salson committed
410
  
411 412
  ostringstream back_x;
  ostringstream back_y;
Mikaël Salson's avatar
Mikaël Salson committed
413 414
  ostringstream back_tr;
  
415 416 417 418 419 420
  while (1) {

    if ((i<0) || (j<0))
      { 
        cout << "Invalid backtrack: " << i << "," << j << endl ;
        exit(1);
Mikaël Salson's avatar
Mikaël Salson committed
421
      }
422 423 424

    if  (B[i][j].type == FIN)
      break ;
Mikaël Salson's avatar
Mikaël Salson committed
425
      
426
    // cout << "bt " << i << "/" << j << " " << B[i][j].type << " " << B[i][j].i << "," << B[i][j].j << endl ;
427 428 429
    
    int next_i = B[i][j].i;
    int next_j = B[i][j].j;
Mikaël Salson's avatar
Mikaël Salson committed
430

431 432
    // The maximum number of characters implied by this operation
    int max_k = max(i - next_i, j - next_j);
Mikaël Salson's avatar
Mikaël Salson committed
433
      
434 435 436
    for (int k=0; k<max_k; k++)
      {
        linkgap[g1]=g2;
437 438 439

        // Some character(s) from x, then fill with spaces
        if (i-k > next_i)
440
          {
441
            back_x << x[i-1-k];
442 443 444 445
            g1--;
          }
        else
          {
446
            back_x << " " ;
447 448
            gap1[g1]++ ;
          }
449 450 451

        // Some character(s) from y, then fill with spaces
        if (j-k > next_j)
452
          {
453
            back_y << y[j-1-k];
454 455 456 457
            g2--;
          }
        else
          {
458
            back_y << " " ;
459 460
            gap2[g2]++ ;
          }
461 462 463 464 465 466

        // The operation character, then fill with spaces
        if (k == max_k-1)          
          back_tr << B[i][j].type ;
        else
          back_tr << " " ;
467
      }
Mikaël Salson's avatar
Mikaël Salson committed
468
    
469 470
    i = next_i;
    j = next_j;
Mikaël Salson's avatar
Mikaël Salson committed
471 472
  }

473 474 475
  // Format backtrack strings
  
  string str1, str2, str3;
476
  str1 = back_x.str();
Mikaël Salson's avatar
Mikaël Salson committed
477 478 479 480
  str1 =string (str1.rbegin(), str1.rend());
  str1 = str1;
  str2=back_tr.str();
  str2 = string (str2.rbegin(), str2.rend());
481
  str3 = back_y.str();
Mikaël Salson's avatar
Mikaël Salson committed
482
  str3 = string (str3.rbegin(), str3.rend());
483 484

  ostringstream back;    
485 486 487
  back << setw(3) << i   << " " << str1.substr(0, BACKSIZE-8) << endl;
  back << setw(3) << " " << " " << str2.substr(0, BACKSIZE-8) << endl;
  back << setw(3) << j   << " " << str3.substr(0, BACKSIZE-8) << endl << endl;
Mikaël Salson's avatar
Mikaël Salson committed
488 489 490 491 492
  for (size_t k=0 ; (BACKSIZE-8+k*BACKSIZE)< str1.length() ; k++){
    back << str1.substr(BACKSIZE-8+k*BACKSIZE, BACKSIZE) << endl;
    back << str2.substr(BACKSIZE-8+k*BACKSIZE, BACKSIZE) << endl;
    back << str3.substr(BACKSIZE-8+k*BACKSIZE, BACKSIZE) << endl << endl;
  }
493
  back << "score: " << best_score << endl;
Mikaël Salson's avatar
Mikaël Salson committed
494
  
495 496
  first_i=i;
  first_j=j;
Mikaël Salson's avatar
Mikaël Salson committed
497 498
  
  str_back=back.str();
Mathieu Giraud's avatar
Mathieu Giraud committed
499 500

  // cout << str_back << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534
}


float identity_percent(int score)
{
  int match = (score + IdentityDirty.match/2) / IdentityDirty.match ;

  int mismatch = (score - match * IdentityDirty.match) / IdentityDirty.mismatch ;

  float pcent = (100. * (float) match) / (match + mismatch) ;

  // cout << score << "  " << match << "  " << mismatch << "  " << pcent << endl ; 

  return pcent ;
}


ostream& operator<<(ostream& out, const DynProg& dp)
{
  out << "       " ;

  for (int j=0; j<dp.n; j++)
    out << setw(4) << dp.y[j] << " ";

  out << endl ;

  for (int i=0; i<=dp.m; i++)
    {
      if (i)
	out << dp.x[i-1] << " " ;
      else
	out << "  " ;

      for (int j=0; j<=dp.n; j++)
535
        {
536 537
          if (dp.B[i][j].score)
            out << setw(4) << dp.B[i][j].score << dp.B[i][j].type ;
538 539 540
          else
            out << "    " << dp.B[i][j].type ;
        }
Mikaël Salson's avatar
Mikaël Salson committed
541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588
      out << endl ;      
    }
  
  out << "best: " << dp.best_i << "," << dp.best_j << endl;

  return out ;
}


Cost strToCost(string str, Cost default_cost){

  if (str.length()!=0){
    string::size_type stTemp = str.find(',');
	  
    std::list<int> val;
    
    while(stTemp != string::npos)
    {
	  val.push_back(atoi(str.substr(0, stTemp).c_str() ) );
	  str = str.substr(stTemp + 1);
	  stTemp = str.find(',');
    }
    val.push_back(atoi(str.c_str() ) );

    if (val.size()==5){ 
      int val1=val.front();
      val.pop_front();
      int val2=val.front();
      val.pop_front();
      int val3=val.front();
      val.pop_front();
      int val4=val.front();
      val.pop_front();
      int val5=val.front();
      val.pop_front();
      Cost result = Cost(val1, val2, val3, val4, val5);
      cout << "use custom Cost "<< result << endl;
      return result;
    }
    
    cout << "incorrect Cost format, use default "<< default_cost<<endl;
  }
  
  return default_cost;
}