dynprog.cpp 13.3 KB
Newer Older
Mikaël Salson's avatar
Mikaël Salson committed
1
/*
2 3 4 5 6 7 8
  This file is part of Vidjil <http://www.vidjil.org>
  Copyright (C) 2011, 2012, 2013, 2014, 2015 by Bonsai bioinformatics 
  at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille
  Contributors: 
      Mathieu Giraud <mathieu.giraud@vidjil.org>
      Mikaël Salson <mikael.salson@vidjil.org>
      Marc Duez <marc.duez@vidjil.org>
Mikaël Salson's avatar
Mikaël Salson committed
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30

  "Vidjil" is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.

  "Vidjil" is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with "Vidjil". If not, see <http://www.gnu.org/licenses/>
*/

#include "dynprog.h"
#include "tools.h"
#include <cassert>
#include <list>
#include <cstdlib>
#include <string>

31 32 33 34 35 36 37 38
#define SUBST '|'
#define MISMATCH '.'
#define INSER 'i'
#define DELET 'd'
#define HOMO2X 'h'
#define HOMO2Y 'h'
#define FIN ' '
#define BEGIN 'B'
Mikaël Salson's avatar
Mikaël Salson committed
39
#define BACKSIZE 120
Mikaël Salson's avatar
Mikaël Salson committed
40 41 42 43 44 45 46 47 48 49 50 51

using namespace std;


Cost::Cost(int match, int mismatch, int indel, int del_end, int homopolymer)
{
  this -> match = match ;
  this -> mismatch = mismatch ;
  this -> insertion = indel ;
  this -> deletion = indel ;
  this -> deletion_end = del_end ;
  this -> homopolymer = (homopolymer == MINUS_INF ? indel: homopolymer);
52

53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
  this -> open_insertion = this -> open_deletion = MINUS_INF ;
  this -> extend_insertion = this -> extend_deletion = MINUS_INF ;
  this -> affine_gap = false ;
}

Cost::Cost(int match, int mismatch, int open_gap, int extend_gap, int del_end, int homopolymer)
{
  this -> match = match ;
  this -> mismatch = mismatch ;
  this -> insertion = MINUS_INF ;
  this -> deletion = MINUS_INF ;
  this -> deletion_end = del_end ;
  this -> homopolymer = homopolymer ;

  this -> open_insertion = this -> open_deletion = open_gap ;
  this -> extend_insertion = this -> extend_deletion = extend_gap ;
69
  this -> affine_gap = true ;
Mikaël Salson's avatar
Mikaël Salson committed
70 71 72 73 74 75 76 77 78
}


ostream& operator<<(ostream& out, const Cost& cost)
{
  out << "(" << cost.match 
      << ", " << cost.mismatch
      << "/" << cost.insertion
      << "/" << cost.deletion
79 80
      << ", " << cost.open_insertion << cost.extend_insertion
      << "/" << cost.open_deletion << cost.extend_deletion
Mikaël Salson's avatar
Mikaël Salson committed
81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
      << ", " << cost.deletion_end
      << ", " << cost.homopolymer
      << ")" ;
  return out;
}

Cost::Cost()
{
}

int Cost::substitution(char a, char b)
{
  return (a == b) ? match : mismatch ;
}

int Cost::homo2(char xa, char xb, char y)
{
  return ((xa == xb) && (xb == y)) ? homopolymer : MINUS_INF ;
}


DynProg::DynProg(const string &x, const string &y, DynProgMode mode, const Cost& cost, const bool reverse_x, const bool reverse_y)
{
  this -> x = reverse_x ? reverse(x) : x ;
  this -> y = reverse_y ? reverse(y) : y ;

  this -> reverse_x = reverse_x ;
  this -> reverse_y = reverse_y ;

  m = x.size();
  n = y.size();
  // cout << "Dynprog " << x << "(" << m << ") / " << y << "(" << n << ")" << endl ;

114
  B = new operation*[m+1];
Mikaël Salson's avatar
Mikaël Salson committed
115
  for (int i = 0; i <= m; i++) {
116
    B[i] = new operation[n+1];
Mikaël Salson's avatar
Mikaël Salson committed
117
  }
118 119 120 121 122 123 124 125
  Bins = new operation*[m+1];
  for (int i = 0; i <= m; i++) {
    Bins[i] = new operation[n+1];
  }
  Bdel = new operation*[m+1];
  for (int i = 0; i <= m; i++) {
    Bdel[i] = new operation[n+1];
  }  
Mikaël Salson's avatar
Mikaël Salson committed
126 127 128 129 130 131 132
  this -> mode = mode;
  this -> cost = cost;

  this -> best_i = -1 ;
  this -> best_j = -1 ;
  this -> first_i = -1 ;
  this -> first_j = -1 ;
Mikaël Salson's avatar
Mikaël Salson committed
133 134 135 136
  
  gap1=NULL;
  gap2=NULL;
  linkgap=NULL;
Mikaël Salson's avatar
Mikaël Salson committed
137 138 139 140 141

  init();
}

DynProg::~DynProg() {
142

Mikaël Salson's avatar
Mikaël Salson committed
143 144
  for (int i = 0; i <= m; i++) {
    delete [] B[i];
145 146
    delete [] Bins[i];
    delete [] Bdel[i];
Mikaël Salson's avatar
Mikaël Salson committed
147
  }
148
  delete [] B;  
149 150
  delete [] Bins;
  delete [] Bdel;  
Mikaël Salson's avatar
Mikaël Salson committed
151 152 153
  delete [] gap1;
  delete [] gap2;
  delete [] linkgap;
Mikaël Salson's avatar
Mikaël Salson committed
154 155 156
}

void DynProg::init()
157 158 159 160 161 162
{
  for (int i=1; i<=m; i++)
    {
      B[i][0].type = INSER ;
      B[i][0].i = i-1 ;
      B[i][0].j = 0 ;
163
      Bdel[i][0].score = Bins[i][0].score = MINUS_INF ;
164 165 166 167 168 169 170
    }

  for (int j=1; j<=n; j++)
    {
      B[0][j].type = DELET ;    
      B[0][j].i = 0 ;
      B[0][j].j = j-1 ;
171
      Bdel[0][j].score = Bins[0][j].score = MINUS_INF ;
172 173
    }

Mikaël Salson's avatar
Mikaël Salson committed
174 175
  if (mode == Local || mode == LocalEndWithSomeDeletions)
    for (int i=0; i<=m; i++)
176
      B[i][0].score = 0 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
177 178
  else if (mode == GlobalButMostlyLocal)
    for (int i=0; i<=m; i++)
179
      B[i][0].score = i * cost.insertion / 2 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
180 181
  else // Global, SemiGlobal
    for (int i=0; i<=m; i++)
182
      B[i][0].score = i * cost.insertion ;
Mikaël Salson's avatar
Mikaël Salson committed
183 184 185
   
  if (mode == SemiGlobal || mode == SemiGlobalTrans || mode == Local || mode == LocalEndWithSomeDeletions)
    for (int j=0; j<=n; j++)
186
      B[0][j].score = 0 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
187 188
  else if (mode == GlobalButMostlyLocal)
    for (int j=0; j<=n; j++)
189
      B[0][j].score = j * cost.deletion / 2 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
190
  else // Global
Mikaël Salson's avatar
Mikaël Salson committed
191
    for (int j=0; j<=n; j++)
192
      B[0][j].score = j * cost.deletion ;
Mikaël Salson's avatar
Mikaël Salson committed
193 194
}

195 196 197 198 199 200 201 202 203 204 205
inline void try_operation(operation &best, int type, int i, int j, int score)
{
  if (score > best.score)
    {
      best.type = type ;
      best.i = i ;
      best.j = j ;
      best.score = score ;
    }
}

Mikaël Salson's avatar
Mikaël Salson committed
206 207 208
int DynProg::compute()
{
  best_score = MINUS_INF ;
209 210 211 212
  best_i = 0 ;
  best_j = 0 ;
  
  operation best ;
Mikaël Salson's avatar
Mikaël Salson committed
213 214 215 216

  for (int i=1; i<=m; i++)
    for (int j=1; j<=n; j++)
    {
217
      best.score = MINUS_INF ;
Mikaël Salson's avatar
Mikaël Salson committed
218

219
      // The edit operations, with their backtracking information and their score
220 221

      // Match, mismatch
222
      try_operation(best, SUBST, i-1, j-1, B[i-1][j-1].score + cost.substitution(x[i-1], y[j-1]));
223

224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245
      if (!cost.affine_gap)
        {
          // Regular indels
          try_operation(best, INSER, i-1, j  , B[i-1][j  ].score + cost.insertion);
          try_operation(best, DELET, i  , j-1, B[i  ][j-1].score + cost.deletion);
        }
      else
        {
          // Gotoh affine gaps - insertion
          Bins[i][j].score = MINUS_INF ;
          try_operation(Bins[i][j], 'o', i-1, j, B[i-1][j].score + cost.open_insertion);
          try_operation(Bins[i][j], 'x', Bins[i-1][j].i, j, Bins[i-1][j].score + cost.extend_insertion);
          try_operation(best, INSER, Bins[i][j].i, j ,Bins[i][j].score);

          // Gotoh affine gaps - deletions
          Bdel[i][j].score = MINUS_INF ;
          try_operation(Bdel[i][j], 'o', i, j-1, B[i][j-1].score + cost.open_deletion);
          try_operation(Bdel[i][j], 'x', i, Bdel[i][j-1].j, Bdel[i-1][j].score + cost.extend_deletion);
          try_operation(best, DELET, i, Bdel[i][j].j,  Bdel[i][j].score);
        }

      // Homopolymers
246 247
      try_operation(best, HOMO2X, i-2, j-1, i > 1 ? B[i-2][j-1].score + cost.homo2(x[i-2], x[i-1], y[j-1]) : MINUS_INF);
      try_operation(best, HOMO2Y, i-1, j-2, j > 1 ? B[i-1][j-2].score + cost.homo2(y[j-2], y[j-1], x[i-1]) : MINUS_INF);
248
      
249
      // Local alignment
250 251
      if (mode == Local || mode == LocalEndWithSomeDeletions)
	try_operation(best, BEGIN, 0, 0, 0);
Mikaël Salson's avatar
Mikaël Salson committed
252

253 254
      if ((best.type == SUBST) && (x[i-1] != y[j-1]))
        best.type = MISMATCH ;
Mikaël Salson's avatar
Mikaël Salson committed
255

256 257
      // Fill the score and the backtrack information
      B[i][j] = best ;
Mikaël Salson's avatar
Mikaël Salson committed
258
      
259 260
      // cout << " " << i << "," << j << " " << best_op.type << " " <<  best_op.i << "," << best_op.j << " "  << best_op.score << " " << best << endl ;

Mikaël Salson's avatar
Mikaël Salson committed
261 262
      if (mode == Local || mode == LocalEndWithSomeDeletions)
	{
263
	  int tbest = best.score ;
Mikaël Salson's avatar
Mikaël Salson committed
264 265 266 267 268 269 270 271 272 273 274

	  if (mode == LocalEndWithSomeDeletions)
	    tbest += cost.deletion_end*(n-j);   
     
	  if (tbest > best_score)
	    {
	      best_score = tbest ;
	      best_i = i ;
	      best_j = j ;
	    }
	    
275
	  if (best.score == 0){
276
	    B[i][j].type = FIN;
Mikaël Salson's avatar
Mikaël Salson committed
277 278
	  }
	}
Mikaël Salson's avatar
Mikaël Salson committed
279

Mikaël Salson's avatar
Mikaël Salson committed
280 281
      if (mode == Local || mode == LocalEndWithSomeDeletions)
	{
282
      	  if (best.score == 0){
283
	    B[i][j].type = FIN;
Mikaël Salson's avatar
Mikaël Salson committed
284 285 286 287 288
	  }
	}
      
    }

289 290
  // End. Find best_i and best_j, put FIN keywords where the backtrack should stop
  
Mikaël Salson's avatar
Mikaël Salson committed
291 292
  if (mode == Local || mode == LocalEndWithSomeDeletions)
    {
293 294
      for (int j=0; j<=n; j++){
	B[0][j].type = FIN;
Mikaël Salson's avatar
Mikaël Salson committed
295 296
      }
      for (int i=0; i<=m; i++){
297
	B[i][0].type = FIN;
Mikaël Salson's avatar
Mikaël Salson committed
298 299 300 301 302 303 304 305
      }
      
    }
  if (mode == SemiGlobal)
    {
      best_i = m ;
      
      for (int j=1; j<=n; j++)
306
	if (B[m][j].score > best_score)
Mikaël Salson's avatar
Mikaël Salson committed
307
	  {
308
	    best_score = B[m][j].score ;
Mikaël Salson's avatar
Mikaël Salson committed
309 310 311
	    best_j = j ;
	  }
      for (int i=0; i<=m; i++){
312
	B[i][0].type = FIN;
Mikaël Salson's avatar
Mikaël Salson committed
313 314 315 316 317 318 319 320
      }
    }

  if (mode == SemiGlobalTrans)
    {
      best_j = n ;
      
      for (int i=1; i<=m; i++)
321
	if (B[i][n].score > best_score)
Mikaël Salson's avatar
Mikaël Salson committed
322
	  {
323
	    best_score = B[i][n].score ;
Mikaël Salson's avatar
Mikaël Salson committed
324 325 326 327
	    best_i = i ;
	  }
	  
      for (int i=0; i<=n; i++){
328
	B[0][i].type = FIN;
329
      }     
Mikaël Salson's avatar
Mikaël Salson committed
330 331
    }

Mathieu Giraud's avatar
Mathieu Giraud committed
332
  if ((mode == Global) || (mode == GlobalButMostlyLocal))
Mikaël Salson's avatar
Mikaël Salson committed
333 334 335
    {
      best_i = m ;
      best_j = n ;
336
      best_score = B[m][n].score;
Mikaël Salson's avatar
Mikaël Salson committed
337 338 339 340 341 342 343 344
    }

  if (reverse_x)
    best_i = m - best_i + 1 ;

  if (reverse_y)
    best_j = n - best_j + 1;

345 346
  B[0][0].type = FIN;
  
Mikaël Salson's avatar
Mikaël Salson committed
347 348 349 350 351 352 353 354 355
  // In the matrix positions start at 1, but start positions may start at 0
  best_i -= 1;   
  best_j -= 1;

  return best_score ;
}

void DynProg::backtrack()
{
356
  // Tables for displaying the alignment
Mikaël Salson's avatar
Mikaël Salson committed
357 358 359 360
  gap1 = new int[x.size()+1];
  linkgap = new int[x.size()+1];
  gap2 = new int[y.size()+1];
      
361
  for (unsigned int i = 0; i <=x.size(); i++) {
Mikaël Salson's avatar
Mikaël Salson committed
362 363 364
    gap1[i] = 0;
    linkgap[i] = 0;
  }
365
  for (unsigned int i = 0; i <= y.size(); i++) {
Mikaël Salson's avatar
Mikaël Salson committed
366 367 368 369 370 371 372
    gap2[i] = 0;
  }
  
  
  int g1=x.size();
  int g2=y.size();
  
373 374
  int i=best_i+1;
  int j=best_j+1;
375

376
  // Compute backtrack strings
Mikaël Salson's avatar
Mikaël Salson committed
377
  
378 379
  ostringstream back_x;
  ostringstream back_y;
Mikaël Salson's avatar
Mikaël Salson committed
380 381
  ostringstream back_tr;
  
382 383 384 385 386 387
  while (1) {

    if ((i<0) || (j<0))
      { 
        cout << "Invalid backtrack: " << i << "," << j << endl ;
        exit(1);
Mikaël Salson's avatar
Mikaël Salson committed
388
      }
389 390 391

    if  (B[i][j].type == FIN)
      break ;
Mikaël Salson's avatar
Mikaël Salson committed
392
      
393
    // cout << "bt " << i << "/" << j << " " << B[i][j].type << " " << B[i][j].i << "," << B[i][j].j << endl ;
394 395 396
    
    int next_i = B[i][j].i;
    int next_j = B[i][j].j;
Mikaël Salson's avatar
Mikaël Salson committed
397

398 399
    // The maximum number of characters implied by this operation
    int max_k = max(i - next_i, j - next_j);
Mikaël Salson's avatar
Mikaël Salson committed
400
      
401 402 403
    for (int k=0; k<max_k; k++)
      {
        linkgap[g1]=g2;
404 405 406

        // Some character(s) from x, then fill with spaces
        if (i-k > next_i)
407
          {
408
            back_x << x[i-1-k];
409 410 411 412
            g1--;
          }
        else
          {
413
            back_x << " " ;
414 415
            gap1[g1]++ ;
          }
416 417 418

        // Some character(s) from y, then fill with spaces
        if (j-k > next_j)
419
          {
420
            back_y << y[j-1-k];
421 422 423 424
            g2--;
          }
        else
          {
425
            back_y << " " ;
426 427
            gap2[g2]++ ;
          }
428 429 430 431 432 433

        // The operation character, then fill with spaces
        if (k == max_k-1)          
          back_tr << B[i][j].type ;
        else
          back_tr << " " ;
434
      }
Mikaël Salson's avatar
Mikaël Salson committed
435
    
436 437
    i = next_i;
    j = next_j;
Mikaël Salson's avatar
Mikaël Salson committed
438 439
  }

440 441 442
  // Format backtrack strings
  
  string str1, str2, str3;
443
  str1 = back_x.str();
Mikaël Salson's avatar
Mikaël Salson committed
444 445 446 447
  str1 =string (str1.rbegin(), str1.rend());
  str1 = str1;
  str2=back_tr.str();
  str2 = string (str2.rbegin(), str2.rend());
448
  str3 = back_y.str();
Mikaël Salson's avatar
Mikaël Salson committed
449
  str3 = string (str3.rbegin(), str3.rend());
450 451

  ostringstream back;    
452 453 454
  back << setw(3) << i   << " " << str1.substr(0, BACKSIZE-8) << endl;
  back << setw(3) << " " << " " << str2.substr(0, BACKSIZE-8) << endl;
  back << setw(3) << j   << " " << str3.substr(0, BACKSIZE-8) << endl << endl;
Mikaël Salson's avatar
Mikaël Salson committed
455 456 457 458 459
  for (size_t k=0 ; (BACKSIZE-8+k*BACKSIZE)< str1.length() ; k++){
    back << str1.substr(BACKSIZE-8+k*BACKSIZE, BACKSIZE) << endl;
    back << str2.substr(BACKSIZE-8+k*BACKSIZE, BACKSIZE) << endl;
    back << str3.substr(BACKSIZE-8+k*BACKSIZE, BACKSIZE) << endl << endl;
  }
460
  back << "score: " << best_score << endl;
Mikaël Salson's avatar
Mikaël Salson committed
461
  
462 463
  first_i=i;
  first_j=j;
Mikaël Salson's avatar
Mikaël Salson committed
464 465
  
  str_back=back.str();
Mathieu Giraud's avatar
Mathieu Giraud committed
466 467

  // cout << str_back << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501
}


float identity_percent(int score)
{
  int match = (score + IdentityDirty.match/2) / IdentityDirty.match ;

  int mismatch = (score - match * IdentityDirty.match) / IdentityDirty.mismatch ;

  float pcent = (100. * (float) match) / (match + mismatch) ;

  // cout << score << "  " << match << "  " << mismatch << "  " << pcent << endl ; 

  return pcent ;
}


ostream& operator<<(ostream& out, const DynProg& dp)
{
  out << "       " ;

  for (int j=0; j<dp.n; j++)
    out << setw(4) << dp.y[j] << " ";

  out << endl ;

  for (int i=0; i<=dp.m; i++)
    {
      if (i)
	out << dp.x[i-1] << " " ;
      else
	out << "  " ;

      for (int j=0; j<=dp.n; j++)
502
        {
503 504
          if (dp.B[i][j].score)
            out << setw(4) << dp.B[i][j].score << dp.B[i][j].type ;
505 506 507
          else
            out << "    " << dp.B[i][j].type ;
        }
Mikaël Salson's avatar
Mikaël Salson committed
508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555
      out << endl ;      
    }
  
  out << "best: " << dp.best_i << "," << dp.best_j << endl;

  return out ;
}


Cost strToCost(string str, Cost default_cost){

  if (str.length()!=0){
    string::size_type stTemp = str.find(',');
	  
    std::list<int> val;
    
    while(stTemp != string::npos)
    {
	  val.push_back(atoi(str.substr(0, stTemp).c_str() ) );
	  str = str.substr(stTemp + 1);
	  stTemp = str.find(',');
    }
    val.push_back(atoi(str.c_str() ) );

    if (val.size()==5){ 
      int val1=val.front();
      val.pop_front();
      int val2=val.front();
      val.pop_front();
      int val3=val.front();
      val.pop_front();
      int val4=val.front();
      val.pop_front();
      int val5=val.front();
      val.pop_front();
      Cost result = Cost(val1, val2, val3, val4, val5);
      cout << "use custom Cost "<< result << endl;
      return result;
    }
    
    cout << "incorrect Cost format, use default "<< default_cost<<endl;
  }
  
  return default_cost;
}