dynprog.cpp 12.9 KB
Newer Older
Mikaël Salson's avatar
Mikaël Salson committed
1
/*
2 3 4 5 6 7 8
  This file is part of Vidjil <http://www.vidjil.org>
  Copyright (C) 2011, 2012, 2013, 2014, 2015 by Bonsai bioinformatics 
  at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille
  Contributors: 
      Mathieu Giraud <mathieu.giraud@vidjil.org>
      Mikaël Salson <mikael.salson@vidjil.org>
      Marc Duez <marc.duez@vidjil.org>
Mikaël Salson's avatar
Mikaël Salson committed
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30

  "Vidjil" is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.

  "Vidjil" is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with "Vidjil". If not, see <http://www.gnu.org/licenses/>
*/

#include "dynprog.h"
#include "tools.h"
#include <cassert>
#include <list>
#include <cstdlib>
#include <string>

31 32 33 34 35 36 37 38
#define SUBST '|'
#define MISMATCH '.'
#define INSER 'i'
#define DELET 'd'
#define HOMO2X 'h'
#define HOMO2Y 'h'
#define FIN ' '
#define BEGIN 'B'
Mikaël Salson's avatar
Mikaël Salson committed
39
#define BACKSIZE 120
Mikaël Salson's avatar
Mikaël Salson committed
40 41 42 43 44 45 46 47 48 49 50 51

using namespace std;


Cost::Cost(int match, int mismatch, int indel, int del_end, int homopolymer)
{
  this -> match = match ;
  this -> mismatch = mismatch ;
  this -> insertion = indel ;
  this -> deletion = indel ;
  this -> deletion_end = del_end ;
  this -> homopolymer = (homopolymer == MINUS_INF ? indel: homopolymer);
52 53 54 55

  this -> open_insertion = this -> open_deletion = -15 ;
  this -> extend_insertion = this -> extend_deletion = -1 ;
  this -> affine_gap = true ;
Mikaël Salson's avatar
Mikaël Salson committed
56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109
}


ostream& operator<<(ostream& out, const Cost& cost)
{
  out << "(" << cost.match 
      << ", " << cost.mismatch
      << "/" << cost.insertion
      << "/" << cost.deletion
      << ", " << cost.deletion_end
      << ", " << cost.homopolymer
      << ")" ;
  return out;
}

Cost::Cost()
{
}

int Cost::substitution(char a, char b)
{
  return (a == b) ? match : mismatch ;
}

/*
int Cost::ins(char current, char next)
{
  return (next == current) ? homopolymer : insertion ;
}

int Cost::del(char current, char next)
{
  return (next == current) ? homopolymer : deletion ;
}
*/

int Cost::homo2(char xa, char xb, char y)
{
  return ((xa == xb) && (xb == y)) ? homopolymer : MINUS_INF ;
}


DynProg::DynProg(const string &x, const string &y, DynProgMode mode, const Cost& cost, const bool reverse_x, const bool reverse_y)
{
  this -> x = reverse_x ? reverse(x) : x ;
  this -> y = reverse_y ? reverse(y) : y ;

  this -> reverse_x = reverse_x ;
  this -> reverse_y = reverse_y ;

  m = x.size();
  n = y.size();
  // cout << "Dynprog " << x << "(" << m << ") / " << y << "(" << n << ")" << endl ;

110
  B = new operation*[m+1];
Mikaël Salson's avatar
Mikaël Salson committed
111
  for (int i = 0; i <= m; i++) {
112
    B[i] = new operation[n+1];
Mikaël Salson's avatar
Mikaël Salson committed
113
  }
114 115 116 117 118 119 120 121
  Bins = new operation*[m+1];
  for (int i = 0; i <= m; i++) {
    Bins[i] = new operation[n+1];
  }
  Bdel = new operation*[m+1];
  for (int i = 0; i <= m; i++) {
    Bdel[i] = new operation[n+1];
  }  
Mikaël Salson's avatar
Mikaël Salson committed
122 123 124 125 126 127 128
  this -> mode = mode;
  this -> cost = cost;

  this -> best_i = -1 ;
  this -> best_j = -1 ;
  this -> first_i = -1 ;
  this -> first_j = -1 ;
Mikaël Salson's avatar
Mikaël Salson committed
129 130 131 132
  
  gap1=NULL;
  gap2=NULL;
  linkgap=NULL;
Mikaël Salson's avatar
Mikaël Salson committed
133 134 135 136 137

  init();
}

DynProg::~DynProg() {
138

Mikaël Salson's avatar
Mikaël Salson committed
139 140
  for (int i = 0; i <= m; i++) {
    delete [] B[i];
141 142
    delete [] Bins[i];
    delete [] Bdel[i];
Mikaël Salson's avatar
Mikaël Salson committed
143
  }
144
  delete [] B;  
145 146
  delete [] Bins;
  delete [] Bdel;  
Mikaël Salson's avatar
Mikaël Salson committed
147 148 149
  delete [] gap1;
  delete [] gap2;
  delete [] linkgap;
Mikaël Salson's avatar
Mikaël Salson committed
150 151 152
}

void DynProg::init()
153 154 155 156 157 158
{
  for (int i=1; i<=m; i++)
    {
      B[i][0].type = INSER ;
      B[i][0].i = i-1 ;
      B[i][0].j = 0 ;
159
      Bdel[i][0].score = Bins[i][0].score = MINUS_INF ;
160 161 162 163 164 165 166
    }

  for (int j=1; j<=n; j++)
    {
      B[0][j].type = DELET ;    
      B[0][j].i = 0 ;
      B[0][j].j = j-1 ;
167
      Bdel[0][j].score = Bins[0][j].score = MINUS_INF ;
168 169
    }

Mikaël Salson's avatar
Mikaël Salson committed
170 171
  if (mode == Local || mode == LocalEndWithSomeDeletions)
    for (int i=0; i<=m; i++)
172
      B[i][0].score = 0 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
173 174
  else if (mode == GlobalButMostlyLocal)
    for (int i=0; i<=m; i++)
175
      B[i][0].score = i * cost.insertion / 2 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
176 177
  else // Global, SemiGlobal
    for (int i=0; i<=m; i++)
178
      B[i][0].score = i * cost.insertion ;
Mikaël Salson's avatar
Mikaël Salson committed
179 180 181
   
  if (mode == SemiGlobal || mode == SemiGlobalTrans || mode == Local || mode == LocalEndWithSomeDeletions)
    for (int j=0; j<=n; j++)
182
      B[0][j].score = 0 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
183 184
  else if (mode == GlobalButMostlyLocal)
    for (int j=0; j<=n; j++)
185
      B[0][j].score = j * cost.deletion / 2 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
186
  else // Global
Mikaël Salson's avatar
Mikaël Salson committed
187
    for (int j=0; j<=n; j++)
188
      B[0][j].score = j * cost.deletion ;
Mikaël Salson's avatar
Mikaël Salson committed
189 190
}

191 192 193 194 195 196 197 198 199 200 201
inline void try_operation(operation &best, int type, int i, int j, int score)
{
  if (score > best.score)
    {
      best.type = type ;
      best.i = i ;
      best.j = j ;
      best.score = score ;
    }
}

Mikaël Salson's avatar
Mikaël Salson committed
202 203 204
int DynProg::compute()
{
  best_score = MINUS_INF ;
205 206 207 208
  best_i = 0 ;
  best_j = 0 ;
  
  operation best ;
Mikaël Salson's avatar
Mikaël Salson committed
209 210 211 212

  for (int i=1; i<=m; i++)
    for (int j=1; j<=n; j++)
    {
213
      best.score = MINUS_INF ;
Mikaël Salson's avatar
Mikaël Salson committed
214

215
      // The edit operations, with their backtracking information and their score
216 217

      // Match, mismatch
218
      try_operation(best, SUBST, i-1, j-1, B[i-1][j-1].score + cost.substitution(x[i-1], y[j-1]));
219

220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241
      if (!cost.affine_gap)
        {
          // Regular indels
          try_operation(best, INSER, i-1, j  , B[i-1][j  ].score + cost.insertion);
          try_operation(best, DELET, i  , j-1, B[i  ][j-1].score + cost.deletion);
        }
      else
        {
          // Gotoh affine gaps - insertion
          Bins[i][j].score = MINUS_INF ;
          try_operation(Bins[i][j], 'o', i-1, j, B[i-1][j].score + cost.open_insertion);
          try_operation(Bins[i][j], 'x', Bins[i-1][j].i, j, Bins[i-1][j].score + cost.extend_insertion);
          try_operation(best, INSER, Bins[i][j].i, j ,Bins[i][j].score);

          // Gotoh affine gaps - deletions
          Bdel[i][j].score = MINUS_INF ;
          try_operation(Bdel[i][j], 'o', i, j-1, B[i][j-1].score + cost.open_deletion);
          try_operation(Bdel[i][j], 'x', i, Bdel[i][j-1].j, Bdel[i-1][j].score + cost.extend_deletion);
          try_operation(best, DELET, i, Bdel[i][j].j,  Bdel[i][j].score);
        }

      // Homopolymers
242 243
      try_operation(best, HOMO2X, i-2, j-1, i > 1 ? B[i-2][j-1].score + cost.homo2(x[i-2], x[i-1], y[j-1]) : MINUS_INF);
      try_operation(best, HOMO2Y, i-1, j-2, j > 1 ? B[i-1][j-2].score + cost.homo2(y[j-2], y[j-1], x[i-1]) : MINUS_INF);
244
      
245
      // Local alignment
246 247
      if (mode == Local || mode == LocalEndWithSomeDeletions)
	try_operation(best, BEGIN, 0, 0, 0);
Mikaël Salson's avatar
Mikaël Salson committed
248

249 250
      if ((best.type == SUBST) && (x[i-1] != y[j-1]))
        best.type = MISMATCH ;
Mikaël Salson's avatar
Mikaël Salson committed
251

252 253
      // Fill the score and the backtrack information
      B[i][j] = best ;
Mikaël Salson's avatar
Mikaël Salson committed
254
      
255 256
      // cout << " " << i << "," << j << " " << best_op.type << " " <<  best_op.i << "," << best_op.j << " "  << best_op.score << " " << best << endl ;

Mikaël Salson's avatar
Mikaël Salson committed
257 258
      if (mode == Local || mode == LocalEndWithSomeDeletions)
	{
259
	  int tbest = best.score ;
Mikaël Salson's avatar
Mikaël Salson committed
260 261 262 263 264 265 266 267 268 269 270

	  if (mode == LocalEndWithSomeDeletions)
	    tbest += cost.deletion_end*(n-j);   
     
	  if (tbest > best_score)
	    {
	      best_score = tbest ;
	      best_i = i ;
	      best_j = j ;
	    }
	    
271
	  if (best.score == 0){
272
	    B[i][j].type = FIN;
Mikaël Salson's avatar
Mikaël Salson committed
273 274
	  }
	}
Mikaël Salson's avatar
Mikaël Salson committed
275

Mikaël Salson's avatar
Mikaël Salson committed
276 277
      if (mode == Local || mode == LocalEndWithSomeDeletions)
	{
278
      	  if (best.score == 0){
279
	    B[i][j].type = FIN;
Mikaël Salson's avatar
Mikaël Salson committed
280 281 282 283 284
	  }
	}
      
    }

285 286
  // End. Find best_i and best_j, put FIN keywords where the backtrack should stop
  
Mikaël Salson's avatar
Mikaël Salson committed
287 288
  if (mode == Local || mode == LocalEndWithSomeDeletions)
    {
289 290
      for (int j=0; j<=n; j++){
	B[0][j].type = FIN;
Mikaël Salson's avatar
Mikaël Salson committed
291 292
      }
      for (int i=0; i<=m; i++){
293
	B[i][0].type = FIN;
Mikaël Salson's avatar
Mikaël Salson committed
294 295 296 297 298 299 300 301
      }
      
    }
  if (mode == SemiGlobal)
    {
      best_i = m ;
      
      for (int j=1; j<=n; j++)
302
	if (B[m][j].score > best_score)
Mikaël Salson's avatar
Mikaël Salson committed
303
	  {
304
	    best_score = B[m][j].score ;
Mikaël Salson's avatar
Mikaël Salson committed
305 306 307
	    best_j = j ;
	  }
      for (int i=0; i<=m; i++){
308
	B[i][0].type = FIN;
Mikaël Salson's avatar
Mikaël Salson committed
309 310 311 312 313 314 315 316
      }
    }

  if (mode == SemiGlobalTrans)
    {
      best_j = n ;
      
      for (int i=1; i<=m; i++)
317
	if (B[i][n].score > best_score)
Mikaël Salson's avatar
Mikaël Salson committed
318
	  {
319
	    best_score = B[i][n].score ;
Mikaël Salson's avatar
Mikaël Salson committed
320 321 322 323
	    best_i = i ;
	  }
	  
      for (int i=0; i<=n; i++){
324
	B[0][i].type = FIN;
325
      }     
Mikaël Salson's avatar
Mikaël Salson committed
326 327
    }

Mathieu Giraud's avatar
Mathieu Giraud committed
328
  if ((mode == Global) || (mode == GlobalButMostlyLocal))
Mikaël Salson's avatar
Mikaël Salson committed
329 330 331
    {
      best_i = m ;
      best_j = n ;
332
      best_score = B[m][n].score;
Mikaël Salson's avatar
Mikaël Salson committed
333 334 335 336 337 338 339 340
    }

  if (reverse_x)
    best_i = m - best_i + 1 ;

  if (reverse_y)
    best_j = n - best_j + 1;

341 342
  B[0][0].type = FIN;
  
Mikaël Salson's avatar
Mikaël Salson committed
343 344 345 346 347 348 349 350 351
  // In the matrix positions start at 1, but start positions may start at 0
  best_i -= 1;   
  best_j -= 1;

  return best_score ;
}

void DynProg::backtrack()
{
352
  // Tables for displaying the alignment
Mikaël Salson's avatar
Mikaël Salson committed
353 354 355 356
  gap1 = new int[x.size()+1];
  linkgap = new int[x.size()+1];
  gap2 = new int[y.size()+1];
      
357
  for (unsigned int i = 0; i <=x.size(); i++) {
Mikaël Salson's avatar
Mikaël Salson committed
358 359 360
    gap1[i] = 0;
    linkgap[i] = 0;
  }
361
  for (unsigned int i = 0; i <= y.size(); i++) {
Mikaël Salson's avatar
Mikaël Salson committed
362 363 364 365 366 367 368
    gap2[i] = 0;
  }
  
  
  int g1=x.size();
  int g2=y.size();
  
369 370
  int i=best_i+1;
  int j=best_j+1;
371

372
  // Compute backtrack strings
Mikaël Salson's avatar
Mikaël Salson committed
373
  
374 375
  ostringstream back_x;
  ostringstream back_y;
Mikaël Salson's avatar
Mikaël Salson committed
376 377
  ostringstream back_tr;
  
378 379 380 381 382 383
  while (1) {

    if ((i<0) || (j<0))
      { 
        cout << "Invalid backtrack: " << i << "," << j << endl ;
        exit(1);
Mikaël Salson's avatar
Mikaël Salson committed
384
      }
385 386 387

    if  (B[i][j].type == FIN)
      break ;
Mikaël Salson's avatar
Mikaël Salson committed
388
      
389
    // cout << "bt " << i << "/" << j << " " << B[i][j].type << " " << B[i][j].i << "," << B[i][j].j << endl ;
390 391 392
    
    int next_i = B[i][j].i;
    int next_j = B[i][j].j;
Mikaël Salson's avatar
Mikaël Salson committed
393

394 395
    // The maximum number of characters implied by this operation
    int max_k = max(i - next_i, j - next_j);
Mikaël Salson's avatar
Mikaël Salson committed
396
      
397 398 399
    for (int k=0; k<max_k; k++)
      {
        linkgap[g1]=g2;
400 401 402

        // Some character(s) from x, then fill with spaces
        if (i-k > next_i)
403
          {
404
            back_x << x[i-1-k];
405 406 407 408
            g1--;
          }
        else
          {
409
            back_x << " " ;
410 411
            gap1[g1]++ ;
          }
412 413 414

        // Some character(s) from y, then fill with spaces
        if (j-k > next_j)
415
          {
416
            back_y << y[j-1-k];
417 418 419 420
            g2--;
          }
        else
          {
421
            back_y << " " ;
422 423
            gap2[g2]++ ;
          }
424 425 426 427 428 429

        // The operation character, then fill with spaces
        if (k == max_k-1)          
          back_tr << B[i][j].type ;
        else
          back_tr << " " ;
430
      }
Mikaël Salson's avatar
Mikaël Salson committed
431
    
432 433
    i = next_i;
    j = next_j;
Mikaël Salson's avatar
Mikaël Salson committed
434 435
  }

436 437 438
  // Format backtrack strings
  
  string str1, str2, str3;
439
  str1 = back_x.str();
Mikaël Salson's avatar
Mikaël Salson committed
440 441 442 443
  str1 =string (str1.rbegin(), str1.rend());
  str1 = str1;
  str2=back_tr.str();
  str2 = string (str2.rbegin(), str2.rend());
444
  str3 = back_y.str();
Mikaël Salson's avatar
Mikaël Salson committed
445
  str3 = string (str3.rbegin(), str3.rend());
446 447

  ostringstream back;    
448 449 450
  back << setw(3) << i   << " " << str1.substr(0, BACKSIZE-8) << endl;
  back << setw(3) << " " << " " << str2.substr(0, BACKSIZE-8) << endl;
  back << setw(3) << j   << " " << str3.substr(0, BACKSIZE-8) << endl << endl;
Mikaël Salson's avatar
Mikaël Salson committed
451 452 453 454 455
  for (size_t k=0 ; (BACKSIZE-8+k*BACKSIZE)< str1.length() ; k++){
    back << str1.substr(BACKSIZE-8+k*BACKSIZE, BACKSIZE) << endl;
    back << str2.substr(BACKSIZE-8+k*BACKSIZE, BACKSIZE) << endl;
    back << str3.substr(BACKSIZE-8+k*BACKSIZE, BACKSIZE) << endl << endl;
  }
456
  back << "score: " << best_score << endl;
Mikaël Salson's avatar
Mikaël Salson committed
457
  
458 459
  first_i=i;
  first_j=j;
Mikaël Salson's avatar
Mikaël Salson committed
460 461
  
  str_back=back.str();
Mathieu Giraud's avatar
Mathieu Giraud committed
462 463

  // cout << str_back << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497
}


float identity_percent(int score)
{
  int match = (score + IdentityDirty.match/2) / IdentityDirty.match ;

  int mismatch = (score - match * IdentityDirty.match) / IdentityDirty.mismatch ;

  float pcent = (100. * (float) match) / (match + mismatch) ;

  // cout << score << "  " << match << "  " << mismatch << "  " << pcent << endl ; 

  return pcent ;
}


ostream& operator<<(ostream& out, const DynProg& dp)
{
  out << "       " ;

  for (int j=0; j<dp.n; j++)
    out << setw(4) << dp.y[j] << " ";

  out << endl ;

  for (int i=0; i<=dp.m; i++)
    {
      if (i)
	out << dp.x[i-1] << " " ;
      else
	out << "  " ;

      for (int j=0; j<=dp.n; j++)
498
        {
499 500
          if (dp.B[i][j].score)
            out << setw(4) << dp.B[i][j].score << dp.B[i][j].type ;
501 502 503
          else
            out << "    " << dp.B[i][j].type ;
        }
Mikaël Salson's avatar
Mikaël Salson committed
504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551
      out << endl ;      
    }
  
  out << "best: " << dp.best_i << "," << dp.best_j << endl;

  return out ;
}


Cost strToCost(string str, Cost default_cost){

  if (str.length()!=0){
    string::size_type stTemp = str.find(',');
	  
    std::list<int> val;
    
    while(stTemp != string::npos)
    {
	  val.push_back(atoi(str.substr(0, stTemp).c_str() ) );
	  str = str.substr(stTemp + 1);
	  stTemp = str.find(',');
    }
    val.push_back(atoi(str.c_str() ) );

    if (val.size()==5){ 
      int val1=val.front();
      val.pop_front();
      int val2=val.front();
      val.pop_front();
      int val3=val.front();
      val.pop_front();
      int val4=val.front();
      val.pop_front();
      int val5=val.front();
      val.pop_front();
      Cost result = Cost(val1, val2, val3, val4, val5);
      cout << "use custom Cost "<< result << endl;
      return result;
    }
    
    cout << "incorrect Cost format, use default "<< default_cost<<endl;
  }
  
  return default_cost;
}