dynprog.cpp 11.5 KB
Newer Older
Mikaël Salson's avatar
Mikaël Salson committed
1
/*
2 3 4 5 6 7 8
  This file is part of Vidjil <http://www.vidjil.org>
  Copyright (C) 2011, 2012, 2013, 2014, 2015 by Bonsai bioinformatics 
  at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille
  Contributors: 
      Mathieu Giraud <mathieu.giraud@vidjil.org>
      Mikaël Salson <mikael.salson@vidjil.org>
      Marc Duez <marc.duez@vidjil.org>
Mikaël Salson's avatar
Mikaël Salson committed
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30

  "Vidjil" is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.

  "Vidjil" is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with "Vidjil". If not, see <http://www.gnu.org/licenses/>
*/

#include "dynprog.h"
#include "tools.h"
#include <cassert>
#include <list>
#include <cstdlib>
#include <string>

31 32 33 34 35 36 37 38
#define SUBST '|'
#define MISMATCH '.'
#define INSER 'i'
#define DELET 'd'
#define HOMO2X 'h'
#define HOMO2Y 'h'
#define FIN ' '
#define BEGIN 'B'
Mikaël Salson's avatar
Mikaël Salson committed
39
#define BACKSIZE 120
Mikaël Salson's avatar
Mikaël Salson committed
40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105

using namespace std;


Cost::Cost(int match, int mismatch, int indel, int del_end, int homopolymer)
{
  this -> match = match ;
  this -> mismatch = mismatch ;
  this -> insertion = indel ;
  this -> deletion = indel ;
  this -> deletion_end = del_end ;
  this -> homopolymer = (homopolymer == MINUS_INF ? indel: homopolymer);
}


ostream& operator<<(ostream& out, const Cost& cost)
{
  out << "(" << cost.match 
      << ", " << cost.mismatch
      << "/" << cost.insertion
      << "/" << cost.deletion
      << ", " << cost.deletion_end
      << ", " << cost.homopolymer
      << ")" ;
  return out;
}

Cost::Cost()
{
}

int Cost::substitution(char a, char b)
{
  return (a == b) ? match : mismatch ;
}

/*
int Cost::ins(char current, char next)
{
  return (next == current) ? homopolymer : insertion ;
}

int Cost::del(char current, char next)
{
  return (next == current) ? homopolymer : deletion ;
}
*/

int Cost::homo2(char xa, char xb, char y)
{
  return ((xa == xb) && (xb == y)) ? homopolymer : MINUS_INF ;
}


DynProg::DynProg(const string &x, const string &y, DynProgMode mode, const Cost& cost, const bool reverse_x, const bool reverse_y)
{
  this -> x = reverse_x ? reverse(x) : x ;
  this -> y = reverse_y ? reverse(y) : y ;

  this -> reverse_x = reverse_x ;
  this -> reverse_y = reverse_y ;

  m = x.size();
  n = y.size();
  // cout << "Dynprog " << x << "(" << m << ") / " << y << "(" << n << ")" << endl ;

106
  B = new operation*[m+1];
Mikaël Salson's avatar
Mikaël Salson committed
107
  for (int i = 0; i <= m; i++) {
108
    B[i] = new operation[n+1];
Mikaël Salson's avatar
Mikaël Salson committed
109
  }
Mikaël Salson's avatar
Mikaël Salson committed
110
  
Mikaël Salson's avatar
Mikaël Salson committed
111 112 113 114 115 116 117
  this -> mode = mode;
  this -> cost = cost;

  this -> best_i = -1 ;
  this -> best_j = -1 ;
  this -> first_i = -1 ;
  this -> first_j = -1 ;
Mikaël Salson's avatar
Mikaël Salson committed
118 119 120 121
  
  gap1=NULL;
  gap2=NULL;
  linkgap=NULL;
Mikaël Salson's avatar
Mikaël Salson committed
122 123 124 125 126

  init();
}

DynProg::~DynProg() {
127

Mikaël Salson's avatar
Mikaël Salson committed
128 129 130
  for (int i = 0; i <= m; i++) {
    delete [] B[i];
  }
131
  delete [] B;  
Mikaël Salson's avatar
Mikaël Salson committed
132 133 134
  delete [] gap1;
  delete [] gap2;
  delete [] linkgap;
Mikaël Salson's avatar
Mikaël Salson committed
135 136 137
}

void DynProg::init()
138 139 140 141 142 143 144 145 146 147 148 149 150 151 152
{
  for (int i=1; i<=m; i++)
    {
      B[i][0].type = INSER ;
      B[i][0].i = i-1 ;
      B[i][0].j = 0 ;
    }

  for (int j=1; j<=n; j++)
    {
      B[0][j].type = DELET ;    
      B[0][j].i = 0 ;
      B[0][j].j = j-1 ;
    }

Mikaël Salson's avatar
Mikaël Salson committed
153 154
  if (mode == Local || mode == LocalEndWithSomeDeletions)
    for (int i=0; i<=m; i++)
155
      B[i][0].score = 0 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
156 157
  else if (mode == GlobalButMostlyLocal)
    for (int i=0; i<=m; i++)
158
      B[i][0].score = i * cost.insertion / 2 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
159 160
  else // Global, SemiGlobal
    for (int i=0; i<=m; i++)
161
      B[i][0].score = i * cost.insertion ;
Mikaël Salson's avatar
Mikaël Salson committed
162 163 164
   
  if (mode == SemiGlobal || mode == SemiGlobalTrans || mode == Local || mode == LocalEndWithSomeDeletions)
    for (int j=0; j<=n; j++)
165
      B[0][j].score = 0 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
166 167
  else if (mode == GlobalButMostlyLocal)
    for (int j=0; j<=n; j++)
168
      B[0][j].score = j * cost.deletion / 2 ;
Mathieu Giraud's avatar
Mathieu Giraud committed
169
  else // Global
Mikaël Salson's avatar
Mikaël Salson committed
170
    for (int j=0; j<=n; j++)
171
      B[0][j].score = j * cost.deletion ;
Mikaël Salson's avatar
Mikaël Salson committed
172 173
}

174 175 176 177 178 179 180 181 182 183 184
inline void try_operation(operation &best, int type, int i, int j, int score)
{
  if (score > best.score)
    {
      best.type = type ;
      best.i = i ;
      best.j = j ;
      best.score = score ;
    }
}

Mikaël Salson's avatar
Mikaël Salson committed
185 186 187
int DynProg::compute()
{
  best_score = MINUS_INF ;
188 189 190 191
  best_i = 0 ;
  best_j = 0 ;
  
  operation best ;
Mikaël Salson's avatar
Mikaël Salson committed
192 193 194 195

  for (int i=1; i<=m; i++)
    for (int j=1; j<=n; j++)
    {
196
      best.score = MINUS_INF ;
Mikaël Salson's avatar
Mikaël Salson committed
197

198
      // The edit operations, with their backtracking information and their score
Mathieu Giraud's avatar
Mathieu Giraud committed
199
      
200
      try_operation(best, SUBST, i-1, j-1, B[i-1][j-1].score + cost.substitution(x[i-1], y[j-1]));
201
      
202 203
      try_operation(best, INSER, i-1, j  , B[i-1][j  ].score + cost.insertion);
      try_operation(best, DELET, i  , j-1, B[i  ][j-1].score + cost.deletion);
204

205 206
      try_operation(best, HOMO2X, i-2, j-1, i > 1 ? B[i-2][j-1].score + cost.homo2(x[i-2], x[i-1], y[j-1]) : MINUS_INF);
      try_operation(best, HOMO2Y, i-1, j-2, j > 1 ? B[i-1][j-2].score + cost.homo2(y[j-2], y[j-1], x[i-1]) : MINUS_INF);
207 208 209
      
      if (mode == Local || mode == LocalEndWithSomeDeletions)
	try_operation(best, BEGIN, 0, 0, 0);
Mikaël Salson's avatar
Mikaël Salson committed
210

211 212
      if ((best.type == SUBST) && (x[i-1] != y[j-1]))
        best.type = MISMATCH ;
Mikaël Salson's avatar
Mikaël Salson committed
213

214 215
      // Fill the score and the backtrack information
      B[i][j] = best ;
Mikaël Salson's avatar
Mikaël Salson committed
216
      
217 218
      // cout << " " << i << "," << j << " " << best_op.type << " " <<  best_op.i << "," << best_op.j << " "  << best_op.score << " " << best << endl ;

Mikaël Salson's avatar
Mikaël Salson committed
219 220
      if (mode == Local || mode == LocalEndWithSomeDeletions)
	{
221
	  int tbest = best.score ;
Mikaël Salson's avatar
Mikaël Salson committed
222 223 224 225 226 227 228 229 230 231 232

	  if (mode == LocalEndWithSomeDeletions)
	    tbest += cost.deletion_end*(n-j);   
     
	  if (tbest > best_score)
	    {
	      best_score = tbest ;
	      best_i = i ;
	      best_j = j ;
	    }
	    
233
	  if (best.score == 0){
234
	    B[i][j].type = FIN;
Mikaël Salson's avatar
Mikaël Salson committed
235 236
	  }
	}
Mikaël Salson's avatar
Mikaël Salson committed
237

Mikaël Salson's avatar
Mikaël Salson committed
238 239
      if (mode == Local || mode == LocalEndWithSomeDeletions)
	{
240
      	  if (best.score == 0){
241
	    B[i][j].type = FIN;
Mikaël Salson's avatar
Mikaël Salson committed
242 243 244 245 246
	  }
	}
      
    }

247 248
  // End. Find best_i and best_j, put FIN keywords where the backtrack should stop
  
Mikaël Salson's avatar
Mikaël Salson committed
249 250
  if (mode == Local || mode == LocalEndWithSomeDeletions)
    {
251 252
      for (int j=0; j<=n; j++){
	B[0][j].type = FIN;
Mikaël Salson's avatar
Mikaël Salson committed
253 254
      }
      for (int i=0; i<=m; i++){
255
	B[i][0].type = FIN;
Mikaël Salson's avatar
Mikaël Salson committed
256 257 258 259 260 261 262 263
      }
      
    }
  if (mode == SemiGlobal)
    {
      best_i = m ;
      
      for (int j=1; j<=n; j++)
264
	if (B[m][j].score > best_score)
Mikaël Salson's avatar
Mikaël Salson committed
265
	  {
266
	    best_score = B[m][j].score ;
Mikaël Salson's avatar
Mikaël Salson committed
267 268 269
	    best_j = j ;
	  }
      for (int i=0; i<=m; i++){
270
	B[i][0].type = FIN;
Mikaël Salson's avatar
Mikaël Salson committed
271 272 273 274 275 276 277 278
      }
    }

  if (mode == SemiGlobalTrans)
    {
      best_j = n ;
      
      for (int i=1; i<=m; i++)
279
	if (B[i][n].score > best_score)
Mikaël Salson's avatar
Mikaël Salson committed
280
	  {
281
	    best_score = B[i][n].score ;
Mikaël Salson's avatar
Mikaël Salson committed
282 283 284 285
	    best_i = i ;
	  }
	  
      for (int i=0; i<=n; i++){
286
	B[0][i].type = FIN;
287
      }     
Mikaël Salson's avatar
Mikaël Salson committed
288 289
    }

Mathieu Giraud's avatar
Mathieu Giraud committed
290
  if ((mode == Global) || (mode == GlobalButMostlyLocal))
Mikaël Salson's avatar
Mikaël Salson committed
291 292 293
    {
      best_i = m ;
      best_j = n ;
294
      best_score = B[m][n].score;
Mikaël Salson's avatar
Mikaël Salson committed
295 296 297 298 299 300 301 302
    }

  if (reverse_x)
    best_i = m - best_i + 1 ;

  if (reverse_y)
    best_j = n - best_j + 1;

303 304
  B[0][0].type = FIN;
  
Mikaël Salson's avatar
Mikaël Salson committed
305 306 307 308 309 310 311 312 313
  // In the matrix positions start at 1, but start positions may start at 0
  best_i -= 1;   
  best_j -= 1;

  return best_score ;
}

void DynProg::backtrack()
{
314
  // Tables for displaying the alignment
Mikaël Salson's avatar
Mikaël Salson committed
315 316 317 318
  gap1 = new int[x.size()+1];
  linkgap = new int[x.size()+1];
  gap2 = new int[y.size()+1];
      
319
  for (unsigned int i = 0; i <=x.size(); i++) {
Mikaël Salson's avatar
Mikaël Salson committed
320 321 322
    gap1[i] = 0;
    linkgap[i] = 0;
  }
323
  for (unsigned int i = 0; i <= y.size(); i++) {
Mikaël Salson's avatar
Mikaël Salson committed
324 325 326 327 328 329 330
    gap2[i] = 0;
  }
  
  
  int g1=x.size();
  int g2=y.size();
  
331 332
  int i=best_i+1;
  int j=best_j+1;
333

334
  // Compute backtrack strings
Mikaël Salson's avatar
Mikaël Salson committed
335
  
336 337
  ostringstream back_x;
  ostringstream back_y;
Mikaël Salson's avatar
Mikaël Salson committed
338 339
  ostringstream back_tr;
  
340 341 342 343 344 345
  while (1) {

    if ((i<0) || (j<0))
      { 
        cout << "Invalid backtrack: " << i << "," << j << endl ;
        exit(1);
Mikaël Salson's avatar
Mikaël Salson committed
346
      }
347 348 349

    if  (B[i][j].type == FIN)
      break ;
Mikaël Salson's avatar
Mikaël Salson committed
350
      
351
    // cout << "bt " << i << "/" << j << " " << B[i][j].type << " " << B[i][j].i << "," << B[i][j].j << endl ;
352 353 354
    
    int next_i = B[i][j].i;
    int next_j = B[i][j].j;
Mikaël Salson's avatar
Mikaël Salson committed
355

356 357
    // The maximum number of characters implied by this operation
    int max_k = max(i - next_i, j - next_j);
Mikaël Salson's avatar
Mikaël Salson committed
358
      
359 360 361
    for (int k=0; k<max_k; k++)
      {
        linkgap[g1]=g2;
362 363 364

        // Some character(s) from x, then fill with spaces
        if (i-k > next_i)
365
          {
366
            back_x << x[i-1-k];
367 368 369 370
            g1--;
          }
        else
          {
371
            back_x << " " ;
372 373
            gap1[g1]++ ;
          }
374 375 376

        // Some character(s) from y, then fill with spaces
        if (j-k > next_j)
377
          {
378
            back_y << y[j-1-k];
379 380 381 382
            g2--;
          }
        else
          {
383
            back_y << " " ;
384 385
            gap2[g2]++ ;
          }
386 387 388 389 390 391

        // The operation character, then fill with spaces
        if (k == max_k-1)          
          back_tr << B[i][j].type ;
        else
          back_tr << " " ;
392
      }
Mikaël Salson's avatar
Mikaël Salson committed
393
    
394 395
    i = next_i;
    j = next_j;
Mikaël Salson's avatar
Mikaël Salson committed
396 397
  }

398 399 400
  // Format backtrack strings
  
  string str1, str2, str3;
401
  str1 = back_x.str();
Mikaël Salson's avatar
Mikaël Salson committed
402 403 404 405
  str1 =string (str1.rbegin(), str1.rend());
  str1 = str1;
  str2=back_tr.str();
  str2 = string (str2.rbegin(), str2.rend());
406
  str3 = back_y.str();
Mikaël Salson's avatar
Mikaël Salson committed
407
  str3 = string (str3.rbegin(), str3.rend());
408 409

  ostringstream back;    
410 411 412
  back << setw(3) << i   << " " << str1.substr(0, BACKSIZE-8) << endl;
  back << setw(3) << " " << " " << str2.substr(0, BACKSIZE-8) << endl;
  back << setw(3) << j   << " " << str3.substr(0, BACKSIZE-8) << endl << endl;
Mikaël Salson's avatar
Mikaël Salson committed
413 414 415 416 417
  for (size_t k=0 ; (BACKSIZE-8+k*BACKSIZE)< str1.length() ; k++){
    back << str1.substr(BACKSIZE-8+k*BACKSIZE, BACKSIZE) << endl;
    back << str2.substr(BACKSIZE-8+k*BACKSIZE, BACKSIZE) << endl;
    back << str3.substr(BACKSIZE-8+k*BACKSIZE, BACKSIZE) << endl << endl;
  }
418
  back << "score: " << best_score << endl;
Mikaël Salson's avatar
Mikaël Salson committed
419
  
420 421
  first_i=i;
  first_j=j;
Mikaël Salson's avatar
Mikaël Salson committed
422 423
  
  str_back=back.str();
Mathieu Giraud's avatar
Mathieu Giraud committed
424 425

  // cout << str_back << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459
}


float identity_percent(int score)
{
  int match = (score + IdentityDirty.match/2) / IdentityDirty.match ;

  int mismatch = (score - match * IdentityDirty.match) / IdentityDirty.mismatch ;

  float pcent = (100. * (float) match) / (match + mismatch) ;

  // cout << score << "  " << match << "  " << mismatch << "  " << pcent << endl ; 

  return pcent ;
}


ostream& operator<<(ostream& out, const DynProg& dp)
{
  out << "       " ;

  for (int j=0; j<dp.n; j++)
    out << setw(4) << dp.y[j] << " ";

  out << endl ;

  for (int i=0; i<=dp.m; i++)
    {
      if (i)
	out << dp.x[i-1] << " " ;
      else
	out << "  " ;

      for (int j=0; j<=dp.n; j++)
460
        {
461 462
          if (dp.B[i][j].score)
            out << setw(4) << dp.B[i][j].score << dp.B[i][j].type ;
463 464 465
          else
            out << "    " << dp.B[i][j].type ;
        }
Mikaël Salson's avatar
Mikaël Salson committed
466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513
      out << endl ;      
    }
  
  out << "best: " << dp.best_i << "," << dp.best_j << endl;

  return out ;
}


Cost strToCost(string str, Cost default_cost){

  if (str.length()!=0){
    string::size_type stTemp = str.find(',');
	  
    std::list<int> val;
    
    while(stTemp != string::npos)
    {
	  val.push_back(atoi(str.substr(0, stTemp).c_str() ) );
	  str = str.substr(stTemp + 1);
	  stTemp = str.find(',');
    }
    val.push_back(atoi(str.c_str() ) );

    if (val.size()==5){ 
      int val1=val.front();
      val.pop_front();
      int val2=val.front();
      val.pop_front();
      int val3=val.front();
      val.pop_front();
      int val4=val.front();
      val.pop_front();
      int val5=val.front();
      val.pop_front();
      Cost result = Cost(val1, val2, val3, val4, val5);
      cout << "use custom Cost "<< result << endl;
      return result;
    }
    
    cout << "incorrect Cost format, use default "<< default_cost<<endl;
  }
  
  return default_cost;
}