Commit e7573a2d authored by Mathieu Giraud's avatar Mathieu Giraud

core/dynprog.{cpp,h}: refactoring the score computation and backtrack loops

Edit operations, with their score computation and their backtracking information,
are now defined in a unique location.
The code should thus be more flexible and easier to read and maintain.
parent 6f50dfe8
...@@ -28,12 +28,14 @@ ...@@ -28,12 +28,14 @@
#include <cstdlib> #include <cstdlib>
#include <string> #include <string>
#define SUBST 0 #define SUBST '|'
#define INSER 1 #define MISMATCH '.'
#define DELET 2 #define INSER 'i'
#define HOMO2X 3 #define DELET 'd'
#define HOMO2Y 4 #define HOMO2X 'h'
#define FIN 5 #define HOMO2Y 'h'
#define FIN ' '
#define BEGIN 'B'
#define BACKSIZE 120 #define BACKSIZE 120
using namespace std; using namespace std;
...@@ -106,9 +108,9 @@ DynProg::DynProg(const string &x, const string &y, DynProgMode mode, const Cost& ...@@ -106,9 +108,9 @@ DynProg::DynProg(const string &x, const string &y, DynProgMode mode, const Cost&
S[i] = new int[n+1]; S[i] = new int[n+1];
} }
B = new backtrack_info*[m+1]; B = new operation*[m+1];
for (int i = 0; i <= m; i++) { for (int i = 0; i <= m; i++) {
B[i] = new backtrack_info[n+1]; B[i] = new operation[n+1];
} }
this -> mode = mode; this -> mode = mode;
...@@ -130,20 +132,32 @@ DynProg::~DynProg() { ...@@ -130,20 +132,32 @@ DynProg::~DynProg() {
for (int i = 0; i <= m; i++) { for (int i = 0; i <= m; i++) {
delete [] S[i]; delete [] S[i];
} }
delete [] S; delete [] S;
for (int i = 0; i <= m; i++) { for (int i = 0; i <= m; i++) {
delete [] B[i]; delete [] B[i];
} }
delete [] B; delete [] B;
delete [] gap1; delete [] gap1;
delete [] gap2; delete [] gap2;
delete [] linkgap; delete [] linkgap;
} }
void DynProg::init() void DynProg::init()
{ {
for (int i=1; i<=m; i++)
{
B[i][0].type = INSER ;
B[i][0].i = i-1 ;
B[i][0].j = 0 ;
}
for (int j=1; j<=n; j++)
{
B[0][j].type = DELET ;
B[0][j].i = 0 ;
B[0][j].j = j-1 ;
}
if (mode == Local || mode == LocalEndWithSomeDeletions) if (mode == Local || mode == LocalEndWithSomeDeletions)
for (int i=0; i<=m; i++) for (int i=0; i<=m; i++)
S[i][0] = 0 ; S[i][0] = 0 ;
...@@ -165,34 +179,55 @@ void DynProg::init() ...@@ -165,34 +179,55 @@ void DynProg::init()
S[0][j] = j * cost.deletion ; S[0][j] = j * cost.deletion ;
} }
inline void try_operation(operation &best, int type, int i, int j, int score)
{
if (score > best.score)
{
best.type = type ;
best.i = i ;
best.j = j ;
best.score = score ;
}
}
int DynProg::compute() int DynProg::compute()
{ {
best_score = MINUS_INF ; best_score = MINUS_INF ;
best_i = 0 ;
best_j = 0 ;
operation best ;
for (int i=1; i<=m; i++) for (int i=1; i<=m; i++)
for (int j=1; j<=n; j++) for (int j=1; j<=n; j++)
{ {
int subst = S[i-1][j-1] + cost.substitution(x[i-1], y[j-1]); best.score = MINUS_INF ;
int inser = S[i-1][j] + cost.insertion; // The edit operations, with their backtracking information and their score
int delet = S[i][j-1] + cost.deletion;
// Also dealing with homopolymers (in a dirty way!) try_operation(best, SUBST, i-1, j-1, S[i-1][j-1] + cost.substitution(x[i-1], y[j-1]));
// int inser = S[i-1][j] + (i > 1 ? cost.ins(x[i-2], x[i-1]) : cost.ins(0,1));
// int delet = S[i][j-1] + (j > 1 ? cost.del(y[j-2], y[j-1]) : cost.del(0,1)); try_operation(best, INSER, i-1, j , S[i-1][j ] + cost.insertion);
try_operation(best, DELET, i , j-1, S[i ][j-1] + cost.deletion);
try_operation(best, HOMO2X, i-2, j-1, i > 1 ? S[i-2][j-1] + cost.homo2(x[i-2], x[i-1], y[j-1]) : MINUS_INF);
try_operation(best, HOMO2Y, i-1, j-2, j > 1 ? S[i-1][j-2] + cost.homo2(y[j-2], y[j-1], x[i-1]) : MINUS_INF);
if (mode == Local || mode == LocalEndWithSomeDeletions)
try_operation(best, BEGIN, 0, 0, 0);
int best = max(max(subst, inser), delet); if ((best.type == SUBST) && (x[i-1] != y[j-1]))
best.type = MISMATCH ;
// Homopolymers (2 <-> 1) // Fill the score and the backtrack information
int homo2x = i > 1 ? S[i-2][j-1] + cost.homo2(x[i-2], x[i-1], y[j-1]) : MINUS_INF ; S[i][j] = best.score ;
int homo2y = j > 1 ? S[i-1][j-2] + cost.homo2(y[j-2], y[j-1], x[i-1]) : MINUS_INF ; B[i][j] = best ;
best = max(max(homo2x, homo2y), best);
// cout << " " << i << "," << j << " " << best_op.type << " " << best_op.i << "," << best_op.j << " " << best_op.score << " " << best << endl ;
if (mode == Local || mode == LocalEndWithSomeDeletions) if (mode == Local || mode == LocalEndWithSomeDeletions)
{ {
best = max(0, best); int tbest = best.score ;
int tbest = best ;
if (mode == LocalEndWithSomeDeletions) if (mode == LocalEndWithSomeDeletions)
tbest += cost.deletion_end*(n-j); tbest += cost.deletion_end*(n-j);
...@@ -204,55 +239,26 @@ int DynProg::compute() ...@@ -204,55 +239,26 @@ int DynProg::compute()
best_j = j ; best_j = j ;
} }
if (best == 0){ if (best.score == 0){
B[i][j].type = FIN; B[i][j].type = FIN;
} }
} }
S[i][j] = best ;
if (best==subst){
B[i][j].i = i-1 ;
B[i][j].j = j-1 ;
B[i][j].type = SUBST ;
}
if (best==inser){
B[i][j].i = i-1 ;
B[i][j].j = j ;
B[i][j].type = INSER ;
}
if (best==delet){
B[i][j].i = i ;
B[i][j].j = j-1 ;
B[i][j].type = DELET ;
}
if (best==homo2x){
B[i][j].i = i-2 ;
if (B[i][j].i < 0) B[i][j].i = 0 ;
B[i][j].j = j-1 ;
B[i][j].type = HOMO2X ;
}
if (best==homo2y){
B[i][j].i = i-1 ;
B[i][j].j = j-2 ;
if (B[i][j].j < 0) B[i][j].j = 0 ;
B[i][j].type = HOMO2Y ;
}
if (mode == Local || mode == LocalEndWithSomeDeletions) if (mode == Local || mode == LocalEndWithSomeDeletions)
{ {
if (best == 0){ if (best.score == 0){
B[i][j].type = FIN; B[i][j].type = FIN;
} }
} }
} }
// End. Find best_i and best_j, put FIN keywords where the backtrack should stop
if (mode == Local || mode == LocalEndWithSomeDeletions) if (mode == Local || mode == LocalEndWithSomeDeletions)
{ {
for (int i=0; i<=n; i++){ for (int j=0; j<=n; j++){
B[0][i].type = FIN; B[0][j].type = FIN;
} }
for (int i=0; i<=m; i++){ for (int i=0; i<=m; i++){
B[i][0].type = FIN; B[i][0].type = FIN;
...@@ -269,11 +275,6 @@ int DynProg::compute() ...@@ -269,11 +275,6 @@ int DynProg::compute()
best_score = S[m][j] ; best_score = S[m][j] ;
best_j = j ; best_j = j ;
} }
for (int i=1; i<=n; i++){
B[0][i].i = 0 ;
B[0][i].j = i-1 ;
B[0][i].type = DELET;
}
for (int i=0; i<=m; i++){ for (int i=0; i<=m; i++){
B[i][0].type = FIN; B[i][0].type = FIN;
} }
...@@ -292,32 +293,14 @@ int DynProg::compute() ...@@ -292,32 +293,14 @@ int DynProg::compute()
for (int i=0; i<=n; i++){ for (int i=0; i<=n; i++){
B[0][i].type = FIN; B[0][i].type = FIN;
} }
for (int i=0; i<=m; i++){
B[i][0].i = i-1 ;
B[i][0].j = 0 ;
B[i][0].type = INSER;
}
} }
if ((mode == Global) || (mode == GlobalButMostlyLocal)) if ((mode == Global) || (mode == GlobalButMostlyLocal))
{ {
best_i = m ; best_i = m ;
best_j = n ; best_j = n ;
best_score = S[m][n]; best_score = S[m][n];
for (int i=0; i<=m; i++){
B[i][0].i = i-1 ;
B[i][0].j = 0 ;
B[i][0].type = INSER;
}
for (int i=1; i<=n; i++){
B[0][i].i = 0 ;
B[0][i].j = i-1 ;
B[0][i].type = DELET;
}
B[0][0].type = FIN;
} }
if (reverse_x) if (reverse_x)
...@@ -326,6 +309,8 @@ int DynProg::compute() ...@@ -326,6 +309,8 @@ int DynProg::compute()
if (reverse_y) if (reverse_y)
best_j = n - best_j + 1; best_j = n - best_j + 1;
B[0][0].type = FIN;
// In the matrix positions start at 1, but start positions may start at 0 // In the matrix positions start at 1, but start positions may start at 0
best_i -= 1; best_i -= 1;
best_j -= 1; best_j -= 1;
...@@ -351,86 +336,83 @@ void DynProg::backtrack() ...@@ -351,86 +336,83 @@ void DynProg::backtrack()
int g1=x.size(); int g1=x.size();
int g2=y.size(); int g2=y.size();
int ti=best_i+1; int i=best_i+1;
int tj=best_j+1; int j=best_j+1;
int tmpi, tmpj; int tmpi, tmpj;
tmpi=ti; tmpi=i;
tmpj=tj; tmpj=j;
str_back = ""; // Compute backtrack strings
string str1, str2, str3;
ostringstream back_s1; ostringstream back_s1;
ostringstream back_s2; ostringstream back_s2;
ostringstream back_tr; ostringstream back_tr;
ostringstream back; ostringstream back;
while (1) {
while ( B[ti][tj].type != FIN ){
if ((i<0) || (j<0))
tmpi=B[ti][tj].i; {
tmpj=B[ti][tj].j; cout << "Invalid backtrack: " << i << "," << j << endl ;
exit(1);
if (B[ti][tj].type == SUBST ){
linkgap[g1]=g2;
back_s1 << x[ti-1];
g1--;
back_s2 << y[tj-1];
g2--;
if(x[ti-1]==y[tj-1]) {
back_tr << "|";
}else{
back_tr << ":";
} }
if (B[i][j].type == FIN)
break ;
} // cout << "bt " << i << "/" << j << " " << B[i][j].type << " " << B[i][j].i << "," << B[i][j].j << endl ;
if (B[ti][tj].type == INSER ){
linkgap[g1]=g2; tmpi=B[i][j].i;
back_s1 << x[ti-1]; tmpj=B[i][j].j;
g1--;
back_s2 << " ";
gap2[g2]++;
back_tr << ".";
}
if (B[ti][tj].type == DELET){
linkgap[g1]=g2;
back_s1 << " ";
gap1[g1]++;
back_s2 << y[tj-1];
g2--;
back_tr << ".";
}
if (B[ti][tj].type == HOMO2X ){
linkgap[g1]=g2;
back_s1 << x[ti-1] << x[ti-2];
g1--;
linkgap[g1]=g2;
g1--;
back_s2 << " " << y[tj-1];
gap2[g2]++;
g2--;
back_tr << " h";
} int max_k = max(i - tmpi, j - tmpj);
if (B[ti][tj].type == HOMO2Y ){ for (int k=0; k<max_k; k++)
linkgap[g1]=g2; {
back_s1 << " " << x[ti-1] ; linkgap[g1]=g2;
gap1[g1]++;
g1--; if (i-k > tmpi)
back_s2 << y[tj-1] << y[tj-2] ; {
g2--; back_s1 << x[i-1-k];
g2--; g1--;
back_tr << " h"; }
} else
{
back_s1 << " " ;
gap1[g1]++ ;
}
if (j-k > tmpj)
{
back_s2 << y[j-1-k];
g2--;
}
else
{
back_s2 << " " ;
gap2[g2]++ ;
}
if (k < max_k-1)
back_tr << " " ;
}
ti=tmpi; back_tr << B[i][j].type ;
tj=tmpj;
i=tmpi;
j=tmpj;
/*
back_s1 << " " ;
back_s2 << " " ;
back_tr << " " ;
*/
} }
// Format backtrack strings
string str1, str2, str3;
str1=back_s1.str(); str1=back_s1.str();
str1 =string (str1.rbegin(), str1.rend()); str1 =string (str1.rbegin(), str1.rend());
str1 = str1; str1 = str1;
...@@ -439,17 +421,17 @@ void DynProg::backtrack() ...@@ -439,17 +421,17 @@ void DynProg::backtrack()
str3=back_s2.str(); str3=back_s2.str();
str3 = string (str3.rbegin(), str3.rend()); str3 = string (str3.rbegin(), str3.rend());
back << ti << " >> " << str1.substr(0, BACKSIZE-8) << endl; back << i << " >> " << str1.substr(0, BACKSIZE-8) << endl;
back << " diff " << str2.substr(0, BACKSIZE-8) << endl; back << " diff " << str2.substr(0, BACKSIZE-8) << endl;
back << tj << " >> " << str3.substr(0, BACKSIZE-8) << endl << endl; back << j << " >> " << str3.substr(0, BACKSIZE-8) << endl << endl;
for (size_t k=0 ; (BACKSIZE-8+k*BACKSIZE)< str1.length() ; k++){ for (size_t k=0 ; (BACKSIZE-8+k*BACKSIZE)< str1.length() ; k++){
back << str1.substr(BACKSIZE-8+k*BACKSIZE, BACKSIZE) << endl; back << str1.substr(BACKSIZE-8+k*BACKSIZE, BACKSIZE) << endl;
back << str2.substr(BACKSIZE-8+k*BACKSIZE, BACKSIZE) << endl; back << str2.substr(BACKSIZE-8+k*BACKSIZE, BACKSIZE) << endl;
back << str3.substr(BACKSIZE-8+k*BACKSIZE, BACKSIZE) << endl << endl; back << str3.substr(BACKSIZE-8+k*BACKSIZE, BACKSIZE) << endl << endl;
} }
first_i=ti; first_i=i;
first_j=tj; first_j=j;
str_back=back.str(); str_back=back.str();
...@@ -488,8 +470,12 @@ ostream& operator<<(ostream& out, const DynProg& dp) ...@@ -488,8 +470,12 @@ ostream& operator<<(ostream& out, const DynProg& dp)
out << " " ; out << " " ;
for (int j=0; j<=dp.n; j++) for (int j=0; j<=dp.n; j++)
out << setw(4) << dp.S[i][j] << " "; {
if (dp.S[i][j])
out << setw(4) << dp.S[i][j] << dp.B[i][j].type ;
else
out << " " << dp.B[i][j].type ;
}
out << endl ; out << endl ;
} }
......
...@@ -16,10 +16,12 @@ using namespace std; ...@@ -16,10 +16,12 @@ using namespace std;
float identity_percent(int score); float identity_percent(int score);
typedef struct { typedef struct {
char type;
int i; int i;
int j; int j;
int type; int score;
} backtrack_info; } operation;
class Cost class Cost
{ {
...@@ -108,8 +110,8 @@ class DynProg ...@@ -108,8 +110,8 @@ class DynProg
friend ostream& operator<<(ostream& out, const DynProg& dp); friend ostream& operator<<(ostream& out, const DynProg& dp);
int **S; int **S; // Score matrix
backtrack_info **B; operation **B; // Backtrack matrix
int *gap1 ; int *gap1 ;
int *linkgap ; int *linkgap ;
int *gap2 ; int *gap2 ;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment