Commit f73e2878 authored by Mikaël Salson's avatar Mikaël Salson

Merge branch 'feature-a/refactor-align-against-collection' into 'dev'

Refactor align_against_collection(), streamlined handling of "reverse" positions

See merge request !706
parents 9080bc30 3800381f
Pipeline #150617 failed with stages
in 7 minutes and 22 seconds
...@@ -441,12 +441,6 @@ int DynProg::compute(bool onlyBottomTriangle, int onlyBottomTriangleShift) ...@@ -441,12 +441,6 @@ int DynProg::compute(bool onlyBottomTriangle, int onlyBottomTriangleShift)
best_score = B[m][n].score; best_score = B[m][n].score;
} }
if (reverse_x)
best_i = m - best_i + 1 ;
if (reverse_y)
best_j = n - best_j + 1;
B[0][0].type = FIN; B[0][0].type = FIN;
// In the matrix positions start at 1, but start positions may start at 0 // In the matrix positions start at 1, but start positions may start at 0
...@@ -491,13 +485,6 @@ void DynProg::backtrack() ...@@ -491,13 +485,6 @@ void DynProg::backtrack()
int i=best_i+1; int i=best_i+1;
int j=best_j+1; int j=best_j+1;
// Retake good i/j when there were reversed strings
if (reverse_x)
i = m - i + 1 ;
if (reverse_y)
j = n - j + 1;
if ((i<0) || (i>m) || (j<0) || (j>n)) if ((i<0) || (i>m) || (j<0) || (j>n))
{ {
cout << "Invalid backtrack starting point: " << i << "," << j << endl ; cout << "Invalid backtrack starting point: " << i << "," << j << endl ;
......
...@@ -51,6 +51,16 @@ AlignBox::AlignBox(string _key, string _color) { ...@@ -51,6 +51,16 @@ AlignBox::AlignBox(string _key, string _color) {
ref_label = ""; ref_label = "";
} }
void AlignBox::reverse() {
int start_ = start;
start = seq_length - end - 1;
end = seq_length - start_ - 1;
int del_left_ = del_left;
del_left = del_right;
del_right = del_left_;
}
int AlignBox::getLength() { int AlignBox::getLength() {
return end - start + 1 ; return end - start + 1 ;
} }
...@@ -867,11 +877,12 @@ void align_against_collection(string &read, BioReader &rep, int forbidden_rep_id ...@@ -867,11 +877,12 @@ void align_against_collection(string &read, BioReader &rep, int forbidden_rep_id
{ {
int best_score = MINUS_INF ; int best_score = MINUS_INF ;
box->rep = &rep;
box->ref_nb = MINUS_INF ; box->ref_nb = MINUS_INF ;
int best_best_i = (int) string::npos ; box->start = (int) string::npos ;
box->end = (int) string::npos ;
int best_best_j = (int) string::npos ; int best_best_j = (int) string::npos ;
int best_first_i = (int) string::npos ;
int best_first_j = (int) string::npos ;
vector<pair<int, int> > score_r; vector<pair<int, int> > score_r;
...@@ -898,35 +909,46 @@ void align_against_collection(string &read, BioReader &rep, int forbidden_rep_id ...@@ -898,35 +909,46 @@ void align_against_collection(string &read, BioReader &rep, int forbidden_rep_id
if (score > best_score) if (score > best_score)
{ {
dp.backtrack(); dp.backtrack();
best_score = score ; best_score = score ;
best_best_i = dp.best_i ;
best_best_j = dp.best_j ; // Reference identification
best_first_i = dp.first_i ; box->ref_nb = r ;
best_first_j = dp.first_j ;
box->ref_nb = r ; // Alignment positions *on the read*
box->ref_label = rep.label(r) ; box->start = dp.first_i; // start position
box->marked_pos = dp.marked_pos_i ; box->end = dp.best_i ; // end position
} box->marked_pos = dp.marked_pos_i ; // marked position
// Alignment positions *on the reference*
box->del_left = dp.first_j; // around start position
best_best_j = dp.best_j; // around end position
}
score_r.push_back(make_pair(score, r)); score_r.push_back(make_pair(score, r));
// #define DEBUG_SEGMENT // #define DEBUG_SEGMENT
#ifdef DEBUG_SEGMENT #ifdef DEBUG_SEGMENT
cout << rep.label(r) << " " << score << " " << dp.best_i << endl ; cout << rep.label(r) << " " << score << " " << dp.first_i << " " << dp.best_i << endl ;
#endif #endif
} }
int length = best_best_i; // end position of the alignment in the read sort(score_r.begin(),score_r.end(),comp_pair);
int del_end = rep.sequence(box->ref_nb).size() - best_best_j; box->score = score_r;
if (reverse_ref || reverse_both) {
length = read.length() - length - 1; box->ref_label = rep.label(box->ref_nb) ;
del_end = best_best_j; box->ref = rep.sequence(box->ref_nb);
box->del_right = box->ref.size() - best_best_j - 1;
box->seq_length = read.length();
if (reverse_both) {
box->reverse();
} }
length = min(length, (int) rep.sequence(box->ref_nb).size());
length += del_end; // Should we run a full DP?
// length is an estimation of the number of aligned nucleotides. It would be better with #2138 int length = min(box->getLength(), (int) box->ref.size());
// length is an estimation of the number of aligned nucleotides.
int min_number_of_matches = min(int(length * FRACTION_ALIGNED_AT_WORST), length - BOTTOM_TRIANGLE_SHIFT); // Minimal number of matches we can have with a triangle int min_number_of_matches = min(int(length * FRACTION_ALIGNED_AT_WORST), length - BOTTOM_TRIANGLE_SHIFT); // Minimal number of matches we can have with a triangle
int max_number_of_insertions = length - min_number_of_matches; int max_number_of_insertions = length - min_number_of_matches;
int score_with_limit_number_of_indels = min_number_of_matches * segment_cost.match + max_number_of_insertions * segment_cost.insertion; int score_with_limit_number_of_indels = min_number_of_matches * segment_cost.match + max_number_of_insertions * segment_cost.insertion;
...@@ -939,26 +961,12 @@ void align_against_collection(string &read, BioReader &rep, int forbidden_rep_id ...@@ -939,26 +961,12 @@ void align_against_collection(string &read, BioReader &rep, int forbidden_rep_id
return; return;
} }
sort(score_r.begin(),score_r.end(),comp_pair);
box->ref = rep.sequence(box->ref_nb);
box->del_right = reverse_both ? best_best_j : box->ref.size() - best_best_j - 1;
box->del_left = best_first_j;
box->start = best_first_i;
box->rep = &rep;
box->score = score_r;
#ifdef DEBUG_SEGMENT #ifdef DEBUG_SEGMENT
cout << "best: " << box->ref_label << " " << best_score ; cout << "reverse_both " << reverse_both << " reverse_left " << reverse_ref << " local " << local << endl;
cout << "del/del2/begin:" << (box->del_right) << "/" << (box->del_left) << "/" << (box->start) << endl; cout << "best: " << *box << " read length: " << read.length() << " ref length: " << box->ref.size() << endl;
cout << endl;
#endif #endif
if (reverse_ref)
// Why -1 here and +1 in dynprog.cpp /// best_i = m - best_i + 1 ;
best_best_i = read.length() - best_best_i - 1 ;
box->end = best_best_i ;
} }
string format_del(int deletions) string format_del(int deletions)
...@@ -1079,10 +1087,6 @@ FineSegmenter::FineSegmenter(Sequence seq, Germline *germline, Cost segment_c, ...@@ -1079,10 +1087,6 @@ FineSegmenter::FineSegmenter(Sequence seq, Germline *germline, Cost segment_c,
} }
align_against_collection(sequence_or_rc, germline->rep_3, NO_FORBIDDEN_ID, reverse_J, !reverse_J, false, align_against_collection(sequence_or_rc, germline->rep_3, NO_FORBIDDEN_ID, reverse_J, !reverse_J, false,
box_J, segment_cost, false, standardised_threshold_evalue); box_J, segment_cost, false, standardised_threshold_evalue);
// J was run with '!reverseJ', we copy the box informations from right to left
// Should this directly be handled in align_against_collection() ?
box_J->start = box_J->end ;
box_J->del_left = box_J->del_right;
/* E-values */ /* E-values */
evalue_left = multiplier * sequence.size() * germline->rep_5.totalSize() * segment_cost.toPValue(box_V->score[0].first); evalue_left = multiplier * sequence.size() * germline->rep_5.totalSize() * segment_cost.toPValue(box_V->score[0].first);
......
...@@ -96,15 +96,32 @@ class AlignBox ...@@ -96,15 +96,32 @@ class AlignBox
string key; string key;
string color; string color;
int del_left;
/**
* Alignment positions *on the read*
*/
int start; int start;
int end; int end;
int marked_pos; // Marked position, for Cys104 and Phe118/Trp118
int seq_length;
/**
* Alignment positions *compared to reference sequence*
*/
int del_left;
int del_right; int del_right;
AlignBox(string key = "", string color=""); AlignBox(string key = "", string color="");
string getSequence(string sequence); string getSequence(string sequence);
void addToOutput(CloneOutput *clone, int alternative_genes); void addToOutput(CloneOutput *clone, int alternative_genes);
/**
* Mirror the AlignBox (over a a sequence length seq_length)
*/
void reverse();
/** /**
* Returns 'V', 'D', 'J', or possibly '5', '4', '3', '?', depending on the ref_label and on the key * Returns 'V', 'D', 'J', or possibly '5', '4', '3', '?', depending on the ref_label and on the key
*/ */
...@@ -132,9 +149,6 @@ class AlignBox ...@@ -132,9 +149,6 @@ class AlignBox
string ref_label; string ref_label;
string ref; string ref;
/* Marked position, for Cys104 and Phe118/Trp118 */
int marked_pos;
/* Scores and identifiers of other possible reference sequence */ /* Scores and identifiers of other possible reference sequence */
vector<pair<int, int> > score; vector<pair<int, int> > score;
}; };
......
...@@ -89,10 +89,6 @@ int main(int argc, const char** argv) ...@@ -89,10 +89,6 @@ int main(int argc, const char** argv)
align_against_collection(read, interestingV, -1, false, false, false, &box_V, VDJ); align_against_collection(read, interestingV, -1, false, false, false, &box_V, VDJ);
align_against_collection(read, interestingJ, -1, false, true, false, &box_J, VDJ); align_against_collection(read, interestingJ, -1, false, true, false, &box_J, VDJ);
// This should be handled directly into align_against_collection
box_J.start = box_J.end ;
box_J.del_left = box_J.del_right;
box_J.end = read.size() - 1;
int align_V_length = min(GENE_ALIGN, box_V.end - box_V.start + 1); int align_V_length = min(GENE_ALIGN, box_V.end - box_V.start + 1);
int align_J_length = min(GENE_ALIGN, (int)read.size() - box_J.start + 1); int align_J_length = min(GENE_ALIGN, (int)read.size() - box_J.start + 1);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment