...
  View open merge request
Commits (9)
......@@ -357,11 +357,57 @@ int DynProg::compute(bool onlyBottomTriangle, int onlyBottomTriangleShift)
{
int tbest = best.score ;
if (mode == LocalEndWithSomeDeletions)
tbest += cost.deletion_end*(n-j);
if (mode == LocalEndWithSomeDeletions) {
int context_length = 10;
pair<operation, int> back = go_back(context_length, B[i][j]);
int score_back = back.first.score;
// Try to guess the nb of differences (here only mismatches) based on scores
// Just a consequence of the formula (n - e) * match + e * mismatch = score
int nb_mismatch_left_part = max(0., (score_back - (back.first.j+1) * cost.match) * 1. / (cost.mismatch - cost.match));
int standardized_mismatch_left_part = nb_mismatch_left_part * back.second / max(1, (back.first.j + 1));
float percent_error_right_part = 1 - nb_mismatch_left_part * 1. / max(1, (back.first.j+1));
// Heuristically guess the number of mismatches in the back.second last aligned letters
int nb_mismatch_right_part = max(0., ((tbest - score_back) - back.second * cost.match)*1. / (cost.mismatch - cost.match));
float proba_error = 0;
// proba_error : probability that the error scheme at the end is different from what we had before
// probability that we had more errors in the right part than the left part
for (int i = standardized_mismatch_left_part+1; i <= nb_mismatch_right_part; i++) {
proba_error += pow(1 - percent_error_right_part, i) * pow(percent_error_right_part, back.second - i) * nChoosek(back.second, i);
}
float del_end_cost = cost.deletion_end * (1 - proba_error) + cost.deletion * proba_error;
#ifdef DEBUG_DEL_END_SCORE
// if (n-j <= 15 && m -i <= 5) {
PRINT_VAR(nb_mismatch);
PRINT_VAR(back.second);
PRINT_VAR(nb_mismatch_left_part);
PRINT_VAR(back.first.j);
PRINT_VAR(standardized_mismatch_left_part);
PRINT_VAR(tbest);
PRINT_VAR(percent_error_right_part);
// PRINT_VAR(percent_score_back);
PRINT_VAR(score_back);
PRINT_VAR(proba_error);
PRINT_VAR(del_end_cost);
PRINT_VAR(i);
PRINT_VAR(j);
cerr << (n-j) << " deletions at the end" << endl;
cerr << "score would be " << tbest + del_end_cost * (n - j) << endl << endl;
// }
#endif
tbest += del_end_cost*(n-j);
}
if (tbest > best_score)
{
#ifdef DEBUG_DEL_END_SCORE
cerr << " *** This is the best score for now " << tbest << " *** " << endl;
PRINT_VAR(i);
PRINT_VAR(j);
#endif
best_score = tbest ;
best_i = i ;
best_j = j ;
......@@ -447,6 +493,14 @@ int DynProg::compute(bool onlyBottomTriangle, int onlyBottomTriangleShift)
return best_score ;
}
pair<operation, int> DynProg::go_back(int nb_positions, operation cell) {
int nb;
for (nb = 0; nb < nb_positions && cell.type != FIN && cell.i >= 0 && cell.j >= 0; nb++) {
cell = B[cell.i][cell.j];
}
return make_pair(cell, nb);
}
int DynProg::best_score_on_i(int i, int *best_j)
{
int best_score = MINUS_INF ;
......
......@@ -74,7 +74,7 @@ const Cost Levenshtein = Cost(0, -1, -1);
const Cost DNA = Cost(+5, -4, -10);
// Vidjil costs
const Cost VDJ = Cost(+4, -6, -10, -1);
const Cost VDJ = Cost(+40, -100, -160, -3);
const Cost VDJaffine = Cost(+4, -6, -15, -1, -1, MINUS_INF);
const Cost IdentityDirty = Cost(+1000, -1, -1); // pour avoir une estimation de longueur de l'alignement, utilise dans compare-all
......@@ -147,6 +147,15 @@ class DynProg
int compute(bool onlyBottomTriangle = false, int onlyBottomTriangleShift = 0);
void backtrack();
/**
* Go back in the alignment by nb_positions (at most) by starting at
* the given cell.
* @return the corresponding cell in the matrix and the number of
* steps that have been performed (<= nb_positions)
*/
pair<operation, int> go_back(int nb_positions, operation cell);
void SemiGlobal_hits_threshold(vector<int> &hits, int threshold, int shift_pos, int verbose);
string SemiGlobal_extract_best();
......
# For the moment 9/CTA/2
>TRGV2*01 12/CCCCTA/2 TRGJ1*02 [TRG]
GGAAGGCCCCACAGCGTCTTCAGTACTATGACTCCTACAACTCCAAGGTTGTGTTGGAATCAGGAGTCAGTCCAGGGAAGTATTATACTTACGCAAGCACAAGGAACAACTTGAGATTGATACTGCGAAATCTAATTGAAAATGACTCTGGGGTCTATTACTGTGCCCCC
CTA
ATTATAAGAAACTCTTTGGCAGTGGAACAACAC
# For the moment 0//10
>TRGV4*01 5/ATGGG/10 TRGJP1*01 [TRG]
GGAAGGCCCCACAGCGTCTTCTGTACTATGACTCCTACACCTCCAGCGTTGTGTTGGAATCAGGAATCAGCCCAGGGAAGTATGATACTTATGGAAGCACAAGGAAGAACTTGAGAATGATACTGCGAAATCTTATTGAAAATGACTCTGGAGTCTATTACTGTGCCACCTGGGCTCGGTTGGTTCAAGATATTTGCTGAAGGGACTAAGCTCATAGTAACTTCGCCTGGTAA
# IGHV3-30 GGAGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTCAGTAGCTATGCTATGCACTGGGTCCGCCAGGCTCCAGGCAAGGGGCTAGAGTGGGTGGCAGTTATATCATATGATGGAAGTAATAAATACTACGCAGACTCCGTGAAGGGCCGATTCACCATCTCCAG
# ||||||||||||||||||||||||||||||||||||||||
# |||||||||| |||||| ||||||||||||||||||||||
# read GGAGGTCCCTAAGACTCCCCTGTGCAGCCTCTGGATTCACTGGTCACCGTCTCCTCAGGTAAG
# |||||||||||||||||||
# IGHJ4 ACTACTTTGACTACTGGGGCCAAGGAACCCTGGTCACCGTCTCCTCAG
......
#include "core/dynprog.h"
void testEndDeletions() {
string v = "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA";
vector<string> seq =
{
"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
"AAAAAAAAGAAAGAAAAGAAAGAAAAGAAAAGAAAGAAAGAAAGAAAGAA",
"AAAAAAAAGAAAGAAAAGAAAGAAAAGAAAAGAAAGAAAGAAAGAAAGGG",
"AAAAAAAAGAAAGAAAAGAAAGAAAAGAAAAGAAAGAAAGAAAGAAAGGA",
"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGAA",
"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGAGAAGAAGAGAGAGAGGGA",
};
vector<size_t> del_ends = {0, 0, 3, 3, 3, 20};
for (size_t i = 0; i < seq.size(); i++) {
DynProg dp(v, seq[i], DynProg::LocalEndWithSomeDeletions, VDJ);
dp.compute();
TAP_TEST_EQUAL(seq[i].length()-1 - dp.best_j, del_ends[i], TEST_DYNPROG_DEL_END, "dp.best_j = " << dp.best_j <<"\ti=" << i);
}
}
void testDynProg() {
testEndDeletions();
}
......@@ -15,6 +15,7 @@
#include "testWindowsStorage.cpp"
#include "testReadStorage.cpp"
#include "testAutomaton.cpp"
#include "testDynProg.cpp"
int main(void) {
TAP_START(NB_TESTS);
......@@ -35,6 +36,7 @@ int main(void) {
testWindowStorage();
testReadStorage();
testAutomaton();
testDynProg();
TAP_END_TEST_EXIT
}
......@@ -172,6 +172,9 @@ enum {
TEST_BRS_GET_NB_SCORES,
TEST_BRS_GET_BEST_READS,
/* Dynprog */
TEST_DYNPROG_DEL_END,
/* Bugs */
TEST_BUG_SEGMENTATION,
TEST_SEGMENT_POSITION,
......@@ -337,6 +340,10 @@ inline void declare_tests() {
RECORD_TAP_TEST(TEST_KMER_REPRESENTATIVE_REQUIRED_SEQ, "Test KmerRepresentativeComputer computations with a required sequence");
RECORD_TAP_TEST(TEST_KMER_REPRESENTATIVE_REVCOMP, "Test KmerRepresentativeComputer computations on a dataset and its revcomp");
RECORD_TAP_TEST(TEST_KMER_REPRESENTATIVE_QUALITY, "Test KmerRepresentativeComputer quality computations");
RECORD_TAP_TEST(TEST_DYNPROG_DEL_END, "Test number of end deletions for Dynamic Programming");
RECORD_TAP_TEST(TEST_BUG_SEGMENTATION, "Test segmentation bug");
RECORD_TAP_TEST(TEST_SEGMENT_POSITION, "Test V,D,J position");
RECORD_TAP_TEST(TEST_KMER_SEGMENT_OVERLAP, "Test kmer segmentation with an overlap");
......