Une MAJ de sécurité est nécessaire sur notre version actuelle. Elle sera effectuée lundi 02/08 entre 12h30 et 13h. L'interruption de service devrait durer quelques minutes (probablement moins de 5 minutes).

Commit 2099c49f authored by Mikaël Salson's avatar Mikaël Salson Committed by Mathieu Giraud
Browse files

tools: Remove Ns at the end of a sequence

We remove the longest prefix (resp. suffix) ending with a
N (resp. starting) whose N content is greater or equal to
RATIO_TOO_MANY_N.

This is particularly useful for the representative: because of spaced
seeds we may have N in the representative, especially on the ends of the
representative (when only one seed matched at the extremity of the
sequence).
parent 8d340098
......@@ -3,6 +3,7 @@
#include "read_score.h"
#include "read_chooser.h"
#include "stats.h"
#include "tools.h"
#include <cstring>
#include <iostream>
......@@ -208,6 +209,7 @@ void KmerRepresentativeComputer::compute() {
if (!cover_longest_run[i])
representative.sequence[i] = 'N';
}
trimSequence(representative.sequence, pos_longest_run, length_longest_run);
}
delete [] cover_longest_run;
representative.sequence = representative.sequence.substr(pos_longest_run, length_longest_run);
......@@ -273,3 +275,4 @@ bool KmerRepresentativeComputer::tryToExtendRepresentative(const vector<Kmer> co
}
return was_extended;
}
......@@ -10,6 +10,9 @@ using namespace std;
#define DEFAULT_STABILITY_LIMIT 30
#define THRESHOLD_BAD_COVERAGE .5 /* Threshold below which the representatie
coverage is considered bad */
/**
* Compute a representative sequence from a list of sequences.
* The sequences are supposed to share a common juction.
......@@ -137,6 +140,11 @@ public:
bool *cover,
int direction);
/**
* Remove the ends of the representative if they contain too many N.
* The values of start_pos and length will be updated correspondingly.
*/
void trimRepresentative(string &sequence, size_t &start_pos, size_t &length);
};
#endif
......@@ -291,6 +291,43 @@ double nChoosek(unsigned n, unsigned k)
return nChoosek_stored[n][k];
}
void trimSequence(string &sequence, size_t &start_pos, size_t &length) {
size_t n = length;
size_t number_of_N = 1;
size_t start;
size_t i = start = sequence.find('N', start_pos);
size_t previous_N = i;
// Remove N at the start
for (; i < start_pos+n
&& number_of_N*1. / (i-start_pos+1) > RATIO_TOO_MANY_N;
i = sequence.find('N', i+1), number_of_N++) {
previous_N = i;
}
// We had at least one N fulfilling our conditions
if (previous_N != i) {
previous_N++; // We ignore the N
length -= (previous_N - start_pos);
start_pos = previous_N;
}
// Remove N at the end
i = start = sequence.rfind('N', start_pos + length - 1);
number_of_N = 1;
previous_N = i;
for (; i > start_pos
&& number_of_N * 1. / ((start_pos + length - 1) - i) > RATIO_TOO_MANY_N;
i = sequence.rfind('N', i-1), number_of_N++) {
previous_N = i;
}
if (previous_N != i) {
previous_N--;
length -= (start_pos + length - 1) - previous_N;
}
}
void output_label_average(ostream &out, string label, long long int nb, double average, int precision)
{
out << " ";
......
......@@ -13,6 +13,12 @@
#define MAX_SEED_SIZE 50 // Spaced seed buffer
#define FIRST_POS 1 // Numbering of the base pairs for external output
#define RATIO_TOO_MANY_N .3 /* Ratio above which we consider there are too
many Ns in the sequence and the
corresponding subsequence should be
trimmed */
#include <sstream>
#include <iostream>
#include <iomanip>
......@@ -194,6 +200,14 @@ extern double nChoosek_stored[NB_N_CHOOSE_K_STORED][NB_N_CHOOSE_K_STORED];
*/
double nChoosek(unsigned n, unsigned k);
/**
* Remove the ends of the sequence if they contain too many N.
* The sequence will be considered starting at position start_pos
* for length letters.
* The values will be updated correspondingly after trimming.
*/
void trimSequence(string &sequence, size_t &start_pos, size_t &length);
const Sequence NULL_SEQUENCE = create_sequence("", "", "NULL", "");
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment