Commit 8d340098 authored by Mikaël Salson's avatar Mikaël Salson Committed by Mathieu Giraud
Browse files

representative.cpp: Use multiple seeds to compute representative

Since spaced seeds may be used, we need to compute the positions that
are covered by actual letters (#). Some positions may be missing in
which case we will put a N in the representative.
parent 83ff2886
......@@ -87,22 +87,29 @@ void KmerRepresentativeComputer::compute() {
// First create an index on the set of reads
IKmerStore<Kmer> *index = new MapKmerStore<Kmer>(getSeed(), revcomp);
string seeds[] = {"##########", // The first seed should be a contiguous seed.
"##-##-##-##-##-",
"#-##-##-##-##-#",
"-##-##-##-##-##",
"--#--#--#--#--#--#--#--#--#--#"};
size_t nb_seeds = 5;
// Add sequences to the index, allowing extended nucleotides (false)
for (list<Sequence>::iterator it=sequences.begin(); it != sequences.end(); ++it) {
index->insert(it->sequence, it->label, false);
for (size_t i = 0; i < nb_seeds; i++)
index->insert(it->sequence, it->label, false, 0, seeds[i]);
}
size_t max = sequences.size();
// Create a read chooser to have the sequences sorted by length
ReadQualityScore *rlc = new ReadQualityScore();
ReadChooser rc(sequences, *rlc);
delete rlc;
delete rlc;
// Traverse the sequences to get the desired representative
size_t pos_longest_run = 0;
size_t length_longest_run = 0;
size_t seq_index_longest_run = 1;
Sequence sequence_longest_run;
bool *cover_longest_run = NULL;
int sequence_used_for_quality = 0;
int window_quality_sum [required.length()];
memset(window_quality_sum, 0, required.length()*sizeof(int));
......@@ -139,27 +146,41 @@ void KmerRepresentativeComputer::compute() {
continue;
}
bool cover[sequence.sequence.size()];
memset(cover, false, sequence.sequence.size()*sizeof(bool));
size_t pos_end_required = pos_required + required.length();
vector<Kmer> counts = index->getResults(sequence.sequence);
memset(&cover[pos_required], true, required.length()*sizeof(bool));
vector<Kmer> counts[nb_seeds];
for (size_t i = 0; i < nb_seeds; i++)
counts[i] = index->getResults(sequence.sequence, false, seeds[i]);
size_t length_run = 0;
size_t i = pos_required;
bool was_extended = true;
// Extend to the left, starting from 'pos_required'
while (i > 0 && isSufficienlyExpressed(counts[i-1].count, max)) {
while (i > 0 && was_extended) {
was_extended = tryToExtendRepresentative(counts, seeds, nb_seeds, i, cover, -1);
if (was_extended) {
i--;
length_run++;
}
}
// Extend to the right, starting from 'pos_required'
for (i = pos_required; i < counts.size(); i++) {
while (i < counts.size() &&
isSufficienlyExpressed(counts[i].count, max)) {
length_run++;
i = pos_required ;
was_extended = true;
while (i+1 < counts[0].size() && was_extended) {
was_extended = tryToExtendRepresentative(counts, seeds, nb_seeds, i, cover, 1);
if (was_extended) {
i++;
length_run++;
}
}
if (length_run)
// Take into account the whole k-mer, not just the starting positions
length_run += k - 1;
......@@ -168,17 +189,27 @@ void KmerRepresentativeComputer::compute() {
pos_longest_run = i - (length_run - k + 1);
sequence_longest_run = sequence;
seq_index_longest_run = seq;
if (cover_longest_run)
delete [] cover_longest_run;
cover_longest_run = new bool[sequence.sequence.size()];
memcpy(cover_longest_run, cover, sizeof(bool) * sequence.sequence.size());
}
// We have a requirement (ie. a non empty string). We reached it, exit.
if (pos_required != pos_end_required)
break;
length_run = 0;
}
}
if (length_longest_run) {
is_computed = true;
representative = sequence_longest_run;
if (nb_seeds > 1) {
for (size_t i = 0; i < sequence_longest_run.sequence.size(); i++) {
if (!cover_longest_run[i])
representative.sequence[i] = 'N';
}
}
delete [] cover_longest_run;
representative.sequence = representative.sequence.substr(pos_longest_run, length_longest_run);
representative.label = representative.label + "-[" + string_of_int(pos_longest_run) + ","
+ string_of_int(pos_longest_run + length_longest_run - 1) + "]"
......@@ -213,3 +244,32 @@ void KmerRepresentativeComputer::compute() {
}
delete index;
}
bool KmerRepresentativeComputer::tryToExtendRepresentative(const vector<Kmer> counts[],
string seeds[],
size_t nb_seeds,
size_t i,
bool *cover,
int direction) {
bool was_extended = false;
for (size_t current_seed = 0; current_seed < nb_seeds && ! was_extended; current_seed++) {
if (isSufficienlyExpressed(counts[current_seed][i+direction].count, sequences.size())) {
i += direction;
if (nb_seeds > 1) {
size_t seed_length = seeds[current_seed].size();
for (size_t pos = 0; pos < seed_length; pos++)
cover[i+pos] |= (seeds[current_seed][pos] == '#');
} else {
if (direction == -1)
cover[i] = true;
else {
cover[i + seeds[current_seed].size() - 1] = true;
}
}
was_extended = true;
}
}
return was_extended;
}
......@@ -4,6 +4,7 @@
#include <cassert>
#include <list>
#include "fasta.h"
#include "kmerstore.h"
using namespace std;
......@@ -121,6 +122,20 @@ public:
* @pre setCoverageReferenceLength() must have been called previously
*/
void compute();
private:
/**
* Check if we can extend the representative to the left (direction == -1)
* or to the right (direction == 1), given the kmer counts, the seeds used
* and starting from position i.
* The cover is updated accordingly if the representative is extended.
*/
bool tryToExtendRepresentative(const vector<Kmer> counts[],
string seeds[],
size_t nb_seeds,
size_t i,
bool *cover,
int direction);
};
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment