Commit 8f1c1377 authored by Mikaël Salson's avatar Mikaël Salson

kmeraffect: The affectations now have a length.

This doubles the space used by affectations. But this allows to compute the index load
on an index which has several types of seeds. This may also solve the problem
with revcomp with Aho-Corasick index (where affectations were not symmetric).

There is now an additionnal parameter for the affectations.
parent 0fb9cab6
......@@ -190,6 +190,7 @@ void PointerACAutomaton<Info>::insert(const seqtype &sequence, const string &lab
if (seed.empty())
seed = this->seed;
size_t seed_span = seed.length();
size_t seed_w = seed_weight(seed);
for(size_t i = start_indexing ; i + seed_span < end_indexing + 1 ; i++) {
seqtype substr = sequence.substr(i, seed_span);
......@@ -204,11 +205,11 @@ void PointerACAutomaton<Info>::insert(const seqtype &sequence, const string &lab
}
for (seqtype &seq: sequences) {
insert(seq, Info(label, 1));
insert(seq, Info(label, 1, seed_w));
}
if (! Info::hasRevcompSymetry()) {
for (seqtype &seq: sequences_rev) {
insert(seq, Info(label, -1));
insert(seq, Info(label, -1, seed_w));
}
}
}
......
......@@ -31,6 +31,10 @@ char affect_char(const affect_t &affect) {
return (affect.c & ((1 << 7)-1));
}
size_t affect_length(const affect_t &affect) {
return affect.length;
}
// KmerAffect class
bool operator==(const affect_t &a1, const affect_t &a2) {
......@@ -98,8 +102,9 @@ KmerAffect::KmerAffect(const KmerAffect &ka, bool reverse) {
}
KmerAffect::KmerAffect(const string &label,
int strand) {
int strand, size_t length) {
affect.c = label[0];
affect.length = length;
if (strand == 1)
affect.c |= (1 << 7);
}
......@@ -144,6 +149,10 @@ string KmerAffect::getLabel() const {
return string(1, affect_char(affect));
}
unsigned char KmerAffect::getLength() const {
return affect_length(affect);
}
KmerAffect KmerAffect::getUnknown() {
return AFFECT_UNKNOWN;
}
......@@ -172,6 +181,9 @@ string KmerAffect::toStringSigns() const {
return ::toStringSigns(affect);
}
void KmerAffect::setLength(unsigned char length) {
affect.length = length;
}
bool operator==(const KmerAffect &a1, const KmerAffect &a2) {
return a1.affect == a2.affect;
......
......@@ -23,6 +23,7 @@ typedef struct affect_s affect_t;
struct affect_s {
// 7 lsb are the label, the msb is the strand (0 -> -, 1 -> +)
char c;
unsigned char length;
};
/**
......@@ -35,6 +36,11 @@ int affect_strand(const affect_t &affect);
*/
char affect_char(const affect_t &affect);
/**
* @return the length of the kmer associated with the affectation
*/
size_t affect_length(const affect_t &affect);
bool operator==(const affect_t &a1, const affect_t &a2);
bool operator<(const affect_t &a1, const affect_t &a2);
bool operator>(const affect_t &a1, const affect_t &a2);
......@@ -75,9 +81,11 @@ public:
/**
* Construct an affectation as stated by the parameters
* @post affect_strand(affect) == strand AND affect_char(affect) == kmer[0]
* @post affect_strand(affect) == strand AND affect_char(affect) == kmer[0] AND
* affect_length(affect) == length
*/
KmerAffect(const string &label, int strand=1);
KmerAffect(const string &label, int strand, size_t length);
/**
* Add another affectation to the current one.
* @post The current affectation is not modified if the parameter is the same
......@@ -114,6 +122,11 @@ public:
*/
string getLabel() const;
/**
* @return the length of such an affectation
*/
unsigned char getLength() const;
/**
* @return the unknown affectation
*/
......@@ -143,6 +156,8 @@ public:
string toString() const;
string toStringValues()const;
string toStringSigns() const;
void setLength(unsigned char length);
};
......@@ -165,23 +180,23 @@ ostream &operator<<(ostream &os, const KmerAffect &kmer);
* Constant defining any not-unknown affectation
* Could be used by .getIndexLoad(), but now any non-AFFECT_UNKNOWN kmer will work.
*/
const KmerAffect AFFECT_NOT_UNKNOWN = KmerAffect(AFFECT_NOT_UNKNOWN_SYMBOL, 0);
const KmerAffect AFFECT_NOT_UNKNOWN = KmerAffect(AFFECT_NOT_UNKNOWN_SYMBOL, 0, 1);
/**
* Constant defining the unknown affectation (not known yet)
*/
const KmerAffect AFFECT_UNKNOWN = KmerAffect(AFFECT_UNKNOWN_SYMBOL, 0);
const KmerAffect AFFECT_UNKNOWN = KmerAffect(AFFECT_UNKNOWN_SYMBOL, 0, 1);
/**
* Constant defining the ambiguous affectation (many possibilities)
*/
const KmerAffect AFFECT_AMBIGUOUS = KmerAffect(AFFECT_AMBIGUOUS_SYMBOL, 1);
const KmerAffect AFFECT_AMBIGUOUS = KmerAffect(AFFECT_AMBIGUOUS_SYMBOL, 1, 1);
const KmerAffect AFFECT_V = KmerAffect("V", 1);
const KmerAffect AFFECT_J = KmerAffect("J", 1);
const KmerAffect AFFECT_V = KmerAffect("V", 1, 1);
const KmerAffect AFFECT_J = KmerAffect("J", 1, 1);
const KmerAffect AFFECT_V_BWD = KmerAffect("V", -1);
const KmerAffect AFFECT_J_BWD = KmerAffect("J", -1);
const KmerAffect AFFECT_V_BWD = KmerAffect("V", -1, 1);
const KmerAffect AFFECT_J_BWD = KmerAffect("J", -1, 1);
////////////////////////////////////////////////////////////////////////////////////////////////////
......
......@@ -32,7 +32,7 @@ Kmer::Kmer():count(0) {}
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
Kmer::Kmer(const string &label, int strand) {
Kmer::Kmer(const string &label, int strand, size_t length) {
count = 1;
}
#pragma GCC diagnostic pop
......
......@@ -21,7 +21,7 @@ public:
/**
* This constructor is used via a IKmerStore<Kmer> index (hence the argument list)
*/
Kmer(const string &label, int strand=1);
Kmer(const string &label, int strand=1, size_t length=0);
Kmer &operator+=(const Kmer &);
......@@ -258,10 +258,10 @@ void IKmerStore<T>::insert(Fasta& input,
insert(input.sequence(r), label, true, keep_only, seed);
}
labels.push_back(make_pair(T(label, 1), input)) ;
labels.push_back(make_pair(T(label, 1, seed_weight(seed)), input)) ;
if (revcomp_indexed && ! T::hasRevcompSymetry()) {
labels.push_back(make_pair(T(label, -1), input)) ;
labels.push_back(make_pair(T(label, -1, seed_weight(seed)), input)) ;
}
}
......@@ -283,6 +283,7 @@ void IKmerStore<T>::insert(const seqtype &sequence,
seed = this->seed;
size_t seed_span = seed.length();
size_t seed_w = seed_weight(seed);
size_t size_indexing = end_indexing - start_indexing;
if (size_indexing > max_size_indexing) {
max_size_indexing = size_indexing;
......@@ -307,13 +308,13 @@ void IKmerStore<T>::insert(const seqtype &sequence,
if (this_kmer.isNull()) {
nb_kmers_inserted++;
}
this_kmer += T(label, strand);
this_kmer += T(label, strand, seed_w);
if (revcomp_indexed && ! T::hasRevcompSymetry()) {
seqtype rc_kmer = spaced(revcomp(substr), seed);
T &this_rc_kmer = this->get(rc_kmer);
if (this_rc_kmer.isNull())
nb_kmers_inserted++;
this_rc_kmer += T(label, -1);
this_rc_kmer += T(label, -1, seed_w);
}
}
}
......
......@@ -490,12 +490,12 @@ KmerSegmenter::KmerSegmenter(Sequence seq, Germline *germline, double threshold,
return ;
} else if (nb_strand[0] > RATIO_STRAND * nb_strand[1]) {
strand = -1;
before = KmerAffect(germline->affect_3, -1);
after = KmerAffect(germline->affect_5, -1);
before = KmerAffect(germline->affect_3, -1, seed_weight(germline->seed));
after = KmerAffect(germline->affect_5, -1, seed_weight(germline->seed));
} else if (nb_strand[1] > RATIO_STRAND * nb_strand[0]) {
strand = 1;
before = KmerAffect(germline->affect_5, 1);
after = KmerAffect(germline->affect_3, 1);
before = KmerAffect(germline->affect_5, 1, seed_weight(germline->seed));
after = KmerAffect(germline->affect_3, 1, seed_weight(germline->seed));
} else {
// Ambiguous information: we have positive and negative strands
// and there is not enough difference to put them apart.
......
......@@ -26,26 +26,26 @@ void testAffectAnalyser1() {
for (int i = 2; i < nb_seq-1; i++) {
// i starts at 2 because AAAA is not found: there is an ambiguity with
// AAAA coming from AAAACAAAACAAAAC or AAAAAAAAAAAAAAA
KmerAffect current_affect(seq[2*i+1], 1);
KmerAffect current_affect(seq[2*i+1], 1, k);
TAP_TEST(kaa.count(current_affect) == 0, TEST_AA_COUNT, "");
TAP_TEST(ckaa.count(current_affect) == 0, TEST_COUNT_AA_COUNT, ckaa.count(current_affect));
TAP_TEST(kaa.first(current_affect) == (int)string::npos, TEST_AA_FIRST, "");
TAP_TEST(kaa.last(current_affect) == (int)string::npos, TEST_AA_LAST, "");
}
for (int i = 0; i < 2; i++) {
KmerAffect current_affect(seq[2*i+1], 1);
KmerAffect current_affect(seq[2*i+1], 1, k);
TAP_TEST(kaa.count(current_affect) == 2, TEST_AA_COUNT, kaa.count(current_affect));
TAP_TEST(ckaa.count(current_affect) == 2, TEST_COUNT_AA_COUNT, ckaa.count(current_affect));
TAP_TEST(kaa.getAffectation(kaa.first(current_affect)) == current_affect, TEST_AA_GET_AFFECT, "");
TAP_TEST(kaa.getAffectation(kaa.last(current_affect)) == current_affect, TEST_AA_GET_AFFECT, "");
}
TAP_TEST(kaa.count(KmerAffect(seq[2*(nb_seq-1)+1], 1)) == 1, TEST_AA_COUNT, "");
TAP_TEST((kaa.first(KmerAffect(seq[2*(nb_seq-1)+1], 1))
== kaa.last(KmerAffect(seq[2*(nb_seq-1)+1], 1)))
TAP_TEST(kaa.count(KmerAffect(seq[2*(nb_seq-1)+1], 1, k)) == 1, TEST_AA_COUNT, "");
TAP_TEST((kaa.first(KmerAffect(seq[2*(nb_seq-1)+1], 1, k))
== kaa.last(KmerAffect(seq[2*(nb_seq-1)+1], 1, k)))
== 1, TEST_AA_FIRST, "");
TAP_TEST(ckaa.max(forbidden) == KmerAffect("C lots of", 1)
|| ckaa.max(forbidden) == KmerAffect("G lots of", 1),
TAP_TEST(ckaa.max(forbidden) == KmerAffect("C lots of", 1, k)
|| ckaa.max(forbidden) == KmerAffect("G lots of", 1, k),
TEST_COUNT_AA_MAX, "max is " << ckaa.max(forbidden));
TAP_TEST(ckaa.max() == KmerAffect::getUnknown(),
......@@ -57,8 +57,8 @@ void testAffectAnalyser1() {
TAP_TEST(kaa.getDistinctAffectations().size() == 5, TEST_AA_GET_DISTINCT_AFFECT, "");
KmerAffect cAffect = KmerAffect(seq[1], 1);
KmerAffect gAffect = KmerAffect(seq[3], 1);
KmerAffect cAffect = KmerAffect(seq[1], 1, k);
KmerAffect gAffect = KmerAffect(seq[3], 1, k);
TAP_TEST(ckaa.countBefore(cAffect, 0) == 0, TEST_COUNT_AA_COUNT_BEFORE, "");
TAP_TEST(ckaa.countBefore(gAffect, 0) == 0, TEST_COUNT_AA_COUNT_BEFORE, "");
TAP_TEST(ckaa.countAfter(cAffect, 10) == 0, TEST_COUNT_AA_COUNT_BEFORE, "");
......@@ -88,8 +88,8 @@ void testAffectAnalyser1() {
TAP_TEST(ckaa.lastMax(cAffect, gAffect) == 11 - (int)index->smallestAnalysableLength() + 1, TEST_COUNT_AA_LAST_MAX, ckaa.lastMax(cAffect, gAffect));
// Test affectation with two affects that are not in the sequence
KmerAffect aAffect = KmerAffect(seq[5], 1);
KmerAffect tAffect = KmerAffect(seq[7], 1);
KmerAffect aAffect = KmerAffect(seq[5], 1, k);
KmerAffect tAffect = KmerAffect(seq[7], 1, k);
TAP_TEST(ckaa.firstMax(aAffect, tAffect) == -1, TEST_COUNT_AA_FIRST_MAX, "");
TAP_TEST(ckaa.lastMax(aAffect, tAffect) == - 1,
TEST_COUNT_AA_LAST_MAX, "");
......@@ -125,7 +125,7 @@ void testAffectAnalyser2() {
TAP_TEST(kaa.getSequence() == "TTTTTGGGGG", TEST_AA_GET_SEQUENCE, "actual: ");
TAP_TEST(ckaa.getSequence() == "TTTTTGGGGG", TEST_AA_GET_SEQUENCE, "actual: " << ckaa.getSequence());
TAP_TEST(kaa.getAffectation(1+k - index->smallestAnalysableLength()) == KmerAffect(seq[2*(nb_seq-1)+1], -1), TEST_AA_GET_AFFECT, "");
TAP_TEST(kaa.getAffectation(1+k - index->smallestAnalysableLength()) == KmerAffect(seq[2*(nb_seq-1)+1], -1, k), TEST_AA_GET_AFFECT, "");
TAP_TEST(kaa.count(kaa.getAffectation(1+k - index->smallestAnalysableLength())) == 1, TEST_AA_GET_AFFECT, "");
TAP_TEST(ckaa.count(kaa.getAffectation(1+k - index->smallestAnalysableLength())) == 1, TEST_COUNT_AA_COUNT, "");
TAP_TEST(kaa.getAffectation(0+k - index->smallestAnalysableLength()) == kaa.getAffectation(10 - index->smallestAnalysableLength()), TEST_AA_GET_AFFECT, "");
......@@ -136,7 +136,7 @@ void testAffectAnalyser2() {
TAP_TEST(kaa.getDistinctAffectations().size() == 3, TEST_AA_GET_DISTINCT_AFFECT, "");
TAP_TEST(ckaa.max(forbidden) == KmerAffect(seq[2*(nb_seq-1)+1], -1),
TAP_TEST(ckaa.max(forbidden) == KmerAffect(seq[2*(nb_seq-1)+1], -1, k),
TEST_COUNT_AA_MAX, "max is " << ckaa.max(forbidden));
TAP_TEST(ckaa.max() == KmerAffect::getUnknown(),
......@@ -169,8 +169,8 @@ void testAffectAnalyserMaxes() {
string sequence = "ACCCCAGGGGGA";
CountKmerAffectAnalyser ckaa(*index, sequence);
KmerAffect cAffect = KmerAffect("C", 1);
KmerAffect gAffect = KmerAffect("G", 1);
KmerAffect cAffect = KmerAffect("C", 1, k);
KmerAffect gAffect = KmerAffect("G", 1, k);
set<KmerAffect> forbidden;
forbidden.insert(KmerAffect::getAmbiguous());
......
......@@ -2,15 +2,19 @@
#include <core/kmeraffect.h>
void testAffect() {
affect_t Vminus = {'V'}, Vplus = {'V'};
affect_t Vminus = {'V', 1}, Vplus = {'V', 2};
Vplus.c |= 1 << 7;
affect_t Jplus = {'J'};
affect_t Jplus = {'J', 3};
Jplus.c |= 1 << 7;
TAP_TEST(affect_strand(Vminus) == -1, TEST_AFFECT_STRAND, "");
TAP_TEST(affect_strand(Vplus) == 1, TEST_AFFECT_STRAND, "");
TAP_TEST(affect_length(Vminus) == 1, TEST_AFFECT_LENGTH, "");
TAP_TEST(affect_length(Vplus) == 2, TEST_AFFECT_LENGTH, "");
TAP_TEST(affect_length(Jplus) == 3, TEST_AFFECT_LENGTH, "");
TAP_TEST(affect_char(Vminus) == 'V', TEST_AFFECT_CHAR, "");
TAP_TEST(affect_char(Vplus) == 'V', TEST_AFFECT_CHAR, "");
TAP_TEST(affect_char(Jplus) == 'J', TEST_AFFECT_CHAR, "");
......@@ -54,13 +58,13 @@ void testAffect() {
}
void testKmerAffectClass() {
affect_t Vminus = {'V'}, Vplus = {'V'};
affect_t Vminus = {'V', 4}, Vplus = {'V', 4};
Vplus.c |= 1 << 7;
affect_t Jplus = {'J'};
affect_t Jplus = {'J', 4};
Jplus.c |= 1 << 7;
KmerAffect KAVp("V", 1);
KmerAffect KAVm("V", -1);
KmerAffect KAJp("J", 1);
KmerAffect KAVp("V", 1, 4);
KmerAffect KAVm("V", -1, 4);
KmerAffect KAJp("J", 1, 4);
TAP_TEST(KAVp.affect == Vplus, TEST_KMERAFFECT_CONSTRUCTOR, "");
TAP_TEST(KAVm.affect == Vminus, TEST_KMERAFFECT_CONSTRUCTOR, "");
......@@ -72,6 +76,8 @@ void testKmerAffectClass() {
TAP_TEST(copy2.getLabel() == KAVp.getLabel(), TEST_KMERAFFECT_CONSTRUCTOR_COPY_REVERSE, "");
TAP_TEST(copy1.getStrand() == KAVp.getStrand(), TEST_KMERAFFECT_CONSTRUCTOR_COPY_REVERSE, "");
TAP_TEST(copy2.getStrand() == -KAVp.getStrand(), TEST_KMERAFFECT_CONSTRUCTOR_COPY_REVERSE, "");
TAP_TEST(copy1.getLength() == KAVp.getLength(), TEST_KMERAFFECT_CONSTRUCTOR_COPY_REVERSE, "");
TAP_TEST(copy2.getLength() == KAVp.getLength(), TEST_KMERAFFECT_CONSTRUCTOR_COPY_REVERSE, "");
KmerAffect test = KAVp;
TAP_TEST(test.affect == KAVp.affect, TEST_KMERAFFECT_AFFECTATION, "");
......@@ -117,9 +123,9 @@ void testKmerAffectClass() {
void testKmerAffectComparison() {
KmerAffect Vminus("V", -1);
KmerAffect Vplus("V", 1);
KmerAffect Jplus("J", 1);
KmerAffect Vminus("V", -1, 4);
KmerAffect Vplus("V", 1, 4);
KmerAffect Jplus("J", 1, 4);
TAP_TEST(! (Vminus == Vplus), TEST_KMERAFFECT_COMPARISON, "");
TAP_TEST(Vminus != Vplus, TEST_KMERAFFECT_COMPARISON, "");
......
......@@ -338,8 +338,7 @@ void testProbability() {
germline.new_index();
germline.finish();
TAP_TEST(germline.index->getIndexLoad() == .75, TEST_GET_INDEX_LOAD, "");
TAP_TEST(germline.index->getIndexLoad(KmerAffect(germline.affect_5, 1, 4)) == .5, TEST_GET_INDEX_LOAD, "");
TAP_TEST(germline.index->getIndexLoad(AFFECT_NOT_UNKNOWN) == .75, TEST_GET_INDEX_LOAD, ".getIndexLoad with AFFECT_NOT_UNKNOWN");
TAP_TEST(germline.index->getIndexLoad(AFFECT_UNKNOWN) == .25, TEST_GET_INDEX_LOAD, ".getIndexLoad with AFFECT_UNKNOWN");
......
......@@ -53,6 +53,7 @@ enum {
/* KmerAffect */
TEST_AFFECT_STRAND,
TEST_AFFECT_CHAR,
TEST_AFFECT_LENGTH,
TEST_AFFECT_COMPARISON,
TEST_AFFECT_TO_STRING,
TEST_AFFECT_OUT,
......@@ -217,6 +218,7 @@ inline void declare_tests() {
RECORD_TAP_TEST(TEST_AC_GET_RESULTS, "Testing getResults with Aho-Corasick");
RECORD_TAP_TEST(TEST_AFFECT_STRAND, "affect_strand()");
RECORD_TAP_TEST(TEST_AFFECT_LENGTH, "affect_length()");
RECORD_TAP_TEST(TEST_AFFECT_CHAR, "affect_char()");
RECORD_TAP_TEST(TEST_AFFECT_COMPARISON, "Comparison operators for affect_t");
RECORD_TAP_TEST(TEST_AFFECT_TO_STRING, "toString() for affect_t");
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment