diff --git a/algo/core/filter.cpp b/algo/core/filter.cpp index 0772554d4c69e0aa7d72a02573c4d03cb8c1d8b7..33c28c104c35890711a10389bd1edcd15aa5166a 100644 --- a/algo/core/filter.cpp +++ b/algo/core/filter.cpp @@ -1,4 +1,5 @@ #include "filter.h" +#include "math.hpp" FilterWithACAutomaton::FilterWithACAutomaton(BioReader &origin, string seed) : originalBioReader(origin){ this->filtered_sequences_nb = 0; @@ -57,7 +58,7 @@ void FilterWithACAutomaton::buildACAutomatonToFilterBioReader(string seed){ based on it. */ BioReader FilterWithACAutomaton::filterBioReaderWithACAutomaton( - seqtype &seq, int kmer_threshold){ + seqtype &seq, int kmer_threshold, int pvalue){ BioReader result; map mapAho; @@ -105,21 +106,32 @@ BioReader FilterWithACAutomaton::filterBioReaderWithACAutomaton( // Use a set to use the comparator and sort function set, Comparator> setOfWords(mapAho.begin(), mapAho.end(), compFunctor); // Iterate over the pair and not the map - int nbKmers = 0, previousOccurences = 0; + int nbKmers = 0; + int nb_kmers_limit = -1; // Limit number of kmers, defined when the last gene of interest is reached + for(pair element : setOfWords){ // Add corresponding sequences to the BioReader if(!element.first.isGeneric()){ continue; } - if(nbKmers == kmer_threshold && previousOccurences == element.second){ - //Keep the same amount of genes + if(nbKmers == kmer_threshold && nb_kmers_limit <= element.second){ + // We have reached our limit of number of genes recovered but we + // continue taking sequences are they have a similar number of + // matching k-mers. }else if(nbKmers < kmer_threshold){ nbKmers++; }else{ break; } transferBioReaderSequences(originalBioReader, result, element.first); - previousOccurences = element.second; + if (nbKmers == kmer_threshold && nb_kmers_limit == -1) { + int maxlen = getSizeLongestTransferredSequence(result, element.first); + nb_kmers_limit = compute_nb_kmers_limit(element.first.getLength(), element.second, maxlen, pvalue); + if (nb_kmers_limit == 0) { + this->filtered_sequences_nb += originalBioReader.size(); + return originalBioReader; + } + } } } this->filtered_sequences_nb += (result.size () == 0) ? originalBioReader.size() : result.size(); @@ -138,6 +150,22 @@ void FilterWithACAutomaton::transferBioReaderSequences(const BioReader &src, Bio } } +int FilterWithACAutomaton::getSizeLongestTransferredSequence(const BioReader &reader, KmerAffect k) const{ + char asciiChar = k.getLabel().at(0); + unsigned int asciiNum = int(asciiChar); + + if(asciiNum > indexes->size() || !k.isGeneric()){ + throw invalid_argument("Incorrect K-mer transmitted."); + } + + size_t longest = 0; + for(int i = 0; i < indexes->at(asciiNum - SPECIFIC_KMERS_NUMBER + 1) - indexes->at(asciiNum - SPECIFIC_KMERS_NUMBER); ++i){ + if (longest < reader.sequence(reader.size() - i - 1).length()) + longest = reader.sequence(reader.size() - i - 1).length(); + } + return longest; +} + vector* FilterWithACAutomaton::getIndexes() const{ return this->indexes; } diff --git a/algo/core/filter.h b/algo/core/filter.h index 98a5179928c3ad00cccb7f4037fe73a35dc7f711..97fe741d478dfbcd7a73c3ad1c86bf1ac324b758 100644 --- a/algo/core/filter.h +++ b/algo/core/filter.h @@ -22,7 +22,7 @@ class FilterWithACAutomaton { ~FilterWithACAutomaton(); - /* + /** This function will filter a BioReader @param idxAho: A pointer to a pair containing an int vector pointer and an AbstractACAutomaton pointer parametrized by KmerAffect. @@ -39,9 +39,11 @@ class FilterWithACAutomaton { it will filter on the "kmer_threshold" number of K-mers. For Example if kmer_threshold = 10, it will filter on the 10 most significant K-mers returned by getMultiResults. - */ + @param pvalue: The pvalue to be used for determining the minimal number of kmers. + This pvalu must actually be given as an integer (90, for .9, 999 for .999…) + */ BioReader filterBioReaderWithACAutomaton( - seqtype &seq, int kmer_threshold = NO_LIMIT_VALUE); + seqtype &seq, int kmer_threshold = NO_LIMIT_VALUE, int pvalue=999); /* This function takes a BioReader as a parameter and returns a couple containing an int vector pointer and an automaton @@ -114,5 +116,12 @@ class FilterWithACAutomaton { void transferBioReaderSequences(const BioReader &src, BioReader &dst, const KmerAffect k) const; friend ostream &operator<<(ostream&, const FilterWithACAutomaton&); + + private: + /** + * Get the size of the longest sequence among the sequences that were just + * transferred to the BioReader reader. + */ + int getSizeLongestTransferredSequence(const BioReader &reader, KmerAffect k) const; }; #endif diff --git a/algo/core/math.cpp b/algo/core/math.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f987a26b73ebfa55f8dba3d7cd3f701c3abc730b --- /dev/null +++ b/algo/core/math.cpp @@ -0,0 +1,26 @@ +#include "math.hpp" +#include +#include +#include +#include +#include "tools.h" + +using namespace std; + +const static map Z_SCORES = {{90, 1.2816}, {95, 1.6449}, {99, 2.3263}, {999, 3.0902}}; + +int compute_nb_kmers_limit(int kmer_size, int nb_occ, int sequence_length, int p_value) { + if (Z_SCORES.count(p_value) == 0) { + throw invalid_argument("You can't use a p_value of "+to_string(p_value)); + } + float empirical_proba = nb_occ * 1. / (sequence_length - kmer_size + 1); + if (empirical_proba > 1) + return nb_occ; + + float min_proba = empirical_proba - Z_SCORES.at(p_value) * sqrt(empirical_proba * (1 - empirical_proba) / (sequence_length - kmer_size + 1)); + if (min_proba <= 0) + return 0; + + // “Converts” the probability to a number of k-mers + return min_proba * (sequence_length - kmer_size + 1); +} diff --git a/algo/core/math.hpp b/algo/core/math.hpp new file mode 100644 index 0000000000000000000000000000000000000000..4b4e5a9577257fdc1aed0475df2a0749e3cd59a2 --- /dev/null +++ b/algo/core/math.hpp @@ -0,0 +1,19 @@ +#ifndef MATH_HPP +#define MATH_HPP + +/** + * Compute the minimal number of k-mers that is not significantly different from nb_occ + * Given that the k-mers used are those used for `affect` on the sequence `sequence`. + * @param kmer_size: size of the kmer + * @param nb_occ: Number of occurrences found on this sequence + * @param sequence_length: The sequence length + * @param p_value: The p_value to be used. The lower the p_value, the lower the value will be returned + * Beware! The p_value must be given as an integer to prevent issues with float equalities. + * Therefore a pvalue of .95 should be given as 95 and a p-value of .999 should be given as 999. + * Not all p-values are possible. Please refer to the keys of Z_SCORES. + * @return The lower bound of the confidence interval assumuing that the center of the confidence + * interval is given by `nb_occ`. In all cases the returned value will be >= 0. + */ +int compute_nb_kmers_limit(int kmer_size, int nb_occ, int sequence_length, int p_value=99); + +#endif diff --git a/algo/tests/should-get-tests/stanford-Z.should-get b/algo/tests/should-get-tests/stanford-Z.should-get index a157c801d4694e84862a94bc5365cbdd917deab5..d2de3c338807c5703f3c7f90144bbc18dae07361 100644 --- a/algo/tests/should-get-tests/stanford-Z.should-get +++ b/algo/tests/should-get-tests/stanford-Z.should-get @@ -6,4 +6,4 @@ $ Clone 13 is correctly analyzed $ Statistics on -Z 1:Statistics on clone analysis -rb1: IGH 1[5-7][0-9]{2}/ 1[0-2][0-9]{3} 15..% +rb1: IGH 3[0-2][0-9]{2}/ 1[0-2][0-9]{3} 28..% diff --git a/algo/tests/unit-tests/testFilter.cpp b/algo/tests/unit-tests/testFilter.cpp index 61e1f688c683bed783b43857974fc04f5ee8ba97..ed0ec8b228c1c2f7308c27c5a384fddf753dd07c 100644 --- a/algo/tests/unit-tests/testFilter.cpp +++ b/algo/tests/unit-tests/testFilter.cpp @@ -626,6 +626,54 @@ void testOriginalBioReaderIsReturned(){ delete f; } +void testPvalueRoleInFiltration() { + BioReader germline; + // seq+endseq are a substring of a de Bruijn sequence for k=4 + // (ie. all k-mers are different) + // http://www.hakank.org/comb/debruijn.cgi?k=4&n=4 + string seq = "ACTTAGAGATAGCCAGCGAGCTAGGCAGGGAGGTAGTCAGTGAGTTATA"; + string endseq = "TCCATCGATCTATGCATGGATGTATTCATTG"; + int nb_seq = 10; + // Add sequences with an increasing number of As + for (int i = 0; i < nb_seq; i++) { + // We add the Ts to have a long enough reference sequence, otherwise it + // would bias the p-value computation + germline.add({"seq"+to_string(i), "seq"+to_string(i), seq.substr(0, 10 + i*4)+"TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT", "", 0}); + } + germline.add({"seq"+to_string(nb_seq), "seq"+to_string(nb_seq), seq+"TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT", "", 0}); + + FilterWithACAutomaton filter(germline, "####"); + + cerr << "First filtering" << endl; + string query = seq + endseq; + // seq have 49 nt, so about 46 k-mers should match. + + // With a 90% confidence interval, we will get sequences with ≥40 k-mers + // Thus we'll get + // seq10 (whole sequence) + // seq9 46nt (without T homopolymer) → 42 k-mers + BioReader result = filter.filterBioReaderWithACAutomaton(query, 1, 90); + TAP_TEST_EQUAL(result.size(), 2, TEST_FILTER_BIOREADER_PVALUE, ""); + TAP_TEST_EQUAL(result.label(0), "seq10", TEST_FILTER_BIOREADER_PVALUE, ""); + TAP_TEST_EQUAL(result.label(1), "seq9", TEST_FILTER_BIOREADER_PVALUE, ""); + + cerr << "Second filtering" << endl; + // The p-value should give us a threshold around 32 k-mers, which could be reached first for + // sequence seq7 → 4 sequences in total + result = filter.filterBioReaderWithACAutomaton(query, 1, 999); + TAP_TEST_EQUAL(result.size(), 4, TEST_FILTER_BIOREADER_PVALUE, ""); + TAP_TEST_EQUAL(result.label(3), "seq7", TEST_FILTER_BIOREADER_PVALUE, ""); + + // The third sequence with most kmers will be seq8 + // which is of length 42nt → 39 k-mers + // With a 90% confidence interval, we will get sequences with ≥33 k-mers + // → from seq7 (38nt and 35 k-mers) to seq10 + result = filter.filterBioReaderWithACAutomaton(query, 3, 90); + TAP_TEST_EQUAL(result.size(), 4, TEST_FILTER_BIOREADER_PVALUE, ""); + TAP_TEST_EQUAL(result.label(3), "seq7", TEST_FILTER_BIOREADER_PVALUE, ""); + +} + void testFilter(){ testAutomatonBuilderFilteringBioReader(); testFilterBioReaderWithACAutomaton(); @@ -634,4 +682,5 @@ void testFilter(){ testExAequoKmersWhenSignificantParameter(); testTransferBioReaderSequences(); testOriginalBioReaderIsReturned(); + testPvalueRoleInFiltration(); } diff --git a/algo/tests/unit-tests/testMath.cpp b/algo/tests/unit-tests/testMath.cpp new file mode 100644 index 0000000000000000000000000000000000000000..59c3f28f19ef43dd9cf8099cac2f0640be81f267 --- /dev/null +++ b/algo/tests/unit-tests/testMath.cpp @@ -0,0 +1,20 @@ +#include + +void testComputeNbKmers() { + // When all k-mers match, probability of error is 0, thus we return the same + // number of occurrences as in input + TAP_TEST_EQUAL(compute_nb_kmers_limit(10, 100, 109, 99), 100, TEST_MATH_LIMIT_KMERS,""); + TAP_TEST_EQUAL(compute_nb_kmers_limit(10, 100, 109, 90), 100, TEST_MATH_LIMIT_KMERS,""); + TAP_TEST_EQUAL(compute_nb_kmers_limit(10, 100, 109, 95), 100, TEST_MATH_LIMIT_KMERS,""); + TAP_TEST_EQUAL(compute_nb_kmers_limit(10, 100, 109, 999), 100, TEST_MATH_LIMIT_KMERS,""); + + + TAP_TEST_EQUAL(compute_nb_kmers_limit(10, 10, 109, 999), 0, TEST_MATH_LIMIT_KMERS,""); + TAP_TEST_EQUAL(compute_nb_kmers_limit(10, 1, 109, 999), 0, TEST_MATH_LIMIT_KMERS,""); + TAP_TEST_EQUAL(compute_nb_kmers_limit(10, 20, 109, 999), 7, TEST_MATH_LIMIT_KMERS,""); + +} + +void testMath() { + testComputeNbKmers(); +} diff --git a/algo/tests/unit-tests/tests.cpp b/algo/tests/unit-tests/tests.cpp index c1b9ae9f2964ffd5f86eabb27ec55fa8e6156eed..d6b4013d7c8f512ab6ad2b84048e782779ad7746 100644 --- a/algo/tests/unit-tests/tests.cpp +++ b/algo/tests/unit-tests/tests.cpp @@ -16,6 +16,7 @@ #include "testWindowsStorage.cpp" #include "testReadStorage.cpp" #include "testAutomaton.cpp" +#include "testMath.cpp" int main(void) { TAP_START(NB_TESTS); @@ -36,6 +37,7 @@ int main(void) { testWindowStorage(); testReadStorage(); testAutomaton(); + testMath(); TAP_END_TEST_EXIT } diff --git a/algo/tests/unit-tests/tests.h b/algo/tests/unit-tests/tests.h index 369e7fd755bfae5a23cddf7ffe72b4e640967ff4..8600eef8e09828ba53709cb2148a150ecb6823a0 100644 --- a/algo/tests/unit-tests/tests.h +++ b/algo/tests/unit-tests/tests.h @@ -27,6 +27,7 @@ enum { /* Filter tests */ TEST_AUTOMATON_BUILDER_TO_FILTER_BIOREADER, TEST_FILTER_BIOREADER_WITH_AC_AUTOMATON, + TEST_FILTER_BIOREADER_PVALUE, TEST_CREATE_SEQUENCE_LABEL_FULL, TEST_CREATE_SEQUENCE_LABEL, @@ -178,6 +179,9 @@ enum { TEST_BRS_GET_NB_SCORES, TEST_BRS_GET_BEST_READS, + /* Math */ + TEST_MATH_LIMIT_KMERS, + /* Bugs */ TEST_BUG_SEGMENTATION, TEST_SEGMENT_POSITION, @@ -217,6 +221,7 @@ inline void declare_tests() { "Check the automaton and the index vector produced while filtering a BioReader"); RECORD_TAP_TEST(TEST_FILTER_BIOREADER_WITH_AC_AUTOMATON, "Filter a BioReader using the getMultiResults of an automaton"); + RECORD_TAP_TEST(TEST_FILTER_BIOREADER_PVALUE, "Filtering BioReader with p-value"); RECORD_TAP_TEST(TEST_CREATE_SEQUENCE_LABEL_FULL, "create_sequence: label_full field"); RECORD_TAP_TEST(TEST_CREATE_SEQUENCE_LABEL, "create_sequence: label field"); @@ -350,6 +355,7 @@ inline void declare_tests() { RECORD_TAP_TEST(TEST_KMER_REPRESENTATIVE, "Test KmerRepresentativeComputer computations"); RECORD_TAP_TEST(TEST_KMER_REPRESENTATIVE_REQUIRED_SEQ, "Test KmerRepresentativeComputer computations with a required sequence"); RECORD_TAP_TEST(TEST_KMER_REPRESENTATIVE_REVCOMP, "Test KmerRepresentativeComputer computations on a dataset and its revcomp"); + RECORD_TAP_TEST(TEST_MATH_LIMIT_KMERS, "Test computation on limit of number of k-mers"); RECORD_TAP_TEST(TEST_KMER_REPRESENTATIVE_QUALITY, "Test KmerRepresentativeComputer quality computations"); RECORD_TAP_TEST(TEST_BUG_SEGMENTATION, "Test segmentation bug"); RECORD_TAP_TEST(TEST_SEGMENT_POSITION, "Test V,D,J position"); diff --git a/algo/vidjil.cpp b/algo/vidjil.cpp index b64af4ba57e87fdb2fb1411ecf2570ec73594432..1fe282d50a84a7f71b797091ff9d6d31930e3111 100644 --- a/algo/vidjil.cpp +++ b/algo/vidjil.cpp @@ -110,7 +110,7 @@ enum { CMD_WINDOWS, CMD_CLONES, CMD_SEGMENT, CMD_GERMLINES } ; #define DEFAULT_MAX_AUDITIONED 2000 #define DEFAULT_RATIO_REPRESENTATIVE 0.5 -#define DEFAULT_KMER_THRESHOLD 3 +#define DEFAULT_KMER_THRESHOLD 1 #define DEFAULT_EPSILON 0 #define DEFAULT_MINPTS 10