Commit 93c0fcf0 authored by Mathieu Giraud's avatar Mathieu Giraud

Merge branch 'feature-a/3225-filtering-with-close-probability' into 'dev'

Filtering and take into account close sequences

Closes #3225

See merge request !268
parents f1933a2b c8021564
Pipeline #43200 failed with stages
in 6 minutes and 27 seconds
#include "filter.h"
#include "math.hpp"
FilterWithACAutomaton::FilterWithACAutomaton(BioReader &origin, string seed) : originalBioReader(origin){
this->filtered_sequences_nb = 0;
......@@ -57,7 +58,7 @@ void FilterWithACAutomaton::buildACAutomatonToFilterBioReader(string seed){
based on it.
*/
BioReader FilterWithACAutomaton::filterBioReaderWithACAutomaton(
seqtype &seq, int kmer_threshold){
seqtype &seq, int kmer_threshold, int pvalue){
BioReader result;
map<KmerAffect, int> mapAho;
......@@ -105,21 +106,32 @@ BioReader FilterWithACAutomaton::filterBioReaderWithACAutomaton(
// Use a set to use the comparator and sort function
set<pair<KmerAffect, int>, Comparator> setOfWords(mapAho.begin(), mapAho.end(), compFunctor);
// Iterate over the pair and not the map
int nbKmers = 0, previousOccurences = 0;
int nbKmers = 0;
int nb_kmers_limit = -1; // Limit number of kmers, defined when the last gene of interest is reached
for(pair<KmerAffect, int> element : setOfWords){
// Add corresponding sequences to the BioReader
if(!element.first.isGeneric()){
continue;
}
if(nbKmers == kmer_threshold && previousOccurences == element.second){
//Keep the same amount of genes
if(nbKmers == kmer_threshold && nb_kmers_limit <= element.second){
// We have reached our limit of number of genes recovered but we
// continue taking sequences are they have a similar number of
// matching k-mers.
}else if(nbKmers < kmer_threshold){
nbKmers++;
}else{
break;
}
transferBioReaderSequences(originalBioReader, result, element.first);
previousOccurences = element.second;
if (nbKmers == kmer_threshold && nb_kmers_limit == -1) {
int maxlen = getSizeLongestTransferredSequence(result, element.first);
nb_kmers_limit = compute_nb_kmers_limit(element.first.getLength(), element.second, maxlen, pvalue);
if (nb_kmers_limit == 0) {
this->filtered_sequences_nb += originalBioReader.size();
return originalBioReader;
}
}
}
}
this->filtered_sequences_nb += (result.size () == 0) ? originalBioReader.size() : result.size();
......@@ -138,6 +150,22 @@ void FilterWithACAutomaton::transferBioReaderSequences(const BioReader &src, Bio
}
}
int FilterWithACAutomaton::getSizeLongestTransferredSequence(const BioReader &reader, KmerAffect k) const{
char asciiChar = k.getLabel().at(0);
unsigned int asciiNum = int(asciiChar);
if(asciiNum > indexes->size() || !k.isGeneric()){
throw invalid_argument("Incorrect K-mer transmitted.");
}
size_t longest = 0;
for(int i = 0; i < indexes->at(asciiNum - SPECIFIC_KMERS_NUMBER + 1) - indexes->at(asciiNum - SPECIFIC_KMERS_NUMBER); ++i){
if (longest < reader.sequence(reader.size() - i - 1).length())
longest = reader.sequence(reader.size() - i - 1).length();
}
return longest;
}
vector<int>* FilterWithACAutomaton::getIndexes() const{
return this->indexes;
}
......
......@@ -22,7 +22,7 @@ class FilterWithACAutomaton {
~FilterWithACAutomaton();
/*
/**
This function will filter a BioReader
@param idxAho: A pointer to a pair containing an int vector pointer and
an AbstractACAutomaton pointer parametrized by KmerAffect.
......@@ -39,9 +39,11 @@ class FilterWithACAutomaton {
it will filter on the "kmer_threshold" number of K-mers. For
Example if kmer_threshold = 10, it will filter on the 10 most
significant K-mers returned by getMultiResults.
*/
@param pvalue: The pvalue to be used for determining the minimal number of kmers.
This pvalu must actually be given as an integer (90, for .9, 999 for .999…)
*/
BioReader filterBioReaderWithACAutomaton(
seqtype &seq, int kmer_threshold = NO_LIMIT_VALUE);
seqtype &seq, int kmer_threshold = NO_LIMIT_VALUE, int pvalue=999);
/*
This function takes a BioReader as a parameter and returns
a couple containing an int vector pointer and an automaton
......@@ -114,5 +116,12 @@ class FilterWithACAutomaton {
void transferBioReaderSequences(const BioReader &src, BioReader &dst, const KmerAffect k) const;
friend ostream &operator<<(ostream&, const FilterWithACAutomaton&);
private:
/**
* Get the size of the longest sequence among the sequences that were just
* transferred to the BioReader reader.
*/
int getSizeLongestTransferredSequence(const BioReader &reader, KmerAffect k) const;
};
#endif
#include "math.hpp"
#include <map>
#include <cmath>
#include <cassert>
#include <string>
#include "tools.h"
using namespace std;
const static map<int, float> Z_SCORES = {{90, 1.2816}, {95, 1.6449}, {99, 2.3263}, {999, 3.0902}};
int compute_nb_kmers_limit(int kmer_size, int nb_occ, int sequence_length, int p_value) {
if (Z_SCORES.count(p_value) == 0) {
throw invalid_argument("You can't use a p_value of "+to_string(p_value));
}
float empirical_proba = nb_occ * 1. / (sequence_length - kmer_size + 1);
if (empirical_proba > 1)
return nb_occ;
float min_proba = empirical_proba - Z_SCORES.at(p_value) * sqrt(empirical_proba * (1 - empirical_proba) / (sequence_length - kmer_size + 1));
if (min_proba <= 0)
return 0;
// “Converts” the probability to a number of k-mers
return min_proba * (sequence_length - kmer_size + 1);
}
#ifndef MATH_HPP
#define MATH_HPP
/**
* Compute the minimal number of k-mers that is not significantly different from nb_occ
* Given that the k-mers used are those used for `affect` on the sequence `sequence`.
* @param kmer_size: size of the kmer
* @param nb_occ: Number of occurrences found on this sequence
* @param sequence_length: The sequence length
* @param p_value: The p_value to be used. The lower the p_value, the lower the value will be returned
* Beware! The p_value must be given as an integer to prevent issues with float equalities.
* Therefore a pvalue of .95 should be given as 95 and a p-value of .999 should be given as 999.
* Not all p-values are possible. Please refer to the keys of Z_SCORES.
* @return The lower bound of the confidence interval assumuing that the center of the confidence
* interval is given by `nb_occ`. In all cases the returned value will be >= 0.
*/
int compute_nb_kmers_limit(int kmer_size, int nb_occ, int sequence_length, int p_value=99);
#endif
......@@ -6,4 +6,4 @@ $ Clone 13 is correctly analyzed
$ Statistics on -Z
1:Statistics on clone analysis
rb1: IGH 1[5-7][0-9]{2}/ 1[0-2][0-9]{3} 15..%
rb1: IGH 3[0-2][0-9]{2}/ 1[0-2][0-9]{3} 28..%
......@@ -626,6 +626,54 @@ void testOriginalBioReaderIsReturned(){
delete f;
}
void testPvalueRoleInFiltration() {
BioReader germline;
// seq+endseq are a substring of a de Bruijn sequence for k=4
// (ie. all k-mers are different)
// http://www.hakank.org/comb/debruijn.cgi?k=4&n=4
string seq = "ACTTAGAGATAGCCAGCGAGCTAGGCAGGGAGGTAGTCAGTGAGTTATA";
string endseq = "TCCATCGATCTATGCATGGATGTATTCATTG";
int nb_seq = 10;
// Add sequences with an increasing number of As
for (int i = 0; i < nb_seq; i++) {
// We add the Ts to have a long enough reference sequence, otherwise it
// would bias the p-value computation
germline.add({"seq"+to_string(i), "seq"+to_string(i), seq.substr(0, 10 + i*4)+"TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT", "", 0});
}
germline.add({"seq"+to_string(nb_seq), "seq"+to_string(nb_seq), seq+"TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT", "", 0});
FilterWithACAutomaton filter(germline, "####");
cerr << "First filtering" << endl;
string query = seq + endseq;
// seq have 49 nt, so about 46 k-mers should match.
// With a 90% confidence interval, we will get sequences with ≥40 k-mers
// Thus we'll get
// seq10 (whole sequence)
// seq9 46nt (without T homopolymer) → 42 k-mers
BioReader result = filter.filterBioReaderWithACAutomaton(query, 1, 90);
TAP_TEST_EQUAL(result.size(), 2, TEST_FILTER_BIOREADER_PVALUE, "");
TAP_TEST_EQUAL(result.label(0), "seq10", TEST_FILTER_BIOREADER_PVALUE, "");
TAP_TEST_EQUAL(result.label(1), "seq9", TEST_FILTER_BIOREADER_PVALUE, "");
cerr << "Second filtering" << endl;
// The p-value should give us a threshold around 32 k-mers, which could be reached first for
// sequence seq7 → 4 sequences in total
result = filter.filterBioReaderWithACAutomaton(query, 1, 999);
TAP_TEST_EQUAL(result.size(), 4, TEST_FILTER_BIOREADER_PVALUE, "");
TAP_TEST_EQUAL(result.label(3), "seq7", TEST_FILTER_BIOREADER_PVALUE, "");
// The third sequence with most kmers will be seq8
// which is of length 42nt → 39 k-mers
// With a 90% confidence interval, we will get sequences with ≥33 k-mers
// → from seq7 (38nt and 35 k-mers) to seq10
result = filter.filterBioReaderWithACAutomaton(query, 3, 90);
TAP_TEST_EQUAL(result.size(), 4, TEST_FILTER_BIOREADER_PVALUE, "");
TAP_TEST_EQUAL(result.label(3), "seq7", TEST_FILTER_BIOREADER_PVALUE, "");
}
void testFilter(){
testAutomatonBuilderFilteringBioReader();
testFilterBioReaderWithACAutomaton();
......@@ -634,4 +682,5 @@ void testFilter(){
testExAequoKmersWhenSignificantParameter();
testTransferBioReaderSequences();
testOriginalBioReaderIsReturned();
testPvalueRoleInFiltration();
}
#include <core/math.hpp>
void testComputeNbKmers() {
// When all k-mers match, probability of error is 0, thus we return the same
// number of occurrences as in input
TAP_TEST_EQUAL(compute_nb_kmers_limit(10, 100, 109, 99), 100, TEST_MATH_LIMIT_KMERS,"");
TAP_TEST_EQUAL(compute_nb_kmers_limit(10, 100, 109, 90), 100, TEST_MATH_LIMIT_KMERS,"");
TAP_TEST_EQUAL(compute_nb_kmers_limit(10, 100, 109, 95), 100, TEST_MATH_LIMIT_KMERS,"");
TAP_TEST_EQUAL(compute_nb_kmers_limit(10, 100, 109, 999), 100, TEST_MATH_LIMIT_KMERS,"");
TAP_TEST_EQUAL(compute_nb_kmers_limit(10, 10, 109, 999), 0, TEST_MATH_LIMIT_KMERS,"");
TAP_TEST_EQUAL(compute_nb_kmers_limit(10, 1, 109, 999), 0, TEST_MATH_LIMIT_KMERS,"");
TAP_TEST_EQUAL(compute_nb_kmers_limit(10, 20, 109, 999), 7, TEST_MATH_LIMIT_KMERS,"");
}
void testMath() {
testComputeNbKmers();
}
......@@ -16,6 +16,7 @@
#include "testWindowsStorage.cpp"
#include "testReadStorage.cpp"
#include "testAutomaton.cpp"
#include "testMath.cpp"
int main(void) {
TAP_START(NB_TESTS);
......@@ -36,6 +37,7 @@ int main(void) {
testWindowStorage();
testReadStorage();
testAutomaton();
testMath();
TAP_END_TEST_EXIT
}
......@@ -27,6 +27,7 @@ enum {
/* Filter tests */
TEST_AUTOMATON_BUILDER_TO_FILTER_BIOREADER,
TEST_FILTER_BIOREADER_WITH_AC_AUTOMATON,
TEST_FILTER_BIOREADER_PVALUE,
TEST_CREATE_SEQUENCE_LABEL_FULL,
TEST_CREATE_SEQUENCE_LABEL,
......@@ -178,6 +179,9 @@ enum {
TEST_BRS_GET_NB_SCORES,
TEST_BRS_GET_BEST_READS,
/* Math */
TEST_MATH_LIMIT_KMERS,
/* Bugs */
TEST_BUG_SEGMENTATION,
TEST_SEGMENT_POSITION,
......@@ -217,6 +221,7 @@ inline void declare_tests() {
"Check the automaton and the index vector produced while filtering a BioReader");
RECORD_TAP_TEST(TEST_FILTER_BIOREADER_WITH_AC_AUTOMATON,
"Filter a BioReader using the getMultiResults of an automaton");
RECORD_TAP_TEST(TEST_FILTER_BIOREADER_PVALUE, "Filtering BioReader with p-value");
RECORD_TAP_TEST(TEST_CREATE_SEQUENCE_LABEL_FULL, "create_sequence: label_full field");
RECORD_TAP_TEST(TEST_CREATE_SEQUENCE_LABEL, "create_sequence: label field");
......@@ -350,6 +355,7 @@ inline void declare_tests() {
RECORD_TAP_TEST(TEST_KMER_REPRESENTATIVE, "Test KmerRepresentativeComputer computations");
RECORD_TAP_TEST(TEST_KMER_REPRESENTATIVE_REQUIRED_SEQ, "Test KmerRepresentativeComputer computations with a required sequence");
RECORD_TAP_TEST(TEST_KMER_REPRESENTATIVE_REVCOMP, "Test KmerRepresentativeComputer computations on a dataset and its revcomp");
RECORD_TAP_TEST(TEST_MATH_LIMIT_KMERS, "Test computation on limit of number of k-mers");
RECORD_TAP_TEST(TEST_KMER_REPRESENTATIVE_QUALITY, "Test KmerRepresentativeComputer quality computations");
RECORD_TAP_TEST(TEST_BUG_SEGMENTATION, "Test segmentation bug");
RECORD_TAP_TEST(TEST_SEGMENT_POSITION, "Test V,D,J position");
......
......@@ -110,7 +110,7 @@ enum { CMD_WINDOWS, CMD_CLONES, CMD_SEGMENT, CMD_GERMLINES } ;
#define DEFAULT_MAX_AUDITIONED 2000
#define DEFAULT_RATIO_REPRESENTATIVE 0.5
#define DEFAULT_KMER_THRESHOLD 3
#define DEFAULT_KMER_THRESHOLD 1
#define DEFAULT_EPSILON 0
#define DEFAULT_MINPTS 10
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment