Commit eb16fce9 authored by Mikaël Salson's avatar Mikaël Salson

Merge branch 'feature-a/920-automaton-filter' into 'dev'

optimize FineSegmenter, filtering BioReader with an Aho-Corasick automaton

Closes #920 and #3271

See merge request !199
parents a4a21e32 83554466
Pipeline #28217 passed with stages
in 48 seconds
......@@ -8,7 +8,7 @@
#include <queue>
#include <utility>
#include "tools.h"
#include <set>
#include <map>
using namespace std;
......@@ -80,6 +80,18 @@ public:
*/
virtual void *next(void *state, char c) = 0;
/**
* This function returns the number of times every Info appears in the
* given sequence.
* It returns a map containing the number of occurences per Info.
* @param seq: The sequence to be queried. It is passed through
* the automaton to identify matching k-mers and extract
* the corresponding Info.
* @param false: unused.
* @param seed: unused.
*/
virtual map<Info,int> getMultiResults
(const seqtype &seq, bool no_revcomp=false, string seed = "") = 0;
};
#define DNA_ALPHABET_SIZE 4
......@@ -111,32 +123,38 @@ public:
template <class Info>
class PointerACAutomaton: public AbstractACAutomaton<Info> {
private:
bool multiple_info;
void free_automaton(pointer_state<Info> *);
void init(string seed, bool revcomp);
void init(string seed, bool revcomp, bool multiple_info);
public:
using IKmerStore<Info>::insert;
/**
* @param revcomp: should the revcomp of the sequences also be indexed
* @param multiple_info: should all the Info be stored in the automaton or
* only a single value summarizing them all.
*
* The default seed will be a contiguous seed of 10 letters. But the seed
* can be specified when inserting sequences. This should be the preferred
* choice as one may want to have different seeds depending on the
* sequences.
*/
PointerACAutomaton(bool revcomp=false);
PointerACAutomaton(bool revcomp=false, bool multiple_info=false);
/**
* @param seed: the seed to be used for indexing
* @param revcomp: indexing revcomp too ?
* @param multiple_info: storing all info?
*/
PointerACAutomaton(string seed, bool revcomp=false);
PointerACAutomaton(string seed, bool revcomp=false, bool multiple_info=false);
/**
* @param k: the size of the contiguous seed
* @param revcomp: indexing revcomp too ?
* @param multiple_info: storing all info?
*/
PointerACAutomaton(int k, bool revcomp=false);
PointerACAutomaton(int k, bool revcomp=false, bool multiple_info=false);
~PointerACAutomaton();
......@@ -183,6 +201,8 @@ public:
// From IKmerStore
vector<Info> getResults(const seqtype &seq, bool no_revcomp=false, string seed = "");
map<Info,int> getMultiResults(const seqtype &seq, bool no_revcomp=false, string seed = "");
Info& get(seqtype &word) ;
Info& operator[](seqtype& word);
......
......@@ -3,7 +3,7 @@
#include "automaton.h"
#include <stack>
#include <set>
//////////////////// IMPLEMENTATIONS ////////////////////
template <class Info>
......@@ -58,22 +58,22 @@ void *AbstractACAutomaton<Info>::goto_state(const string &seq, void *starting_st
///////////////////////
template <class Info>
PointerACAutomaton<Info>::PointerACAutomaton(bool revcomp):AbstractACAutomaton<Info>(){
init("##########",revcomp);
PointerACAutomaton<Info>::PointerACAutomaton(bool revcomp, bool multiple_info):AbstractACAutomaton<Info>(){
init("##########",revcomp, multiple_info);
}
template <class Info>
PointerACAutomaton<Info>::PointerACAutomaton(string seed, bool revcomp):AbstractACAutomaton<Info>() {
init(seed, revcomp);
PointerACAutomaton<Info>::PointerACAutomaton(string seed, bool revcomp, bool multiple_info):AbstractACAutomaton<Info>() {
init(seed, revcomp, multiple_info);
}
template <class Info>
PointerACAutomaton<Info>::PointerACAutomaton(int k, bool revcomp):AbstractACAutomaton<Info>() {
init(seed_contiguous(k), revcomp);
PointerACAutomaton<Info>::PointerACAutomaton(int k, bool revcomp, bool multiple_info):AbstractACAutomaton<Info>() {
init(seed_contiguous(k), revcomp, multiple_info);
}
template <class Info>
void PointerACAutomaton<Info>::init(string seed, bool revcomp) {
void PointerACAutomaton<Info>::init(string seed, bool revcomp, bool multiple_info) {
if (revcomp && Info::hasRevcompSymetry()) {
cerr << "PointerACAutomaton cannot deal with revcomp symmetry at the moment."
<< endl;
......@@ -86,6 +86,7 @@ void PointerACAutomaton<Info>::init(string seed, bool revcomp) {
this->s = seed.length();
this->revcomp_indexed = revcomp;
this->max_size_indexing = 0;
this->multiple_info = multiple_info;
}
template <class Info>
......@@ -182,12 +183,16 @@ void PointerACAutomaton<Info>::insert(const seqtype &seq, Info info) {
}
}
state->is_final = true;
assert(info.getLength() <= MAX_KMER_SIZE);
if (state->informations.front().isNull()) {
this->nb_kmers_inserted++;
this->kmers_inserted[info]++;
state->informations.front() += info;
} else {
if (this->multiple_info)
state->informations.push_back(info);
else
state->informations.front() += info;
}
state->informations.front() += info;
}
template <class Info>
......@@ -270,6 +275,32 @@ vector<Info> PointerACAutomaton<Info>::getResults(const seqtype &seq, bool no_re
return result;
}
template <class Info>
map<Info, int> PointerACAutomaton<Info>::getMultiResults(const seqtype &seq, bool no_revcomp, string seed) {
UNUSED(no_revcomp);
UNUSED(seed);
pointer_state<Info>* current_state = getInitialState();
size_t seq_len = seq.length();
map<Info, int> results;
for(size_t i = 0;i < seq_len;++i) {
current_state = (pointer_state<Info> *)next(current_state, seq[i]);
set<Info> informations(current_state->informations.begin(),
current_state->informations.end());
for(auto const& info : informations){
/* If map contain info, increase its occurence. */
if(results.count(info) > 0){
results[info] = results[info] + 1;
}
/* Otherwise add info into map with a value of 1. */
else{
results.insert(pair<Info,int>(info,1));
}
}
}
return results;
}
template <class Info>
Info& PointerACAutomaton<Info>::get(seqtype &word) {
pointer_state<Info> *state = (pointer_state<Info> *)this->goto_state(word);
......
#include "filter.h"
pair<vector<int>*, AbstractACAutomaton<KmerAffect>*>* buildACAutomatonToFilterBioReader
(BioReader &origin, string seed){
pair<vector<int>*, AbstractACAutomaton<KmerAffect>*>* result;
vector<int>* indexes;
PointerACAutomaton<KmerAffect>* aho;
char asciiChar;
int asciiNumber;
string currentLabel;
string previousLabel;
if(origin.size() < 1){
return nullptr;
}
result = new pair<vector<int>*,AbstractACAutomaton<KmerAffect>*>();
aho = new PointerACAutomaton<KmerAffect>(seed, false, true);
indexes = new vector<int>();
aho->insert(origin.sequence(0),std::string("") + char(1), true, 0, seed);
asciiNumber = 1;
indexes->push_back(0);
previousLabel = extractGeneName(origin.label(0));
int i;
for(i = 1;i < origin.size(); ++i){
currentLabel = extractGeneName(origin.label(i));
if(currentLabel != previousLabel){
indexes->push_back(i);
asciiNumber++;
}
if(asciiNumber > 127){
delete result; delete aho; delete indexes;
return nullptr;
}
asciiChar = char(asciiNumber);
aho->insert(origin.sequence(i),std::string("") + asciiChar, true, 0, seed);
previousLabel = currentLabel;
}
indexes->push_back(origin.size());
aho->build_failure_functions();
result->first = indexes;
result->second = aho;
return result;
}
/*
Takes a built automaton and a vector of indexes and build a BioReader
based on it.
*/
BioReader filterBioReaderWithACAutomaton(
pair<vector<int>*, AbstractACAutomaton<KmerAffect>*>* idxAho,
BioReader &origin, seqtype &seq,
int kmer_threshold){
BioReader result;
AbstractACAutomaton<KmerAffect>* aho;
vector<int>* indexes;
map<KmerAffect, int> mapAho;
KmerAffect tmpKmer;
unsigned int asciiNum;
char asciiChar;
if(!idxAho || kmer_threshold < 0){
return origin;
}
indexes = idxAho->first;
aho = idxAho->second;
mapAho = aho->getMultiResults(seq);
//All k-mers selected : iterate over all map
if(kmer_threshold == ALL_KMERS_VALUE || kmer_threshold > (int)mapAho.size()){
for(auto const mx: mapAho){
tmpKmer = mx.first;
asciiChar = tmpKmer.getLabel().at(0);
asciiNum = int(asciiChar);
if(asciiNum > indexes->size() - 1){
break;
}
for(int i = indexes->at(asciiNum - 1); i < indexes->at(asciiNum); ++i){
result.add(origin.read(i));
}
}
/* The most significant k-mers selected : iterate over a portion of the
sorted map */
}else{
/* sort map */
typedef function<bool(pair<KmerAffect, int>, pair<KmerAffect, int>)> Comparator;
Comparator compFunctor = [](pair<KmerAffect, int> elem1 ,pair<KmerAffect, int> elem2){
return (elem1.second == elem2.second) ? elem1.first > elem2.first : elem1.second > elem2.second;
};
// Use a set to use the comparator and sort function
set<pair<KmerAffect, int>, Comparator> setOfWords(mapAho.begin(), mapAho.end(), compFunctor);
set<pair<KmerAffect, int>, Comparator>::iterator setIt = setOfWords.begin();
// Iterate over the pair and not the map
int nbKmers = 0;
for(pair<KmerAffect, int> element : setOfWords){
// Add corresponding sequences to the BioReader
if(nbKmers < kmer_threshold){
nbKmers++;
/* Check if next K-mer has same occurence */
if(nbKmers == kmer_threshold){
std::advance(setIt, nbKmers);
pair<KmerAffect, int> pNext = *setIt;
int nextKmerOccurs = pNext.second;
if(nextKmerOccurs == element.second){
nbKmers--;
}
}
tmpKmer = element.first;
asciiChar = tmpKmer.getLabel().at(0);
asciiNum = int(asciiChar);
if(asciiNum > indexes->size() - 1){
break;
}
for(int i = indexes->at(asciiNum - 1); i < indexes->at(asciiNum); ++i){
result.add(origin.read(i));
}
}
else{
/* Enough K-mers used for filtering, no need to go further */
break;
}
}
}
return (result.size() == 0) ? origin : result;
}
#ifndef FILTER_H
#define FILTER_H
#include <climits> //for UINT_MAX in filterBioReader
#include "bioreader.hpp"
#include "automaton.hpp"
/*
This number is used in filterBioReaderWithACAutomaton to define
the minimum number of occurence that genes must have to be in the
filtered BioReader.
*/
#define BIOREADER_MIN 3
/*
The number of required genes that must be greater or equal to BIOREADER_MIN
to filter a BioReader.
*/
#define REQUIRED_GOOD_GENES 1
/*
This function will filter a BioReader
@param idxAho: A pointer to a pair containing an int vector pointer and
an AbstractACAutomaton pointer parametrized by KmerAffect.
The int vector represents indexes of a BioReader and the
automaton is build with single char labels put in KmerAffect.
To know more about them, read doc of
buildACAutomatonToFilterBioReader function.
@param origin : The BioReader object we want to filter.
@param seq : The sequence that will be aligned against the genes.
@param kmer_threshold : The threshold to K-mers used during the filtering.
Since it's an optional arguments, if not specified it will
filter on every K-mers returned by getMultiResults. Otherwise
it will filter on the "kmer_threshold" number of K-mers. For
Example if kmer_threshold = 10, it will filter on the 10 most
significant K-mers returned by getMultiResults
*/
BioReader filterBioReaderWithACAutomaton(
pair<vector<int>*, AbstractACAutomaton<KmerAffect>*>* idxAho,
BioReader &origin, seqtype &seq,
int kmer_threshold = NO_LIMIT_VALUE);
/*
This function takes a BioReader as a parameter and returns
a couple containing an int vector pointer and an automaton
object pointer specifying the automaton used..
For now the automaton used is Aho-Corasick but to prevent future
errors the returned type is an AbstractACAutomaton. The index
vector contains the indexes of the genes families.
For example if the BioReader has the following genes:
IGHV-01*01 (index 0) (New Family !)
IGHV-02*01 (index 1) (New Family !)
IGHV-02*02 (index 2)
IGHV-03*01 (index 3) (New Family !)
IGHV-03*02 (index 4)
IGHV-03*03 (index 5)
IGHV-03*04 (index 6)
IGHV-04*04 (index 7) (New Family !)
IGHV-05*01 (index 8) (New Family !)
IGHV-05*02 (index 9)
The following vector<int> is returned :
[0, 1, 3, 7, 8, 9]
Note : The first case always contains the number '0' and the last one
contains the number of genes in the BioReader (minus 1).
Regarding the automaton, it is built using KmerAffect. We take these
Kmer because they can handle informations on a character. We set a
different character to a Kmer for each group of genes.
For example we will build the automaton using the KmerAffect like this:
KmerAffect_0's label = 'a' (New Family !)
KmerAffect_1's label = 'b' (New Family !)
KmerAffect_2's label = 'b'
KmerAffect_3's label = 'c' (New Family !)
KmerAffect_4's label = 'c'
KmerAffect_5's label = 'c'
KmerAffect_6's label = 'c'
KmerAffect_7's label = 'd' (New Family !)
KmerAffect_8's label = 'e' (New Family !)
There is no KmerAffect_9's because as said previously, the last number
in the vector<index> represent the total number of genes, and not a
family itself.
In the previous example KmerAffectX's label start from 'a' (n°97 in ascii
chart), but in reality to store more informations, the label start from
the ascii character NUL (n°0 in ascii chart) and increase for each new
family of genes met.
Note : There is no ascii character for '_', '?' and '*' since they
respectively represent an "AFFECT_UNKNOWN_SYMBOL", an "AFFECT_AMBIGUOUS_SYMPBOL"
and an "AFFECT_NOT_UNKNOWN_SYMBOL".
The param "seed" is used while inserting sequences in the automaton. By default
the seed has a size of 10.
*/
pair<vector<int>*, AbstractACAutomaton<KmerAffect>*>*
buildACAutomatonToFilterBioReader(BioReader &origin, string seed);
#endif
......@@ -6,7 +6,7 @@
void Germline::init(string _code, char _shortcut,
string seed,
int max_indexing)
int max_indexing, bool build_automaton)
{
seg_method = SEG_METHOD_53 ;
code = _code ;
......@@ -24,21 +24,21 @@ void Germline::init(string _code, char _shortcut,
affect_5 = string(1, toupper(shortcut)) + "-" + code + "V";
affect_4 = string(1, 14 + shortcut) + "-" + code + "D";
affect_3 = string(1, tolower(shortcut)) + "-" + code + "J";
automaton_5 = build_automaton ? buildACAutomatonToFilterBioReader(rep_5, seed) : nullptr;
}
Germline::Germline(string _code, char _shortcut,
string seed, int max_indexing)
string seed, int max_indexing, bool build_automaton)
{
init(_code, _shortcut, seed, max_indexing);
init(_code, _shortcut, seed, max_indexing, build_automaton);
}
Germline::Germline(string _code, char _shortcut,
string f_rep_5, string f_rep_4, string f_rep_3,
string seed, int max_indexing)
string f_rep_5, string f_rep_4, string f_rep_3,
string seed, int max_indexing, bool build_automaton)
{
init(_code, _shortcut, seed, max_indexing);
f_reps_5.push_back(f_rep_5);
f_reps_4.push_back(f_rep_4);
......@@ -49,16 +49,17 @@ Germline::Germline(string _code, char _shortcut,
rep_4 = BioReader(f_rep_4, 2, "|");
rep_3 = BioReader(f_rep_3, 2, "|");
init(_code, _shortcut, seed, max_indexing, build_automaton);
if (rep_4.size())
seg_method = SEG_METHOD_543 ;
}
Germline::Germline(string _code, char _shortcut,
list <string> _f_reps_5, list <string> _f_reps_4, list <string> _f_reps_3,
string seed, int max_indexing)
list <string> _f_reps_5, list <string> _f_reps_4, list <string> _f_reps_3,
string seed, int max_indexing, bool build_automaton)
{
init(_code, _shortcut, seed, max_indexing);
f_reps_5 = _f_reps_5 ;
f_reps_4 = _f_reps_4 ;
......@@ -72,7 +73,9 @@ Germline::Germline(string _code, char _shortcut,
for (list<string>::const_iterator it = f_reps_5.begin(); it != f_reps_5.end(); ++it)
rep_5.add(*it);
init(_code, _shortcut, seed, max_indexing, build_automaton);
for (list<string>::const_iterator it = f_reps_4.begin(); it != f_reps_4.end(); ++it)
rep_4.add(*it);
......@@ -86,22 +89,21 @@ Germline::Germline(string _code, char _shortcut,
Germline::Germline(string _code, char _shortcut,
BioReader _rep_5, BioReader _rep_4, BioReader _rep_3,
string seed, int max_indexing)
string seed, int max_indexing, bool build_automaton)
{
init(_code, _shortcut, seed, max_indexing);
rep_5 = _rep_5 ;
rep_4 = _rep_4 ;
rep_3 = _rep_3 ;
init(_code, _shortcut, seed, max_indexing, build_automaton);
if (rep_4.size())
seg_method = SEG_METHOD_543 ;
}
Germline::Germline(string code, char shortcut, string path, json json_recom,
string seed, int max_indexing)
string seed, int max_indexing, bool build_automaton)
{
init(code, shortcut, seed, max_indexing);
bool regular = (code.find("+") == string::npos);
......@@ -116,7 +118,9 @@ Germline::Germline(string code, char shortcut, string path, json json_recom,
f_reps_5.push_back(path + filename);
rep_5.add(path + filename);
}
init(code, shortcut, seed, max_indexing, build_automaton);
if (json_recom.find("4") != json_recom.end()) {
for (json::iterator it = json_recom["4"].begin();
it != json_recom["4"].end(); ++it)
......@@ -152,6 +156,10 @@ Germline::Germline(string code, char shortcut, string path, json json_recom,
}
}
int Germline::getMaxIndexing(){
return this->max_indexing;
}
void Germline::finish() {
if (index)
index->finish_building();
......@@ -202,6 +210,15 @@ void Germline::override_rep5_rep3_from_labels(KmerAffect left, KmerAffect right)
Germline::~Germline()
{
if(automaton_5){
if(automaton_5->first){
delete automaton_5->first;
}
if(automaton_5->second){
delete automaton_5->second;
}
delete automaton_5;
}
if (index)
{
if (--(index->refs) == 0)
......@@ -262,7 +279,7 @@ void MultiGermline::add_germline(Germline *germline)
}
void MultiGermline::build_from_json(string path, string json_filename_and_filter, int filter,
string default_seed, int default_max_indexing)
string default_seed, int default_max_indexing, bool build_automaton)
{
//extract json_filename and systems_filter
......@@ -345,14 +362,14 @@ void MultiGermline::build_from_json(string path, string json_filename_and_filter
seedMap["9s"] = SEED_9;
seed = (default_seed.size() == 0) ? seedMap[seed] : default_seed;
//for each set of recombination 3/4/5
for (json::iterator it2 = recom.begin(); it2 != recom.end(); ++it2) {
add_germline(new Germline(code, shortcut, path + "/", *it2,
seed, max_indexing));
seed, max_indexing, build_automaton));
}
}
}
/* if 'one_index_per_germline' was not set, this should be called once all germlines have been loaded */
......
......@@ -12,6 +12,7 @@
#include "../lib/json.hpp"
#include "kmerstorefactory.hpp"
#include "bioreader.hpp"
#include "filter.h"
#include <climits>
#define DEFAULT_GERMLINE_SEED SEED_S10
......@@ -43,7 +44,7 @@ class Germline {
int max_indexing;
void init(string _code, char _shortcut,
string seed, int max_indexing);
string seed, int max_indexing, bool build_automaton=false);
public:
/*
......@@ -52,24 +53,25 @@ class Germline {
Germline(string _code, char _shortcut,
list <string> f_rep_5, list <string> f_rep_4, list <string> f_rep_3,
string seed="", int max_indexing=0);
string seed="", int max_indexing=0, bool build_automaton=false);
Germline(string _code, char _shortcut,
Germline(string _code, char _shortcut,
string f_rep_5, string f_rep_4, string f_rep_3,
string seed="", int max_indexing=0);
string seed="", int max_indexing=0, bool build_automaton=false);
Germline(string _code, char _shortcut,
Germline(string _code, char _shortcut,
BioReader _rep_5, BioReader _rep_4, BioReader _rep_3,
string seed="", int max_indexing=0);
string seed="", int max_indexing=0, bool build_automaton=false);
Germline(string _code, char _shortcut,
string seed="", int max_indexing=0);
string seed="", int max_indexing=0, bool build_automaton=false);
Germline(string _code, char shortcut, string path, json json_recom,
string seed="", int max_indexing=0);
string seed="", int max_indexing=0, bool build_automaton=false);
~Germline();
pair<vector<int>*, AbstractACAutomaton<KmerAffect>*>* automaton_5;
int seg_method ;
string code ;
char shortcut ;
......@@ -83,7 +85,9 @@ class Germline {
* Finishes the construction of the germline so that it can be used
*/
void finish();
/* Return the max indexing of a germline */
int getMaxIndexing();
void new_index(IndexTypes type);
void set_index(IKmerStore<KmerAffect> *index);
......@@ -152,7 +156,7 @@ class MultiGermline {
* max_indexing:
*/
void build_from_json(string path, string json_filename_and_filter, int filter,
string default_seed="", int default_max_indexing=0);
string default_seed="", int default_max_indexing=0, bool build_automaton=false);
/**
* Finishes the construction of the multi germline so that it can be used
......
......@@ -60,12 +60,8 @@ bool operator!=(const affect_t &a1, const affect_t &a2) {
}
string toString(const affect_t &a) {
string result;
if((a == AFFECT_UNKNOWN.affect) || (a == AFFECT_AMBIGUOUS.affect))
result = " ";
else
result = (affect_strand(a)==1 ? "+" : "-");
result += string(1,affect_char(a));
result = toStringSigns(a);
result += toStringValues(a);
return result;
}
......
......@@ -180,7 +180,7 @@ ostream &operator<<(ostream &os, const KmerAffect &kmer);
* Constant defining any not-unknown affectation
* Could be used by .getIndexLoad(), but now any non-AFFECT_UNKNOWN kmer will work.
*/
const KmerAffect AFFECT_NOT_UNKNOWN = KmerAffect(AFFECT_NOT_UNKNOWN_SYMBOL, 0, 1);
const KmerAffect AFFECT_NOT_UNKNOWN = KmerAffect(AFFECT_NOT_UNKNOWN_SYMBOL, 0, -1);
/**
* Constant defining the unknown affectation (not known yet)
......@@ -190,7 +190,7 @@ const KmerAffect AFFECT_UNKNOWN = KmerAffect(AFFECT_UNKNOWN_SYMBOL, 0, 1);
/**
* Constant defining the ambiguous affectation (many possibilities)
*/
const KmerAffect AFFECT_AMBIGUOUS = KmerAffect(AFFECT_AMBIGUOUS_SYMBOL, 1, 1);
const KmerAffect AFFECT_AMBIGUOUS = KmerAffect(AFFECT_AMBIGUOUS_SYMBOL, 1, -1);
const KmerAffect AFFECT_V = KmerAffect("V", 1);
const KmerAffect AFFECT_J = KmerAffect("J", 1);
......
......@@ -42,6 +42,10 @@ Kmer &Kmer::operator+=(const Kmer &kmer) {
return *this;
}
string Kmer::getLabel() const{
return "";
}
size_t Kmer::getLength() const{
return 10;
}
......
......@@ -43,6 +43,8 @@ public:
* @return true if the element is the same as when initialised with default constructor.
*/
bool isNull() const;
string getLabel() const;
/**
* @return true iff the kmer is unknown (which doesn't make sense here but
......
......@@ -29,7 +29,10 @@
#include <cstring>
#include <string>
#include "windowExtractor.h"
#include <set>
#include "automaton.h"
#include <map>
#include <string>
#define NO_FORBIDDEN_ID (-1)
AlignBox::AlignBox(string _key, string _color) {
......@@ -947,7 +950,7 @@ string format_del(int deletions)
return deletions ? *"(" + string_of_int(deletions) + " del)" : "" ;
}
FineSegmenter::FineSegmenter(Sequence seq, Germline *germline, Cost segment_c, double threshold, double multiplier)
FineSegmenter::FineSegmenter(Sequence seq, Germline *germline, Cost segment_c, double threshold, double multiplier, int kmer_threshold)
{
box_V = new AlignBox("5");
box_D = new AlignBox("4");
......@@ -1046,12 +1049,16 @@ FineSegmenter::FineSegmenter(Sequence seq, Germline *germline, Cost segment_c,
/* Regular 53 Segmentation */
align_against_collection(sequence_or_rc, germline->rep_5, NO_FORBIDDEN_ID, reverse_V, reverse_V, false,
if(kmer_threshold != NO_LIMIT_VALUE){
BioReader filtered = filterBioReaderWithACAutomaton(germline->automaton_5, germline->rep_5, sequence_or_rc, kmer_threshold);
align_against_collection(sequence_or_rc, filtered, NO_FORBIDDEN_ID, reverse_V, reverse_V, false,
box_V, segment_cost);
}else{
align_against_collection(sequence_or_rc, germline->rep_5, NO_FORBIDDEN_ID, reverse_V, reverse_V, false,
box_V, segment_cost);
}
align_against_collection(sequence_or_rc, germline->rep_3, NO_FORBIDDEN_ID, reverse_J, !reverse_J, false,
box_J, segment_cost);
// J was run with '!reverseJ', we copy the box informations from right to left
// Should this directly be handled in align_against_collection() ?
box_J->start = box_J->end ;
......
......@@ -12,6 +12,7 @@
#include "kmeraffect.h"
#include "affectanalyser.h"
#include "../lib/json.hpp"
#include "filter.h"