Commit 5a25394d authored by Mikaël Salson's avatar Mikaël Salson

Merge branch 'feature-a/3293-seeds' into 'dev'

Feature a/3293 seeds

Closes #3293

See merge request !226
parents 5d2c0f4d d289f5b4
Pipeline #32474 passed with stages
in 53 seconds
......@@ -13,9 +13,8 @@ void Germline::init(string _code, char _shortcut,
shortcut = _shortcut ;
index = 0 ;
this->max_indexing = max_indexing;
this->seed = seed;
if (this->seed.size() == 0)
this->seed = DEFAULT_GERMLINE_SEED;
this->seed = expand_seed(seed);
affect_5 = "V" ;
affect_4 = "" ;
......@@ -24,7 +23,7 @@ void Germline::init(string _code, char _shortcut,
affect_5 = string(1, toupper(shortcut)) + "-" + code + "V";
affect_4 = string(1, 14 + shortcut) + "-" + code + "D";
affect_3 = string(1, tolower(shortcut)) + "-" + code + "J";
filter_5 = build_automaton ? new FilterWithACAutomaton(rep_5, seed) : nullptr;
filter_5 = build_automaton ? new FilterWithACAutomaton(rep_5, this->seed) : nullptr;
}
......@@ -167,7 +166,7 @@ void Germline::finish() {
void Germline::new_index(IndexTypes type)
{
assert(! seed.empty());
assert(! seed.empty() && (seed.find(SEED_YES) != std::string::npos));
bool rc = true ;
index = KmerStoreFactory<KmerAffect>::createIndex(type, seed, rc);
......@@ -353,13 +352,7 @@ void MultiGermline::build_from_json(string path, string json_filename_and_filter
break ;
}
map<string, string> seedMap;
seedMap["13s"] = SEED_S13;
seedMap["12s"] = SEED_S12;
seedMap["10s"] = SEED_S10;
seedMap["9s"] = SEED_9;
seed = (default_seed.size() == 0) ? seedMap[seed] : default_seed;
seed = (default_seed.size() == 0) ? seed : default_seed;
//for each set of recombination 3/4/5
for (json::iterator it2 = recom.begin(); it2 != recom.end(); ++it2) {
......@@ -390,7 +383,7 @@ void MultiGermline::insert_in_one_index(IKmerStore<KmerAffect> *_index, bool set
void MultiGermline::build_with_one_index(string seed, bool set_index)
{
bool rc = true ;
index = KmerStoreFactory<KmerAffect>::createIndex(indexType, seed, rc);
index = KmerStoreFactory<KmerAffect>::createIndex(indexType, expand_seed(seed), rc);
index->refs = 1;
insert_in_one_index(index, set_index);
}
......
......@@ -15,8 +15,6 @@
#include "filter.h"
#include <climits>
#define DEFAULT_GERMLINE_SEED SEED_S10
enum SEGMENTATION_METHODS {
SEG_METHOD_53, // Regular or incomplete germlines, 5'-3'
SEG_METHOD_543, // Regular or incomplete germlines, 5'-3', with an additional middle gene (such a D gene)
......
......@@ -19,6 +19,30 @@ int seed_weight(const string &seed)
return count(seed.begin(), seed.end(), SEED_YES);
}
map<string, string> seedMap = {
{"9c", "#########"},
{"10s", "#####-#####"},
{"12s", "######-######"},
{"13s", "#######-######"}
};
string expand_seed(const string &seed)
{
if (seed.size() == 0)
return expand_seed(DEFAULT_SEED);
if (seed.find(SEED_YES) == std::string::npos)
{
if (seedMap.find(seed) == seedMap.end())
throw invalid_argument("Unknown seed: " + seed);
else
return seedMap[seed];
}
return seed ;
}
char spaced_buf[MAX_SEED_SIZE+1];
string spaced(const string &input, const string &seed) {
......@@ -67,6 +91,14 @@ string scientific_string_of_double(double number)
return ss.str();
}
string string_of_map(map <string, string> m, const string &before)
{
stringstream ss;
for (auto x: m)
ss << before << x.first << ":" << x.second;
return ss.str();
}
bool is_extended_nucleotide(char nuc) {
switch(nuc) {
......
......@@ -43,16 +43,14 @@ using namespace std;
#define PRINT_VAR(v) cerr << #v << " = " << v << endl
#define NB_N_CHOOSE_K_STORED 500
#define SEED_YES '#'
// Common seeds
#define SEED_9 "#########"
#define SEED_S10 "#####-#####"
#define SEED_S12 "######-######"
#define SEED_S13 "#######-######"
#define NB_N_CHOOSE_K_STORED 500
#define DEFAULT_SEED "10s"
extern map<string, string> seedMap;
string expand_seed(const string &seed);
string seed_contiguous(int k);
......@@ -112,6 +110,7 @@ bool pair_occurrence_sort(pair<T, int> a, pair<T, int> b);
string string_of_int(int number);
string fixed_string_of_float(float number, int precision);
string scientific_string_of_double(double number);
string string_of_map(map <string, string> m, const string &before);
/**
* @param nuc is A, C, G, T or any extended nucleotide (or lowercase)
......
!LAUNCH: $VIDJIL_DIR/$EXEC $VIDJIL_DEFAULT_OPTIONS -KA -z 0 -s "#####-#####" -V $VIDJIL_DIR/germline/homo-sapiens/IGHV.fa -J $VIDJIL_DIR/germline/homo-sapiens/IGHJ.fa -D $VIDJIL_DIR/germline/homo-sapiens/IGHD.fa $VIDJIL_DATA/common-V-D.fa ; cat out/common-V-D.affects
!LAUNCH: $VIDJIL_DIR/$EXEC $VIDJIL_DEFAULT_OPTIONS -KA -z 0 -s 10s -V $VIDJIL_DIR/germline/homo-sapiens/IGHV.fa -J $VIDJIL_DIR/germline/homo-sapiens/IGHJ.fa -D $VIDJIL_DIR/germline/homo-sapiens/IGHD.fa $VIDJIL_DATA/common-V-D.fa ; cat out/common-V-D.affects
$ Segments the sequence
1: SEG .* -> .* 1
......
......@@ -8,6 +8,10 @@ $ Check default costs
1:segmenter .* "4, -6, -10, -1, -10"
1:clustering .* "1, -4, -4, 0, 0"
$ Show seeds
1: 9c.#########
1: 13s.#######-######
$ Display advanced options
: , experimental options
: custom Cost
......
......@@ -106,7 +106,6 @@ enum { CMD_WINDOWS, CMD_CLONES, CMD_SEGMENT, CMD_GERMLINES } ;
#define DEFAULT_K 0
#define DEFAULT_W 50
#define DEFAULT_SEED DEFAULT_GERMLINE_SEED
#define DEFAULT_MAX_AUDITIONED 2000
#define DEFAULT_RATIO_REPRESENTATIVE 0.5
......@@ -354,8 +353,10 @@ int main (int argc, char **argv)
seed_changed = true;
return true;
},
"spaced seeds used for the V/J affectation (default: depends on germline)")
-> group(group) -> level();
"seed, possibly spaced, used for the V/J affectation (default: depends on germline), given either explicitely, either by an alias"
"\n " + string_of_map(seedMap, " ")
)
-> group(group) -> level() -> set_type_name("SEED=" DEFAULT_SEED);
// ----------------------------------------------------------------------------------------------------------------------
......
......@@ -107,7 +107,7 @@
"3": ["TRDD3+down.fa"]
} ],
"parameters": {
"seed": "9s"
"seed": "9c"
}
},
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment