Commit 8ef46f07 authored by Mathieu Giraud's avatar Mathieu Giraud
Browse files

merge -- streamline Vidjil handling of command-line parameters

Closes #1673 and #2156.
parents 6cb3041b 026af310
......@@ -14,6 +14,8 @@ void Germline::init(string _code, char _shortcut,
index = 0 ;
this->max_indexing = max_indexing;
this->seed = seed;
if (this->seed.size() == 0)
this->seed = DEFAULT_GERMLINE_SEED;
affect_5 = "V" ;
affect_4 = "" ;
......@@ -103,13 +105,14 @@ Germline::Germline(string _code, char _shortcut,
}
Germline::Germline(string code, char shortcut, string path, json json_recom,
string seed, int max_indexing)
int delta_min, string seed, int max_indexing)
{
int delta_min = -10;
if (delta_min == UNSET_DELTA_MIN) {
delta_min = DEFAULT_DELTA_MIN;
if (json_recom.find("4") != json_recom.end()) {
delta_min = 0;
if (json_recom.find("4") != json_recom.end()) {
delta_min = DEFAULT_DELTA_MIN_D;
}
}
init(code, shortcut, delta_min, seed, max_indexing);
......@@ -257,7 +260,8 @@ void MultiGermline::add_germline(Germline *germline)
germlines.push_back(germline);
}
void MultiGermline::build_from_json(string path, string json_filename_and_filter, int filter, int max_indexing)
void MultiGermline::build_from_json(string path, string json_filename_and_filter, int filter, int default_delta_min,
string default_seed, int default_max_indexing)
{
//extract json_filename and systems_filter
......@@ -297,11 +301,26 @@ void MultiGermline::build_from_json(string path, string json_filename_and_filter
//for each germline
for (json::iterator it = j.begin(); it != j.end(); ++it) {
int delta_min = default_delta_min;
int max_indexing = default_max_indexing;
json recom = it.value()["recombinations"];
char shortcut = it.value()["shortcut"].dump()[1];
json json_value = it.value();
json recom = json_value["recombinations"];
char shortcut = json_value["shortcut"].dump()[1];
string code = it.key();
string seed = it.value()["parameters"]["seed"];
json json_parameters = json_value["parameters"];
string seed = json_parameters["seed"];
if (default_delta_min == UNSET_DELTA_MIN) {
if (json_parameters.count("delta_min") > 0) {
delta_min = json_parameters["delta_min"];
}
}
if (default_max_indexing == 0) {
if (json_parameters.count("max_indexing") > 0) {
max_indexing = json_parameters["max_indexing"];
}
}
if (systems_filter.size())
{
......@@ -329,10 +348,12 @@ void MultiGermline::build_from_json(string path, string json_filename_and_filter
seedMap["12s"] = SEED_S12;
seedMap["10s"] = SEED_S10;
seedMap["9s"] = SEED_9;
seed = (default_seed.size() == 0) ? seedMap[seed] : default_seed;
//for each set of recombination 3/4/5
for (json::iterator it2 = recom.begin(); it2 != recom.end(); ++it2) {
add_germline(new Germline(code, shortcut, path + "/", *it2 , seedMap[seed],
add_germline(new Germline(code, shortcut, path + "/", *it2 , delta_min, seed,
max_indexing));
}
}
......
......@@ -12,6 +12,12 @@
#include "../lib/json.hpp"
#include "kmerstorefactory.hpp"
#include "bioreader.hpp"
#include <climits>
#define DEFAULT_DELTA_MIN -10
#define DEFAULT_DELTA_MIN_D 0
#define UNSET_DELTA_MIN INT_MAX
#define DEFAULT_GERMLINE_SEED SEED_S10
enum SEGMENTATION_METHODS {
SEG_METHOD_53, // Regular or incomplete germlines, 5'-3'
......@@ -70,7 +76,7 @@ class Germline {
int max_indexing=0);
Germline(string _code, char shortcut, string path, json json_recom,
string seed="", int max_indexing=0);
int delta_min=UNSET_DELTA_MIN, string seed="", int max_indexing=0);
~Germline();
......@@ -157,7 +163,8 @@ class MultiGermline {
* filter: see GERMLINES_FILTER
* max_indexing:
*/
void build_from_json(string path, string json_filename_and_filter, int filter, int max_indexing);
void build_from_json(string path, string json_filename_and_filter, int filter, int default_delta_min=UNSET_DELTA_MIN,
string default_seed="", int default_max_indexing=0);
/**
* Finishes the construction of the multi germline so that it can be used
......
......@@ -49,6 +49,8 @@ int seed_weight(const string &seed);
// https://stackoverflow.com/posts/3599170/revisions
#define UNUSED(x) ((void)(x))
#define FIRST_IF_UNCHANGED(first, second, changed) ((changed) ? (second) : (first))
/**
* Return a spaced key from a contiguous key and a seed model
* @param input: contiguous key
......
......@@ -105,11 +105,7 @@ enum { CMD_WINDOWS, CMD_CLONES, CMD_SEGMENT, CMD_GERMLINES } ;
#define DEFAULT_K 0
#define DEFAULT_W 50
#define DEFAULT_SEED ""
#define DEFAULT_DELTA_MIN -10
#define DEFAULT_DELTA_MIN_D 0
#define DEFAULT_SEED DEFAULT_GERMLINE_SEED
#define DEFAULT_MAX_AUDITIONED 2000
#define DEFAULT_RATIO_REPRESENTATIVE 0.5
......@@ -242,9 +238,10 @@ void usage(char *progname, bool advanced)
cerr << "Detailed output per read (generally not recommended, large files, but may be used for filtering, as in -uu -X 1000)" << endl
<< " -U output segmented reads (in " << SEGMENTED_FILENAME << " file)" << endl
<< " -u output unsegmented reads (in " << UNSEGMENTED_FILENAME << " file)" << endl
<< " -uu output unsegmented reads, gathered by unsegmentation cause, except for very short and 'too few V/J' reads (in *" << UNSEGMENTED_DETAIL_FILENAME << " files)" << endl
<< " -uuu output unsegmented reads, gathered by unsegmentation cause, all reads (in *" << UNSEGMENTED_DETAIL_FILENAME << " files) (use only for debug)" << endl
<< " -u output unsegmented reads, gathered by unsegmentation cause, except for very short and 'too few V/J' reads (in *" << UNSEGMENTED_DETAIL_FILENAME << " files)" << endl
<< " -uu output unsegmented reads, gathered by unsegmentation cause, all reads (in *" << UNSEGMENTED_DETAIL_FILENAME << " files) (use only for debug)" << endl
<< " -uuu output unsegmented reads, all reads, including a " << UNSEGMENTED_FILENAME << " file (use only for debug)" << endl
<< " -K output detailed k-mer affectation on all reads (in " << AFFECTS_FILENAME << " file) (use only for debug, for example -KX 100)" << endl
<< endl
......@@ -315,8 +312,6 @@ int main (int argc, char **argv)
//$$ options: defaults
string germline_system = "" ;
list <string> f_reps_V ;
list <string> f_reps_D ;
list <string> f_reps_J ;
......@@ -325,13 +320,13 @@ int main (int argc, char **argv)
string read_header_separator = DEFAULT_READ_HEADER_SEPARATOR ;
string f_reads = DEFAULT_READS ;
string seed = DEFAULT_SEED ;
bool seed_changed = false;
string f_basename = "";
string out_dir = DEFAULT_OUT_DIR;
string comp_filename = COMP_FILENAME;
int kmer_size = DEFAULT_K ;
int wmer_size = DEFAULT_W ;
IndexTypes indexType = KMER_INDEX;
......@@ -364,6 +359,9 @@ int main (int argc, char **argv)
int delta_min = DEFAULT_DELTA_MIN ; // Kmer+Fine
int trim_sequences = DEFAULT_TRIM;
bool delta_min_changed = false;
bool trim_sequences_changed = false;
bool output_sequences_by_cluster = false;
bool output_segmented = false;
bool output_unsegmented = false;
......@@ -438,7 +436,6 @@ int main (int argc, char **argv)
case 'V':
f_reps_V.push_back(optarg);
germline_system = "custom" ;
break;
case 'D':
......@@ -448,12 +445,10 @@ int main (int argc, char **argv)
case 'J':
f_reps_J.push_back(optarg);
germline_system = "custom" ;
break;
case 'g':
multi_germline = true;
germline_system = "multi" ;
{
string arg = string(optarg);
struct stat buffer;
......@@ -499,7 +494,7 @@ int main (int argc, char **argv)
case 's':
#ifndef NO_SPACED_SEEDS
seed = string(optarg);
kmer_size = seed_weight(seed);
seed_changed = true;
options_s_k++ ;
#else
cerr << "To enable the option -s, please compile without NO_SPACED_SEEDS" << endl;
......@@ -507,8 +502,11 @@ int main (int argc, char **argv)
break;
case 'k':
kmer_size = atoi(optarg);
seed = seed_contiguous(kmer_size);
{
int kmer_size = atoi(optarg);
seed = seed_contiguous(kmer_size);
seed_changed = true;
}
options_s_k++ ;
break;
......@@ -518,6 +516,7 @@ int main (int argc, char **argv)
case 'm':
delta_min = atoi(optarg);
delta_min_changed = true;
break;
case '!':
......@@ -548,6 +547,7 @@ int main (int argc, char **argv)
case 't':
trim_sequences = atoi(optarg);
trim_sequences_changed = true;
break;
case 'v':
......@@ -639,9 +639,9 @@ int main (int argc, char **argv)
break;
case 'u':
output_unsegmented_detail_full = output_unsegmented_detail; // -uuu
output_unsegmented_detail = output_unsegmented; // -uu
output_unsegmented = true ; // -u
output_unsegmented = output_unsegmented_detail_full ; // -uuu
output_unsegmented_detail_full = output_unsegmented_detail; // -uu
output_unsegmented_detail = true; // -u
break;
case 'U':
output_segmented = true;
......@@ -655,7 +655,7 @@ int main (int argc, char **argv)
//$$ options: post-processing+display
if (!germline_system.size())
if (!multi_germline && (!f_reps_V.size() || !f_reps_J.size()))
{
cerr << ERROR_STRING << "At least one germline must be given with -g or -V/(-D)/-J." << endl ;
exit(1);
......@@ -690,22 +690,9 @@ int main (int argc, char **argv)
// Default seeds
#ifndef NO_SPACED_SEEDS
if (kmer_size == DEFAULT_K)
#ifdef NO_SPACED_SEEDS
if (! seed_changed)
{
if (germline_system.find("TRA") != string::npos)
seed = SEED_S13 ;
else if ((germline_system.find("TRB") != string::npos)
|| (germline_system.find("IGH") != string::npos))
seed = SEED_S12 ;
else // TRD, TRG, IGK, IGL, custom, multi
seed = SEED_S10 ;
kmer_size = seed_weight(seed);
}
#else
{
cerr << ERROR_STRING << "Vidjil was compiled with NO_SPACED_SEEDS: please provide a -k option." << endl;
exit(1) ;
}
......@@ -850,7 +837,10 @@ int main (int argc, char **argv)
for (pair <string, string> path_file: multi_germline_paths_and_files)
{
try {
multigermline->build_from_json(path_file.first, path_file.second, GERMLINES_REGULAR, trim_sequences);
multigermline->build_from_json(path_file.first, path_file.second, GERMLINES_REGULAR,
FIRST_IF_UNCHANGED(UNSET_DELTA_MIN, delta_min, delta_min_changed),
FIRST_IF_UNCHANGED("", seed, seed_changed),
FIRST_IF_UNCHANGED(0, trim_sequences, trim_sequences_changed));
} catch (std::exception& e) {
cerr << ERROR_STRING << "Vidjil cannot properly read " << path_file.first << "/" << path_file.second << ": " << e.what() << endl;
exit(1);
......@@ -861,7 +851,7 @@ int main (int argc, char **argv)
{
// Custom germline
Germline *germline;
germline = new Germline(germline_system, 'X',
germline = new Germline("custom", 'X',
f_reps_V, f_reps_D, f_reps_J,
delta_min, seed, trim_sequences);
......@@ -884,14 +874,14 @@ int main (int argc, char **argv)
}
if (multi_germline_unexpected_recombinations_12) {
Germline *pseudo = new Germline(PSEUDO_UNEXPECTED, PSEUDO_UNEXPECTED_CODE, -10, "", trim_sequences);
Germline *pseudo = new Germline(PSEUDO_UNEXPECTED, PSEUDO_UNEXPECTED_CODE, DEFAULT_DELTA_MIN, "", trim_sequences);
pseudo->seg_method = SEG_METHOD_MAX12 ;
pseudo->index = multigermline->index ;
multigermline->germlines.push_back(pseudo);
}
if (multi_germline_unexpected_recombinations_1U) {
Germline *pseudo_u = new Germline(PSEUDO_UNEXPECTED, PSEUDO_UNEXPECTED_CODE, -10, "", trim_sequences);
Germline *pseudo_u = new Germline(PSEUDO_UNEXPECTED, PSEUDO_UNEXPECTED_CODE, DEFAULT_DELTA_MIN, "", trim_sequences);
pseudo_u->seg_method = SEG_METHOD_MAX1U ;
// TODO: there should be more up/downstream regions for the PSEUDO_UNEXPECTED germline. And/or smaller seeds ?
pseudo_u->index = multigermline->index ;
......@@ -901,7 +891,10 @@ int main (int argc, char **argv)
// Should come after the initialization of regular (and possibly pseudo) germlines
{
for (pair <string, string> path_file: multi_germline_paths_and_files)
multigermline->build_from_json(path_file.first, path_file.second, GERMLINES_INCOMPLETE, trim_sequences);
multigermline->build_from_json(path_file.first, path_file.second, GERMLINES_INCOMPLETE,
FIRST_IF_UNCHANGED(UNSET_DELTA_MIN, delta_min, delta_min_changed),
FIRST_IF_UNCHANGED("", seed, seed_changed),
FIRST_IF_UNCHANGED(0, trim_sequences, trim_sequences_changed));
if ((! multigermline->one_index_per_germline) && (command != CMD_GERMLINES)) {
multigermline->insert_in_one_index(multigermline->index, true);
}
......@@ -979,6 +972,8 @@ int main (int argc, char **argv)
int total_length = 0 ;
int s = index->getS();
int kmer_size = seed_weight(seed);
while (reads->hasNext())
{
reads->next();
......
......@@ -593,7 +593,7 @@ See [[http://git.vidjil.org/blob/master/doc/browser.org][browser.org]] for infor
** Filtering reads
It is possible to extract all segmented or unsegmented reads, possibly to give them to other software.
Runing Vidjil with =-U= gives a file =out/basename.unsegmented.vdj.fa=, with all segmented reads.
Runing Vidjil with =-U= gives a file =out/basename.segmented.vdj.fa=, with all segmented reads.
On datasets generated with rather specific V(D)J primers, this is generally not recommended, as it may generate a large file.
However, the =-U= option is very useful for whole RNA-Seq or capture datasets that contain few reads with V(D)J recombinations.
......@@ -603,7 +603,7 @@ Similarly, options are available to get the unsegmented reads:
it may be interesting to further study RNA-Seq datasets.
- =-uu= gives the same set of files, including *all* unsegmented reads (including =UNSEG too short= and =UNSEG too few V/J=),
and =-uuu= further outputs all these reads in a file =out/basename.segmented.vdj.fa=.
and =-uuu= further outputs all these reads in a file =out/basename.unsegmented.vdj.fa=.
Again, as these options may generate large files, they are generally not recommended.
However, they are very useful in some situations, especially to understand why some dataset gives poor segmentation result.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment