Commit c6878803 authored by Mikaël Salson's avatar Mikaël Salson Committed by Mathieu Giraud
Browse files

vidjil: add -t option

This option is passed to the germline which is passed to the kmerstore (in the insert method).
It is used to only consider one end of a V/J gene
parent a399961f
...@@ -116,6 +116,8 @@ enum { CMD_WINDOWS, CMD_CLONES, CMD_SEGMENT, CMD_GERMLINES } ; ...@@ -116,6 +116,8 @@ enum { CMD_WINDOWS, CMD_CLONES, CMD_SEGMENT, CMD_GERMLINES } ;
#define DEFAULT_CLUSTER_COST Cluster #define DEFAULT_CLUSTER_COST Cluster
#define DEFAULT_SEGMENT_COST VDJ #define DEFAULT_SEGMENT_COST VDJ
#define DEFAULT_TRIM 100
// error // error
#define ERROR_STRING "[error] " #define ERROR_STRING "[error] "
...@@ -180,6 +182,7 @@ void usage(char *progname, bool advanced) ...@@ -180,6 +182,7 @@ void usage(char *progname, bool advanced)
<< " -M <int> maximal admissible delta between last V and first J k-mer (default: " << DEFAULT_DELTA_MAX << ") (default with -D: " << DEFAULT_DELTA_MAX_D << ")" << endl << " -M <int> maximal admissible delta between last V and first J k-mer (default: " << DEFAULT_DELTA_MAX << ") (default with -D: " << DEFAULT_DELTA_MAX_D << ")" << endl
<< " -w <int> w-mer size used for the length of the extracted window (default: " << DEFAULT_W << ")" << endl << " -w <int> w-mer size used for the length of the extracted window (default: " << DEFAULT_W << ")" << endl
<< " -e <float> maximal e-value for determining if a segmentation can be trusted (default: " << THRESHOLD_NB_EXPECTED << ")" << endl << " -e <float> maximal e-value for determining if a segmentation can be trusted (default: " << THRESHOLD_NB_EXPECTED << ")" << endl
<< " -t <int> trim V and J genes (resp. 5' and 3' regions) to keep at most <int> nt (default: " << DEFAULT_TRIM << ")" << endl
<< endl << endl
<< "Labeled windows (these windows will be kept even if -r/-% thresholds are not reached)" << endl << "Labeled windows (these windows will be kept even if -r/-% thresholds are not reached)" << endl
...@@ -316,6 +319,7 @@ int main (int argc, char **argv) ...@@ -316,6 +319,7 @@ int main (int argc, char **argv)
// Admissible delta between left and right segmentation points // Admissible delta between left and right segmentation points
int delta_min = DEFAULT_DELTA_MIN ; // Kmer+Fine int delta_min = DEFAULT_DELTA_MIN ; // Kmer+Fine
int delta_max = DEFAULT_DELTA_MAX ; // Fine int delta_max = DEFAULT_DELTA_MAX ; // Fine
int trim_sequences = DEFAULT_TRIM;
bool output_sequences_by_cluster = false; bool output_sequences_by_cluster = false;
bool output_segmented = false; bool output_segmented = false;
...@@ -348,7 +352,7 @@ int main (int argc, char **argv) ...@@ -348,7 +352,7 @@ int main (int argc, char **argv)
//$$ options: getopt //$$ options: getopt
while ((c = getopt(argc, argv, "A!x:X:hHaiI12g:G:V:D:J:k:r:vw:e:C:f:W:l:Fc:m:M:N:s:b:Sn:o:L%:y:z:uUK3E:")) != EOF) while ((c = getopt(argc, argv, "A!x:X:hHaiI12g:G:V:D:J:k:r:vw:e:C:f:W:l:Fc:m:M:N:s:b:Sn:o:L%:y:z:uUK3E:t:")) != EOF)
switch (c) switch (c)
{ {
...@@ -485,6 +489,10 @@ int main (int argc, char **argv) ...@@ -485,6 +489,10 @@ int main (int argc, char **argv)
output_sequences_by_cluster = true; output_sequences_by_cluster = true;
break; break;
case 't':
trim_sequences = atoi(optarg);
break;
case 'v': case 'v':
verbose += 1 ; verbose += 1 ;
break; break;
...@@ -772,7 +780,7 @@ int main (int argc, char **argv) ...@@ -772,7 +780,7 @@ int main (int argc, char **argv)
if (multi_germline) if (multi_germline)
{ {
multigermline->build_default_set(multi_germline_file); multigermline->build_default_set(multi_germline_file, trim_sequences);
} }
else else
{ {
...@@ -780,7 +788,7 @@ int main (int argc, char **argv) ...@@ -780,7 +788,7 @@ int main (int argc, char **argv)
Germline *germline; Germline *germline;
germline = new Germline(germline_system, 'X', germline = new Germline(germline_system, 'X',
f_reps_V, f_reps_D, f_reps_J, f_reps_V, f_reps_D, f_reps_J,
delta_min, delta_max); delta_min, delta_max, trim_sequences);
germline->new_index(seed); germline->new_index(seed);
...@@ -799,7 +807,7 @@ int main (int argc, char **argv) ...@@ -799,7 +807,7 @@ int main (int argc, char **argv)
multigermline->build_with_one_index(seed, false); multigermline->build_with_one_index(seed, false);
} }
Germline *pseudo = new Germline(PSEUDO_GERMLINE_MAX12, 'x', -10, 80); Germline *pseudo = new Germline(PSEUDO_GERMLINE_MAX12, 'x', -10, 80, trim_sequences);
pseudo->index = multigermline->index ; pseudo->index = multigermline->index ;
multigermline->germlines.push_back(pseudo); multigermline->germlines.push_back(pseudo);
} }
...@@ -807,7 +815,7 @@ int main (int argc, char **argv) ...@@ -807,7 +815,7 @@ int main (int argc, char **argv)
// Should come after the initialization of regular (and possibly pseudo) germlines // Should come after the initialization of regular (and possibly pseudo) germlines
if (multi_germline_incomplete) { if (multi_germline_incomplete) {
multigermline->one_index_per_germline = true; // Starting from now, creates new indexes multigermline->one_index_per_germline = true; // Starting from now, creates new indexes
multigermline->build_incomplete_set(multi_germline_file); multigermline->build_incomplete_set(multi_germline_file, trim_sequences);
} }
if (multi_germline_mark) if (multi_germline_mark)
......
...@@ -161,6 +161,7 @@ Window prediction ...@@ -161,6 +161,7 @@ Window prediction
-M <int> maximal admissible delta between last V and first J k-mer (default: 20) (default with -D: 80) -M <int> maximal admissible delta between last V and first J k-mer (default: 20) (default with -D: 80)
-w <int> w-mer size used for the length of the extracted window (default: 50) -w <int> w-mer size used for the length of the extracted window (default: 50)
-e <float> maximal e-value for determining if a segmentation can be trusted (default: 'all', no limit) -e <float> maximal e-value for determining if a segmentation can be trusted (default: 'all', no limit)
-t <int> trim V and J genes (resp. 5' and 3' regions) to keep at most <int> nt (default: 100)
#+END_EXAMPLE #+END_EXAMPLE
The =-s=, =-k=, =-m= and =-M= options are the options of the seed-based heuristic. A detailed The =-s=, =-k=, =-m= and =-M= options are the options of the seed-based heuristic. A detailed
...@@ -189,6 +190,10 @@ The default value is 1.0, but values such as 1000, 1e-3 or even less can be used ...@@ -189,6 +190,10 @@ The default value is 1.0, but values such as 1000, 1e-3 or even less can be used
to have a more or less permissive segmentation. to have a more or less permissive segmentation.
The threshold can be disabled with =-e all=. The threshold can be disabled with =-e all=.
The =-t= option sets the maximal number of nucleotides that will be indexed in
V genes (the 3' end) or in J genes (the 5' end). By default only 100nt will be
kept. This should be largely sufficient for the huge majority of
applications.
** Threshold on clone output ** Threshold on clone output
The following options control how many clones are output and analyzed. The following options control how many clones are output and analyzed.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment