Commit 535e4393 authored by Mikael Salson's avatar Mikael Salson

vidjil.cpp: Add -p option and pass it through WindowExtractor to KmerMultiSegmenter.

Doc updated accordingly.
parent 09ed77dd
......@@ -28,7 +28,7 @@
#define JSON_REMEMBER_BEST 4 /* The number of V/D/J predictions to keep */
#define THRESHOLD_NB_EXPECTED 1E-6 /* Threshold of the accepted expected value for number of found k-mers */
#define THRESHOLD_NB_EXPECTED 100 /* Threshold of the accepted expected value for number of found k-mers */
using namespace std;
......
......@@ -10,7 +10,8 @@ WindowExtractor::WindowExtractor(): out_segmented(NULL), out_unsegmented(NULL),
WindowsStorage *WindowExtractor::extract(OnlineFasta *reads, MultiGermline *multigermline,
size_t w,
map<string, string> &windows_labels,
int stop_after, int only_nth_read, bool keep_unsegmented_as_clone) {
int stop_after, int only_nth_read, bool keep_unsegmented_as_clone,
double nb_expected) {
init_stats();
WindowsStorage *windowsStorage = new WindowsStorage(windows_labels);
......@@ -37,7 +38,7 @@ WindowsStorage *WindowExtractor::extract(OnlineFasta *reads, MultiGermline *mult
*out_affects << reads->getSequence();
}
KmerMultiSegmenter kmseg(reads->getSequence(), multigermline, out_affects);
KmerMultiSegmenter kmseg(reads->getSequence(), multigermline, out_affects, nb_expected);
KmerSegmenter *seg = kmseg.the_kseg ;
int read_length = seg->getSequence().sequence.length();
......
......@@ -39,6 +39,7 @@ class WindowExtractor {
* @param multigermline: the multigermline
* @param w: length of the window
* @param windows_labels: Windows that must be kept and registered as such.
* @param nb_expected: maximal e-value of the segmentation
* @return a pointer to a WindowsStorage that will contain all the windows.
* It is a pointer so that the WindowsStorage is not duplicated.
* @post Statistics on segmentation will be provided through the getSegmentationStats() methods
......@@ -47,7 +48,8 @@ class WindowExtractor {
WindowsStorage *extract(OnlineFasta *reads, MultiGermline *multigermline,
size_t w,
map<string, string> &windows_labels,
int stop_after=-1, int only_nth_reads=1, bool keep_unsegmented_as_clone=false);
int stop_after=-1, int only_nth_reads=1, bool keep_unsegmented_as_clone=false,
double nb_expected = THRESHOLD_NB_EXPECTED);
/**
* @return the average length of sequences whose segmentation has been classified as seg
......
......@@ -179,6 +179,7 @@ void usage(char *progname, bool advanced)
<< " -m <int> minimal admissible delta between last V and first J k-mer (default: " << DEFAULT_DELTA_MIN << ") (default with -D: " << DEFAULT_DELTA_MIN_D << ")" << endl
<< " -M <int> maximal admissible delta between last V and first J k-mer (default: " << DEFAULT_DELTA_MAX << ") (default with -D: " << DEFAULT_DELTA_MAX_D << ")" << endl
<< " -w <int> w-mer size used for the length of the extracted window (default: " << DEFAULT_W << ") (default with -D: " << DEFAULT_W_D << ")" << endl
<< " -p <float> maximal e-value for determining if a segmentation can be trusted (default: " << THRESHOLD_NB_EXPECTED <<")" << endl
<< endl
<< "Window annotations" << endl
......@@ -331,13 +332,15 @@ int main (int argc, char **argv)
int options_s_k = 0 ;
double expected_value = THRESHOLD_NB_EXPECTED;
//JsonArray which contains the Levenshtein distances
JsonArray jsonLevenshtein;
//$$ options: getopt
while ((c = getopt(argc, argv, "A!x:X:hHaiI1g:G:V:D:J:k:r:vw:e:C:f:l:c:m:M:N:s:b:Sn:o:L%:y:z:uUK3")) != EOF)
while ((c = getopt(argc, argv, "A!x:X:hHaiI1g:G:V:D:J:k:r:vw:e:C:f:l:c:m:M:N:s:b:Sn:o:L%:y:z:uUK3p:")) != EOF)
switch (c)
{
......@@ -454,6 +457,10 @@ int main (int argc, char **argv)
keep_unsegmented_as_clone = true;
break;
case 'p':
expected_value = atof(optarg);
break;
// Output
case 'o':
......@@ -927,7 +934,7 @@ int main (int argc, char **argv)
we.setAffectsOutput(out_affects);
}
WindowsStorage *windowsStorage = we.extract(reads, multigermline, w, windows_labels, max_reads_processed, only_nth_read, keep_unsegmented_as_clone);
WindowsStorage *windowsStorage = we.extract(reads, multigermline, w, windows_labels, max_reads_processed, only_nth_read, keep_unsegmented_as_clone, expected_value);
windowsStorage->setIdToAll();
size_t nb_total_reads = we.getNbReads();
......
......@@ -159,6 +159,7 @@ Window prediction
-m <int> minimal admissible delta between last V and first J k-mer (default: -10) (default with -D: 0)
-M <int> maximal admissible delta between last V and first J k-mer (default: 20) (default with -D: 80)
-w <int> w-mer size used for the length of the extracted window (default: 40)(default with -D: 60)
-p <float> e-value like for determining if a window must be segmented or no (default: 100)
#+END_EXAMPLE
The =-s=, =-k=, =-m= and =-M= options are the options of the seed-based heuristic. A detailed
......@@ -178,6 +179,13 @@ Setting =-w= to 30 for VJ and 50 for VDJ recombinations may "segment" (analyze)
few more reads, but may in some rare cases falsely cluster reads from
different clones. Setting =-w= to lower values is not recommended.
The =-p= option is used to determine what is the maximal e-value accepted for
segmenting a sequence. If a segmentation has a higher e-value, it will not be
segmented. The default value is 100 but it is *not* the recommended
value. The purpose is to keep Vidjil with the same behaviour. The
recommendation would be to use a value of 1e-6 (or lower with huge datasets).
** Threshold on clone output
The following options control how many clones are output and analyzed.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment