Commit 6c5bb801 authored by Mathieu Giraud's avatar Mathieu Giraud
Browse files
parents c2ef87f9 9c7a455e
......@@ -349,7 +349,7 @@ affect_infos KmerAffectAnalyser<T>::getMaximum(const T &before,
results.nb_before_left = results.nb_before_right = results.nb_after_right = results.nb_after_left = 0;
currentValue = 0;
for (int i = 0; i < span - maxOverlap; i++) {
for (int i = 0; i < min(length,span - maxOverlap); i++) {
if (affectations[i] == after) {
currentValue--;
results.nb_after_right++;
......@@ -388,7 +388,7 @@ affect_infos KmerAffectAnalyser<T>::getMaximum(const T &before,
results.nb_before_right = 0;
}
}
for (int i = length - span + maxOverlap; i < length; i++) {
for (int i = length - span + maxOverlap; i < length && i >= 0; i++) {
if (affectations[i] == before)
results.nb_before_right++;
}
......
......@@ -26,6 +26,14 @@ vector<int> WindowsStorage::getStatus(junction window) {
return status_by_window[window];
}
string WindowsStorage::getLabel(junction window) {
if (windows_labels.find(window) == windows_labels.end())
return "" ;
return windows_labels[window];
}
Germline *WindowsStorage::getGermline(junction window) {
return germline_by_window[window];
}
......@@ -108,10 +116,10 @@ pair <int, int> WindowsStorage::keepInterestingWindows(size_t min_reads_window)
{
junction junc = it->first;
// Is it supported by enough reads?
// Is it not supported by enough reads?
if (!(seqs_by_window[junc].size() >= min_reads_window)
// Is it a labelled junction?
&& !(windows_labels.find(junc) == windows_labels.end()))
// Is it not a labelled junction?
&& (windows_labels.find(junc) == windows_labels.end()))
{
map <junction, list<Sequence> >::iterator toBeDeleted = it;
it++;
......@@ -187,7 +195,7 @@ JsonArray WindowsStorage::sortedWindowsToJsonArray(map <junction, JsonList> json
ostream &WindowsStorage::windowToStream(ostream &os, junction window, int num_seq,
size_t size) {
os << ">" << size << "--window--" << num_seq << " " << windows_labels[window] << endl ;
os << ">" << size << "--window--" << num_seq << " " << getLabel(window) << endl ;
os << window << endl;
return os;
}
......@@ -111,6 +111,12 @@ class WindowsStorage {
*/
void add(junction window, Sequence sequence, int status, Germline *germline);
/**
* Return the label of a window, if it exists
*/
string getLabel(junction window);
/**
* Only keep windows that are interesting. Those windows are windows
* supported by at least min_reads_window reads as well as windows that are
......
>lcl|FLN1FA001DNR97.1|
accctcggtcaccgtctc
>lcl|FLN1FA001C3XTY.1|
ccagggaaccctggtcaccg
>lcl|FLN1FA001A4CSH.1|
aatggtcaccgtctcctcag
>lcl|FLN1FA002RILD1.1|
ccctggtcaccgtctcctca
!LAUNCH: valgrind --leak-check=full ../../../vidjil -c clones -d -G ../../../germline/IGH -z 2 -r 1 bug20141024.fa 2>&1
!REQUIRES: which valgrind
$ No invalid read with short sequences
e1:ERROR SUMMARY: 0 errors from 0 contexts
$ No memory leak
e1:no leaks are possible
......@@ -29,6 +29,12 @@ option after the option !OUTPUT_DIR:
By default spaces can be replaced by any whitespaces. You can override this by
specifying !IGNORE_WHITESPACES: 0
* Requirements
Sometimes, to launch a test some requirements must be met. If the requirements
are not met we may want to skip the test. To do so, specify in the file
an option !REQUIRES: which will be followed by a command that is supposed
to exit with the error code 0. If the error code is different from 0 all the
tests in the file will be skipped.
* Environment
** Debug
If the environment variable DEBUG is defined, then some debug information
......@@ -67,6 +73,7 @@ LOG_FILE=${BASE%.*}.log
OUTPUT_FILE=
FILE_TO_GREP=
NO_LAUNCHER=
REQUIRE=
IGNORE_WHITESPACES=1
SEPARATOR_LINE="========================================================================"
......@@ -102,6 +109,10 @@ while read line; do
NO_LAUNCHER=1
elif [ "$type" == "IGNORE_WHITESPACES" ]; then
IGNORE_WHITESPACES=${line#*:}
elif [ "$type" == "REQUIRES" ]; then
REQUIRE=${line#*:}
else
echo "Unknown option $type" >&2
fi
elif [ ${line:0:1} == '$' ]; then
msg=${line:1}
......@@ -160,6 +171,10 @@ while read line; do
nb_hits=${nb_hits:1}
done
if ! eval $REQUIRE > /dev/null 2> /dev/null; then
skip=1
fi
# Replace whitespaces if needed
if [ $IGNORE_WHITESPACES -ne 0 ]; then
pattern=$(sed -r 's/[[:space:]]+/[[:space:]]+/g' <<< $pattern)
......
!LAUNCH: ../../vidjil -G ../../germline/IGH -d -r 5 -l ../../data/Stanford_S22.label ../../data/Stanford_S22.fasta
$ Keep the good number of windows, including one window labeled in Stanford_S22.label
1: keep 3 windows
$ The third clone has only one windows and the good label
1: clone-003.*0001-.* my-clone
......@@ -149,7 +149,7 @@ void usage(char *progname)
<< " -V <file> V germline multi-fasta file" << endl
<< " -D <file> D germline multi-fasta file (automatically implies -d)" << endl
<< " -J <file> J germline multi-fasta file" << endl
<< " -G <prefix> prefix for V (D) and J repertoires (shortcut for -V <prefix>V.fa -D <prefix>D.fa -J <prefix>J.fa)" << endl
<< " -G <prefix> prefix for V (D) and J repertoires (shortcut for -V <prefix>V.fa -D <prefix>D.fa -J <prefix>J.fa) (basename gives germline code)" << endl
<< " -g <path> multiple germlines (experimental)" << endl
<< endl
......@@ -163,22 +163,15 @@ void usage(char *progname)
#ifndef NO_SPACED_SEEDS
<< " (using -k option is equivalent to set with -s a contiguous seed with only '#' characters)" << endl
#endif
<< " -w <int> w-mer size used for the length of the extracted window (default: " << DEFAULT_W << ")(default with -d: " << DEFAULT_W_D << ")" << endl
<< " -m <int> minimal admissible delta between last V and first J k-mer (default: " << DEFAULT_DELTA_MIN << ") (default with -d: " << DEFAULT_DELTA_MIN_D << ")" << endl
<< " -M <int> maximal admissible delta between last V and first J k-mer (default: " << DEFAULT_DELTA_MAX << ") (default with -d: " << DEFAULT_DELTA_MAX_D << ")" << endl
<< " -w <int> w-mer size used for the length of the extracted window (default: " << DEFAULT_W << ") (default with -d: " << DEFAULT_W_D << ")" << endl
<< endl
<< "Window annotations" << endl
<< " -l <file> labels for some windows -- these windows will be kept even if some limits are not reached" << endl
<< endl
<< "Additional clustering (not output in .vidjil file and therefore not used in the browser)" << endl
<< " -e <file> manual clustering -- a file used to force some specific edges" << endl
<< " -n <int> maximum distance between neighbors for automatic clustering (default " << DEFAULT_EPSILON << "). No automatic clusterisation if =0." << endl
<< " -N <int> minimum required neighbors for automatic clustering (default " << DEFAULT_MINPTS << ")" << endl
<< " -S generate and save comparative matrix for clustering" << endl
<< " -L load comparative matrix for clustering" << endl
<< " -C <string> use custom Cost for automatic clustering : format \"match, subst, indels, homo, del_end\" (default "<<Cluster<<" )"<< endl
<< endl
<< "Limits to report a clone (or a window)" << endl
<< " -r <nb> minimal number of reads supporting a clone (default: " << MIN_READS_CLONE << ")" << endl
<< " -% <ratio> minimal percentage of reads supporting a clone (default: " << RATIO_READS_CLONE << ")" << endl
......@@ -192,11 +185,18 @@ void usage(char *progname)
<< "Fine segmentation options (second pass, see warning in doc/algo.org)" << endl
<< " -d segment into V(D)J components instead of VJ " << endl
<< " -m <int> minimal admissible delta between segmentation points (default: " << DEFAULT_DELTA_MIN << ") (default when -d is used: " << DEFAULT_DELTA_MIN_D << ")" << endl
<< " -M <int> maximal admissible delta between segmentation points (default: " << DEFAULT_DELTA_MAX << ") (default when -d is used: " << DEFAULT_DELTA_MAX_D << ")" << endl
<< " -f <string> use custom Cost for fine segmenter : format \"match, subst, indels, homo, del_end\" (default "<<VDJ<<" )"<< endl
<< endl
<< "Additional clustering" << endl
<< " -e <file> manual clustering -- a file used to force some specific edges" << endl
<< " -n <int> maximum distance between neighbors for automatic clustering (default " << DEFAULT_EPSILON << "). No automatic clusterisation if =0." << endl
<< " -N <int> minimum required neighbors for automatic clustering (default " << DEFAULT_MINPTS << ")" << endl
<< " -S generate and save comparative matrix for clustering" << endl
<< " -L load comparative matrix for clustering" << endl
<< " -C <string> use custom Cost for automatic clustering : format \"match, subst, indels, homo, del_end\" (default "<<Cluster<<" )"<< endl
<< endl
<< "Debug" << endl
<< " -U output segmented (in " << SEGMENTED_FILENAME << " file) sequences" << endl
<< " -u output unsegmented (in " << UNSEGMENTED_FILENAME << " file) sequences" << endl
......@@ -296,27 +296,12 @@ int main (int argc, char **argv)
//$$ options: getopt
while ((c = getopt(argc, argv, "Ahag:G:V:D:J:k:r:vw:e:C:t:l:dc:m:M:N:s:b:Sn:o:L%:y:z:uU")) != EOF)
while ((c = getopt(argc, argv, "Ahag:G:V:D:J:k:r:vw:e:C:f:l:dc:m:M:N:s:b:Sn:o:L%:y:z:uU")) != EOF)
switch (c)
{
case 'h':
usage(argv[0]);
case 'a':
output_sequences_by_cluster = true;
break;
case 'd':
segment_D = 1 ;
delta_min = DEFAULT_DELTA_MIN_D ;
delta_max = DEFAULT_DELTA_MAX_D ;
default_w = DEFAULT_W_D ;
break;
case 'e':
forced_edges = optarg;
break;
case 'l':
windows_labels_file = optarg;
break;
case 'c':
if (!strcmp(COMMAND_CLONES,optarg))
......@@ -332,9 +317,7 @@ int main (int argc, char **argv)
usage(argv[0]);
}
break;
case 'v':
verbose += 1 ;
break;
// Germline
......@@ -363,13 +346,22 @@ int main (int argc, char **argv)
f_rep_V = (germline_system + "V.fa").c_str() ;
f_rep_D = (germline_system + "D.fa").c_str() ;
f_rep_J = (germline_system + "J.fa").c_str() ;
if (germline_system.find_last_of("/\\") != string::npos)
germline_system.erase(0, germline_system.find_last_of("/\\")+1);
germline_system = extract_basename(germline_system);
// TODO: if VDJ, set segment_D // NO, bad idea, depends on naming convention
break;
// Algorithm
case 's':
#ifndef NO_SPACED_SEEDS
seed = string(optarg);
k = seed_weight(seed);
options_s_k++ ;
#else
cerr << "To enable the option -s, please compile without NO_SPACED_SEEDS" << endl;
#endif
break;
case 'k':
k = atoi(optarg);
seed = seed_contiguous(k);
......@@ -388,6 +380,8 @@ int main (int argc, char **argv)
delta_max = atoi(optarg);
break;
// Output
case 'o':
out_dir = optarg ;
break;
......@@ -396,6 +390,14 @@ int main (int argc, char **argv)
f_basename = optarg;
break;
case 'a':
output_sequences_by_cluster = true;
break;
case 'v':
verbose += 1 ;
break;
// Limits
case '%':
......@@ -430,20 +432,16 @@ int main (int argc, char **argv)
max_representatives = -1 ;
max_clones = -1 ;
break ;
// Seeds
case 's':
#ifndef NO_SPACED_SEEDS
seed = string(optarg);
k = seed_weight(seed);
options_s_k++ ;
#else
cerr << "To enable the option -s, please compile without NO_SPACED_SEEDS" << endl;
#endif
break;
case 'l':
windows_labels_file = optarg;
break;
// Clustering
case 'e':
forced_edges = optarg;
break;
case 'n':
epsilon = atoi(optarg);
......@@ -465,7 +463,16 @@ int main (int argc, char **argv)
cluster_cost=strToCost(optarg, Cluster);
break;
case 't':
// Fine segmentation
case 'd':
segment_D = 1 ;
delta_min = DEFAULT_DELTA_MIN_D ;
delta_max = DEFAULT_DELTA_MAX_D ;
default_w = DEFAULT_W_D ;
break;
case 'f':
segment_cost=strToCost(optarg, VDJ);
break;
......@@ -477,6 +484,9 @@ int main (int argc, char **argv)
break;
}
//$$ options: post-processing+display
// If there was no -w option, then w is either DEFAULT_W or DEFAULT_W_D
if (w == 0)
w = default_w ;
......@@ -507,8 +517,6 @@ int main (int argc, char **argv)
exit(1);
}
//$$ options: post-processing+display
size_t min_cover_representative = (size_t) (MIN_COVER_REPRESENTATIVE_RATIO_MIN_READS_CLONE * min_reads_clone) ;
if (!segment_D) // TODO: add other constructor to Fasta, and do not load rep_D in this case
......@@ -1030,8 +1038,7 @@ int main (int argc, char **argv)
string clone_id_human = oss_human.str();
// Window label
string window_str = ">" + clone_id + "--window" + " " + windows_labels[it->first] + '\n' + it->first + '\n' ;
string window_str = ">" + clone_id + "--window" + " " + windowsStorage->getLabel(it->first) + '\n' + it->first + '\n' ;
//$$ If max_representatives is reached, we stop here but still outputs the window
......
TGTGCGAGAGATGGACGGGATACGTAAAACGACATATGGTTCGGGGTTTGGTGCTTTTGA my-clone
......@@ -125,10 +125,12 @@ Window prediction
(default: #####-#####, ######-######, #######-#######, depends on germline)
-k <int> k-mer size used for the V/J affectation (default: 10, 12, 13, depends on germline)
(using -k option is equivalent to set with -s a contiguous seed with only '#' characters)
-m <int> minimal admissible delta between last V and first J k-mer (default: -10) (default with -d: 0)
-M <int> maximal admissible delta between last V and first J k-mer (default: 20) (default with -d: 80)
-w <int> w-mer size used for the length of the extracted window (default: 40)(default with -d: 60)
#+END_EXAMPLE
The =-s= and =-k= options are the options of the heuristic. A detailed
The =-s=, =-k=, =-m= and =-M= options are the options of the seed-based heuristic. A detailed
explanation can be found in the paper. More help on that will be
available in the following months. The defaults values should work.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment