Commit 22ad13cf authored by Mathieu Giraud's avatar Mathieu Giraud

Merge branch 'feature-a/4681-filter-reads' into 'dev'

Preset --filter-reads

Closes #4681

See merge request !906
parents a79706ac 6f2f5f6e
Pipeline #216599 failed with stages
in 15 minutes and 21 seconds
# Random DNA sequences, %GC = 0.50
>random-10
tgtcccgatt
>random-20
aatggatgtaccggaggctg
>random-30
gcaactgcgatgagggtacccccccactct
>random-40
gcaaccgtgtggtggtgcagcatgtaatgtctaacccgtc
>random-50
gcaacgtccggttggtaagccggaaaaagtaaacctgtatgcgctacatg
>random-60
ccaatctttagtgactcctgtctcgtggcaagctgtccgttttgggcgtaggttccagaa
>random-70
aggtagggtttgtaggtatgactggctcaacgtgataggccgagagcaagttcattgacaacaccctctg
>random-80
gaaaaggcccacgccggcagctgtgagatgttaggtccggggggtcgctagtattctcacacctgcataacccgggaata
>random-90
ggcaagactaacctatggttccgtgtaagcaacccttcccaaaccgatctctcgccgagcccaccagtaaccccttaagacaaacctggt
>random-100
atttgtatgtctggccacctaaagctacaataagccggatgatagtcagatcgcaaatagcgtttatccggctttaaacgcggaacgcgtgagtattttg
!NO_LAUNCHER:
!LAUNCH: cat $VIDJIL_DATA/random.fa $VIDJIL_DATA/segment_S22.fa > $VIDJIL_DATA/random+S22.fa
!LAUNCH: $VIDJIL_DIR/$EXEC --filter-reads -g $VIDJIL_DIR/germline/homo-sapiens.g:IGH $VIDJIL_DATA/random+S22.fa
$ Correct information
1: Detecting V.D.J recombinations
$ Correct filtering
1: junction detected in 3 reads .23..%.
1: IGH .* 3
$ Output
1: ==> .*/random.S22.detected.vdj.fa
$ No spurious output (implied -c detect)
3:windows
0:Diversity measures
0:AIRR output
\ No newline at end of file
......@@ -13,7 +13,7 @@
0:Extracting windows
# '-c designations', one time
1:Segmenting V.D.J
1:Designating V.D.J
# '-c germlines'
1:Discovering germlines
......
!NO_LAUNCHER:
cd $VIDJIL_DIR ; ./$EXEC -h | grep '$EXEC -c' | sed 's/X 50/X 5/' | sed 's/demo.LIL-L4/-X 1000 demo\/LIL-L4/' > help-examples.sh
cd $VIDJIL_DIR ; ./$EXEC -h | grep '$EXEC.*\#' | sed 's/X 50/X 5/' | sed 's/demo.LIL-L4/-X 1000 demo\/LIL-L4/' > help-examples.sh
cd $VIDJIL_DIR ; sh help-examples.sh
# Test examples embedded in './vidjil-algo -h'
# Run 6 times (-h + 5 examples)
6:vidjil-algo -- V.D.J recombinations analysis
# Run 7 times (-h + 6 examples)
7:vidjil-algo -- V.D.J recombinations analysis
# '-c clone', three times
# '-c clones', three times
3: end of clones
# '-c windows'
1:Extracting windows
# '--filter-reads'
1:Detecting V.D.J recombinations$
# '-c clones' / '-c windows'
4:Detecting V.D.J recombinations and
# '-c designations'
1:Segmenting V.D.J
1:Designating V.D.J recombinations
# '-c germlines'
1:Discovering germlines
......
......@@ -15,7 +15,7 @@ $ Do not display advanced options
0: custom Cost
$ Correct number of regular options
25:^..-
26:^..-
!NO_LAUNCHER:
......@@ -37,4 +37,4 @@ $ Display advanced options
: custom Cost
$ Correct number of options, including advanced options
62:^..-
63:^..-
......@@ -3,7 +3,7 @@
# Testing '-w all' option
$ The junctions are the reads
1: considering all analyzed reads as windows
1: considering all detected reads as windows
1: found 13141 windows in 13153 reads
$ Only the first 12 clones have 2 reads
......
!LAUNCH: $VIDJIL_DIR/$EXEC --all -w all -g $VIDJIL_DIR/germline/homo-sapiens.g $VIDJIL_DATA/s-somatic.fa ; cat out/s-somatic.vidjil | python $VIDJIL_DIR/tools/format_json.py -1
$ No clustering due to -w all
1: considering all analyzed reads as windows
1: considering all detected reads as windows
$ Warning on similar clones in json ('warn' because this is a clone in the first 10 clones)
1: "code": "W53", "level": "warn", "msg": "Similar to clone #1 .* IGHV3-48.01 .* IGHJ4.02"
......@@ -86,12 +86,13 @@
#define DEFAULT_RATIO_READS_CLONE 0.0
#define NO_LIMIT "all"
#define COMMAND_DETECT "detect"
#define COMMAND_WINDOWS "windows"
#define COMMAND_CLONES "clones"
#define COMMAND_SEGMENT "designations"
#define COMMAND_GERMLINES "germlines"
enum { CMD_WINDOWS, CMD_CLONES, CMD_SEGMENT, CMD_GERMLINES } ;
enum { CMD_DETECT, CMD_WINDOWS, CMD_CLONES, CMD_SEGMENT, CMD_GERMLINES } ;
#define DEFAULT_OUT_DIR "./out/"
......@@ -134,6 +135,10 @@ enum { CMD_WINDOWS, CMD_CLONES, CMD_SEGMENT, CMD_GERMLINES } ;
#define MAX_CLONES_FOR_SIMILARITY 20
#define EVALUE_FILTER_READS 1e6
#define STR_(X) #X
#define STR(X) STR_(X)
// warn
#define WARN_MAX_CLONES 5000
#define WARN_PERCENT_SEGMENTED 40
......@@ -166,6 +171,7 @@ string usage_examples(char *progname)
<< " # including unexpected recombinations (-2), assign V(D)J genes and try to detect the CDR3s (-3))" << endl
<< " " << progname << " -c clones -g germline/homo-sapiens.g:IGH -3 demo/Stanford_S22.fasta # (restrict to complete recombinations on the IGH locus)" << endl
<< " " << progname << " -c clones -g germline/homo-sapiens.g -2 -3 -z 20 demo/LIL-L4.fastq.gz # (basic usage, output detailed V(D)J analysis on the first 20 clones)" << endl
<< " " << progname << " --filter-reads -g germline/homo-sapiens.g demo/LIL-L4.fastq.gz # (pre-filter, extract all reads that may have V(D)J recombinations)" << endl
<< " " << progname << " -c windows -g germline/homo-sapiens.g -y 0 -uu -U demo/LIL-L4.fastq.gz # (splits all the reads into (large) files depending on the detection of V(D)J recombinations)" << endl
<< " " << progname << " -c designations -g germline/homo-sapiens.g -2 -3 -X 50 demo/Stanford_S22.fasta # (full analysis of each read, here on 50 sampled reads)" << endl
<< " " << progname << " -c germlines -g germline/homo-sapiens.g demo/Stanford_S22.fasta # (statistics on the k-mers)" << endl
......@@ -265,8 +271,10 @@ int main (int argc, char **argv)
string cmd = COMMAND_CLONES;
app.add_option("-c", cmd, "command"
"\n \t\t" COMMAND_CLONES " \t locus detection, window extraction, clone clustering (default command, most efficient, all outputs)"
"\n \t\t" COMMAND_WINDOWS " \t locus detection, window extraction"
"\n \t\t" COMMAND_CLONES " \t locus/V(D)J detection, window extraction, clone clustering (default command, most efficient, all outputs)"
"\n \t\t" COMMAND_WINDOWS " \t locus/V(D)J detection, window extraction"
"\n \t\t" COMMAND_DETECT " \t locus/V(D)J detection"
"\n \t\t" COMMAND_SEGMENT " \t detailed V(D)J designation, without prior clustering (not as efficient)"
"\n \t\t" COMMAND_GERMLINES " \t statistics on k-mers in different germlines")
-> group(group) -> type_name("COMMAND");
......@@ -624,6 +632,18 @@ int main (int argc, char **argv)
// ----------------------------------------------------------------------------------------------------------------------
group = "Presets";
app.add_flag_function("--filter-reads", [&](int n) {
UNUSED(n);
cmd = COMMAND_DETECT;
output_segmented = true;
expected_value = EVALUE_FILTER_READS ;
multi_germline_unexpected_recombinations_12 = true;
return true;
},
"filter possibly huge datasets, with a permissive threshold, to extract reads that may have V(D)J recombinations"
PAD_HELP "(equivalent to -c " COMMAND_DETECT " --out-detected --e-value " STR(EVALUE_FILTER_READS) " -2)")
-> group(group);
app.add_option("--grep-reads",
[&only_labeled_windows,&windows_labels_explicit,&output_sequences_by_cluster](CLI::results_t res) {
only_labeled_windows = true;
......@@ -670,6 +690,11 @@ int main (int argc, char **argv)
command = CMD_CLONES;
else if (cmd == COMMAND_SEGMENT)
command = CMD_SEGMENT;
else if (cmd == COMMAND_DETECT) {
command = CMD_DETECT;
no_airr = true;
no_vidjil = true;
}
else if (cmd == COMMAND_WINDOWS)
command = CMD_WINDOWS;
else if (cmd == COMMAND_GERMLINES)
......@@ -788,11 +813,13 @@ int main (int argc, char **argv)
json j_labels = load_into_map_from_json(windows_labels, windows_labels_json);
switch(command) {
case CMD_WINDOWS: cout << "Extracting windows" << endl;
case CMD_DETECT: cout << "Detecting V(D)J recombinations" << endl;
break;
case CMD_WINDOWS: cout << "Detecting V(D)J recombinations and extracting windows" << endl;
break;
case CMD_CLONES: cout << "Analysing clones" << endl;
case CMD_CLONES: cout << "Detecting V(D)J recombinations and analyzing clones" << endl;
break;
case CMD_SEGMENT: cout << "Segmenting V(D)J" << endl;
case CMD_SEGMENT: cout << "Designating V(D)J recombinations" << endl;
break;
case CMD_GERMLINES: cout << "Discovering germlines" << endl;
break;
......@@ -1107,18 +1134,18 @@ int main (int argc, char **argv)
////////////////////////////////////////
// CLONE ANALYSIS //
////////////////////////////////////////
if (command == CMD_CLONES || command == CMD_WINDOWS) {
if (command == CMD_CLONES || command == CMD_WINDOWS || command == CMD_DETECT) {
//////////////////////////////////
//$$ Kmer Segmentation
cout << endl;
cout << "Loop through reads, ";
cout << "Loop through reads, detecting V(D)J recombinations";
if (wmer_size != NO_LIMIT_VALUE)
cout << "looking for windows up to " << wmer_size << "bp" << endl;
cout << " while extracting windows up to " << wmer_size << "bp" << endl;
else
cout << "considering all analyzed reads as windows" << endl;
cout << " while considering all detected reads as windows" << endl;
ofstream *out_segmented = NULL;
ofstream *out_unsegmented = NULL;
......@@ -1217,6 +1244,10 @@ int main (int argc, char **argv)
cout << stream_segmentation_info.str();
// CMD_DETECT stops here
if (command == CMD_CLONES || command == CMD_WINDOWS) {
//////////////////////////////////
//$$ Sort windows
......@@ -1302,6 +1333,7 @@ int main (int argc, char **argv)
cout << "No clustering" << endl ;
}
// CMD_WINDOWS stops here
//$$ Further analyze some clones (-z)
if (command == CMD_CLONES) {
......@@ -1735,6 +1767,7 @@ int main (int argc, char **argv)
if (jsonLevenshteinComputed)
output.set("similarity", jsonLevenshtein);
} // end if (command == CMD_CLONES) || (command == CMD_WINDOWS)
//$$ Clean
......
......@@ -689,31 +689,32 @@ Detailed output per read (generally not recommended, large files, but may be use
-uuu output undetected reads, all reads, including a .undetected.vdj.fa file (use only for debug)
--out-reads output all reads by clones (clone.fa-*), to be used only on small datasets
-K, --out-affects output detailed k-mer affectation for each read (in .affects file) (use only for debug, for example -KX 100)
Presets
--filter-reads filter possibly huge datasets, with a permissive threshold, to extract reads that may have V(D)J recombinations
(equivalent to -c detect --out-detected --e-value 1e6 -2)
```
It is possible to extract all reads with or without detected recombinations,
possibly to give them to other software.
Runing Vidjil-algo with `-U` gives a file `out/basename.detected.vdj.fa`, with all detected reads.
On datasets generated with rather specific V(D)J primers, this is generally not recommended, as it may generate a large file.
However, the `-U` option is very useful for whole RNA-Seq or capture datasets that contain few reads with V(D)J recombinations.
Moreover `-U` only uses the ultra-fast first passs analysis, based on k-mer heuristics.
Similarly, options are available to get the non analyzed reads:
- `-U` gives a file `out/basename.detected.vdj.fa` with all reads having a detected V(D)J recombination
- `-u` gives a set of files `out/basename.UNSEG_*`, with not detected reads gathered by cause.
It outputs only reads sharing significantly sequences with V/J germline genes or with some ambiguity:
it may be interesting to further study RNA-Seq datasets.
- `-uu` gives the same set of files, including **all** not detected reads (including `UNSEG too short` and `UNSEG too few V/J`),
- `-u` gives a set of files `out/basename.UNSEG_*` with reads where /no V(D)J recombination was detected/,
but with nevertheless some significant similarity to some V/J germline genes,
- `-uu` further produce files with all /other/ reads where no V(D)J recombination was detected
(including `UNSEG too short` and `UNSEG too few V/J`),
and `-uuu` further outputs all these reads in a file `out/basename.undetected.vdj.fa`.
Again, as these options may generate large files, they are generally not recommended.
As these options may generate large files, they are generally not recommended.
However, they are very useful in some situations, especially to understand
why some dataset gives low detection rate.
For example `-uu -X 1000` splits the not detected reads from the 1000 first reads.
When processing large datasets, such as RNA-Seq or capture, one may want to pre-filter read by keeping only the ones that potentially harbour a V(D)J recombination.
In such a case, the recommanded option is to use the `--filter-reads` preset, that launches Vidjil-algo without clone clustering and analysis,
while outputing a `out/basename.detected.vdj.fa` file. This file contains reads /that may have V(D)J recombinations/, evaluated with a very permissive threshold.
The resulting file is usually much smaller on such datasets and can then be transferred or analysed in-depth more easily.
## AIRR .tsv output
Since version 2018.10, vidjil-algo supports the [AIRR format](http://docs.airr-community.org/en/latest/datarep/rearrangements.html#fields).
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment