Commit 1f3d1d48 authored by Mikaël Salson's avatar Mikaël Salson

Merge branch 'feature-a/3772-c-designations' into 'dev'

-c designations

Closes #3772

See merge request !429
parents 8a2a1ee6 865b913e
Pipeline #66682 passed with stages
in 51 minutes and 21 seconds
!LAUNCH: $VIDJIL_DIR/$EXEC -c segment -g ../../../germline/homo-sapiens.g:IGH bug20131218.fa
!LAUNCH: $VIDJIL_DIR/$EXEC -c designations -g ../../../germline/homo-sapiens.g:IGH bug20131218.fa
# The number of deletions in the D is larger than the D length itself.
$ Bug with the number of deletions in the D.
......
!LAUNCH: $VIDJIL_DIR/$EXEC -E 1 -d -g $VIDJIL_DIR/germline -c segment bug2249.fa
!LAUNCH: $VIDJIL_DIR/$EXEC -E 1 -d -g $VIDJIL_DIR/germline -c designations bug2249.fa
$ Should not have a TRDD2 (length 9) with 8 deletions (first seq)
f0:7 TRDD2.01 1
......
......@@ -250,7 +250,7 @@ def header_igblast_results(ff_fasta, ff_igblast):
### Vidjil
VIDJIL_FINE = '{directory}/vidjil-algo --header-sep "#" -c segment -2 -3 -d -g {directory}/germline/homo-sapiens.g %s >> %s'
VIDJIL_FINE = '{directory}/vidjil-algo --header-sep "#" -c designations -2 -3 -d -g {directory}/germline/homo-sapiens.g %s >> %s'
VIDJIL_KMER = '{directory}/vidjil-algo -w 20 --header-sep "#" -b out -c windows -uuuU -2 -g {directory}/germline/homo-sapiens.g %s > /dev/null ; cat out/out.segmented.vdj.fa out/out.unsegmented.vdj.fa >> %s'
def should_results_from_vidjil_output(f_log):
......
!LAUNCH: $VIDJIL_DIR/$EXEC $VIDJIL_DEFAULT_OPTIONS -c segment -g $VIDJIL_DIR/germline/homo-sapiens.g $VIDJIL_DATA/3344-bad-filtering.fa
!LAUNCH: $VIDJIL_DIR/$EXEC $VIDJIL_DEFAULT_OPTIONS -c designations -g $VIDJIL_DIR/germline/homo-sapiens.g $VIDJIL_DATA/3344-bad-filtering.fa
$ Check that proper filtering is used
1: IGHV4-31.02
!REQUIRES: python $VIDJIL_DIR/tools/check_python_version.py
!LAUNCH: $VIDJIL_DIR/$EXEC $VIDJIL_DEFAULT_OPTIONS --alternative-genes 3 -c segment -x 1 -g $VIDJIL_DIR/germline/homo-sapiens.g:IGH $VIDJIL_DATA/Stanford_S22.fasta > /dev/null ; cat out/Stanford_S22.vidjil | python $VIDJIL_DIR/tools/format_json.py -1
!LAUNCH: $VIDJIL_DIR/$EXEC $VIDJIL_DEFAULT_OPTIONS --alternative-genes 3 -c designations -x 1 -g $VIDJIL_DIR/germline/homo-sapiens.g:IGH $VIDJIL_DATA/Stanford_S22.fasta > /dev/null ; cat out/Stanford_S22.vidjil | python $VIDJIL_DIR/tools/format_json.py -1
$ Presence of alternative:
1: "3alt"
......
!LAUNCH: $VIDJIL_DIR/$EXEC $VIDJIL_DEFAULT_OPTIONS -c segment -3 -g $VIDJIL_DIR/germline/homo-sapiens.g:TRG $VIDJIL_DATA/cdr3-stopcodon.fa
!LAUNCH: $VIDJIL_DIR/$EXEC $VIDJIL_DEFAULT_OPTIONS -c designations -3 -g $VIDJIL_DIR/germline/homo-sapiens.g:TRG $VIDJIL_DATA/cdr3-stopcodon.fa
!OUTPUT_FILE: out/cdr3-stopcodon.vidjil
$ Two identical junctions in JSON
......
!LAUNCH: $VIDJIL_DIR/$EXEC $VIDJIL_DEFAULT_OPTIONS -c segment -3 -E 1.0 -g $VIDJIL_DIR/germline ../should-vdj-tests/Demo-X5.should-vdj.fa
!LAUNCH: $VIDJIL_DIR/$EXEC $VIDJIL_DEFAULT_OPTIONS -c designations -3 -E 1.0 -g $VIDJIL_DIR/germline ../should-vdj-tests/Demo-X5.should-vdj.fa
$ Detects a CDR3 on regular V(D)J recombinations
1: IGH SEG.* [{].*[}]
......
......@@ -12,7 +12,7 @@
# '-c windows'
0:Extracting windows
# '-c segment', one time
# '-c designations', one time
1:Segmenting V.D.J
# '-c germlines'
......
!LAUNCH: $VIDJIL_DIR/$EXEC $VIDJIL_DEFAULT_OPTIONS -c segment reads 2>&1
!EXIT_CODE: 1
$ Deprecated option
1:is deprecated
$ Advice on usage
1:-c designations
!LAUNCH: $VIDJIL_DIR/$EXEC $VIDJIL_DEFAULT_OPTIONS -3 -c segment -g $VIDJIL_DIR/germline/mus-musculus.g $VIDJIL_DATA/example_mouse.fa
!LAUNCH: $VIDJIL_DIR/$EXEC $VIDJIL_DEFAULT_OPTIONS -3 -c designations -g $VIDJIL_DIR/germline/mus-musculus.g $VIDJIL_DATA/example_mouse.fa
$ Segment sequence on IGK
1:segIGK.*IGKV.*IGKJ
......
!LAUNCH: $VIDJIL_DIR/$EXEC -c segment $VIDJIL_DEFAULT_OPTIONS -g $VIDJIL_DIR/germline/homo-sapiens.g:IGH,IGK,IGL $VIDJIL_DATA/multi-complete.fa ; cat out/multi-complete.vidjil | python $VIDJIL_DIR/tools/format_json.py -1
!LAUNCH: $VIDJIL_DIR/$EXEC -c designations $VIDJIL_DEFAULT_OPTIONS -g $VIDJIL_DIR/germline/homo-sapiens.g:IGH,IGK,IGL $VIDJIL_DATA/multi-complete.fa ; cat out/multi-complete.vidjil | python $VIDJIL_DIR/tools/format_json.py -1
$ Segment the Ig recombinations
1:IGH SEG
......
!LAUNCH: $VIDJIL_DIR/$EXEC $VIDJIL_DEFAULT_OPTIONS -c segment -g $VIDJIL_DIR/germline $VIDJIL_DATA/multi-tiny.fa
!LAUNCH: $VIDJIL_DIR/$EXEC $VIDJIL_DEFAULT_OPTIONS -c designations -g $VIDJIL_DIR/germline $VIDJIL_DATA/multi-tiny.fa
$ Do not segment any of the seven reads
7:UNSEG
......
# The sequences are on TRG, no CDR3 should be found with IGH
!LAUNCH: $VIDJIL_DIR/$EXEC $VIDJIL_DEFAULT_OPTIONS -c segment -3 -V $VIDJIL_DIR/germline/homo-sapiens/IGHV.fa -D $VIDJIL_DIR/germline/homo-sapiens/IGHD.fa -J $VIDJIL_DIR/germline/homo-sapiens/IGHJ.fa ../should-vdj-tests/cdr3-indels.should-vdj.fa; cat out/cdr3-indels.should-vdj.vidjil
!LAUNCH: $VIDJIL_DIR/$EXEC $VIDJIL_DEFAULT_OPTIONS -c designations -3 -V $VIDJIL_DIR/germline/homo-sapiens/IGHV.fa -D $VIDJIL_DIR/germline/homo-sapiens/IGHD.fa -J $VIDJIL_DIR/germline/homo-sapiens/IGHJ.fa ../should-vdj-tests/cdr3-indels.should-vdj.fa; cat out/cdr3-indels.should-vdj.vidjil
$ No CDR3 should be found
0:TRGV.* [{].*[}]
......
!LAUNCH: $VIDJIL_DIR/$EXEC $VIDJIL_DEFAULT_OPTIONS -c segment -3 -g $VIDJIL_DIR/germline $VIDJIL_DATA/no-cdr3.fa
!LAUNCH: $VIDJIL_DIR/$EXEC $VIDJIL_DEFAULT_OPTIONS -c designations -3 -g $VIDJIL_DIR/germline $VIDJIL_DATA/no-cdr3.fa
$ No CDR3 should be found
0:seq1.* IGH SEG.* [{].*[}]
......
!LAUNCH: $VIDJIL_DIR/$EXEC $VIDJIL_DEFAULT_OPTIONS -c segment -g $VIDJIL_DIR/germline/homo-sapiens.g:IGH -A $VIDJIL_DATA/overlap-d-j.fa | grep -v out | tail -4 | tr -d '\n' | wc -c
!LAUNCH: $VIDJIL_DIR/$EXEC $VIDJIL_DEFAULT_OPTIONS -c designations -g $VIDJIL_DIR/germline/homo-sapiens.g:IGH -A $VIDJIL_DATA/overlap-d-j.fa | grep -v out | tail -4 | tr -d '\n' | wc -c
$ Exported sequence has all the bases
1:116
......
!LAUNCH: $VIDJIL_DIR/$EXEC $VIDJIL_DEFAULT_OPTIONS -g $VIDJIL_DIR/germline/homo-sapiens.g:IGH -c segment $VIDJIL_DATA/segment_S22.fa | grep '^>' ; cat out/segment_S22.vidjil | python $VIDJIL_DIR/tools/format_json.py -1
!LAUNCH: $VIDJIL_DIR/$EXEC $VIDJIL_DEFAULT_OPTIONS -g $VIDJIL_DIR/germline/homo-sapiens.g:IGH -c designations $VIDJIL_DATA/segment_S22.fa | grep '^>' ; cat out/segment_S22.vidjil | python $VIDJIL_DIR/tools/format_json.py -1
$ First sequence Stanford
# 164 175 195 203
......
!LAUNCH: $VIDJIL_DIR/$EXEC $VIDJIL_DEFAULT_OPTIONS -v -g $VIDJIL_DIR/germline -c segment $VIDJIL_DATA/segment_simul.fa | grep '^[>#]'
!LAUNCH: $VIDJIL_DIR/$EXEC $VIDJIL_DEFAULT_OPTIONS -v -g $VIDJIL_DIR/germline -c designations $VIDJIL_DATA/segment_simul.fa | grep '^[>#]'
$ First sequence, easy segmentation (no error, few deletions at the windows, small N)
......
!LAUNCH: $VIDJIL_DIR/$EXEC $VIDJIL_DEFAULT_OPTIONS -c segment -x 2 -g $VIDJIL_DIR/germline/homo-sapiens.g:IGH $VIDJIL_DATA/Stanford_S22.fasta
!LAUNCH: $VIDJIL_DIR/$EXEC $VIDJIL_DEFAULT_OPTIONS -c designations -x 2 -g $VIDJIL_DIR/germline/homo-sapiens.g:IGH $VIDJIL_DATA/Stanford_S22.fasta
$ Segments the good number of sequences in Stanford S22
2: >lcl
......
......@@ -14,7 +14,7 @@ cd $VIDJIL_DIR
# '-c windows'
1:Extracting windows
# '-c segment'
# '-c designations'
1:Segmenting V.D.J
# '-c germlines'
......
!LAUNCH: $VIDJIL_DIR/$EXEC $VIDJIL_DEFAULT_OPTIONS -c segment -g $VIDJIL_DIR/germline/homo-sapiens.g $VIDJIL_DATA/what-V.fa
!LAUNCH: $VIDJIL_DIR/$EXEC $VIDJIL_DEFAULT_OPTIONS -c designations -g $VIDJIL_DIR/germline/homo-sapiens.g $VIDJIL_DATA/what-V.fa
$ V1*01 is recognized three times
3: VJ .* TRGV1.01
......
......@@ -87,7 +87,7 @@
#define COMMAND_WINDOWS "windows"
#define COMMAND_CLONES "clones"
#define COMMAND_SEGMENT "segment"
#define COMMAND_SEGMENT "designations"
#define COMMAND_GERMLINES "germlines"
enum { CMD_WINDOWS, CMD_CLONES, CMD_SEGMENT, CMD_GERMLINES } ;
......@@ -150,14 +150,14 @@ string usage_examples(char *progname)
stringstream ss;
ss
<< "Examples (see " DOCUMENTATION ")" << endl
<< " " << progname << " -c clones -g germline/homo-sapiens.g -2 -3 -r 1 demo/Demo-X5.fa # (basic usage, detect the locus for each read," << endl
<< " " << progname << " -c clones -g germline/homo-sapiens.g -2 -3 -r 1 demo/Demo-X5.fa # (basic usage, detect the locus for each read," << endl
<< " # cluster reads and report clones starting from the first read (-r 1)," << endl
<< " # including unexpected recombinations (-2), assign V(D)J genes and try to detect the CDR3s (-3))" << endl
<< " " << progname << " -c clones -g germline/homo-sapiens.g:IGH -3 demo/Stanford_S22.fasta # (restrict to complete recombinations on the IGH locus)" << endl
<< " " << progname << " -c clones -g germline/homo-sapiens.g -2 -3 -z 20 demo/LIL-L4.fastq.gz # (basic usage, output detailed V(D)J analysis on the first 20 clones)" << endl
<< " " << progname << " -c windows -g germline/homo-sapiens.g -y 0 -uu -U demo/LIL-L4.fastq.gz # (splits all the reads into (large) files depending on the detection of V(D)J recombinations)" << endl
<< " " << progname << " -c segment -g germline/homo-sapiens.g -2 -3 -X 50 demo/Stanford_S22.fasta # (full analysis of each read, only for debug/testing, here on 50 sampled reads)" << endl
<< " " << progname << " -c germlines -g germline/homo-sapiens.g demo/Stanford_S22.fasta # (statistics on the k-mers)" << endl
<< " " << progname << " -c clones -g germline/homo-sapiens.g:IGH -3 demo/Stanford_S22.fasta # (restrict to complete recombinations on the IGH locus)" << endl
<< " " << progname << " -c clones -g germline/homo-sapiens.g -2 -3 -z 20 demo/LIL-L4.fastq.gz # (basic usage, output detailed V(D)J analysis on the first 20 clones)" << endl
<< " " << progname << " -c windows -g germline/homo-sapiens.g -y 0 -uu -U demo/LIL-L4.fastq.gz # (splits all the reads into (large) files depending on the detection of V(D)J recombinations)" << endl
<< " " << progname << " -c designations -g germline/homo-sapiens.g -2 -3 -X 50 demo/Stanford_S22.fasta # (full analysis of each read, here on 50 sampled reads)" << endl
<< " " << progname << " -c germlines -g germline/homo-sapiens.g demo/Stanford_S22.fasta # (statistics on the k-mers)" << endl
;
return ss.str();
......@@ -252,7 +252,7 @@ int main (int argc, char **argv)
app.add_option("-c", cmd, "command"
"\n \t\t" COMMAND_CLONES " \t locus detection, window extraction, clone clustering (default command, most efficient, all outputs)"
"\n \t\t" COMMAND_WINDOWS " \t locus detection, window extraction"
"\n \t\t" COMMAND_SEGMENT " \t detailed V(D)J designation (not recommended)"
"\n \t\t" COMMAND_SEGMENT " \t detailed V(D)J designation, without prior clustering (not as efficient)"
"\n \t\t" COMMAND_GERMLINES " \t statistics on k-mers in different germlines")
-> group(group) -> set_type_name("COMMAND");
......@@ -602,6 +602,8 @@ int main (int argc, char **argv)
command = CMD_WINDOWS;
else if (cmd == COMMAND_GERMLINES)
command = CMD_GERMLINES;
else if (cmd == "segment")
return app.exit(CLI::ConstructionError("'-c segment' is deprecated, please use '-c designations'", 1));
else {
return app.exit(CLI::ConstructionError("Unknown command " + cmd, 1));
}
......@@ -739,15 +741,15 @@ int main (int argc, char **argv)
if (command == CMD_SEGMENT)
{
cout << endl
<< "* WARNING: " << PROGNAME << " was run with '-c segment' option" << endl ;
<< "* WARNING: " << PROGNAME << " was run with '-c" COMMAND_SEGMENT "' option" << endl ;
}
if (max_clones == NO_LIMIT_VALUE || max_clones > WARN_MAX_CLONES || command == CMD_SEGMENT)
{
cout << "* " << PROGNAME << " efficientl extracts windows overlapping the CDR3" << endl
cout << "* " << PROGNAME << " efficiently extracts windows overlapping the CDR3" << endl
<< "* to cluster reads into clones ('-c clones')." << endl
<< "* Computing accurate V(D)J designations for many sequences ('-c segment' or large '-z' values)" << endl
<< "* is slow and should be done only on small datasets or for testing purposes." << endl
<< "* Computing accurate V(D)J designations for many sequences ('-c " COMMAND_SEGMENT "' or large '-z' values)" << endl
<< "* is not as efficient as the default '-c " COMMAND_CLONES "' command." << endl
<< "* More information is provided in " DOCUMENTATION "." << endl
<< endl ;
}
......@@ -1567,7 +1569,7 @@ int main (int argc, char **argv)
} else if (command == CMD_SEGMENT) {
//$$ CMD_SEGMENT
////////////////////////////////////////
// V(D)J SEGMENTATION //
// V(D)J DESIGNATION //
////////////////////////////////////////
int nb = 0;
......
......@@ -299,7 +299,7 @@ Recombination detection ("window" prediction, first pass)
(all these options, except -w, are overriden when using -g)
-k INT k-mer size used for the V/J affectation (default: 10, 12, 13, depends on germline)
-w INT w-mer size used for the length of the extracted window ('all': use all the read, no window clustering)
-e FLOAT=1 maximal e-value for determining if a V-J segmentation can be trusted
-e FLOAT=1 maximal e-value for determining if a V-J designation can be trusted
-t INT trim V and J genes (resp. 5' and 3' regions) to keep at most <INT> nt (0: no trim)
-s SEED=10s seed, possibly spaced, used for the V/J affectation (default: depends on germline), given either explicitely or by an alias
10s:#####-##### 12s:######-###### 13s:#######-###### 9c:#########
......@@ -349,7 +349,7 @@ It is an upper bound on the number of exepcted windows found by chance by the se
The e-value computation takes into account both the number of reads in the
input sequence and the number of locus searched for.
The default value is 1.0, but values such as 1000, 1e-3 or even less can be used
to have a more or less permissive segmentation.
to have a more or less permissive designation.
The threshold can be disabled with `-e all`.
The `-t` option sets the maximal number of nucleotides that will be indexed in
......@@ -643,7 +643,7 @@ We export all required fields, some optional fields, as also some custom fields
Note that Vidjil-algo is designed to efficiently gather reads from large datasets into clones.
By default (`-c clones`), we thus report in the AIRR format *clones*.
See also [What is a clone ?](vidjil-format/#what-is-a-clone).
Using `-c segment` trigger a separate analysis for each read, but this is usually not advised for large datasets.
Using `-c designations` trigger a separate analysis for each read, but this is usually not advised for large datasets.
| Name | Type | AIRR 1.2 Description <br /> *vidjil-algo implementation* |
......@@ -669,14 +669,14 @@ Our implementation of .tsv may evolve in future versions.
Contact us if a particular feature does interest you.
## Segmentation and .vdj format
## The .vdj format
Vidjil output includes segmentation of V(D)J recombinations. This happens
Vidjil output includes analysis of V(D)J recombinations. This happens
in the following situations:
- in a first pass, when requested with `-U` option, in a `.segmented.vdj.fa` file.
The goal of this ultra-fast segmentation, based on a seed
The goal of this ultra-fast analysis, based on a seed
heuristics, is only to identify the locus and to locate the w-window overlapping the
CDR3. This should not be taken as a real V(D)J designation, as
the center of the window may be shifted up to 15 bases from the
......@@ -686,7 +686,8 @@ in the following situations:
- at the end of the clones detection (default command `-c clones`,
on a number of clones limited by the `-z` option)
- or directly when explicitly requiring segmentation (`-c segment`)
- or directly when explicitly requiring V(D)J designation for each read
(`-c designations`)
These V(D)J designations are obtained by full comparison (dynamic programming)
with all germline sequences.
......@@ -698,7 +699,7 @@ in the following situations:
To check the quality of these designations, the automated test suite include
sequences with manually curated V(D)J designations (see [should-vdj.org](http://git.vidjil.org/blob/master/doc/should-vdj.org)).
Segmentations of V(D)J recombinations are displayed using a dedicated
Designations of V(D)J recombinations are displayed using a dedicated
`.vdj` format. This format is compatible with FASTA format. A line starting
with a \> is of the following form:
......@@ -707,7 +708,7 @@ with a \> is of the following form:
name sequence name (include the number of occurrences in the read set and possibly other information)
+ strand on which the sequence is mapped
VDJ type of segmentation (can be "VJ", "VDJ", "VDDJ", "53"...
VDJ type of designation (can be "VJ", "VDJ", "VDDJ", "53"...
or shorter tags such as "V" for incomplete sequences).
The following line are for "VDJ" recombinations :
......@@ -804,9 +805,9 @@ This file will be relatively small (a few kB or MB) and can be taken again as an
```
``` bash
./vidjil-algo -c segment -g germline/homo-sapiens.g -2 -3 -d -x 50 demo/Stanford_S22.fasta
./vidjil-algo -c designations -g germline/homo-sapiens.g -2 -3 -d -x 50 demo/Stanford_S22.fasta
# Detailed V(D)J designation, including multiple D, and CDR3 detection on the first 50 reads, without clone clustering
# (this is slow and should only be used for testing, or on a small file)
# (this is not as efficient as '-c clones')
```
``` bash
......
......@@ -55,7 +55,7 @@ def segment_sequences(sequences):
germline_folder = defs.DIR_VIDJIL + '/germline/'
## config de vidjil
config = '-c segment -2 -3 -g germline'
config = '-c designations -2 -3 -g germline'
config = config.replace( ' germline' ,germline_folder)
## commande complete
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment