Attention une mise à jour du serveur va être effectuée le lundi 17 mai entre 13h et 13h30. Cette mise à jour va générer une interruption du service de quelques minutes.

Commit 2d1237b5 authored by Mikaël Salson's avatar Mikaël Salson

Merge branch 'feature-a/4642-stdin' into 'dev'

read data from stdin, check nb_reads_for_evalue

Closes #4642

See merge request !887
parents fe306e43 3183a27e
Pipeline #210989 failed with stages
in 9 minutes and 12 seconds
......@@ -29,6 +29,7 @@
#include <list>
#include <stdexcept>
#define STDIN_FILENAME "-"
#define SAMPLE_APPROX_NB_SEQUENCES 200
using namespace std;
......
......@@ -49,7 +49,10 @@ OnlineFasta::~OnlineFasta() {
}
void OnlineFasta::init() {
if (! filename.empty()) {
if (filename == STDIN_FILENAME) {
input = &cin;
}
else if (! filename.empty()) {
input = new igzstream(filename.c_str());
if (this->input->fail()) {
......
......@@ -3,6 +3,7 @@
#define W09_INTERRUPTED "W09" // Program interrupted, output data may be not complete
#define W20_VERY_FEW_RECOMBINATIONS "W20" // Very few V(D)J recombinations found
#define W21_DOUBTFUL_MULTIPLIER "W21" // Doubtful e-value multiplier
#define W50_WINDOW "W50" // Short or shifted window
#define W51_LOW_COVERAGE "W51" // Low coverage
......
!LAUNCH: cat ../should-vdj-tests/Demo-X5.should-vdj.fa | $LAUNCHER $VIDJIL_DIR/$EXEC --all -g $VIDJIL_DIR/germline/homo-sapiens.g:TRG -
$ Display message with the estimation of read number
1:reading from stdin, estimating 100 reads
$ Detect recombinations in one read
1:junction detected in 1 reads
$ Detect and designate one read on TRG
1:TRG .* -> .* 1
1:TRGV10.* ./AGAC/3 TRGJP1
# When --read-number gives a bad estimation, there is a warning
!LAUNCH: cat ../should-vdj-tests/Demo-X5.should-vdj.fa | $LAUNCHER $VIDJIL_DIR/$EXEC --read-number 1 --all -g $VIDJIL_DIR/germline/homo-sapiens.g:TRG -
$ Display message with the estimation of read number
1:reading from stdin, estimating 1 reads
$ Detect and designate at least one read on TRG
1:TRG .* -> .* [1-9]
1:TRGV10.* ./AGAC/3 TRGJP1
$ Warning
1:.W21. Bad e-value multiplier
......@@ -130,6 +130,7 @@ enum { CMD_WINDOWS, CMD_CLONES, CMD_SEGMENT, CMD_GERMLINES } ;
#define DEFAULT_SEGMENT_COST VDJ
#define DEFAULT_TRIM 0
#define DEFAULT_STDIN_READ_NB 100
#define MAX_CLONES_FOR_SIMILARITY 20
......@@ -138,6 +139,7 @@ enum { CMD_WINDOWS, CMD_CLONES, CMD_SEGMENT, CMD_GERMLINES } ;
#define WARN_PERCENT_SEGMENTED 40
#define WARN_COVERAGE 0.6
#define WARN_NUM_CLONES_SIMILAR 10
#define WARN_RATIO_NB_READS 5
// display
#define CLONES_ON_STDOUT 50
......@@ -253,6 +255,7 @@ int main (int argc, char **argv)
- FASTQ (.fq/.fastq, .fq.gz/.fastq.gz)
- BAM (.bam)
Paired-end reads should be merged before given as an input to vidjil-algo.
Uncompressed FASTA/FASTQ reads can be given from standard input with ')Z" STDIN_FILENAME R"Z('.
)Z")
-> required() -> type_name("");
......@@ -289,6 +292,11 @@ int main (int argc, char **argv)
"maximal number of reads to process ('" NO_LIMIT "': no limit, default), sampled reads")
-> group(group) -> transform(string_NO_LIMIT);
long long force_read_number = NO_LIMIT_VALUE;
app.add_option("--read-number", force_read_number,
"estimate for read number used in e-value computation (default: '" NO_LIMIT "', count all reads, but takes "
+ string_of_int(DEFAULT_STDIN_READ_NB) + " for stdin)")
-> group(group) -> level() -> transform(string_NO_LIMIT);
// ----------------------------------------------------------------------------------------------------------------------
group = "Germline/recombination selection (at least one -g or -V/(-D)/-J option must be given)";
......@@ -726,6 +734,15 @@ int main (int argc, char **argv)
cout << "# using default sequence file: " << f_reads << endl ;
}
bool reads_stdin = (f_reads == STDIN_FILENAME);
if (reads_stdin)
{
if (force_read_number == NO_LIMIT_VALUE)
force_read_number = DEFAULT_STDIN_READ_NB ;
cout << "# reading from stdin, estimating " << force_read_number << " reads" << endl ;
}
size_t min_cover_representative = (size_t) min(min_reads_clone, DEFAULT_MIN_COVER_REPRESENTATIVE);
// Check seed buffer
......@@ -960,7 +977,9 @@ int main (int argc, char **argv)
cout << endl ;
// Number of reads for e-value computation
unsigned long long nb_reads_for_evalue = (expected_value > NO_LIMIT_VALUE) ? nb_sequences_in_file(f_reads, true) : 1 ;
unsigned long long nb_reads_for_evalue = (expected_value == NO_LIMIT_VALUE) ? 1
: (force_read_number > NO_LIMIT_VALUE) ? force_read_number
: nb_sequences_in_file(f_reads, true);
if (expected_value_kmer == NO_LIMIT_VALUE)
{
......@@ -1159,6 +1178,12 @@ int main (int argc, char **argv)
windowsStorage->setIdToAll();
size_t nb_total_reads = we.getNbReads();
if ((float) nb_total_reads / (float) nb_reads_for_evalue > WARN_RATIO_NB_READS)
{
output.add_warning(W21_DOUBTFUL_MULTIPLIER, "Bad e-value multiplier.", LEVEL_WARN);
cout << " ! The estimated number of reads was far below the actual number of reads" << endl ;
cout << " ! There may be false positives, you should run with an higher --read-number" << endl ;
}
//$$ Display statistics on segmentation causes
......
......@@ -150,24 +150,47 @@ make -C src test # run self-tests (can take 5 to 60 minutes)
# Input and parameters
The `-h` and `-H` help options provide the list of parameters that can be
used. We detail here the options of the main `-c clones` command.
The default options are very conservative (large window, no further
automatic clusterization, see below), leaving the user or other
software making detailed analysis and decisions on the final
clustering.
### Input selection
```
Positionals
reads_file reads file, in one of the following formats:
- FASTA (.fa/.fasta, .fa.gz/.fasta.gz)
- FASTQ (.fq/.fastq, .fq.gz/.fastq.gz)
- BAM (.bam)
Paired-end reads should be merged before given as an input to vidjil-algo.
Uncompressed FASTA/FASTQ reads can be given from standard input with '-'.
Input
-x, --first-reads INT maximal number of reads to process ('all': no limit, default), only first reads
-X, --sampled-reads INT maximal number of reads to process ('all': no limit, default), sampled reads
```
The main input file of Vidjil-algo is a *set of reads*, given as a `.fasta`
or `.fastq` file, possibly compressed with gzip (`.gz`).
This set of reads can reach several gigabytes and 2\*10<sup>9</sup> reads. It is
never loaded entirely in the memory, but reads are processed one by
one by Vidjil-algo.
FASTA/FASTQ reads can also be given on the standard input by giving `-` instead of a file.
Vidjil-algo can also process BAM files, but please note that:
1. The reads don't need to be aligned beforehand.
2. In case of paired-end sequencing, the reads must have already been merged
in the BAM file.
The `-h` and `-H` help options provide the list of parameters that can be
used. We detail here the options of the main `-c clones` command.
The default options are very conservative (large window, no further
automatic clusterization, see below), leaving the user or other
software making detailed analysis and decisions on the final
clustering.
The `--first-reads` option restricts the analysis on a few sequences, for example to probe a large file or to test some parameters.
However, read files may be not homogeneous, with biais in the sequences at the start of the file.
The `--sampled-reads` option rather considers *regularly sampled sequences* from the file.
It is thus generally safe to run `--sampled-reads 1000` to have a fast insight of what there is in some data.
## Germline presets: locus and recombination selection
......@@ -358,10 +381,6 @@ The default is `--trim 0`.
The following options control how many clones are output and analyzed.
``` diff
Input
-x, --first-reads INT maximal number of reads to process ('all': no limit, default), only first reads
-X, --sampled-reads INT maximal number of reads to process ('all': no limit, default), sampled reads
Limits to report and to analyze clones (second pass)
-r, --min-reads INT=5 minimal number of reads supporting a clone
--min-ratio FLOAT=0 minimal percentage of reads supporting a clone
......
......@@ -23,6 +23,7 @@ Warnings which were implemented ([x]) have a fixed code that should not be chang
## Output of an analysis, global warnings
- [x] W20 Very few V(D)J recombinations found (0.7%)
- [x] W21 Doubtful e-value multiplier
- [ ] W2x Sequences with known adapters #1669
- [ ] W2x CDR3 detection without gapped germlines #2187 (ou bien par clone ?)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment