Commit 9ad3f80f authored by Mikaël Salson's avatar Mikaël Salson

Merge branch 'feature-a/4386-out-clone-files' into 'dev'

Feature a/4386 --out-clone-files

Closes #4386

See merge request !773
parents c8f26ed9 3dd35e59
Pipeline #160272 passed with stages
in 11 minutes and 2 seconds
......@@ -37,4 +37,4 @@ $ Display advanced options
: custom Cost
$ Correct number of options, including advanced options
58:^..-
59:^..-
......@@ -14,6 +14,9 @@ $ There are no clones in the .vidjil
$ Correct output message
1:only metadata
$ There is no clone output in individual files
0:detail, by clone
!LAUNCH: rm out/Stanford_S22.tsv ; $LAUNCHER $VIDJIL_DIR/$EXEC $EXTRA -z 0 -g $VIDJIL_DIR/germline/homo-sapiens.g:IGH $VIDJIL_DATA/Stanford_S22.fasta > /dev/null ; touch out/Stanford_S22.tsv ; cat out/Stanford_S22.tsv
$ The AIRR .tsv file has four lines
......
!LAUNCH: $VIDJIL_DIR/$EXEC -x 100 -r 1 -y 1 -z 0 -w 10 -g $VIDJIL_DIR/germline/homo-sapiens.g:IGH $VIDJIL_DATA/Stanford_S22.fasta 2>&1
!LAUNCH: $VIDJIL_DIR/$EXEC -x 100 -r 1 -y 1 -z 0 -w 10 --out-clone-files -g $VIDJIL_DIR/germline/homo-sapiens.g:IGH $VIDJIL_DATA/Stanford_S22.fasta 2>&1
!OUTPUT_FILE: out/seq/clone.fa-1
### !EXIT_CODE: 1
......
......@@ -97,6 +97,7 @@ enum { CMD_WINDOWS, CMD_CLONES, CMD_SEGMENT, CMD_GERMLINES } ;
// Fixed filenames/suffixes
#define CLONES_FILENAME ".vdj.fa"
#define CLONE_DIR "seq/"
#define CLONE_FILENAME "clone.fa-"
#define WINDOWS_FILENAME ".windows.fa"
#define SEGMENTED_FILENAME ".detected.vdj.fa"
......@@ -566,7 +567,7 @@ int main (int argc, char **argv)
-> group(group);
bool output_sequences_by_cluster = false;
app.add_flag("--out-reads", output_sequences_by_cluster, "output all reads by clones (" CLONE_FILENAME "*), to be used only on small datasets") -> group(group);
app.add_flag("--out-reads", output_sequences_by_cluster, "output all reads by clones (in " CLONE_DIR "/" CLONE_FILENAME "* files), to be used only on small datasets") -> group(group);
bool output_affects = false;
app.add_flag("--out-affects,-K", output_affects,
......@@ -586,6 +587,11 @@ int main (int argc, char **argv)
bool out_gz = false;
app.add_flag("--gz", out_gz, "output compressed .tsv.gz, .vdj.fa.gz, and .vidjil.gz files") -> group(group) -> level();
bool output_clone_files = false;
app.add_flag("--out-clone-files", output_clone_files,
"output clones in individual files (in " CLONE_DIR "/" CLONE_FILENAME "* files)")
-> group(group) -> level();
bool no_airr = false;
bool no_vidjil = false;
app.add_flag("--no-airr", no_airr, "do not output AIRR .tsv") -> group(group) -> level();
......@@ -700,7 +706,7 @@ int main (int argc, char **argv)
for(string lab : windows_labels_explicit)
windows_labels[lab] = string("--label");
string out_seqdir = out_dir + "/seq/" ;
string out_seqdir = out_dir + "/" + CLONE_DIR ;
if (verbose)
cout << "# verbose " << verbose << endl ;
......@@ -731,10 +737,16 @@ int main (int argc, char **argv)
return 2;
}
const char *outseq_cstr = out_seqdir.c_str();
if (mkpath(outseq_cstr, 0755) == -1) {
cerr << ERROR_STRING << "Directory creation: " << out_seqdir << endl; perror("");
return 2;
if (output_sequences_by_cluster)
output_clone_files = true;
if (output_clone_files)
{
const char *outseq_cstr = out_seqdir.c_str();
if (mkpath(outseq_cstr, 0755) == -1) {
cerr << ERROR_STRING << "Directory creation: " << out_seqdir << endl; perror("");
return 2;
}
}
// Compute basename if not given as an option
......@@ -1305,8 +1317,11 @@ int main (int argc, char **argv)
cout << " ==> " << f_clones << " \t(for post-processing with other software)" << endl ;
ostream* out_clones = new_ofgzstream(f_clones.c_str(), out_gz) ;
cout << " ==> " << out_seqdir + CLONE_FILENAME + "*" << "\t(detail, by clone)" << endl ;
cout << endl ;
if (output_clone_files)
{
cout << " ==> " << out_seqdir + CLONE_FILENAME + "*" << "\t(detail, by clone)" << endl ;
cout << endl ;
}
global_interrupted = false;
signal(SIGINT, sigintHandler);
......@@ -1391,15 +1406,21 @@ int main (int argc, char **argv)
//$$ Open CLONE_FILENAME
ofstream *out_clone = NULL;
string clone_file_name = out_seqdir+ CLONE_FILENAME + string_of_int(num_clone) ;
ofstream out_clone(clone_file_name.c_str());
if (output_clone_files)
{
string clone_file_name = out_seqdir+ CLONE_FILENAME + string_of_int(num_clone) ;
out_clone = new ofstream(clone_file_name.c_str());
*out_clone << window_str ;
}
//$$ Output window
if (clone_on_stdout)
{
cout << window_str ;
out_clone << window_str ;
}
//$$ Compute a representative sequence
// Display statistics on auditionned sequences
......@@ -1465,6 +1486,13 @@ int main (int argc, char **argv)
if (clone_on_stdout)
cout << representative << endl ;
*out_clones << representative << endl ;
if (output_clone_files)
{
out_clone->close();
delete out_clone;
}
continue;
}
......@@ -1490,8 +1518,11 @@ int main (int argc, char **argv)
// to stdout, CLONES_FILENAME, and CLONE_FILENAME-*
if (clone_on_stdout)
cout << seg << endl ;
out_clone << seg << endl ;
*out_clones << seg << endl ;
if (output_clone_files)
*out_clone << seg << endl ;
*out_clones << seg << endl ;
seg.toOutput(clone);
......@@ -1519,13 +1550,16 @@ int main (int argc, char **argv)
}
// Output best V, (D) and J germlines to CLONE_FILENAME-*
if (output_clone_files)
{
if ((segmented_germline->seg_method == SEG_METHOD_53) || (segmented_germline->seg_method == SEG_METHOD_543))
out_clone << ">" << seg.box_V->ref_label << endl << seg.box_V->ref << endl ;
*out_clone << ">" << seg.box_V->ref_label << endl << seg.box_V->ref << endl ;
if ((segmented_germline->seg_method == SEG_METHOD_543) || (segmented_germline->seg_method == SEG_METHOD_ONE))
out_clone << ">" << seg.box_D->ref_label << endl << seg.box_D->ref << endl ;
*out_clone << ">" << seg.box_D->ref_label << endl << seg.box_D->ref << endl ;
if ((segmented_germline->seg_method == SEG_METHOD_53) || (segmented_germline->seg_method == SEG_METHOD_543))
out_clone << ">" << seg.box_J->ref_label << endl << seg.box_J->ref << endl ;
out_clone << endl;
*out_clone << ">" << seg.box_J->ref_label << endl << seg.box_J->ref << endl ;
*out_clone << endl;
}
} // end if (seg.isSegmented())
seg.checkWarnings(clone, clone_on_stdout);
......@@ -1536,13 +1570,19 @@ int main (int argc, char **argv)
for (list<Sequence>::const_iterator itt = sequences.begin(); itt != sequences.end(); ++itt)
{
out_clone << *itt ;
*out_clone << *itt ;
}
}
if (clone_on_stdout)
cout << endl ;
out_clone.close();
if (output_clone_files)
{
out_clone->close();
delete out_clone;
}
} // end for clones
signal(SIGINT, SIG_DFL);
......
......@@ -514,7 +514,8 @@ ATAGTAGTGGTTATTACGGGGTAGGGCAGTACTACTACTACTACATGGAC
Windows of size 50 (modifiable by `-w`) have been extracted.
The first window has 8 occurrences, the second window has 5 occurrences.
The `out/seq/clone.fa-*` contains the detailed analysis by clone, with
With the `--out-clone-files` option, one `out/seq/clone.fa-*` file is created for each clone.
It contains the detailed analysis by clone, with
the window, the consensus sequence, as well as with the most similar V, (D) and J germline genes:
``` diff
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment