Commit 756ef3c2 authored by Mikaël Salson's avatar Mikaël Salson

Vidjil: change output filenames

* .data files are renamed to .vidjil
* by default output filenames have the same basename as the input file

Documentation and tests have been updated accordingly
parent 5b1b015d
!LAUNCH: ../../vidjil -G ../../germline/IGH -r 5 -d ../../data/Stanford_S22.fasta ; cat out/vidjil.data | sh format-json.sh
!LAUNCH: ../../vidjil -G ../../germline/IGH -r 5 -d -b data ../../data/Stanford_S22.fasta ; cat out/data.vidjil | sh format-json.sh
$ Number of reads
1:"total" : [ 13153 ] ,
......
!LAUNCH: ../../vidjil -G ../../germline/IGH -d ../../data/Stanford_S22.fasta ; python ../../server/fuse.py out/vidjil.data out/vidjil.data -o out/fused.data ; cat out/fused.data | sh format-json.sh
!LAUNCH: ../../vidjil -G ../../germline/IGH -d ../../data/Stanford_S22.fasta ; python ../../server/fuse.py out/Stanford_S22.vidjil out/Stanford_S22.vidjil -o out/fused.data ; cat out/fused.data | sh format-json.sh
$ Points list
f1:"point": [ "", "" ]
......
......@@ -85,14 +85,14 @@
enum { CMD_WINDOWS, CMD_CLONES, CMD_SEGMENT, CMD_GERMLINES } ;
#define OUT_DIR "./out/"
#define CLONES_FILENAME "clones.vdj.fa"
#define CLONES_FILENAME ".vdj.fa"
#define CLONE_FILENAME "clone.fa-"
#define WINDOWS_FILENAME "windows.fa"
#define SEGMENTED_FILENAME "segmented.vdj.fa"
#define UNSEGMENTED_FILENAME "unsegmented.fa"
#define EDGES_FILENAME "edges"
#define COMP_FILENAME "comp.data"
#define JSON_SUFFIX ".data"
#define WINDOWS_FILENAME ".windows.fa"
#define SEGMENTED_FILENAME ".segmented.vdj.fa"
#define UNSEGMENTED_FILENAME ".unsegmented.fa"
#define EDGES_FILENAME ".edges"
#define COMP_FILENAME "comp.vidjil"
#define JSON_SUFFIX ".vidjil"
// "tests/data/leukemia.fa"
......@@ -170,7 +170,7 @@ void usage(char *progname)
<< " -l <file> labels for some windows -- these windows will be kept even if some limits are not reached" << endl
<< endl
<< "Additional clustering (not output in vidjil.data and therefore not used in the browser)" << endl
<< "Additional clustering (not output in .vidjil file and therefore not used in the browser)" << endl
<< " -e <file> manual clustering -- a file used to force some specific edges" << endl
<< " -n <int> maximum distance between neighbors for automatic clustering (default " << DEFAULT_EPSILON << "). No automatic clusterisation if =0." << endl
<< " -N <int> minimum required neighbors for automatic clustering (default " << DEFAULT_MINPTS << ")" << endl
......@@ -198,12 +198,12 @@ void usage(char *progname)
<< endl
<< "Debug" << endl
<< " -U output segmented (" << SEGMENTED_FILENAME << ") sequences" << endl
<< " -u output unsegmented (" << UNSEGMENTED_FILENAME << ") sequences" << endl
<< " -U output segmented (in " << SEGMENTED_FILENAME << " file) sequences" << endl
<< " -u output unsegmented (in " << UNSEGMENTED_FILENAME << " file) sequences" << endl
<< " and display detailed k-mer affectation both on segmented and on unsegmented sequences" << endl
<< "Output" << endl
<< " -o <dir> output directory (default: " << OUT_DIR << ")" << endl
<< " -p <string> prefix output filenames by the specified string" << endl
<< " -b <string> output basename (by default basename of the input file)" << endl
<< " -a output all sequences by cluster (" << CLONE_FILENAME << "*), to be used only on small datasets" << endl
<< " -x do not compute representative sequences" << endl
......@@ -242,7 +242,7 @@ int main (int argc, char **argv)
string f_rep_J = DEFAULT_J_REP ;
string f_reads = DEFAULT_READS ;
string seed = DEFAULT_SEED ;
string prefix_filename = "";
string f_basename = "";
string out_dir = OUT_DIR;
......@@ -297,7 +297,7 @@ int main (int argc, char **argv)
//$$ options: getopt
while ((c = getopt(argc, argv, "Ahag:G:V:D:J:k:r:vw:e:C:t:l:dc:m:M:N:s:p:Sn:o:L%:y:z:uU")) != EOF)
while ((c = getopt(argc, argv, "Ahag:G:V:D:J:k:r:vw:e:C:t:l:dc:m:M:N:s:b:Sn:o:L%:y:z:uU")) != EOF)
switch (c)
{
......@@ -393,8 +393,8 @@ int main (int argc, char **argv)
out_dir = optarg ;
break;
case 'p':
prefix_filename = optarg;
case 'b':
f_basename = optarg;
break;
// Limits
......@@ -562,6 +562,11 @@ int main (int argc, char **argv)
exit(2);
}
// Compute basename if not given as an option
if (f_basename == "") {
f_basename = extract_basename(f_reads);
}
out_dir += "/" ;
/// Load labels ;
......@@ -813,14 +818,14 @@ int main (int argc, char **argv)
WindowExtractor we;
if (output_segmented) {
string f_segmented = out_dir + prefix_filename + SEGMENTED_FILENAME ;
string f_segmented = out_dir + f_basename + SEGMENTED_FILENAME ;
cout << " ==> " << f_segmented << endl ;
out_segmented = new ofstream(f_segmented.c_str());
we.setSegmentedOutput(out_segmented);
}
if (output_unsegmented) {
string f_unsegmented = out_dir + prefix_filename + UNSEGMENTED_FILENAME ;
string f_unsegmented = out_dir + f_basename + UNSEGMENTED_FILENAME ;
cout << " ==> " << f_unsegmented << endl ;
out_unsegmented = new ofstream(f_unsegmented.c_str());
we.setUnsegmentedOutput(out_unsegmented);
......@@ -880,7 +885,7 @@ int main (int argc, char **argv)
//$$ Output windows
//////////////////////////////////
string f_all_windows = out_dir + prefix_filename + WINDOWS_FILENAME;
string f_all_windows = out_dir + f_basename + WINDOWS_FILENAME;
cout << " ==> " << f_all_windows << endl ;
ofstream out_all_windows(f_all_windows.c_str());
......@@ -923,7 +928,7 @@ int main (int argc, char **argv)
if (load_comp==1)
{
comp.load((out_dir+prefix_filename + comp_filename).c_str());
comp.load((out_dir+f_basename + "." + comp_filename).c_str());
}
else
{
......@@ -932,7 +937,7 @@ int main (int argc, char **argv)
if (save_comp==1)
{
comp.save(( out_dir+prefix_filename + comp_filename).c_str());
comp.save(( out_dir+f_basename + "." + comp_filename).c_str());
}
clones_windows = comp.cluster(forced_edges, w, cout, epsilon, minPts) ;
......@@ -983,16 +988,16 @@ int main (int argc, char **argv)
int num_clone = 0 ;
int clones_without_representative = 0 ;
ofstream out_edges((out_dir+prefix_filename + EDGES_FILENAME).c_str());
ofstream out_edges((out_dir+f_basename + EDGES_FILENAME).c_str());
int nb_edges = 0 ;
cout << " ==> suggested edges in " << out_dir+ prefix_filename + EDGES_FILENAME
cout << " ==> suggested edges in " << out_dir+ f_basename + EDGES_FILENAME
<< endl ;
string f_clones = out_dir + prefix_filename + CLONES_FILENAME ;
string f_clones = out_dir + f_basename + CLONES_FILENAME ;
cout << " ==> " << f_clones << " \t(main result file)" << endl ;
ofstream out_clones(f_clones.c_str()) ;
cout << " ==> " << out_seqdir + prefix_filename + CLONE_FILENAME + "*" << "\t(detail, by clone)" << endl ;
cout << " ==> " << out_seqdir + CLONE_FILENAME + "*" << "\t(detail, by clone)" << endl ;
cout << endl ;
......@@ -1043,7 +1048,7 @@ int main (int argc, char **argv)
//$$ Open CLONE_FILENAME
string clone_file_name = out_seqdir+ prefix_filename + CLONE_FILENAME + string_of_int(num_clone) ;
string clone_file_name = out_seqdir+ CLONE_FILENAME + string_of_int(num_clone) ;
ofstream out_clone(clone_file_name.c_str());
......@@ -1201,7 +1206,7 @@ int main (int argc, char **argv)
} // end if (command == CMD_CLONES)
//$$ .json output: json_data_segment
string f_json = out_dir + prefix_filename + "vidjil" + JSON_SUFFIX ; // TODO: retrieve basename from f_reads instead of "vidjil"
string f_json = out_dir + f_basename + JSON_SUFFIX ;
cout << " ==> " << f_json << "\t(data file for the browser)" << endl ;
ofstream out_json(f_json.c_str()) ;
......
......@@ -138,7 +138,7 @@ than 100 clones is often not useful since they can't be visualized easily
in the browser, and takes large computation time.
Note that even if a clone is not in the top 20 (or 50, or 100) but
still passes the =-r=, =-%= options, it is still reported in the .data
still passes the =-r=, =-%= options, it is still reported in the .vidjil
file. If the clone is at some MRD point in the top 20 (or 50, or 100),
it will be fully analyzed/segmented by this other point (and then
collected by the =fuse.py= script, using representatives computed at this
......@@ -185,9 +185,9 @@ require the =-G germline/IGH= and the =-d= options.
#+BEGIN_SRC sh
./vidjil -G germline/IGH -d data/Stanford_S22.fasta
# Extract (with an ultra-fast heuristic) all windows
# Summary of windows is available in out/vidjil.data
# ('.data' format, see below)
# To have detailed/debug results in out/segmented.vdj.fa
# Summary of windows is available in out/Stanford_S22.vidjil
# (for the '.vidjil' format, see below)
# To have detailed/debug results in out/Stanford_S22.vdj.fa
# (which is a FASTA file embedding heuristic information
in the headers, '.vdj' format, see warning below)
# run Vidjil with option '-U'
......@@ -212,10 +212,10 @@ CTATGATAGTAGTGGTTATTACGGGGTAGGGCAGTACTACTACTACTACATGGACGTCTG
# For debug purpose, if one wants all the clones, use the option -A.
# Results are both
# - on the standard output
# - in out/clones.vdj.fa (fasta file to be processed by other tools)
# - in out/vidjil.data (for the browser)
# Additional files are in out/seq/windows.fa-* and out/seq/clone.fa-*
# If one adds the '-U' option, an additonal out/segmented.vdj.fa file is produced,
# - in out/clones_simul.vdj.fa (fasta file to be processed by other tools)
# - in out/clones_simul.vidjil (for the browser)
# Additional files are in out/clones_simul.windows.fa and out/seq/clone.fa-*
# If one adds the '-U' option, an additonal out/clones_simul.segmented.vdj.fa file is produced,
# listing segmented reads using the .vdj format (see below)
#+END_SRC
......@@ -242,7 +242,7 @@ CTATGATAGTAGTGGTTATTACGGGGTAGGGCAGTACTACTACTACTACATGGACGTCTG
Vidjil output includes segmentation of V(D)J recombinations. This happens
in the following situations:
- in a first pass, when requested with =-U= option, in =segmented.vdj.fa= file.
- in a first pass, when requested with =-U= option, in a =.segmented.vdj.fa= file.
The goal of this ultra-fast segmentation, based on a seed
heuristics, is only to locate the w-window overlapping the
......@@ -251,7 +251,8 @@ in the following situations:
actual center.
- in a second pass, on the standard output
- at the end of the clones detection (=-c clones=, also in in =clones.vdj.fa=)
- at the end of the clones detection (=-c clones=, also in in
=basename.vdj.fa=, where =basename= is the basename of the input file)
- or directly when explicitly requiring segmentation (=-c segment=)
This segmentation obtained by full comparison (dynamic
......@@ -293,7 +294,7 @@ with a > is of the following form:
Jgene name of the J gene being rearranged
comments optional comments. In Vidjil, the following comments are now used:
- "seed" when this comes for the first pass (segmented.vdj.fa). See the warning above.
- "seed" when this comes for the first pass (.segmented.vdj.fa). See the warning above.
- "!ov x" when there is an overlap of x bases between last V seed and first J seed
#+END_EXAMPLE
......@@ -306,7 +307,7 @@ applicable being removed:
>name + VJ startV endV startJ endJ Vgene delV/N1/delJ Jgene coments
* vidjil.data .json format and web interface
* .vidjil and .json format and web interface
A summary of extracted windows is also available in a JSON format,
including, for each windows, the number of reads sharing this window.
......
......@@ -18,17 +18,17 @@ The Vidjil browser runs in any modern browser. It has been successfully tested o
- Opera version >= XX
- Safari version >= XX
** The .data files
** The .vidjil files
The vidjil browser displays .data files that summarize the V(D)J
The vidjil browser displays .vidjil files that summarize the V(D)J
rearrangements and the sequences found in a run. Such files can be
obtained:
- by sending us your .fasta/.fastq files, either through
http://www.vidjil.org/submit.html or using any other method
(e.g. your own FTP)
- from the command-line version of Vidjil (starting from
.fasta/.fastq files, see doc/alog.org in the command-line version).
To gather several .data files, you have to use the [[../server/fuse.py][fuse.py]] script
.fasta/.fastq files, see doc/algo.org in the command-line version).
To gather several .vidjil files, you have to use the [[../server/fuse.py][fuse.py]] script
- in a next release (start of 2015), you will be able to upload,
manage and process your runs (.fasta/.fastq files) directly on the browser (with
authentication to ensure that you keep the control on your data).
......@@ -209,13 +209,13 @@ The browser can be opened on a data file specified from a =data= attribute,
and optionally on an analysis file specified from a =analysis= attribute,
as in the following URLs on our test server:
- http://rbx.vidjil.org/browser/?data=test.data
- http://rbx.vidjil.org/browser/?data=test.data&analysis=test.analysis
- http://rbx.vidjil.org/browser/?data=http://rbx.vidjil.org/browser/test.data
- http://rbx.vidjil.org/browser/?data=test.vidjil
- http://rbx.vidjil.org/browser/?data=test.vidjil&analysis=test.analysis
- http://rbx.vidjil.org/browser/?data=http://rbx.vidjil.org/browser/test.vidjil
Both GET and POST requests are accepted.
Note that the =browser/index.html= file and the =.data/.analysis= files should be hosted on the same server.
Otherwise, the server hosting the =.data/.analysis= files must accept cross-domain queries.
Note that the =browser/index.html= file and the =.vidjil/.analysis= files should be hosted on the same server.
Otherwise, the server hosting the =.vidjil/.analysis= files must accept cross-domain queries.
* Reference
......
#+TITLE: .analysis and .data format
#+TITLE: .analysis and .vidjil format
#+AUTHOR: The Vidjil team
The .analysis and the .data files share a common [[http://en.wikipedia.org/wiki/JSON][.json]] format
The .analysis and the .vidjil files share a common [[http://en.wikipedia.org/wiki/JSON][.json]] format
The .data file represents the actual data on clones (and that can
The .vidjil file represents the actual data on clones (and that can
reach megabytes).
The .analysis file describe customizations done by the user
......@@ -12,7 +12,7 @@ can load or save such files (and possibly from/to the server).
It is intended to be very small (a few kilobytes).
All settings in the .analysis file override the settings that could be
present in the .data file.
present in the .vidjil file.
* Analysis file example
......@@ -96,7 +96,7 @@ present in the .data file.
* 'reads' element [.data only, required]
* 'reads' element [.vidjil only, required]
#+BEGIN_SRC js
{
......@@ -118,7 +118,7 @@ present in the .data file.
"number": 2, // number of samples [required]
"original_names": [], // original sample names (with samples.number elements) [required]
// the names in the .data file and in .analysis files must match
// the names in the .vidjil file and in .analysis files must match
"names": [], // custom sample names (with samples.number elements) [optional]
// These names are editable and will be used on the graphs
......@@ -138,7 +138,7 @@ present in the .data file.
Each element in the 'clones' list describes properties of a clone.
In a .data file, this is the main part, describing all clones.
In a .vidjil file, this is the main part, describing all clones.
In the .analysis file, this section is intended to describe some specific clones.
......@@ -149,15 +149,15 @@ In the .analysis file, this section is intended to describe some specific clones
"id": "", // clone identifier, must be unique [required]
// Vidjil/algo output -> the 'window'
// Brno .clntab -> clone sequence
// the clone identifier in the .data file and in .analysis file must match
// the clone identifier in the .vidjil file and in .analysis file must match
"germline": "" // [required for .data]
"germline": "" // [required for .vidjil]
// (should match a germline defined in germline/germline.data)
"name": "", // clone custom name [optional]
// (the default name, in .data, is computed from V/D/J information)
// (the default name, in .vidjil, is computed from V/D/J information)
"sequence": "", // reference nt sequence [required for .data]
"sequence": "", // reference nt sequence [required for .vidjil]
// (for .analysis, not really used now in the browser,
// for special clones/sequences that are known,
// such as standard/spikes or know patient clones)
......@@ -187,11 +187,11 @@ In the .analysis file, this section is intended to describe some specific clones
}
"reads": [], // number of reads in this clones [.data only, required]
"reads": [], // number of reads in this clones [.vidjil only, required]
// (with samples.number elements)
"top": 0,
"stats": [] // (not documented now) [.data only] (with sample.number elements)
"stats": [] // (not documented now) [.vidjil only] (with sample.number elements)
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment