Commit 799e8a40 authored by Vidjil Team's avatar Vidjil Team
Browse files

Merge branch 'master' of git+ssh://scm.gforge.inria.fr//gitroot/vidjil/vidjil into rbx.vidjil.org

parents c85b3e30 1df44a85
......@@ -75,7 +75,7 @@ cleanall: clean
RELEASE_TAG="notag"
RELEASE_H = $(VIDJIL_ALGO_SRC)/release.h
RELEASE_SOURCE = $(wildcard $(VIDJIL_ALGO_SRC)/*.cpp) $(wildcard $(VIDJIL_ALGO_SRC)/*.h) $(wildcard $(VIDJIL_ALGO_SRC)/core/*.cpp) $(wildcard $(VIDJIL_ALGO_SRC)/tests/*.cpp) $(wildcard $(VIDJIL_ALGO_SRC)/core/*.h) $(wildcard $(VIDJIL_ALGO_SRC)/tests/*.h)
RELEASE_SOURCE = $(wildcard $(VIDJIL_ALGO_SRC)/*.cpp) $(wildcard $(VIDJIL_ALGO_SRC)/*.h) $(wildcard $(VIDJIL_ALGO_SRC)/core/*.cpp) $(wildcard $(VIDJIL_ALGO_SRC)/tests/*.cpp) $(wildcard $(VIDJIL_ALGO_SRC)/core/*.h) $(wildcard $(VIDJIL_ALGO_SRC)/tests/*.h) $(wildcard $(VIDJIL_ALGO_SRC)/cgi/*.cpp)
RELEASE_MAKE = ./Makefile $(VIDJIL_ALGO_SRC)/Makefile $(VIDJIL_ALGO_SRC)/core/Makefile $(VIDJIL_ALGO_SRC)/tests/Makefile germline/Makefile data/Makefile
RELEASE_TESTS = data/get-sequences $(wildcard data/*.fa) $(wildcard data/*.fq) $(VIDJIL_ALGO_SRC)/tests/should-to-tap.sh $(wildcard $(VIDJIL_ALGO_SRC)/tests/*.should_get) $(wildcard $(VIDJIL_ALGO_SRC)/tests/bugs/*.fa) $(wildcard $(VIDJIL_ALGO_SRC)/tests/bugs/*.should_get) $(VIDJIL_ALGO_SRC)/tests/format-json.sh
RELEASE_FILES = $(RELEASE_SOURCE) $(RELEASE_TESTS) $(RELEASE_MAKE) germline/get-germline germline/split-from-imgt.py doc/algo.org doc/LICENSE data/segmentation.fasta
......
......@@ -16,20 +16,24 @@ patient follow-up.
High-throughput sequencing (NGS/HTS) now enables the deep sequencing
of a lymphoid population with dedicated [[http://omictools.com/rep-seq-c424-p1.html][Rep-Seq]] methods and softwares.
Vidjil process high-througput sequencing data to *extract V(D)J
The Vidjil platform contains three components. The Vidjil algorithm
process high-througput sequencing data to *extract V(D)J
junctions and gather them into clones*. Vidjil starts
from a set of reads and detects "windows" overlapping the actual CDR3.
This is based on an fast and reliable seed-based heuristic and allows
to output the most abundant clones. Vidjil can also clusterize similar
clones, or leave this to the user after a manual review.
to output all sequenced clones. The analysis is extremely fast
because, in the first phase, no alignment is performed with database
germline sequences.
Vidjil also contains a *dynamic browser* for visualization and
The Vidjil *dynamic browser* is made for the visualization and
analysis of clones and their tracking along the time in a MRD setup or
in a immunological study. The browser can visualize data processed by
the algorithmic component of Vidjil or by other V(D)J analysis
pipelines.
the Vidjil algorithm or by other V(D)J analysis pipelines.
The browser enables to explore further cluterings proposed
by software and/or done manually done by the user.
Finally, a *server* is currently developed to link the browser and the
Finally, a *patient database* with a server
is currently developed to link the browser and the
algorithmic part. The goal is that the clinicians will be able to
upload, manage and process their runs directly on the browser (with
authentication).
......@@ -49,7 +53,7 @@ authentication).
- Development code is under [[browser/]]
- Documentation (in progress): [[doc/browser.org]]
** The server
** The server and the patient database
- Currently under development, code is in [[server/]]
- Planned first release: Q1 2015
......
......@@ -908,7 +908,7 @@ span.logo {
float: left;
height: 100%;
width: 6px;
line-height: calc(99%);
line-height: calc(100% - 1px);
background: #333333;
cursor: pointer;
}
......@@ -923,22 +923,24 @@ span.logo {
cursor: pointer;
}
#db_div {
z-index: 3;
z-index: 5;
border: solid;
position: fixed;
top: 150px;
top: 20px;
left: 50% ;
width: 960px;
height: 450px;
width: 80%;
height: calc(100% - 50px);
font-size: 14px;
background: #000000;
display: none;
left: -moz-calc(50% - 480px);
left: -webkit-calc(50% - 480px);
left: calc(50% - 480px);
left: 10%;
}
#db_msg {
margin: 15px;
height: calc(100% - 30px);
}
#db_content {
height: calc(100% - 150px);
}
.db_table {
width: 100%;
......@@ -947,6 +949,8 @@ span.logo {
}
.db_table td {
overflow: hidden;
text-overflow: ellipsis;
white-space: nowrap;
}
.db_table .column_200 {
width: 200px;
......@@ -1022,7 +1026,7 @@ span.logo {
#db_table_container {
overflow-y: auto;
overflow-x: hidden;
max-height: 275px;
height: 100%;
position: relative;
}
#db_fixed_header {
......
......@@ -908,7 +908,7 @@ span.logo {
float: left;
height: 100%;
width: 6px;
line-height: calc(99%);
line-height: calc(100% - 1px);
background: #cccccc;
cursor: pointer;
}
......@@ -923,22 +923,24 @@ span.logo {
cursor: pointer;
}
#db_div {
z-index: 3;
z-index: 5;
border: solid;
position: fixed;
top: 150px;
top: 20px;
left: 50% ;
width: 960px;
height: 450px;
width: 80%;
height: calc(100% - 50px);
font-size: 14px;
background: #ffffff;
display: none;
left: -moz-calc(50% - 480px);
left: -webkit-calc(50% - 480px);
left: calc(50% - 480px);
left: 10%;
}
#db_msg {
margin: 15px;
height: calc(100% - 30px);
}
#db_content {
height: calc(100% - 150px);
}
.db_table {
width: 100%;
......@@ -947,6 +949,8 @@ span.logo {
}
.db_table td {
overflow: hidden;
text-overflow: ellipsis;
white-space: nowrap;
}
.db_table .column_200 {
width: 200px;
......@@ -1022,7 +1026,7 @@ span.logo {
#db_table_container {
overflow-y: auto;
overflow-x: hidden;
max-height: 275px;
height: 100%;
position: relative;
}
#db_fixed_header {
......
......@@ -1052,7 +1052,7 @@ span.logo
float: left;
height: 100%;
width: @margin;
line-height: calc(100% - 1px);
line-height: calc(~"100% - 1px");
background : @border;
cursor: pointer;
}
......@@ -1068,24 +1068,28 @@ span.logo
}
#db_div{
z-index:3;
z-index:5;
border:solid;
position: fixed;
top: 150px;
top: 20px;
left:50% ;
width:960px;
height :450px;
width:80%;
height: calc(~"100% - 50px");
font-size: 14px;
background: @background;
display:none;
left: -moz-calc(~"50% - 480px");
left: -webkit-calc(~"50% - 480px");
left: calc(~"50% - 480px");
left: 10%;
}
#db_msg{
margin : 15px;
height : calc(~"100% - 30px")
}
#db_content{
height : calc(~"100% - 150px");
}
.db_table{
width: 100%;
border-collapse: collapse;
......@@ -1094,6 +1098,8 @@ span.logo
.db_table td{
overflow: hidden;
text-overflow: ellipsis;
white-space: nowrap;
}
.db_table .column_200{ width: 200px; }
......@@ -1178,9 +1184,9 @@ span.logo
}
#db_table_container{
overflow-y: auto;
overflow-y: scroll;
overflow-x: hidden;
max-height: 275px;
height: 100%;
position: relative;
}
......
......@@ -323,8 +323,7 @@
<div id="db_div">
<span class="closeButton" onclick="db.close()">X</span>
<div id="db_msg">
</div>
<div id="db_msg"></div>
</div>
<div id="flash_container"></div>
......
......@@ -192,7 +192,7 @@ Model.prototype = {
loadAnalysisUrl: function (url) {
var self = this;
var url2 = url.replace(".vidjil",".analysis");
var url2 = url.replace(new RegExp(".vidjil" + '$'), ".analysis")
var url_split = url2.split('/')
......
#+TITLE: Vidjil -- Algo Manual
#+TITLE: Vidjil Algorithm -- Command-line Manual
#+AUTHOR: The Vidjil team (Mathieu, Mikaël and Marc)
# Vidjil -- V(D)J recombinations analysis -- [[http://www.vidjil.org]]
......@@ -14,10 +14,11 @@ Vidjil processes high-throughput sequencing data to extract V(D)J
junctions and gather them into clones. Vidjil starts
from a set of reads and detects "windows" overlapping the actual CDR3.
This is based on an fast and reliable seed-based heuristic and allows
to output the most abundant clones. The analysis is extremely fast
to output all sequenced clones. The analysis is extremely fast
because, in the first phase, no alignment is performed with database
germline sequences. Vidjil can also cluster similar
clones, or leave this to the user after a manual review.
germline sequences. At the end, only the representative sequences
of each clone have to be analyzed. Vidjil can also cluster similar
clones, or leave this to the user after a manual review in the browser.
The method is described in the following paper:
......@@ -40,6 +41,7 @@ Vidjil has been successfully tested on the following platforms :
- Ubuntu 12.04 amd64
- Ubuntu 12.04 i386
Moreover, the continuous integration of Vidjil can be checked on [[https://travis-ci.org/magiraud/vidjil][travis-ci.org]].
* Installation
......@@ -65,10 +67,54 @@ make test # run self-tests
#+END_SRC
* Input and output files
The main input file of Vidjil is a /set of reads/, given as a =.fasta=
or =.fastq= file. This set of reads can reach several gigabytes. It is
never loaded entirely in the memory, but reads are processed one by
one by the Vidjil algorithm.
The main output of Vidjil (with the default =-c clones= command) are two following files:
- The =.vidjil= file is /the file for the Vidjil browser/.
The file is in a =.json= format (detailed in [[file:format-analysis.org][format-analysis.org]])
describing the windows and their count, the representatives (=-y=),
the detailed segmentation (=-z=, see warning below), and possibly
the results of the further clustering.
The browser takes this =.vidjil= file (possibly merged with
=fuse.py=) for the /visualization and analysis/ of clones and their
tracking along different samples (for example time points in a MRD
setup or in a immunological study).
Please see [[file:browser.org][browser]].org for more information on the browser.
- The =.vdj.fa= file is /a FASTA file for further processing by other bioinformatics tools/.
The sequences are at least the windows (and their count in the headers) or
the representatives (=-y=) when they have been computed.
The headers include the count of each window, and further includes the
detailed segmentation (=-z=, see warning below), given in a '.vdj' format, see below.
The further clustering is not output in this file.
The =.vdj.fa= output enable to use Vidjil as a /filtering tool/,
shrinking a large read set into a manageable number of (pre-)clones
that will be deeply analyzed and possibly further clustered by
other software.
The default options are very conservative (large window, no further
automatic clusterization, see below), leaving the user or other
software making detailed analysis and decisions on the final
clustering.
By default, the two output files are named =out/basename.vidjil= in =out/basename.vdj.fa=, where:
- =out= is the directory where all the outputs are stored, including auxiliary output files (can be changed with the =-o= option)
- =basename= is the basename of the input =.fasta/.fastq= file (can be overriden with the =-b= option)
* Vidjil parameters
Launching vidjil with =-h= option provides the list of parameters that can be
used.
used. We detail here the options of the main =-c clones= command.
** Main algorithm parameters
......@@ -119,7 +165,7 @@ The =-r/-%= options are strong thresholds: if a clone does not have
the requested number of reads, the clone is discarded (except when
using =-l=, see below).
The default =-r 10= option is meant to only output clones that
have a significant read support. *You shoud use* =-r 1= *if you
have a significant read support. *You should use* =-r 1= *if you
want to detect all clones starting from the first read* (especially for
MRD detection).
......@@ -135,11 +181,12 @@ to display the clones on the grid (otherwise they are displayed on the
If you want to analyze more clones, you should use =-z 50= or
=-z 100=. It is not recommended to use larger values: outputting more
than 100 clones is often not useful since they can't be visualized easily
in the browser, and takes large computation time.
in the browser, and takes large computation time (full dynamic programming,
see below).
Note that even if a clone is not in the top 20 (or 50, or 100) but
still passes the =-r=, =-%= options, it is still reported in the .vidjil
file. If the clone is at some MRD point in the top 20 (or 50, or 100),
still passes the =-r=, =-%= options, it is still reported in both the =.vidjil=
and =.vdj.fa= files. If the clone is at some MRD point in the top 20 (or 50, or 100),
it will be fully analyzed/segmented by this other point (and then
collected by the =fuse.py= script, using representatives computed at this
other point, and then, on the browser, correctly displayed on the grid).
......@@ -162,10 +209,11 @@ while the remaining columns consist of the window's label.
In Vidjil output, the labels are output alongside their windows.
** Further clustering
** Further clustering (experimental)
These options have no consequences on the visualization through the
browser. They are intented for a command-line use only.
These options have no consequences on the =.vdj.fa= file, but adds
additional information in the =.vidjil= file to be visualized in the
browser.
Setting the =-n= option triggers an additional automatic
clustering using DBSCAN algorithm (Ester and al., 1996).
......@@ -177,6 +225,8 @@ considered as similar. Such a file may be automatically produced by vidjil
two windows that must be clustered.
* Examples of use
All the following examples are on a IGH VDJ recombinations : they thus
......@@ -185,12 +235,8 @@ require the =-G germline/IGH= and the =-d= options.
#+BEGIN_SRC sh
./vidjil -G germline/IGH -d data/Stanford_S22.fasta
# Extract (with an ultra-fast heuristic) all windows
# Summary of windows is available in out/Stanford_S22.vidjil
# (for the '.vidjil' format, see below)
# To have detailed/debug results in out/Stanford_S22.vdj.fa
# (which is a FASTA file embedding heuristic information
in the headers, '.vdj' format, see warning below)
# run Vidjil with option '-U'
# Summary of windows is available both in out/Stanford_S22.vdj.fa
# and in out/Stanford_S22.vidjil.
#+END_SRC
#+BEGIN_EXAMPLE
......@@ -234,7 +280,7 @@ CTATGATAGTAGTGGTTATTACGGGGTAGGGCAGTACTACTACTACTACATGGACGTCTG
#+BEGIN_SRC sh
./vidjil -c germlines file.fastq
# Search for all the germlines and output statistics
# on the number of occurrences in each germline
# on the number of occurrences of k-mers in each germline
#+END_SRC
* Segmentation and .vdj format
......@@ -250,16 +296,16 @@ in the following situations:
the center of the window may be shifted up to 15 bases from the
actual center.
- in a second pass, on the standard output
- at the end of the clones detection (=-c clones=, also in in
=basename.vdj.fa=, where =basename= is the basename of the input file)
- or directly when explicitly requiring segmentation (=-c segment=)
- in a second pass, on the standard output and in both =.vidjil= and =.vdj.fa= files
- at the end of the clones detection (default command =-c clones=)
- or directly when explicitly requiring segmentation (=-c segment=)
This segmentation obtained by full comparison (dynamic
programming) with all germline sequences. Such segmentation are
not at the core of the Vidjil clone gathering method (which
relies only on the 'window', see above). They are provided only
for convenience and should be checked with other softwares such
relies only on the 'window', see above). They are slow to compute
and are provided only for convenience.
They should be checked with other softwares such
as IgBlast, iHHMune-align or IMGT/V-QUEST.
Segmentations of V(D)J recombinations are displayed using a dedicated
......@@ -269,7 +315,7 @@ with a > is of the following form:
#+BEGIN_EXAMPLE
>name + VDJ startV endV startD endD startJ endJ Vgene delV/N1/delD5' Dgene delD3'/N2/delJ Jgene comments
name sequence name
name sequence name (include the number of occurrences in the read set and possibly other information)
+ strand on which the sequence is mapped
VDJ type of segmentation (can be "VJ", "VDJ",
or shorter tags such as "V" for incomplete sequences).
......@@ -304,16 +350,7 @@ this case a valid FASTA file.
For VJ recombinations the output is similar, the fields that are not
applicable being removed:
>name + VJ startV endV startJ endJ Vgene delV/N1/delJ Jgene coments
* .vidjil and .json format and web interface
A summary of extracted windows is also available in a JSON format,
including, for each windows, the number of reads sharing this window.
The format of this file may change in future releases.
This file is used by the dynamic browser for visualization
and analysis of clones and their tracking along different samples,
(for example time points in a MRD setup or in a immunological study).
Please see the file [[file:browser.org][browser]].org for more information on the browser.
#+BEGIN_EXAMPLE
>name + VJ startV endV startJ endJ Vgene delV/N1/delJ Jgene coments
#+END_EXAMPLE
......@@ -19,11 +19,11 @@ The Vidjil browser runs in any modern browser. It has been successfully tested o
** The .vidjil files
The vidjil browser displays .vidjil files that summarize the V(D)J
The vidjil browser displays =.vidjil= files that summarize the V(D)J
rearrangements and the sequences found in a run.
If you have an access to the patient database, you are able to upload,
manage, process your runs (.fasta/.fastq or .clntab files) directly on the browser
manage, process your runs (=.fasta=, =.fastq= or =.clntab= files) directly on the browser
(see below 'patient database'), and the server behind the patient
database computes these .vidjil files.
Otherwise, such .vidjil files can be obtained:
......@@ -31,8 +31,8 @@ Otherwise, such .vidjil files can be obtained:
http://www.vidjil.org/submit.html or using any other method
(e.g. your own FTP)
- from the command-line version of Vidjil (starting from
.fasta/.fastq files, see doc/algo.org in the command-line version).
To gather several .vidjil files, you have to use the [[../server/fuse.py][fuse.py]] script
=.fasta= or =.fastq= files, see [[http://git.vidjil.org/blob/master/doc/algo.org][algo.org]])
To gather several .vidjil files, you have to use the [[http://git.vidjil.org/blob/master/server/fuse.py][fuse.py]] script
- or by post-processing of other V(D)J analysis pipelines (contact us
if you are interested)
......
......@@ -796,7 +796,7 @@ def main():
group_options.add_argument('--compress', '-c', action='store_true', help='compress point names, removing common substrings')
group_options.add_argument('--pipeline', '-p', action='store_true', help='compress point names (internal Bonsai pipeline)')
group_options.add_argument('--output', '-o', type=str, default='fused.data', help='output file (%(default)s)')
group_options.add_argument('--output', '-o', type=str, default='fused.vidjil', help='output file (%(default)s)')
group_options.add_argument('--top', '-t', type=int, default=50, help='keep only clones in the top TOP of some point (%(default)s)')
parser.add_argument('file', nargs='+', help='''input files (.vidjil/.cnltab)''')
......
......@@ -10,3 +10,7 @@ if request.env.http_origin:
def index():
if auth.has_membership("admin"):
return dict(message=T(''))
def worker():
if auth.has_membership("admin"):
return dict(message=T(''))
......@@ -100,7 +100,7 @@ def get_data():
& ( db.results_file.sequence_file_id == db.sequence_file.id )
& ( db.patient.id == request.vars["patient_id"] )
& ( db.results_file.config_id == request.vars["config_id"] )
).select( orderby=db.sequence_file.sampling_date )
).select( orderby=db.sequence_file.id|db.results_file.run_date, groupby=db.sequence_file.id )
data["samples"]["original_names"] = []
data["samples"]["info"] = []
......@@ -164,7 +164,7 @@ def get_analysis():
res["clusters"] = analysis["clusters"]
res["clones"] = analysis["clones"]
res["tags"] = analysis["tags"]
res["samples"]["order"] = analysis["samples"]["order"]
res["samples"]= analysis["samples"]
res["info_patient"] = db.patient[request.vars["patient_id"]].info
res["patient"] = db.patient[request.vars["patient_id"]].first_name + " " + db.patient[request.vars["patient_id"]].last_name + " (" + db.config[request.vars["config_id"]].name + ")"
......
......@@ -26,7 +26,23 @@ def index():
res = {"redirect" : "default/user/login"}
return gluon.contrib.simplejson.dumps(res, separators=(',',':'))
log.debug('patient list')
return dict(message=T(''))
count = db.sequence_file.id.count()
isAdmin = auth.has_membership("admin")
query = db(
(auth.accessible_query('read', db.patient) | auth.accessible_query('admin', db.patient) )
).select(
db.patient.ALL,
count,
left=db.sequence_file.on(db.patient.id == db.sequence_file.patient_id),
groupby=db.patient.id
)
return dict(query = query,
count = count,
isAdmin = isAdmin)
......
......@@ -22,7 +22,7 @@
</div>
<div>
<div id="db_content">
{{include}}
</div>
......
{{extend 'db_layout.html'}}
{{('message' in globals())}}
{{if 'message' in globals():}}
<h3>Patient list</h3>
{{
count = db.sequence_file.id.count()
<div id="db_table_container">
<table class="db_table" id="table">
query = db(
(auth.accessible_query('read', db.patient) | auth.accessible_query('admin', db.patient) )
).select(
db.patient.ALL,
count,
left=db.sequence_file.on(db.patient.id == db.sequence_file.patient_id),
groupby=db.patient.id
)
}}
<thead>
<tr><td class="column_200"> name </td>
<td class="column_100"> birth </td>
<td> info </td>
<td class="column_100"> configs </td>
{{if isAdmin:}} <td class="column_100"> users </td> {{pass}}
<td class="column_100"> files </td>
<td class="column5"> </td>
<td class="column5"> </td>
<td class="column5"> </td>
</tr>
</thead>
<div>
<h3>{{=message}}</h3>
<div id="db_table_container">
<table class="db_table" id="table">
<thead>
<tr><td class="column_200"> name </td>
<td class="column_100"> birth </td>
<td> info </td>
<td class="column_100"> configs </td>
{{if auth.has_membership("admin"):}}
<td class="column_100"> users </td>
{{pass}}
<td class="column_100"> files </td>
<td class="column5"> </td>
<td class="column5"> </td>
<td class="column5"> </td>
</tr>
</thead>
{{for row in query :}}
<tr onclick="db.call('patient/info', {'id' :'{{=row.patient.id}}'} )" >
<td> {{=row.patient.last_name + " " + row.patient.first_name }} </td>
<td> {{=row.patient.birth }} </td>
<td> {{=row.patient.info[:50] }}
{{if len(row.patient.info) > 50: }}...{{pass}}
</td>
<td> {{for row2 in db( db.fused_file.patient_id == row.patient.id ).select(db.fused_file.config_id, distinct=True) :}}
{{for row in query :}}
<tr onclick="db.call('patient/info', {'id' :'{{=row.patient.id}}'} )" >
<td> {{=row.patient.last_name + " " + row.patient.first_name }} </td>
<td> {{=row.patient.birth }} </td>
<td> {{=row.patient.info }} </td>
<td> {{for row2 in db( db.fused_file.patient_id == row.patient.id ).select(db.fused_file.config_id, distinct=True) :}}
{{=db.config[row2.config_id].name}}
{{pass}}</td>