Commit 1abb1e3f authored by marc's avatar marc
Browse files
parents cf29b5ca 1fbd45fa
......@@ -50,25 +50,23 @@ string AlignBox::getSequence(string sequence) {
void AlignBox::addToJson(json &seg) {
seg[key] = ref_label;
json j;
if (key == "5")
{
seg[key+"end"] = end;
seg[key+"del"] = del_right;
}
else if (key == "3")
j["name"] = ref_label;
if (key != "3") // no end information for J
{
seg[key+"del"] = del_left;
seg[key+"start"] = start;
j["end"] = end;
j["delRight"] = del_right;
}
else
if (key != "5") // no start information for V
{
seg[key+"delLeft"] = del_left;
seg[key+"start"] = start;
seg[key+"end"] = end;
seg[key+"delRight"] = del_right;
j["start"] = start;
j["delLeft"] = del_left;
}
seg[key] = j ;
}
ostream &operator<<(ostream &out, const AlignBox &box)
......
../../tools/ansi.py
\ No newline at end of file
......@@ -4,4 +4,4 @@ $ Skip the good number of reads
1:Processing every 131th read
$ Analyze the good number of reads
1: found 100 ..-windows in 100 reads .100. of 100 reads
1: found .* of 100 reads
......@@ -14,21 +14,13 @@ $ Segmentation
1:"name": "IGHV3-23.05 6/ACCCGGGAGGAACAATAT/9 IGHD6-13.01 0//5 IGHJ4.02"
$ Segmentation details - V
1:"5": "IGHV3-23.05",
1:"5end": 54,
1:"5del": 6,
1:"5": ."delRight": 6, "end": 54, "name": "IGHV3-23.05".
$ Segmentation details - D
1:"4delLeft": 9,
1:"4start": 73,
1:"4": "IGHD6-13.01",
1:"4end": 84,
1:"4delRight": 0,
1:"4": ."delLeft": 9, "delRight": 0, "end": 84, "name": "IGHD6-13.01", "start": 73.
$ Segmentation details - J
1:"3del": 5,
1:"3start": 85,
1:"3": "IGHJ4.02",
1:"3": ."delLeft": 5, "name": "IGHJ4.02", "start": 85.
$ Segmentation details - N1, N2
1:"N1": 18,
......
!LAUNCH: $VIDJIL_DIR/vidjil -y 0 -x 100 -G $VIDJIL_DIR/germline/IGH $VIDJIL_DIR/data/Stanford_S22.fasta
$ Analyze the good number of sequences in Stanford S22
1: found 99 ..-windows in 100 reads .100. of 100 reads
1: found .* of 100 reads
......@@ -7,6 +7,6 @@ $ The FineSegmenter outputs a segmentation without any D
1: clone-001--TRB--.*--TRBV7-2--TRBJ2-3
$ There is a segmentation in the .vidjil, but without any D
1: "3": "TRBJ2-3
0: "4": "TRBD
1: "5": "TRBV7-2
1:"name": "TRBJ2-3.01"
0:"name": "TRBD
1:"name": "TRBV7-2.01"
......@@ -16,6 +16,7 @@ The 'should_pattern' is then checked against the 'result' part, and a .tap file
import sys
import re
import ansi
PY_REQUIRED = (2, 7)
if sys.version_info < PY_REQUIRED:
......@@ -231,6 +232,9 @@ def id_line_to_tap(l, tap_id):
if 'TODO' in should:
tap += '# TODO '
else:
if not found:
tap += ansi.Style.BRIGHT + ansi.Fore.RED + '# not ok ' + ansi.Style.RESET_ALL
tap += '- ' + should_pattern
......
......@@ -64,7 +64,7 @@
// GIT_VERSION should be defined in "git-version.h", created by "create-git-version-h.sh", to be used outside of releases
#include "git-version.h"
#define VIDJIL_JSON_VERSION "2014.10"
#define VIDJIL_JSON_VERSION "2016a"
//$$ #define (mainly default options)
......
......@@ -38,4 +38,7 @@ var config = {
/* 3) Static file autoload, possibly with an .analysis file */
"autoload" : "data/Stanford-S22.vidjil"
// "autoload_analysis" : "data/Stanford-S22.analysis"
// Proxy config for IMGT querying
"proxy": "http://test.vidjil.org/proxy"
}
var PROXY_ADDRESS = "https://dev.vidjil.org/proxy/imgt"
//parametre IMGT par defaut
function initImgtInput() {
var imgtInput = {};
......@@ -169,7 +171,11 @@ function imgtPostForSegmenter(data, system) {
//disabled due to security concerns
//form.action = "http://www.imgt.org/IMGT_vquest/vquest";
//using proxy on server to allow requests on other site than vidjil one's in JS.
form.action = "https://test.vidjil.org/vidjil/proxy/imgt";
if (typeof config != 'undefined') {
form.action = config.proxy
} else {
form.action = PROXY_ADDRESS;
}
form.method = "POST";
for (var k in imgtInput) {
......
......@@ -1122,9 +1122,9 @@ Uploader.prototype = {
if (this.is_uploading()){
$("#upload_summary").css("display","block")
$("#upload_summary_label").html("<span class='loading_seq'>upload list</span>")
$("#upload_summary_label").html("<span class='loading_seq'>uploading</span>")
}else{
$("#upload_summary_label").html("<span class='loading_status'>upload list</span>")
$("#upload_summary_label").html("<span class='loading_status'>uploads</span>")
}
for (var key in this.queue){
......
......@@ -722,6 +722,7 @@ List.prototype = {
var c = this.m.clone(i)
if (c.getName().toUpperCase().indexOf(str.toUpperCase())!=-1 ) c.isFiltered = false
if (c.getSequence().toUpperCase().indexOf(str.toUpperCase())!=-1 ) c.isFiltered = false
if (c.getSegAASequence('cdr3').toUpperCase().indexOf(str.toUpperCase())!=-1 ) c.isFiltered = false
if (c.getRevCompSequence().toUpperCase().indexOf(str.toUpperCase())!=-1 ) c.isFiltered = false
if (c.getSequenceName().toUpperCase().indexOf(str.toUpperCase())!=-1 ) c.isFiltered = false
}
......
#+TITLE: .analysis and .vidjil format
#+TITLE: .analysis and .vidjil format (2016a)
#+AUTHOR: The Vidjil team
The =.analysis= and the =.vidjil= files share a common [[http://en.wikipedia.org/wiki/JSON][.json]] format.
They are produced and used by several components of the Vidjil platform,
but you can also use these formats to use the Vidjil browser within
your own analysis pipeline.
The following [[http://en.wikipedia.org/wiki/JSON][.json]] format allows to
encode a set of clones with V(D)J immune recombinations,
possibly with user annotations.
In Vidjil, this format is used by both the =.analysis= and the =.vidjil= files.
The =.vidjil= file represents the actual data on clones (and that can
reach megabytes). It should be automatically produced.
reach megabytes, or even more), usually produced by processing reads by some RepSeq software.
(for example with detailed information on the 100 or 1000 top clones).
The =.analysis= file describes customizations done by the user
(or by some automatic pre-processing) on the Vidjil browser. The browser
can load or save such files (and possibly from/to the patient database).
......@@ -16,21 +16,37 @@ It is intended to be very small (a few kilobytes).
All settings in the =.analysis= file override the settings that could be
present in the =.vidjil= file.
* What is a clone ?
There are several definitions of what may be a clonotype,
depending on different RepSeq software or studies.
This format and the Vidjil browser both accept any kind of definition:
Clones are identified by a =id= string that may be an arbitrary identifier such as =clone-072a=.
Software computing clones may choose some relevant identifiers:
- =CGAGAGGTTACTATGATAGTAGTGGTTATTACGGGGTAGGGCAGTACTAC=, Vidjil algorithm, 50 nt window centered on the CDR3
- =CARPRDWNTYYYYGMDVW=, a CDR3 AA sequence
- =CARPRDWNTYYYYGMDVW IGHV3-11*00 IGHJ6*00=, a CDR3 AA sequence with additional V/J gene information (MiXCR)
- the 'clone sequence' as computed by the ARReST in =.clntab= files (processed by =fuse.py=)
- see also 'IMGT clonotype (AA) or (nt)'
* Examples
** =.vidjil= file -- one sample
This is an almost minimal =.vidjil= file, describing clones in one sample.
The =seg= element is optional: clones without =seg= elements will be shown on the grid with '?/?'.
All other elemnts are required. The =reads.germlines= list can have only one element the case of data on a unique locus.
There is here one clone with a segmentation =TRGV5*01 5/CC/0 TRGJ1*02=.
Note that other elements could be added by some program (such as =tag= or =clusters=).
All other elements are required. The =reads.germlines= list can have only one element the case of data on a unique locus.
There is here one clone on the =TRG= locus with a designation =TRGV5*01 5/CC/0 TRGJ1*02=.
Note that other elements could be added by some program (such as =tag=, to identify some clones,
or =clusters=, to further cluster some clones, see below).
#+BEGIN_SRC js :tangle analysis-example1.vidjil
{
"producer": "program xyz version xyz",
"timestamp": "2014-10-01 12:00:11",
"vidjil_json_version": "2014.10",
"vidjil_json_version": "2016a",
"samples": {
"number": 1,
......@@ -65,19 +81,19 @@ Note that other elements could be added by some program (such as =tag= or =clust
}
#+END_SRC
** =.vidjil= file -- several samples
** =.vidjil= file -- several related samples
This a =.vidjil= file obtained by merging with =fuse.py= two =.vidjil= files corresponding to two samples.
Clones that have a same =id= are gathered.
Clones that have a same =id= are gathered (see 'What is a clone?', above).
It is the responsability of the program generating the initial =.vidjil= files to choose these =id= to
do a correct gathering ('windows' is used by Vidjil, 'clone sequence' is used by EC-NGS/Brno pipeline,
and 'IMGT clonotype (AA) or (nt)' could also be used by some programs).
do a correct gathering.
#+BEGIN_SRC js :tangle analysis-example2.vidjil
{
"producer": "program xyz version xyz / fuse.py version xyz",
"timestamp": "2014-10-01 14:00:11",
"vidjil_json_version": "2014.10",
"vidjil_json_version": "2016a",
"samples": {
"number": 2,
......@@ -132,15 +148,15 @@ and 'IMGT clonotype (AA) or (nt)' could also be used by some programs).
** =.analysis= file
This file reflects what an user could have done with the browser (or with some other tool).
She has manually set sample names (=names=), tagged (=tag=, =tags=) and clustered (=clusters=)
This file reflects the annotations a user could have done within the Vidjil browser or some other tool.
She has manually set sample names (=names=), tagged (=tag=, =tags=), named (=name=) and clustered (=clusters=)
some clones, and added external data (=data=).
#+BEGIN_SRC js :tangle analysis-example2.analysis
{
"producer": "user Bob, via browser",
"timestamp": "2014-10-01 12:00:11",
"vidjil_json_version": "2014.10",
"vidjil_json_version": "2016a",
"samples": {
"number": 2,
......@@ -191,40 +207,43 @@ considered. In that case we should first consider the second point (whose =name=
is /fu1)/ and the point to be considered in second should be the first one in
the file (whose =name= is /diag/).
As exemplified in the =clusters= field, this proceeds to the clustering of
clones defined in the =.vidjil= file (here /clone2/ and /clone3/ are defined in the
vidjil file in previous section). If clones do not exist, the clusters are
The =clusters= field indicate clones (by their =id=) that have been further clustered.
Usually, these clones were defined in a related =.vidjil= file (as /clone2/ and /clone3/,
see the =.vidjil= file in the previous section). If these clones do not exist, the clusters are
just ignored. The first item of the cluster is considered as the
representative clone of the cluster.
* The different elements
* Detailed specification
** Generic information for traceability [required]
#+BEGIN_SRC js
"producer": "", // arbitrary string, user/software/options producing this file [required]
"timestamp": "", // last modification date [required]
"vidjil_json_version": "2014.10", // version of the format [required]
"producer": "my-repseq-software -z -k (v. 123)", // arbitrary string, user/software/version/options producing this file [required]
"timestamp": "2014-10-01 12:00:11", // last modification date [required]
"vidjil_json_version": "2016a", // version of the .json format [required]
#+END_SRC
** 'reads' element [.vidjil only, required]
** Statistics: the =reads= element [.vidjil only, required]
The number of analyzed reads (=segmented=) may be higher than the sum of the read number of all clones,
when one choose to report only the 'top' clones (=-t= option for fuse).
#+BEGIN_SRC js
{
"total" : // total number of reads per sample (with samples.number elements)
"segmented" : // number of segmented reads per sample (with samples.number elements)
"germline" : { // number of segmented reads per sample/germline (with samples.number elements)
"TRG" :
"IGH" :
"total" : [], // total number of reads per sample (with samples.number elements)
"segmented" : [], // number of analyzed/segmented reads per sample (with samples.number elements)
"germline" : { // number of analyzed/segmented reads per sample/germline (with samples.number elements)
"TRG" : [],
"IGH" : []
}
}
#+END_SRC js
#+END_SRC
** 'Samples' element [required]
** =samples= element [required]
#+BEGIN_SRC js
{
......@@ -238,29 +257,26 @@ representative clone of the cluster.
"order": [], // custom sample order (lexicographic order by default) [optional]
// traceability on each sample (with sample.number elements)
"producer": [],
"timestamp": [],
"log": [],
"log": []
}
#+END_SRC
** 'Clones' list
** =clones= list, with read count, tags, V(D)J designation and other sequence features
Each element in the 'clones' list describes properties of a clone.
In a .vidjil file, this is the main part, describing all clones.
In the .analysis file, this section is intended to describe some specific clones.
Each element in the =clones= list describes properties of a clone.
In a =.vidjil= file, this is the main part, describing all clones.
In the =.analysis= file, this section is intended to describe some specific clones.
#+BEGIN_SRC js
{
"id": "", // clone identifier, must be unique [required]
// Vidjil/algo output -> the 'window'
// Brno .clntab -> clone sequence
"id": "", // clone identifier, must be unique [required] [see above, 'What is a clone ?']
// the clone identifier in the .vidjil file and in .analysis file must match
"germline": "" // [required for .vidjil]
......@@ -280,46 +296,37 @@ In the .analysis file, this section is intended to describe some specific clones
// this will create a normalization option in the
// settings browser menu
"seg": // segmentation information [optional]
"seg": // detailed V(D)J desigination/segmentation and other sequences features [optional]
// in the browser clones, that are not segmented will be shown on the grid with '?/?'
// positions are related to the 'sequence'
// names of V/D/J genes should match the ones in files referenced in germline/germline.data
// Positions must start at 1.
// Positions on the sequence start at 1.
{
"5": {"name": "IGHV5*01",
"start": 0,
"stop": 0},
"4": {"name": "IGHD1*01",
"start": 0,
"stop": 0},
"3": {"name": "IGHJ3*02",
"start": 0,
"stop": 0},
"5": {"name": "IGHV5*01", "start": 1, "stop": 120}, // V (or 5') segment
"4": {"name": "IGHD1*01", "start": 124, "stop": 135}, // D (or middle) segment
// Recombination with several D may use "4a", "4b"...
"3": {"name": "IGHJ3*02", "start": 136, "stop": 171}, // J (or 3') segment
// any feature to be highligthed in the sequenc
// any feature to be highligthed in the sequence
// the optional "seq" element gives a sequence that corresponds to this feature
// CDR3 should be stored that way (in a field called "cdr3"), this is similar
// for the other region of interest.
// The junction is also stored in that way (in a "junction" field),
// JUNCTION//CDR3 should be stored that way (in fields called "junction" of "cdr3"),
// its productivity must be stored in a boolean field called "productive".
// Positions must also start at 1.
"somefeature": { "start": 1, "stop": 100, "seq": "" }
// Positions also start at 1.
"somefeature": { "start": 56, "stop": 61, "seq": "ACTGTA" }
}
"reads": [], // number of reads in this clones [.vidjil only, required]
// (with samples.number elements)
"top": 0, // required so that the browser displays the clone
"top": 0, // (not documented now) [required] threshold to display/hide the clone
"stats": [] // (not documented now) [.vidjil only] (with sample.number elements)
}
#+END_SRC
** 'Germlines' list [optional][work in progress, to be documented]
** =germlines= list [optional][work in progress, to be documented]
extend the =germline.data= default file with a custom germline
......@@ -334,45 +341,29 @@ extend the =germline.data= default file with a custom germline
}
#+END_SRC
** 'Clusters' list [optional]
** Further clustering of clones: the =clusters= list [optional]
Each element in the 'clusters' list describe a list of clones that are 'merged'.
In the browser, it will be still possible to see them or to unmerge them.
The first clone of each line is used as a representative for the cluster.
** 'Data' list [optional][work in progress, to be documented]
** =data= list [optional][work in progress, to be documented]
Each element in the 'data' list is a list of values (of size samples.number)
Each element in the =data= list is a list of values (of size samples.number)
showing additional data for each sample, as for example qPCR levels or spike information.
In the browser, it will be possible to display these data and to normalize
against them (not implemented now).
** 'Tags' list [optional]
** Tagging some clones: =tags= list [optional]
The 'tags' list describe the custom tag names as well as tags that should be hidden by default.
The =tags= list describe the custom tag names as well as tags that should be hidden by default.
The default tag names are defined in [[../browser/js/vidjil-style.js]].
#+BEGIN_SRC js
"key" : "value" // "key" is the tag id from 0 to 7 and "value" is the custom tag name attributed
#+END_SRC
* Differences between programs
Due to specificities between programs, some elements may differ depending
on which program has been run.
** MiXCR
The output when using MiXCR differs from Vidjil on the id of each clone.
Where Vidjil provides the representative sequence of the clone, MiXCR
provides the representative sequence in =Amino Acids= followed by the name
of the =V gene= and the name of the =J gene=.
#+BEGIN_SRC js
{
"germline": ...
"id": CARPRDWNTYYYYGMDVW IGHV3-11*00 IGHJ6*00
...
}
#+END_SRC
......@@ -7,21 +7,17 @@
* A quality control of bioinformatics analysis
The =.should-vdj.fa= tests are sequences with manually curated V(D)J designations.
These designations were checked by hand, possibly with the help of other bioinformatics tools.
These designations were checked by hand, possibly with the help of some bioinformatics tools.
Tests may range from very easy cases with unambiguous V(D)J designations
to borderline or difficult cases, including incomplete or unusual recombinations or translocations.
This collection of sequences helps the robustness of Vidjil (and potentially of other programs,
as this data is open-source). The automated test suite launches Vidjil on all these
sequences and compares the computed designations with the curated designations.
Tests where Vidjil currently fails will be marked as TODO.
Having a correct behavior on these tests may be a goal for future releases.
This collection of sequences, distributed as open-source data, may help the robustness
of any software doing immune repertoire sequencing (RepSeq) analysis.
* Contributing to the tests
Users and developers are strongly encouraged to send us (=contact@vidjil.org=)
Users and developers of RepSeq software are encouraged to send us (=contact@vidjil.org=)
their manually curated sequences, ideally in the format described below, or by
directly proposing pull requests on GitHub with new tests in the [[https://github.com/vidjil/vidjil/tree/master/algo/tests/should-vdj-tests][=algo/tests/should-vdj=]] directory.
We can also help to encode sequences in this format.
......@@ -107,6 +103,15 @@ Mixed TRA/TRD recombinations can be encoded with =[TRA+D]=.
Other special cases, such as translocations involving BCL1 or BCL2, should be written now as comments after a =#= character.
* Encoding the JUNCTION/CDR3 information
JUNCTION or CDR3 information can be optionnaly encoded, using curly braces:
#+BEGIN_EXAMPLE
>TRGV10*02 5/AGAC/3 TRGJP1*01 [TRG] {CAAWRPTGWFKIF}
AAGTCCGTAGAGAAAGAAGACATGGCCGTTTACTACTGTGCTGCGTGGAGACCCACTGGTTGGTTCAAGATATTTGCTGAAGGGACTAAGC
#+END_EXAMPLE
* Ambiguous or alternate designations
On some sequences, several V(D)J designations may be equally acceptable.
......@@ -126,7 +131,14 @@ TGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAGTT
#+END_EXAMPLE
* Running the tests
* Program-specific information
** Vidjil
In Vidjil, the automated test suite launches the analysis on all these
sequences and compares the computed designations with the curated designations.
Cases with current failure will be marked as TODO.
Having a correct behavior on these tests may be a goal for future releases.
Vidjil can be tested on =.should-vdj:= tests can be launched within the =algo/tests= directory:
- =python should-vdj-to-tap.py= runs one or several tests, given as parameters on the command line,
......
VIDJIL_JSON_VERSION = "2014.10"
VIDJIL_JSON_VERSION_REQUIRED = "2014.10"
VIDJIL_JSON_VERSION = "2016a"
import collections
import defs
import sys
#### Utilities on dictionaries
......@@ -203,7 +204,9 @@ class VidjilJson():
def check_version(self, filepath):
'''Check vidjil_json_version'''
if "vidjil_json_version" in self.d:
if self.d["vidjil_json_version"] <= defs.VIDJIL_JSON_VERSION:
if self.d["vidjil_json_version"] < defs.VIDJIL_JSON_VERSION:
sys.stderr.write("! Reading file with old .json version %s\n" % self.d["vidjil_json_version"])
if self.d["vidjil_json_version"] >= defs.VIDJIL_JSON_VERSION_REQUIRED:
return
raise IOError ("File '%s' is too old -- please regenerate it with a newer version of Vidjil" % filepath)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment