Commit 2cbd3725 authored by Mikaël Salson's avatar Mikaël Salson Committed by Mathieu Giraud

Vidjil: release 2013.10

	* Better heuristic, segments more reads (core/segment.cpp)
	* Better and faster selection of representative read (vidjil.cpp, core/read_chooser.cpp)
	* Better display of reason of non-segmenting reads
	* New normalization against a standard (-Z) (core/labels.cpp)
	* New experimental lazy_msa multiple aligner
	* New .json output
	* New unit tests
	* Bugs closed
parent de4c48e0
......@@ -9,13 +9,7 @@ test: all
should: all
@echo
@echo "*** Launching .should_get tests..."
src/tests/should-to-tap.sh src/tests/bugs/bug20130617.should_get
src/tests/should-to-tap.sh src/tests/stanford.should_get
src/tests/should-to-tap.sh src/tests/clones_simul.should_get
src/tests/should-to-tap.sh src/tests/clones_simul_cluster.should_get
src/tests/should-to-tap.sh src/tests/segment_S22.should_get
src/tests/should-to-tap.sh src/tests/segment_lec.should_get
src/tests/should-to-tap.sh src/tests/segment_simul.should_get
make -C src/tests should
@echo "*** All .should_get tests passed"
data germline: %:
......@@ -34,7 +28,7 @@ RELEASE_TAG="notag"
RELEASE_H = src/release.h
RELEASE_SOURCE = $(wildcard src/*.cpp) $(wildcard src/*.h) $(wildcard src/core/*.cpp) $(wildcard src/tests/*.cpp) $(wildcard src/core/*.h) $(wildcard src/tests/*.h)
RELEASE_MAKE = ./Makefile src/Makefile src/tests/Makefile germline/Makefile data/Makefile
RELEASE_TESTS = data/get-sequences $(wildcard data/*.fa) $(wildcard data/*.fq) src/tests/should-to-tap.sh $(wildcard src/tests/*.should_get) $(wildcard src/tests/bugs/*.fa)
RELEASE_TESTS = data/get-sequences $(wildcard data/*.fa) $(wildcard data/*.fq) src/tests/should-to-tap.sh $(wildcard src/tests/*.should_get) $(wildcard src/tests/bugs/*.fa) $(wildcard src/tests/bugs/*.should_get)
RELEASE_FILES = $(RELEASE_SOURCE) $(RELEASE_TESTS) $(RELEASE_MAKE) germline/get-germline germline/split-from-imgt.py doc/README doc/LICENSE
RELEASE_ARCHIVE = vidjil-$(RELEASE_TAG).tgz
......
DATA=Stanford_S22.fasta
DATA=Stanford_S22.fasta Stanford_S22_rc.fasta
all: $(DATA)
$(DATA):
sh get-sequences
clean:
$(RM) -f $(DATA)
\ No newline at end of file
$(RM) -f $(DATA)
>test
CCAGGCGAAGTTACTATGAGCTTAGTCCCTTTTGCAAACGTCTTGATCCAATCACTACTCCAGGTGGCACAGTAATAGACCCCAGAGTCACGTTCAATTAGATTTTCCAGTATAAATTTAAGGCTCTTCCCTGTGCTTGCATAAGTATGATACTTTTCTCGACTGATCCTGATTCCAACACAACCTGGAGTTGTAGAGTCCA
......@@ -4,3 +4,4 @@ wget http://www.emi.unsw.edu.au/~ihmmune/Stanford_S22.zip
unzip -d $(dirname $0) -o Stanford_S22.zip
rm -f Stanford_S22.zip
wget http://bioinfo.lifl.fr/vidjil/seq/Stanford_S22.rc.fasta
>lcl|FLN1FA002QB069.1|
ggactggagtggattgggtatatctattacagtgggagcaccaactacaacccctccctcaagagtcgagtcaccatatcagtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgctgcggacacggccgtgtattactgtgcgagagggacatggtataacgacggctggcctcactttgactactggggcccgggaaccctggtcaccgtctcctcag
>lcl|FLN1FA001DWH5L.1|
cagtgggagcaccctttacaacccctccctcaagagtcgagtcaccatatcagaagacacgtccgagaaccagttctccctgaagctgagctctgtgaccgctgcggacacggccatatattactgtgcgagagggacatggtataacgacggctggcctcactttgactactggggcccgggaaccctggtcaccgtctcctcag
>lcl|FLN1FA001CU46B.1|
gcctggagtggattgggtacatctattacagtgggagcacctactacaacccgtccctcaagagtcgagttaccatatcagtagacacgtctaagaaccagttctccctgaagctgagctctgtgactgccgcggacacggccgtgtattactgtgcgagagggacatggtataacgacggctggcctcactttgactactggggcccgggaaccctggtcaccgtctcctcag
>lcl|FLN1FA002Q5881.1|
cagtgggagcaccctttacaacccctccctcaagagtcgagtcaccatatcagaagacacgtccgagaaccagttctccctgaagctgagctctgtgaccgctgcggacacggccatatattactgtgcgagagggacatggtataacgacggctggcctcactttgactactggggcccgggaaccctggtcaccgtctcctcag
>lcl|FLN1FA001CR02J.1|
ggctggagtggattggggaaatcaatcatagtggaagcaccaactacaacccgtccctcaagagtcgagtcaccatatcagtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgccgcggacacggctgtgtattactgtgcgagagggacatggtataacgacggctggcctcactttgactactggggcccgggaaccctggtcaccgtctcctcag
2013-10-07 The Vidjil Team
* Better heuristic, segments more reads (core/segment.cpp)
* Better and faster selection of representative read (vidjil.cpp, core/read_chooser.cpp)
* Better display of reason of non-segmenting reads
* New normalization against a standard (-Z) (core/labels.cpp)
* New experimental lazy_msa multiple aligner
* New .json output
* New unit tests
* Bugs closed
2013-07-03 The Vidjil Team
* New selection of representative read (core/read_chooser.cpp)
* Faster spaced seed computation (core/tools.cpp)
......@@ -6,7 +16,6 @@
* Bugs closed
2013-04-18 The Vidjil Team
* First public release
......@@ -2,23 +2,26 @@
# Copyright (C) 2011, 2012, 2013 by Bonsai bioinformatics at LIFL (UMR CNRS 8022, Université Lille) and Inria Lille
# Contact: mathieu.giraud@lifl.fr, mikael.salson@lifl.fr
V(D)J recombinations in lymphocytes are a key for immunologic diversity
as they have an influence on the production of antibodies and antigen
receptors. They are also useful markers for pathologies: in many
cases, such a clonality marker is used for patient follow-up to
quantify the minimal residual disease in leukemias.
V(D)J recombinations in lymphocytes are essential for immunological
diversity. They are also useful markers of pathologies, and in
leukemia, are used to quantify the minimal residual disease during
patient follow-up.
Vidjil process high-througput sequencing data to extract V(D)J
junctions and gather them into clones. This analysis is based on a
seed heuristics and is fast and scalable, as, in the first phase, no
alignment is done with database germline sequences. Starting from a
set of reads, Vidjil detects the junctions in each read. This is based
on an ultra-fast seed-based heuristic, which has be prooven to be
reliable output the most abundant clones, based on their junctions
seed heuristics and is fast and scalable because in the first phase, no
alignment is performed with database germline sequences. Vidjil starts
from a set of reads and detects "windows" overlapping the actual CDR3.
This is based on an fast and reliable seed-based heuristic and allows
to output the most abundant clones. Vidjil can also clusterize similar
clones, or leave this to the user after a manual review.
The method is described in the following paper:
Vidjil can also clusterize similar clones, or leave this to the user
after a manual review. The method is described in the paper referenced
below.
Mathieu Giraud, Mikaël Salson, et al.,
"Fast multiclonal clusterization of V(D)J recombinations from high-throughput sequencing",
(submitted)
Vidjil is open-source, released under GNU GPLv3 license.
......@@ -59,7 +62,7 @@ make test # run self-tests
### Optional dependencies
clustalw (to compute alignments between junctions from a same clone)
clustalw (to compute alignments between windows from a same clone)
neato (to display graph of neighbors for the automatic clusterisation)
### Vidjil parameters
......@@ -67,24 +70,24 @@ neato (to display graph of neighbors for the automatic clusterisation)
Launching vidjil with -h option provides the list of parameters that can be
used.
### List of junctions
### List of windows
Vidjil allows to specify a list of junctions that must be followed
(even if those junctions are 'rare', below the -r/-R/-% thresholds).
Vidjil allows to specify a list of windows that must be followed
(even if those windows are 'rare', below the -r/-R/-% thresholds).
The parameter -l is made for providing such a list in a file having
the following format: junction label (separed by one space)
the following format: window label (separed by one space)
The first column of the file is the junction to be followed
while the remaining columns consist of the junction's label.
In Vidjil output, the labels are output alongside their junctions.
The first column of the file is the window to be followed
while the remaining columns consist of the window's label.
In Vidjil output, the labels are output alongside their windows.
### Manual clustering
The -e option allows to specify a file for manually clustering two junctions
The -e option allows to specify a file for manually clustering two windows
considered as similar. Such a file may be automatically produced by vidjil
(out/edges), depending on the option provided. Only the two first columns
(separed by one space) are important to vidjil, they only consist of the
two junctions that must be clustered.
two windows that must be clustered.
### Examples of use
......@@ -94,31 +97,33 @@ require the "-G germline/IGH" and the "-d" options.
./vidjil -G germline/IGH -d data/Stanford_S22.fasta
# Extract (with an ultra-fast heuristic) all junctions
# Extract (with an ultra-fast heuristic) all windows
# Results are in out/segmented.vdj.fa, which is a FASTA file
# embedding segmentation information in the headers
# ('.vdj' format, see below)
# Summary of windows is also available in out/data.json
# ('.json' format, see below)
>5--junction--1
TTGTAGTGGTGGTAGCTGCTACTCCTTTGACTACTGGGGC
>5--junction--2
TGTAGTGGTGGTAGCTGTTACTCCCACGTCTGGGGCCAAG
>8--window--1
CACCTATTACTGTACCCGGGAGGAACAATATAGCAGCTGGTACTTTGACTTCTGGGGCCA
>5--window--2
CTATGATAGTAGTGGTTATTACGGGGTAGGGCAGTACTACTACTACTACATGGACGTCTG
(...)
Junctions of size 40 (modifiable by -w) have been extracted.
These two junctions have 5 occurrences in the set of reads.
Windows of size 60 (modifiable by -w) have been extracted.
The first window has 8 occurrences, the second window has 5 occurrences.
./vidjil -c clones -G germline/IGH -x -r 1 -R 1 -d ./data/clones_simul.fa
# Extracts the junctions (-r 1, with at least 1 read each),
# Extracts the windows (-r 1, with at least 1 read each),
# then gather them into clones (-R 1, with at least 1 read each:
# there are many 1-read clones due to sequencing errors.)
# A more natural option could be -R 5.
# No representative selection / clustalw postprocessing (-x)
# Results are in out/segmented.fa, out/junctions.fa-* and out/clones*
# Results are in out/segmented.fa, out/windows.fa-* and out/clones*
# out/segmented.fa list segmented reads using the .vdj format (see below)
./vidjil -c clones -G germline/IGH -x -r 1 -R 5 -n 5 -d ./data/clones_simul.fa
# Junction extraction + clone gathering,
# Window extraction + clone gathering,
# with automatic clusterisation, distance five (-n 5)
./vidjil -c segment -G germline/IGH -d data/segment_S22.fa
......@@ -167,3 +172,10 @@ For VJ recombinations the output is similar, the fields that are not
applicable being removed:
>name + VJ startV endV startJ endJ Vgene delV/N1/delJ Jgene
### .json format
A summary of extracted windows is also available in a .json format,
including, for each windows, the number of reads sharing this window.
This file is currently used for development purposes, its format may
change in future releases of Vidjil.
CC=g++
OPTIM=-O2
CFLAGS=-W -Wall $(OPTIM) $(DEBUG)
LDFLAGS=
CFLAGS=-W -Wall $(OPTIM) $(DEBUG) $(CONFIG)
LDFLAGS=-lm
EXEC=vidjil kmer kmer_count cut count detailed_affect
SRCCORE=$(wildcard core/*.cpp) $(wildcard core/*.h)
MAINCORE=$(wildcard *.cpp)
......@@ -17,12 +17,16 @@ spaced: cleanspaced
make
nospaced: cleanspaced
make OPTIM="-O2 -DNO_SPACED_SEEDS"
make CONFIG="-DNO_SPACED_SEEDS"
cleanspaced:
rm vidjil.o core/tools.o
rm -f vidjil.o core/tools.o
unsegmented:
rm -f vidjil.o core/segment.o
make CONFIG="-DOUT_UNSEGMENTED"
all: $(EXEC)
......
#include <fstream>
#include <iostream>
#include <stdio.h>
#include <string.h>
#include <cstdlib>
#include <stdlib.h>
#include "../core/dynprog.h"
#include "../core/fasta.h"
#include "../core/msa.h"
#include "../core/lazy_msa.h"
using namespace std;
int main(int argc, char* argv[])
{
const char* fdata;
DynProg::DynProgMode dpMode = DynProg::Global;
Cost dpCost = VDJ;
ostringstream ost;
ostream * p;
p=&ost;
char filename[L_tmpnam];
tmpnam(filename);
bool cgi_mode;
if (argc <= 1){
cgi_mode=true;
cout <<"Content-type: text/html"<<endl<<endl;
cout<< "{"<<endl;
char * requestMethod = getenv("REQUEST_METHOD");
cout<<"\"requestMethod\" : \""<<requestMethod<<"\""<<endl;
if(!requestMethod) {
cout<<",\"Error\": \"requestMethod\"}"<<endl;
return 0;
}
if(strcmp(requestMethod, "POST") != 0) {
cout<<",\"Error\": \"requestMethod =/= post\"}"<<endl;
return 0;
}
char * contentType = getenv("CONTENT_TYPE");
if(!contentType) {
cout<<",\"Error\": \"content_type\"}"<<endl;
return 0;
}
cout<<",\"contentType\": \""<<contentType<<"\""<<endl;
char temp[1024];
FILE *f;
f = fopen(filename,"w");
if(f == NULL){
cout<<",\"Error\": \"save\""<<filename<<"}"<<endl;
return 0;
}else{
while(cin) {
cin.getline(temp, 1024);
fputs(temp, f);
fputs("\n", f);
}
}
fclose(f);
fdata = filename;
}else{
cgi_mode=false;
fdata = argv[1];
p=&cout;
}
if (!cgi_mode) cout <<ost<<endl;
Fasta fa(fdata, 1, " ", *p);
string seq0 = fa.sequence(0);
LazyMsa lm = LazyMsa(50, seq0);
for (int i=1; i < fa.size(); i++){
string seq1 = fa.sequence(i);
lm.add(seq1);
}
string *result;
result =new string[lm.sizeUsed+2];
lm.align(result);
if (cgi_mode){
cout<<",\"seq\" : [\""<<result[0]<<"\""<<endl;
for (int i=1; i<lm.sizeUsed+2; i++){
cout<<",\""<<result[i]<<"\""<<endl;
}
cout<<"]}";
remove(filename);
}else{
int length=60;
int n=result[0].size();
int j=0;
while (n > 0){
for (int i=0; i<lm.sizeUsed+2; i++){
cout<<" > "<<result[i].substr(j*length,length)<<" < "<<fa.label(i)<<endl;
}
cout<<endl;
n=n-length;
j++;
}
}
}
align: align.o
mkdir -p ../../interface/cgi
g++ -o ../../interface/cgi/align.cgi align.o ../core.a
align.o: align.cpp
g++ -c align.cpp
......@@ -30,7 +30,7 @@
#define HOMO2X 3
#define HOMO2Y 4
#define FIN 5
#define BACKSIZE 80
#define BACKSIZE 120
using namespace std;
......@@ -109,7 +109,7 @@ DynProg::DynProg(const string &x, const string &y, DynProgMode mode, const Cost&
B[i][j] = new int[3];
}
}
this -> mode = mode;
this -> cost = cost;
......@@ -117,6 +117,10 @@ DynProg::DynProg(const string &x, const string &y, DynProgMode mode, const Cost&
this -> best_j = -1 ;
this -> first_i = -1 ;
this -> first_j = -1 ;
gap1=NULL;
gap2=NULL;
linkgap=NULL;
init();
}
......@@ -136,6 +140,10 @@ DynProg::~DynProg() {
delete [] B[i];
}
delete [] B;
delete [] gap1;
delete [] gap2;
delete [] linkgap;
}
void DynProg::init()
......@@ -218,14 +226,17 @@ int DynProg::compute()
}
if (best==homo2x){
B[i][j][0] = i-2 ;
if (B[i][j][0] < 0) B[i][j][0] = 0 ;
B[i][j][1] = j-1 ;
B[i][j][2] = HOMO2X ;
}
if (best==homo2y){
B[i][j][0] = i-1 ;
B[i][j][1] = j-2 ;
if (B[i][j][1] < 0) B[i][j][1] = 0 ;
B[i][j][2] = HOMO2Y ;
}
if (mode == Local || mode == LocalEndWithSomeDeletions)
{
......@@ -322,8 +333,24 @@ int DynProg::compute()
void DynProg::backtrack()
{
int ti=best_i;
int tj=best_j;
gap1 = new int[x.size()+1];
linkgap = new int[x.size()+1];
gap2 = new int[y.size()+1];
for (int i = 0; i <=x.size(); i++) {
gap1[i] = 0;
linkgap[i] = 0;
}
for (int i = 0; i <= y.size(); i++) {
gap2[i] = 0;
}
int g1=x.size();
int g2=y.size();
int ti=best_i+1;
int tj=best_j+1;
int tmpi, tmpj;
tmpi=ti;
tmpj=tj;
......@@ -336,14 +363,6 @@ void DynProg::backtrack()
ostringstream back_tr;
ostringstream back;
if (x[ti]==y[tj]){
back_tr << "|";
}else{
back_tr << ":";
}
back_s1 << x[ti];
back_s2 << y[tj];
while ( B[ti][tj][2] != FIN ){
......@@ -351,42 +370,64 @@ void DynProg::backtrack()
tmpj=B[ti][tj][1];
if (B[ti][tj][2] == SUBST ){
linkgap[g1]=g2;
back_s1 << x[ti-1];
g1--;
back_s2 << y[tj-1];
g2--;
if(x[ti-1]==y[tj-1]) {
back_tr << "|";
}else{
back_tr << ":";
}
}
if (B[ti][tj][2] == INSER ){
linkgap[g1]=g2;
back_s1 << x[ti-1];
g1--;
back_s2 << " ";
gap2[g2]++;
back_tr << ".";
}
if (B[ti][tj][2] == DELET){
linkgap[g1]=g2;
back_s1 << " ";
gap1[g1]++;
back_s2 << y[tj-1];
g2--;
back_tr << ".";
}
if (B[ti][tj][2] == HOMO2X ){
linkgap[g1]=g2;
back_s1 << x[ti-1] << x[ti-2];
g1--;
linkgap[g1]=g2;
g1--;
back_s2 << " " << y[tj-1];
back_tr << " h";
gap2[g2]++;
g2--;
back_tr << " h";
}
if (B[ti][tj][2] == HOMO2Y ){
linkgap[g1]=g2;
back_s1 << " " << x[ti-1] ;
gap1[g1]++;
g1--;
back_s2 << y[tj-1] << y[tj-2] ;
back_tr << " h";
g2--;
g2--;
back_tr << " h";
}
ti=tmpi;
tj=tmpj;
}
str1=back_s1.str();
str1 =string (str1.rbegin(), str1.rend());
......@@ -409,7 +450,6 @@ void DynProg::backtrack()
first_j=tj;
str_back=back.str();
}
string DynProg::SemiGlobal_extract_best()
......
......@@ -8,7 +8,7 @@
#include <iomanip>
#define MINUS_INF -99
#define MINUS_INF -9999
using namespace std;
......@@ -96,13 +96,16 @@ class DynProg
int compute();
void backtrack();
void SemiGlobal_hits_threshold(vector<int> &hits, int threshold, int shift_pos, int verbose);
string SemiGlobal_extract_best();
friend ostream& operator<<(ostream& out, const DynProg& dp);
int **S;
private:
int ***B;
int *gap1 ;
int *linkgap ;
int *gap2 ;
};
......
......@@ -18,6 +18,9 @@
*/
#include "labels.h"
#include <cmath>
#include <cstdlib>
#include "tools.h"
map <string, string> load_map(string map_file)
{
......@@ -59,3 +62,149 @@ map <string, string> load_map(string map_file)
return the_map ;
}
map <string, pair <string, float> > load_map_norm(string map_file)
{
// Loads a simple file with key, values, normalization into a map
map <string, pair <string, float> > the_map ;
if (!map_file.size())
return the_map ;
cout << " <== " << map_file ;
ifstream f(map_file.c_str());
if (!f.is_open())
{
cout << " [failed] " << endl ;
return the_map ;
}
int nb_keys = 0 ;
while (f.good())
{
string line ;
getline (f, line);
int i = line.find(" ");
int j = line.find(" ", i+1);
if (i != (int) string::npos && j != (int) string::npos)
{
string key = line.substr(0, i);
string value = line.substr(i+1, j);
float norm = atof(line.substr(j+1, string::npos).c_str());
nb_keys++ ;
the_map[key] = make_pair(value + " " + the_map[key].first, norm + the_map[key].second);
cout << key << " " << value << " " << norm << endl ;
}
}
cout << ": " << nb_keys << " elements" << endl ;
return the_map ;
}
list< pair <float, int> > compute_normalization_list(map<string, list<Sequence> > &seqs_by_window,
map <string, pair <string, float> > normalization,
int total
)
{
list< pair <float, int> > result;
for (map <string, pair <string, float> >::iterator it = normalization.begin();
it != normalization.end();
it++) {
int nb_occs = seqs_by_window[it->first].size();
// If a normalized window is not found
if (!nb_occs)
continue ;
float norm = it->second.second / (nb_occs*1. / total);
// PRINT_VAR(nb_occs);
// PRINT_VAR(norm);
result.push_back(make_pair(norm, nb_occs));
}
result.sort(pair_occurrence_sort<float>);
return result;
}
float compute_normalization_one(list< pair <float, int> > norm_list, int nb_reads)
// Normalization with the largest standard
{
if (norm_list.empty()) {
return 1.;
}
float highest_norm = norm_list.begin()->first;
return highest_norm;
}
float compute_normalization(list< pair <float, int> > norm_list, int nb_reads)
// Full normalization
{
list<pair <float, int> >::const_iterator it;
float higher_norm;
int higher_value;
if (norm_list.empty()) {
return 1.;
}
// Traverse the list until we find the interesting position (in-between
// someone bigger and smaller)
for (it = norm_list.begin();
it != norm_list.end() && nb_reads <= it->second ; it++) {
higher_norm = it->first;
higher_value = it->second;
// cout << it->second << endl;
}
// At the end the iterator is on the highest value that is lower than nb_reads
if (it == norm_list.begin()) {
// We are above the higher standard
return it->first;
} else {
if (it != norm_list.end()) {
float lower_norm = it->first;
int lower_value = it->second;
float ratio = (higher_value == lower_value) ? 0.5 :
(log(nb_reads) - log(lower_value)) / (log(higher_value) - log(lower_value)) ;
// PRINT_VAR(ratio);
// PRINT_VAR(lower_value);
// PRINT_VAR(higher_value);
// PRINT_VAR(nb_reads);
return lower_norm + (higher_norm - lower_norm) * ratio;
} else {
// We are below the lowest standard
return higher_norm;
}
}
}
void out_json_normalization_names(ostream &out)
{
out << "\"normalizations\": [\"none\", \"highest standard\", \"all standards\"]" ;
}
void out_json_normalization(ostream &out, list< pair <float, int> > norm_list, int nb_reads, int nb_segmented)
{
out << "[ "
<< (float) nb_reads / nb_segmented << ", " // Raw ratio
<< (float) nb_reads * compute_normalization_one(norm_list, nb_reads) / nb_segmented << ", " // normalization against highest standard
<< (float) nb_reads * compute_normalization(norm_list, nb_reads) / nb_segmented << "]" // full normalization
;
}