Commit f6e9c1fe authored by Marc Duez's avatar Marc Duez
parents 4b8f6fc9 10453486
......@@ -57,7 +57,10 @@ authentication).
* Code and license
Vidjil is open-source, released under GNU GPLv3 license.
The development code is available on [[http://www.github.com/magiraud/vidjil]].
You are welcome to redistribute it under [[http://git.vidjil.org/blob/master/doc/LICENSE][certain conditions]].
This software is for research use only and comes with no warranty.
The development code is available on [[http://git.vidjil.org/]].
Bug reports, issues and patches are welcome.
* The Vidjil team
......
......@@ -321,16 +321,8 @@ list<list<junction> > comp_matrix::nocluster()
return cluster;
}
void comp_matrix::stat_cluster( list<list<junction> > cluster, string neato_file, ostream &out )
void comp_matrix::stat_cluster( list<list<junction> > cluster, ostream &out )
{
ofstream f_graph;
f_graph.open((neato_file + ".dot").c_str());
f_graph << "graph pairs {" << endl ;
f_graph << " ratio = 1.618 " << endl ;
f_graph << " node [shape=box,colorscheme=pastel19,style=filled]" << endl ;
f_graph << " graph [overlap=true]" << endl ;
int n_cluster=0; //nombre de cluster
int n_100cluster=0; //nombre de cluster de taille superieur a 100
int max_size=0; //taille du plus gros cluster
......@@ -340,7 +332,6 @@ void comp_matrix::stat_cluster( list<list<junction> > cluster, string neato_file
float jc2=0; //nombre de junction clusterisé(%)
float moy=0; //taille moyenne des clusters
int color=0;
//recherche du plus gros cluster
for (list <list <string> >::iterator it = cluster.begin();
it != cluster.end(); ++it )
......@@ -361,42 +352,11 @@ void comp_matrix::stat_cluster( list<list<junction> > cluster, string neato_file
if (size> max_size) max_size=size;
if (size>=100) {
color++;
n_100cluster++;
//boucle neato
string j1,j2;
int n=0;
for (list<string>::iterator it2 = li.begin();
it2 != li.end() ; ++it2 )
{
n++;
if (n<=16){
j1=*it2;
if (it2 != li.begin())
{
f_graph << " " <<"n"<<count[j2]<<"_"<<j2<< " -- " <<"n"<<count[j1]<<"_"<<j1<< " [style=dashed,color=red]" << endl ;
}
f_graph << " " <<"n"<<count[j1]<<"_"<<j1<< " [fillcolor=" << 1+(color%9) << "]" << endl ;
j2=j1;
}
}
}
}
f_graph << "}" << endl ;
f_graph.close();
string com = "neato "+neato_file+".dot -Tpdf -o "+neato_file+".pdf" ; // TODO: use out_dir + GRAPH_FILE
out << " " << com << endl ;
if (system(com.c_str()) == -1) {
perror("Launching neato");
exit(4);
}
if(n_j2c!=0)
moy=n_j2c/n_cluster;
......
......@@ -66,7 +66,7 @@ class comp_matrix {
*/
void del();
void stat_cluster( list<list<junction> > cluster, string neato_file, ostream &out=cout);
void stat_cluster( list<list<junction> > cluster, ostream &out=cout);
private:
......
#include "msa.h"
#include <cstdlib>
#include <algorithm>
list<string> multiple_seq_align(string file)
{
// Extremement non generique...
list<string> out;
string alig_file = file + ".aln" ;
string com = CLUSTALW + file + " -outfile="+alig_file+" -outorder=INPUT -align -type=dna -pairgap=3 -pwmatrix=gonnet -pwdnamatrix=iub -pwgapopen=10 -pwgapext=2.0 -matrix=gonnet -dnamatrix=iub -gapopen=10 -gapext=2.0 -gapdist=1 -clustering=NJ > /dev/null 2> /dev/null";
// cout << com << endl ;
cout << "[clustalw... " ;
cout.flush() ;
int ret_code = system(com.c_str());
ifstream alig(alig_file.c_str());
if (ret_code == -1 || !alig.is_open())
{
cout << "FAILED] " << endl ;
cout.flush();
return out;
}
cout << "ok]" << endl ;
cout.flush() ;
while (alig.good())
{
string line ;
getline (alig, line);
replace (line.begin(), line.end(), 'N', ' ');
if (line.find("junction") != (size_t) string::npos)
{
if (line.size() > 22)
out.push_front(line.substr(22));
else
out.push_front(line);
}
}
alig.close();
return out ;
}
#include <iostream>
#include <fstream>
#include <string>
#include <list>
#define CLUSTALW "clustalw2 "
using namespace std ;
list<string> multiple_seq_align(string file);
......@@ -163,7 +163,7 @@ void KmerRepresentativeComputer::compute() {
is_computed = true;
representative = sequence_longest_run;
representative.sequence = representative.sequence.substr(pos_longest_run, length_longest_run);
representative.label += "-[" + string_of_int(pos_longest_run) + ","
representative.label = "representative--" + representative.label + "-[" + string_of_int(pos_longest_run) + ","
+ string_of_int(pos_longest_run + length_longest_run - 1) + "]";
}
delete index;
......
......@@ -6,7 +6,6 @@
#include <cstdlib>
#include "core/dynprog.h"
#include "core/fasta.h"
#include "core/msa.h"
#include "core/lazy_msa.h"
......
......@@ -5,7 +5,6 @@
#include "core/cluster-junctions.h"
#include "core/read_score.h"
#include "core/read_chooser.h"
#include "core/msa.h"
#include "core/compare-all.h"
#include "core/teestream.h"
#include "core/mkdir.h"
......
......@@ -15,26 +15,26 @@ void testRepresentative() {
Sequence representative = krc.getRepresentative();
// Seq3 is the longest it should be taken when performing 0 extra iteration
TAP_TEST(representative.label.find("seq3-[37,73]") == 0, TEST_KMER_REPRESENTATIVE,
TAP_TEST(representative.label.find("representative--seq3-[37,73]") == 0, TEST_KMER_REPRESENTATIVE,
"If we take the first representative we should have seq3, and not at the beginning (" << representative.label << " instead)");
krc.setStabilityLimit(1);
krc.compute();
representative = krc.getRepresentative();
TAP_TEST(representative.label.find("seq3-[37,73]") == 0, TEST_KMER_REPRESENTATIVE,
TAP_TEST(representative.label.find("representative--seq3-[37,73]") == 0, TEST_KMER_REPRESENTATIVE,
"When allowing one step before stability, we should still have seq3 (" << representative.label << " instead)");
krc.setStabilityLimit(2);
krc.compute();
representative = krc.getRepresentative();
TAP_TEST(representative.label.find("seq1-[0,41]") == 0, TEST_KMER_REPRESENTATIVE,
TAP_TEST(representative.label.find("representative--seq1-[0,41]") == 0, TEST_KMER_REPRESENTATIVE,
"When allowing two steps before stability, we should reach seq1 (" << representative.label << " instead)");
krc.setRevcomp(true);
krc.setRequiredSequence("ATCGCGCCCT"); // revcomp
krc.compute();
representative = krc.getRepresentative();
TAP_TEST(representative.label.find("seq2-[33,52]") == 0, TEST_KMER_REPRESENTATIVE_REQUIRED_SEQ,
TAP_TEST(representative.label.find("representative--seq2-[33,52]") == 0, TEST_KMER_REPRESENTATIVE_REQUIRED_SEQ,
"When requiring sequence ATCGCGCCCT, we should have seq2 (" << representative.label << " instead)");
krc.setRevcomp(false);
......
......@@ -41,7 +41,6 @@
#include "core/dynprog.h"
#include "core/read_score.h"
#include "core/read_chooser.h"
#include "core/msa.h"
#include "core/compare-all.h"
#include "core/teestream.h"
#include "core/mkdir.h"
......@@ -90,7 +89,6 @@ enum { CMD_WINDOWS, CMD_ANALYSIS, CMD_SEGMENT, CMD_GERMLINES } ;
#define UNSEGMENTED_FILENAME "unsegmented.fa"
#define EDGES_FILENAME "edges"
#define COMP_FILENAME "comp.data"
#define GRAPH_FILENAME "graph"
#define JSON_SUFFIX ".data"
// "tests/data/leukemia.fa"
......@@ -171,10 +169,10 @@ void usage(char *progname)
<< " -r <nb> minimal number of reads containing a window (default: " << MIN_READS_WINDOW << ")" << endl
<< endl
<< "Clusterisation" << endl
<< " -e <file> manual clusterisation -- a file used to force some specific edges" << endl
<< " -n <int> maximum distance between neighbors for automatic clusterisation (default " << DEFAULT_EPSILON << "). No automatic clusterisation if =0." << endl
<< " -N <int> minimum required neighbors for automatic clusterisation (default " << DEFAULT_MINPTS << ")" << endl
<< "Additional clustering (not output in vidjil.data and therefore not used in the browser)" << endl
<< " -e <file> manual clustering -- a file used to force some specific edges" << endl
<< " -n <int> maximum distance between neighbors for automatic clustering (default " << DEFAULT_EPSILON << "). No automatic clusterisation if =0." << endl
<< " -N <int> minimum required neighbors for automatic clustering (default " << DEFAULT_MINPTS << ")" << endl
<< " -S generate and save comparative matrix for clustering" << endl
<< " -L load comparative matrix for clustering" << endl
<< " -C <string> use custom Cost for automatic clustering : format \"match, subst, indels, homo, del_end\" (default "<<Cluster<<" )"<< endl
......@@ -205,7 +203,7 @@ void usage(char *progname)
<< " -o <dir> output directory (default: " << OUT_DIR << ")" << endl
<< " -p <string> prefix output filenames by the specified string" << endl
<< " -a output all sequences by cluster (" << SEQUENCES_FILENAME << ")" << endl
<< " -a output all sequences by cluster (" << SEQUENCES_FILENAME << "), to be used only on small datasets" << endl
<< " -x do not compute representative sequences" << endl
<< " -v verbose mode" << endl
<< endl
......@@ -228,6 +226,11 @@ int main (int argc, char **argv)
cout << "# Vidjil -- V(D)J recombinations analysis <http://www.vidjil.org/>" << endl
<< "# Copyright (C) 2011, 2012, 2013, 2014 by the Vidjil team" << endl
<< "# Bonsai bioinformatics at LIFL (UMR CNRS 8022, Université Lille) and Inria Lille" << endl
<< endl
<< "# Vidjil is free software, and you are welcome to redistribute it" << endl
<< "# under certain conditions -- see http://git.vidjil.org/blob/master/doc/LICENSE" << endl
<< "# No lymphocyte was harmed in the making of this software," << endl
<< "# however this software is for research use only and comes with no warranty." << endl
<< endl ;
//$$ options: defaults
......@@ -276,7 +279,6 @@ int main (int argc, char **argv)
bool output_sequences_by_cluster = false;
bool detailed_cluster_analysis = true ;
bool very_detailed_cluster_analysis = false ;
bool output_segmented = false;
bool output_unsegmented = false;
......@@ -429,7 +431,7 @@ int main (int argc, char **argv)
#endif
break;
// Clusterisation
// Clustering
case 'n':
epsilon = atoi(optarg);
......@@ -924,7 +926,7 @@ int main (int argc, char **argv)
}
clones_windows = comp.cluster(forced_edges, w, cout, epsilon, minPts) ;
comp.stat_cluster(clones_windows, out_dir + prefix_filename + GRAPH_FILENAME, cout );
comp.stat_cluster(clones_windows, cout );
comp.del();
}
else
......@@ -1112,6 +1114,24 @@ int main (int argc, char **argv)
ratio_representative,
max_auditionned);
representative.label = string_of_int(it->second) + "--" + representative.label;
if (output_sequences_by_cluster) // -a option, output all sequences
{
out_sequences << ">" << it->second << "--window--" << num_seq << " " << windows_labels[it->first] << endl ;
out_sequences << it->first << endl;
if (representative != NULL_SEQUENCE)
out_sequences << representative ;
list<Sequence> &sequences = windowsStorage->getReads(it->first);
for (list<Sequence>::const_iterator itt = sequences.begin(); itt != sequences.end(); ++itt)
{
out_sequences << *itt ;
}
}
if (representative == NULL_SEQUENCE) {
clones_without_representative++ ;
......@@ -1122,7 +1142,7 @@ int main (int argc, char **argv)
//$$ There is one representative, FineSegmenter
representative.label = string_of_int(it->second) + "-"
representative.label = string_of_int(it->second) + "--"
+ representative.label;
FineSegmenter seg(representative, rep_V, rep_J, delta_min, delta_max, segment_cost);
......@@ -1254,154 +1274,8 @@ int main (int argc, char **argv)
cout << endl ;
out_windows.close();
if (!very_detailed_cluster_analysis)
{
continue ;
}
//$$ Very detailed cluster analysis (with sequences) // NOT USED NOW
list<string> msa;
bool good_msa = false ;
// TODO: do something if no sequences have been segmented !
if (!more_windows)
{
cout << "!! No segmented sequence, deleting clone" << endl ;
// continue ;
} else
{
msa = multiple_seq_align(windows_file_name);
// Alignment of windows
if (!msa.empty())
{
if (msa.size() == sort_windows.size() + more_windows)
{
// cout << "clustalw parse: success" << endl ;
good_msa = true ;
}
else
{
cout << "! clustalw parse: failed" << endl ;
}
}
}
//$$ Second pass: output clone, all representatives
num_seq = 0 ;
list <Sequence> representatives_this_clone ;
string code_representative = "";
for (list <pair<junction, int> >::const_iterator it = sort_windows.begin();
it != sort_windows.end(); ++it) {
num_seq++ ;
string junc ;
if (!good_msa)
{
junc = it->first ;
}
else
{
junc = msa.back();
msa.pop_back();
}
// Output all sequences
if (output_sequences_by_cluster)
{
out_sequences << ">" << it->second << "--window--" << num_seq << " " << windows_labels[it->first] << endl ;
out_sequences << it->first << endl;
list<Sequence> &sequences = windowsStorage->getReads(it->first);
for (list<Sequence>::const_iterator itt = sequences.begin(); itt != sequences.end(); ++itt)
{
out_sequences << *itt ;
}
}
Sequence representative
= windowsStorage->getRepresentative(it->first, seed,
min_cover_representative,
ratio_representative,
max_auditionned);
if (representative != NULL_SEQUENCE) {
representative.label = string_of_int(it->second) + "-"
+ representative.label;
FineSegmenter seg(representative, rep_V, rep_J, delta_min, delta_max, segment_cost);
if (segment_D)
seg.FineSegmentD(rep_V, rep_D, rep_J);
//cout << seg.toJson();
json_data_segment[it->first]=seg.toJsonList(rep_V, rep_D, rep_J);
if (seg.isSegmented())
{
representatives_this_clone.push_back(seg.getSequence());
}
/// TODO: et si pas isSegmented ?
bool warning = false;
if (num_seq <= 20) /////
{
cout << setw(20) << representative.label << " " ;
cout << " " << junc ;
cout << " " << setw(WIDTH_NB_READS) << it->second << " " ;
cout << (warning ? "§ " : " ") ;
cout << seg.info ;
cout << endl ;
}
}
}
//$$ Display msa
if (good_msa)
{
cout << setw(20) << best_V << " " << msa.back() << endl ;
msa.pop_back();
if (segment_D)
{
cout << setw(20) << best_D << " " << msa.back() << endl ;
msa.pop_back();
}
cout << setw(20) << best_J << " " << msa.back() << endl ;
msa.pop_back();
}
out_clone.close();
cout << endl;
//$$ Compare representatives of this clone
cout << "Comparing representatives of this clone 2 by 2" << endl ;
// compare_all(representatives_this_clone);
SimilarityMatrix matrix = compare_all(representatives_this_clone, true);
cout << RawOutputSimilarityMatrix(matrix, 90);
//Sort all windows
windowsStorage->sort();
//Compute all the edges
SimilarityMatrix matrixLevenshtein = compare_windows(*windowsStorage, Levenshtein, max_clones);
//Added distances matrix in the JsonTab
jsonLevenshtein << JsonOutputWindowsMatrix(matrixLevenshtein);
}
//$$ End of very_detailed_cluster_analysis
}
out_edges.close() ;
out_clones.close();
......
......@@ -2246,7 +2246,7 @@ var msg = {
"version_error": "Error &ndash; data file too old (version " + VIDJIL_JSON_VERSION + " required)" + "</br> This data file was generated by a too old version of Vidjil. " + "</br> Please regenerate a newer data file. " + "</br></br> <div class='center' > <button onclick='closePopupMsg()'>ok</button></div>",
"welcome": " <h2>Vidjil <span class='logo'>(beta)</span></h2>" + "(c) 2011-2014, the Vidjil team" + "<br />Marc Duez, Mathieu Giraud and Mikaël Salson" + " &ndash; <a href='http://www.vidjil.org'>http://www.vidjil.org/</a>" + "</br>" + "</br>Vidjil is developed by the <a href='http://www.lifl.fr/bonsai'>Bonsai bioinformatics team</a> (LIFL, CNRS, U. Lille 1, Inria Lille), in collaboration with the <a href='http://biologiepathologie.chru-lille.fr/organisation-fbp/91210.html'>department of Hematology</a> of CHRU Lille"
+ " the <a href='http://www.ircl.org/plate-forme-genomique.html'>Functional and Structural Genomic Platform</a> (U. Lille 2, IFR-114, IRCL)" + " and the <a href='http://www.euroclonality.org/'>EuroClonality-NGS</a> working group." + "</br>" + "</br>This is a beta version, please use it only for test purposes." + " " + "Please cite <a href='http://www.biomedcentral.com/1471-2164/15/409'>BMC Genomics 2014, 15:409</a> if you use Vidjil for your research." +"</br></br> <div class='center' > <button onclick='closePopupMsg()'>start</button></div>",
+ " the <a href='http://www.ircl.org/plate-forme-genomique.html'>Functional and Structural Genomic Platform</a> (U. Lille 2, IFR-114, IRCL)" + " and the <a href='http://www.euroclonality.org/'>EuroClonality-NGS</a> working group." + "<br/>" + "<br>Vidjil is free software, and you are welcome to redistribute it under <a href='http://git.vidjil.org/blob/master/doc/LICENSE'>certain conditions</a>. This software is for research use only and comes with no warranty." + "<br>" + "Please cite <a href='http://www.biomedcentral.com/1471-2164/15/409'>BMC Genomics 2014, 15:409</a> if you use Vidjil for your research." +"<br><br> <div class='center' > <button onclick='closePopupMsg()'>start</button></div>",
"browser_error": "The web browser you are using has not been tested with Vidjil." + "</br>Note in particular that Vidjil is <b>not compatible</b> with Internet Explorer 9.0 or below." +"</br>For a better experience, we recommend to install one of those browsers : " + "</br> <a href='http://www.mozilla.org/'> Firefox </a> " + "</br> <a href='www.google.com/chrome/'> Chrome </a> " + "</br> <a href='http://www.chromium.org/getting-involved/download-chromium'> Chromium </a> " + "</br></br> <div class='center' > <button onclick='popupMsg(msg.welcome)'>I want to try anyway</button></div>",
}
......
......@@ -16,7 +16,7 @@ from a set of reads and detects "windows" overlapping the actual CDR3.
This is based on an fast and reliable seed-based heuristic and allows
to output the most abundant clones. The analysis is extremely fast
because, in the first phase, no alignment is performed with database
germline sequences. Vidjil can also clusterize similar
germline sequences. Vidjil can also cluster similar
clones, or leave this to the user after a manual review.
The method is described in the following paper:
......@@ -64,14 +64,6 @@ make test # run self-tests
./vidjil -h # display help/usage
#+END_SRC
* Optional dependencies
These dependencies have no consequences on the visualization through the
browser. They are intented for a command-line use only.
- clustalw :: to compute alignments between windows from a same clone, by setting
=very_detailed_cluster_analysis= in vidjil.cpp
- neato :: to display graph of neighbors for the automatic clusterisation
* Vidjil parameters
......@@ -128,7 +120,7 @@ Limits to segment a clone
The =-r/-R/-%= options are strong thresholds: if a clone does not have
the requested number of reads, the clone is discarded (except when
using =-l=, see below).
The =-r= option is applied before the additional clusterization, the
The =-r= option is applied before the additional clustering, the
=-R/-%= options after it.
The default =-r 10 -R 10= options are meant to only output clones that
have a significant read support. You can safely put =-r 1 -R 1= if you
......@@ -165,7 +157,14 @@ The first column of the file is the window to be followed
while the remaining columns consist of the window's label.
In Vidjil output, the labels are output alongside their windows.
** Manual clustering
** Further clustering
These options have no consequences on the visualization through the
browser. They are intented for a command-line use only.
Setting the =-n= option triggers an additional automatic
clustering using DBSCAN algorithm (Ester and al., 1996).
The =-e= option allows to specify a file for manually clustering two windows
considered as similar. Such a file may be automatically produced by vidjil
......@@ -221,7 +220,7 @@ CTATGATAGTAGTGGTTATTACGGGGTAGGGCAGTACTACTACTACTACATGGACGTCTG
#+BEGIN_SRC sh
./vidjil -c clones -G germline/IGH -x -r 1 -R 5 -n 5 -d ./data/clones_simul.fa
# Window extraction + clone gathering,
# with automatic clusterisation, distance five (-n 5)
# with automatic clustering, distance five (-n 5)
#+END_SRC
#+BEGIN_SRC sh
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment