Commit 113ee8ee authored by Mathieu Giraud's avatar Mathieu Giraud

Vidjil: release 2014.02

	* Refactored main vidjil.cpp (core/windows.cpp, core/windowExtractor.cpp)
	* Removed unused html output
	* Better json output (core/json.cpp)
	* Updated main stdout, with representative sequence for each clone
	* Updated parameters for FineSegmenter (delta_max) and dynprog (substition cost)
	* Bugs closed
parent 2cbd3725
VIDJIL_ALGO_SRC = algo/
VIDJIL_SERVER_SRC = server/
all:
make -C src
make -C $(VIDJIL_ALGO_SRC)
test: all
make -C src/tests
# make -C $(VIDJIL_SERVER_SRC) tests
make -C $(VIDJIL_ALGO_SRC)/tests
make should
should: all
@echo
@echo "*** Launching .should_get tests..."
make -C src/tests should
make -C $(VIDJIL_ALGO_SRC)/tests should
@echo "*** All .should_get tests passed"
data germline: %:
make -C $@
clean:
make -C src clean
make -C $(VIDJIL_ALGO_SRC) clean
cleanall: clean
make -C data $^
......@@ -25,11 +31,11 @@ cleanall: clean
.PHONY: all test should clean cleanall distrib data germline
RELEASE_TAG="notag"
RELEASE_H = src/release.h
RELEASE_SOURCE = $(wildcard src/*.cpp) $(wildcard src/*.h) $(wildcard src/core/*.cpp) $(wildcard src/tests/*.cpp) $(wildcard src/core/*.h) $(wildcard src/tests/*.h)
RELEASE_MAKE = ./Makefile src/Makefile src/tests/Makefile germline/Makefile data/Makefile
RELEASE_TESTS = data/get-sequences $(wildcard data/*.fa) $(wildcard data/*.fq) src/tests/should-to-tap.sh $(wildcard src/tests/*.should_get) $(wildcard src/tests/bugs/*.fa) $(wildcard src/tests/bugs/*.should_get)
RELEASE_FILES = $(RELEASE_SOURCE) $(RELEASE_TESTS) $(RELEASE_MAKE) germline/get-germline germline/split-from-imgt.py doc/README doc/LICENSE
RELEASE_H = $(VIDJIL_ALGO_SRC)/release.h
RELEASE_SOURCE = $(wildcard $(VIDJIL_ALGO_SRC)/*.cpp) $(wildcard $(VIDJIL_ALGO_SRC)/*.h) $(wildcard $(VIDJIL_ALGO_SRC)/core/*.cpp) $(wildcard $(VIDJIL_ALGO_SRC)/tests/*.cpp) $(wildcard $(VIDJIL_ALGO_SRC)/core/*.h) $(wildcard $(VIDJIL_ALGO_SRC)/tests/*.h)
RELEASE_MAKE = ./Makefile $(VIDJIL_ALGO_SRC)/Makefile $(VIDJIL_ALGO_SRC)/tests/Makefile germline/Makefile data/Makefile
RELEASE_TESTS = data/get-sequences $(wildcard data/*.fa) $(wildcard data/*.fq) $(VIDJIL_ALGO_SRC)/tests/should-to-tap.sh $(wildcard $(VIDJIL_ALGO_SRC)/tests/*.should_get) $(wildcard $(VIDJIL_ALGO_SRC)/tests/bugs/*.fa) $(wildcard $(VIDJIL_ALGO_SRC)/tests/bugs/*.should_get)
RELEASE_FILES = $(RELEASE_SOURCE) $(RELEASE_TESTS) $(RELEASE_MAKE) germline/get-germline germline/split-from-imgt.py doc/README doc/LICENSE data/segmentation.fasta
RELEASE_ARCHIVE = vidjil-$(RELEASE_TAG).tgz
CURRENT_DIR = vidjil
......
......@@ -43,7 +43,7 @@ $(LIBCORE): $(OBJCORE)
rm -f $(LIBCORE)
ar rc $@ $^
%.o: %.cpp %.h
%.o: %.cpp %.h core/dynprog.h
$(CC) -o $@ -c $< $(CFLAGS)
%.o: %.cpp
......
......@@ -16,8 +16,6 @@ int main(int argc, char* argv[])
{
const char* fdata;
DynProg::DynProgMode dpMode = DynProg::Global;
Cost dpCost = VDJ;
ostringstream ost;
ostream * p;
p=&ost;
......
align: align.o
align: core align.o
mkdir -p ../../interface/cgi
g++ -o ../../interface/cgi/align.cgi align.o ../core.a
align.o: align.cpp
g++ -c align.cpp
core:
cd .. ; make
......@@ -20,55 +20,37 @@
#include "cluster-junctions.h"
#include<cstdlib>
int total_nb_reads (list<junction> clone, map <string, list<Sequence> > seqs_by_junction)
{
int total = 0 ;
for (list <junction>::const_iterator it = clone.begin(); it != clone.end(); ++it)
total += seqs_by_junction[*it].size() ;
return total ;
}
bool MySort(const pair<int, string>& lh, const pair<int, string>& rh){
return lh.second>rh.second;
}
comp_matrix::comp_matrix(MapKmerStore<Kmer> *junc){
junctions=junc;
}
comp_matrix::comp_matrix(WindowsStorage &ws):windows(ws){}
void comp_matrix::compare(ostream &out, Cost cluster_cost)
{
// DEBUG // out << " DEBUT COMPARE JUNCTION" << endl ;
// clock_t start = clock();
typedef map<string,Kmer> msK ;
typedef map<junction,list<Sequence> > mjs ;
Cost compareCost = cluster_cost;
out << " Using cost " << compareCost << endl ;
map<string,Kmer> z = junctions->store;
string j1, j2;
size=z.size();
m=(char**)malloc(z.size()*sizeof(char*));
for (int i=0;i<size;i++){
m[i]=(char *)malloc(z.size()*sizeof(char));
}
m = alloc_matrix(windows.size());
int c=0;
int c1=0;
int c2=0;
for (msK::const_iterator it0 = z.begin();
it0 != z.end(); ++it0 )
for (mjs::const_iterator it0 = windows.getMap().begin();
it0 != windows.getMap().end(); ++it0 )
{
j1=it0->first;
for (msK::const_iterator it1 = it0;
it1 != z.end(); ++it1 )
for (mjs::const_iterator it1 = it0;
it1 != windows.getMap().end(); ++it1 )
{
j2=it1->first;
DynProg dp = DynProg(j1, j2, DynProg::Local, compareCost);
......@@ -88,20 +70,13 @@ void comp_matrix::compare(ostream &out, Cost cluster_cost)
void comp_matrix::load(string file){
typedef map<string,Kmer> msK ;
map<string,Kmer> z = junctions->store;
size=z.size();
m=(char**)malloc(z.size()*sizeof(char*));
for (int i=0;i<size;i++){
m[i]=(char *)malloc(z.size()*sizeof(char));
}
m = alloc_matrix(windows.size());
char* tampon=(char*)malloc(size*sizeof(char));
char* tampon=(char*)malloc(windows.size()*sizeof(char));
ifstream in_comp(file.c_str());
for(int i=0; i<size;i++){
in_comp.read(tampon, size*sizeof(char));
for(int j=0;j<size; j++){
for(int i=0; i<windows.size();i++){
in_comp.read(tampon, windows.size()*sizeof(char));
for(int j=0;j<windows.size(); j++){
m[i][j]=tampon[j];
}
}
......@@ -115,8 +90,8 @@ void comp_matrix::save(string file){
ofstream out_comp(file.c_str());
for(int i=0; i<size;i++){
out_comp.write((char *)m[i],size*sizeof(char));
for(int i=0; i<windows.size();i++){
out_comp.write((char *)m[i],windows.size()*sizeof(char));
}
out_comp.close();
......@@ -124,7 +99,7 @@ void comp_matrix::save(string file){
void comp_matrix::del(){
for (int i=0;i<size;i++){
for (int i=0;i<windows.size();i++){
free(m[i]);
}
free(m);
......@@ -136,13 +111,11 @@ list<list<junction> > comp_matrix::cluster(string forced_edges, int w, ostream
// out << " eps: " << epsilon << " / minPts: " << minPts << endl ;
typedef map<string,Kmer> msK ;
typedef map<junction,list<Sequence> > mjs ;
typedef list<string> li ;
map <string, map <string, bool> > graph ;
map<string,Kmer> z = junctions->store;
////////////////////////
//indexation des voisins
map <string, list <string> > neighbor;
......@@ -153,8 +126,8 @@ list<list<junction> > comp_matrix::cluster(string forced_edges, int w, ostream
int c1=0;
int c2=0;
for (msK::const_iterator ite = z.begin();
ite != z.end(); ++ite )
for (mjs::const_iterator ite = windows.getMap().begin();
ite != windows.getMap().end(); ++ite )
{
n_j++;
list <string> voisins ;
......@@ -162,21 +135,21 @@ list<list<junction> > comp_matrix::cluster(string forced_edges, int w, ostream
neighbor[j1]=voisins;
}
for (msK::const_iterator it0 = z.begin();
it0 != z.end(); ++it0 )
for (mjs::const_iterator it0 = windows.getMap().begin();
it0 != windows.getMap().end(); ++it0 )
{
j1=it0->first;
int k=it0->second.count;
int k=it0->second.size();
count[j1]=k;
n_j2+=k;
for (msK::const_iterator it1 = z.begin();
it1 != z.end(); ++it1 )
for (mjs::const_iterator it1 = windows.getMap().begin();
it1 != windows.getMap().end(); ++it1 )
{
j2=it1->first;
int distance = (int)m[c2][c1];
if (distance <= epsilon){
neighbor[j1].push_back(j2);
......@@ -241,8 +214,8 @@ list<list<junction> > comp_matrix::cluster(string forced_edges, int w, ostream
int noise = 0;
int nb_comp = 0 ;
for (msK::const_iterator it0 = z.begin();
it0 != z.end(); ++it0 )
for (mjs::const_iterator it0 = windows.getMap().begin();
it0 != windows.getMap().end(); ++it0 )
{
j1=it0->first;
if(visit[j1]==false && clust[j1]==false){
......@@ -337,11 +310,9 @@ list<list<junction> > comp_matrix::cluster(string forced_edges, int w, ostream
list<list<junction> > comp_matrix::nocluster()
{
list <list < string > > cluster ;
typedef map<string,Kmer> msK ;
map<string,Kmer> z = junctions->store;
for (msK::const_iterator it0 = z.begin();
it0 != z.end(); ++it0 )
for (map<junction, list<Sequence> >::const_iterator it0 = windows.getMap().begin();
it0 != windows.getMap().end(); ++it0 )
{
list< string > c1;
c1.push_back(it0->first);
......@@ -462,3 +433,11 @@ void comp_matrix::stat_cluster( list<list<junction> > cluster, string neato_file
out << " clusterised occurrences : "<< n_j2c << " (" << jc2 << "%)" << endl ;
}
char **comp_matrix::alloc_matrix(size_t s) {
char **matrix = (char**)malloc(s*sizeof(char*));
for (size_t i=0;i<s;i++){
matrix[i]=(char *)malloc(s*sizeof(char));
}
return matrix;
}
#ifndef CLUSTER_JUNCTIONS_H
#define CLUSTER_JUNCTIONS_H
#include <fstream>
#include <iostream>
......@@ -6,21 +8,17 @@
#include <list>
#include <ctime>
#include "dynprog.h"
#include "kmerstore.h"
#include "windows.h"
using namespace std ;
typedef string junction ;
#define SIMILAR_JUNCTIONS_THRESHOLD 1
class comp_matrix {
public:
char ** m;
int size;
MapKmerStore<Kmer> *junctions;
WindowsStorage &windows;
map <string, int> count;
int n_j;
int n_j2;
......@@ -28,7 +26,7 @@ class comp_matrix {
/**
* create new distance matrix
*/
comp_matrix(MapKmerStore<Kmer> *junctions);
comp_matrix(WindowsStorage &windows);
/**
* init matrix with a KmerStore and compute distance value between sequences
......@@ -71,8 +69,13 @@ class comp_matrix {
void del();
void stat_cluster( list<list<junction> > cluster, string neato_file, ostream &out=cout);
};
int total_nb_reads (list<junction> clone, map <string, list<Sequence> > seqs_by_junction);
private:
/**
* Allocates a matrix of size s * s bytes
*/
char **alloc_matrix(size_t s);
};
#endif
......@@ -151,13 +151,19 @@ void DynProg::init()
if (mode == Local || mode == LocalEndWithSomeDeletions)
for (int i=0; i<=m; i++)
S[i][0] = 0 ;
else
for (int i=0; i<=m; i++)
else if (mode == GlobalButMostlyLocal)
for (int i=0; i<=m; i++)
S[i][0] = i * cost.insertion / 2 ;
else // Global, SemiGlobal
for (int i=0; i<=m; i++)
S[i][0] = i * cost.insertion ;
if (mode == SemiGlobal || mode == SemiGlobalTrans || mode == Local || mode == LocalEndWithSomeDeletions)
for (int j=0; j<=n; j++)
S[0][j] = 0 ;
else if (mode == GlobalButMostlyLocal)
for (int j=0; j<=n; j++)
S[0][j] = j * cost.deletion / 2 ;
else
for (int j=0; j<=n; j++)
S[0][j] = j * cost.deletion ;
......@@ -174,7 +180,7 @@ int DynProg::compute()
int inser = S[i-1][j] + cost.insertion;
int delet = S[i][j-1] + cost.deletion;
// Also dealing with homopolymers (in a dirty way!)
// int inser = S[i-1][j] + (i > 1 ? cost.ins(x[i-2], x[i-1]) : cost.ins(0,1));
// int delet = S[i][j-1] + (j > 1 ? cost.del(y[j-2], y[j-1]) : cost.del(0,1));
......@@ -299,7 +305,7 @@ int DynProg::compute()
}
if (mode == Global)
if ((mode == Global) || (mode == GlobalButMostlyLocal))
{
best_i = m ;
best_j = n ;
......@@ -450,6 +456,8 @@ void DynProg::backtrack()
first_j=tj;
str_back=back.str();
// cout << str_back << endl ;
}
string DynProg::SemiGlobal_extract_best()
......
......@@ -47,7 +47,7 @@ ostream& operator<<(ostream& out, const Cost& cost);
/* const Cost VDJ = Cost(+5, -8, -8, -1); */
const Cost DNA = Cost(+5, -4, -10, 0, 0);
const Cost VDJ = Cost(+4, -10, -10, -1, -2);
const Cost VDJ = Cost(+4, -6, -10, -1, -2);
const Cost Identity = Cost(+1, -1, -1, 0, 0);
const Cost Homopolymers = Cost(+1, MINUS_INF, -1); // TODO: true homopolymer
......@@ -68,6 +68,7 @@ class DynProg
LocalEndWithSomeDeletions, // local + some deletions on __
SemiGlobalTrans, // start-to-partial x / partial-to-end y
SemiGlobal, // complete x / partial y
GlobalButMostlyLocal, // complete x / complete y, but deletions at begin (DONE) and end (TODO) are cheaper
Global // complete x / complete y
} ;
......
......@@ -47,14 +47,14 @@ Fasta::Fasta(const string &input,
exit(1);
}
out << " <== " << input << endl ;
out << " <== " << input ;
while (is.good()) {
is >> *this;
}
is.close();
out << " " << total_size << " bp in " << size() << " sequences" << endl ;
out << "\t" << setw(6) << total_size << " bp in " << setw(3) << size() << " sequences" << endl ;
}
int Fasta::size() const{ return (int)reads.size(); }
......@@ -86,6 +86,8 @@ OnlineFasta::OnlineFasta(const string &input,
if (this->input->fail()) {
throw ios_base::failure("!! Error in opening file "+input);
}
cout << " <== " << input << endl ;
input_allocated = true;
init();
}
......
......@@ -5,7 +5,6 @@
#include <string>
#include <vector>
#include <list>
#include "tools.h"
using namespace std;
......@@ -23,6 +22,8 @@ typedef enum {
FASTX_FASTQ_SEP, FASTX_FASTQ_QUAL
} fasta_state;
#include "tools.h"
class Fasta
{
int total_size;
......
#include <string>
#include "json.h"
#include <sstream>
#include "labels.h"
using namespace std ;
JsonData::JsonData(){
}
string JsonData::toString(){
ostringstream stream;
stream << "\"" << name << "\" : " << data << endl;
return stream.str();;
}
JsonList::JsonList(){
}
void JsonList::add(string n, string d){
JsonData elem;
elem.name=n;
elem.data="\""+d+"\"";
l.push_back(elem);
}
void JsonList::add(string n, float d){
JsonData elem;
ostringstream stream;
stream << d;
elem.name=n;
elem.data=stream.str();
l.push_back(elem);
}
void JsonList::add(string n, JsonList &d){
JsonData elem;
elem.name=n;
elem.data=d.toString();
l.push_back(elem);
}
void JsonList::add(string n, JsonArray &d){
JsonData elem;
elem.name=n;
elem.data=d.toString();
l.push_back(elem);
}
void JsonList::concat(JsonList &d){
l.insert(l.end(), d.l.begin(), d.l.end());
}
string JsonList::toString(){
ostringstream stream;
stream << " { ";
for ( list<JsonData>::iterator i=l.begin(); i!= l.end(); ++i){
if (i!=l.begin()) stream << ",";
stream << endl << "\"" << (*i).name << "\" : " << (*i).data ;
}
stream << endl << " } ";
return stream.str();;
}
JsonArray::JsonArray(){
}
void JsonArray::add(string d){
l.push_back("\""+d+"\"");
}
void JsonArray::add(float d){
ostringstream stream;
stream << d;
l.push_back(stream.str());
}
void JsonArray::add(JsonArray &d){
l.push_back(d.toString());
}
void JsonArray::add(JsonList &d){
l.push_back(d.toString());
}
string JsonArray::toString(){
ostringstream stream;
stream << " [ ";
for ( list<string>::iterator i=l.begin(); i!= l.end(); ++i){
if (i!=l.begin()) stream << ", ";
stream << (*i);
}
stream << " ] ";
return stream.str();
}
JsonArray json_normalization_names()
{
JsonArray result;
result.add("none");
result.add("highest standard");
result.add("all standards");
return result;
}
JsonArray json_normalization( list< pair <float, int> > norm_list, int nb_reads, int nb_segmented)
{
JsonArray result;
result.add( (float) nb_reads / nb_segmented );
result.add( (float) nb_reads * compute_normalization_one(norm_list, nb_reads) / nb_segmented );
result.add( (float) nb_reads * compute_normalization(norm_list, nb_reads) / nb_segmented );
return result;
}
#ifndef JSON_H
#define JSON_H
#include <string>
#include <list>
#include "labels.h"
using namespace std ;
class JsonData
{
public:
string name;
string data;
JsonData();
string toString();
};
class JsonList;
class JsonArray
{
public:
list<string> l;
JsonArray();
void add(string d);
void add(float d);
void add(JsonList &d);
void add(JsonArray &d);
string toString();
};
class JsonList
{
public:
list<JsonData> l;
JsonList();
void add(string n, string d);
void add(string n, float d);
void add(string n, JsonList &d);
void add(string n, JsonArray &d);
void concat(JsonList &d);
string toString();
};
JsonArray json_normalization_names();
JsonArray json_normalization( list< pair <float, int> > norm_list, int nb_reads, int nb_segmented);
#endif
......@@ -22,6 +22,7 @@
#include <cstdlib>
#include "tools.h"
map <string, string> load_map(string map_file)
{
// Loads a simple file with key, values into a map
......@@ -195,16 +196,4 @@ float compute_normalization(list< pair <float, int> > norm_list, int nb_reads)
}
}
void out_json_normalization_names(ostream &out)
{
out << "\"normalizations\": [\"none\", \"highest standard\", \"all standards\"]" ;
}
void out_json_normalization(ostream &out, list< pair <float, int> > norm_list, int nb_reads, int nb_segmented)
{
out << "[ "
<< (float) nb_reads / nb_segmented << ", " // Raw ratio
<< (float) nb_reads * compute_normalization_one(norm_list, nb_reads) / nb_segmented << ", " // normalization against highest standard
<< (float) nb_reads * compute_normalization(norm_list, nb_reads) / nb_segmented << "]" // full normalization
;
}
......@@ -17,6 +17,3 @@ list< pair <float, int> > compute_normalization_list(map<string, list<Sequence>
float compute_normalization_one(list< pair <float, int> > norm_list, int nb_reads);
float compute_normalization(list< pair <float, int> > norm_list, int nb_reads);
void out_json_normalization_names(ostream &out);