Commit a519cbc4 authored by Mathieu Giraud's avatar Mathieu Giraud
Browse files

merge - merge branch 'rework_data', new 2014.10 .data format

@duez worked a lot to implement the new .data format, described in doc/format-analysis.org
Behind, the access to the objects in the .js code have been streamlined.
parents 7ca20a8d 6935e2ae
......@@ -24,7 +24,12 @@
return lh.second>rh.second;
}
comp_matrix::comp_matrix(WindowsStorage &ws):windows(ws){}
comp_matrix::comp_matrix(list<pair <junction, int> > sc)
{
sort_clones = sc;
matrix_size = sort_clones.size();
if (matrix_size > JUNCTION_LIMIT) matrix_size = JUNCTION_LIMIT;
}
void comp_matrix::compare(ostream &out, Cost cluster_cost)
{
......@@ -37,30 +42,33 @@ void comp_matrix::compare(ostream &out, Cost cluster_cost)
out << " Using cost " << compareCost << endl ;
string j1, j2;
m = alloc_matrix(windows.size());
m = alloc_matrix(sort_clones.size());
int c=0;
int c1=0;
int c2=0;
for (mjs::const_iterator it0 = windows.getMap().begin();
it0 != windows.getMap().end(); ++it0 )
size_t i = 0;
for (list <pair<junction,int> >::const_iterator it0 = sort_clones.begin();
(it0 != sort_clones.end()) & (i<matrix_size); ++it0)
{
i++;
j1=it0->first;
for (mjs::const_iterator it1 = it0;
it1 != windows.getMap().end(); ++it1 )
{
j2=it1->first;
DynProg dp = DynProg(j1, j2, DynProg::Local, compareCost);
int score=dp.compute();
int distance = max(j1.size(), j2.size())-score;
m[c2][c1]=distance;
m[c1][c2]=distance;
c1++;
c++;
}//fin it1
size_t k = 0;
for (list <pair<junction,int> >::const_iterator it1 = it0;
(it1 != sort_clones.end()) & (k<matrix_size); ++it1)
{
k++;
j2=it1->first;
DynProg dp = DynProg(j1, j2, DynProg::Local, compareCost);
int score=dp.compute();
int distance = max(j1.size(), j2.size())-score;
m[c2][c1]=distance;
m[c1][c2]=distance;
c1++;
c++;
}//fin it1
c2++;
c1=c2;
}//fin it0
......@@ -69,14 +77,15 @@ void comp_matrix::compare(ostream &out, Cost cluster_cost)
}
void comp_matrix::load(string file){
m = alloc_matrix(windows.size());
char* tampon=(char*)malloc(windows.size()*sizeof(char));
m = alloc_matrix(sort_clones.size());
char* tampon=(char*)malloc(sort_clones.size()*sizeof(char));
ifstream in_comp(file.c_str());
for(unsigned int i=0; i<windows.size();i++){
in_comp.read(tampon, windows.size()*sizeof(char));
for(unsigned int j=0;j<windows.size(); j++){
for(unsigned int i=0; i<sort_clones.size();i++){
in_comp.read(tampon, sort_clones.size()*sizeof(char));
for(unsigned int j=0;j<sort_clones.size(); j++){
m[i][j]=tampon[j];
}
}
......@@ -90,8 +99,8 @@ void comp_matrix::save(string file){
ofstream out_comp(file.c_str());
for(unsigned int i=0; i<windows.size();i++){
out_comp.write((char *)m[i],windows.size()*sizeof(char));
for(unsigned int i=0; i<sort_clones.size();i++){
out_comp.write((char *)m[i],sort_clones.size()*sizeof(char));
}
out_comp.close();
......@@ -99,7 +108,7 @@ void comp_matrix::save(string file){
void comp_matrix::del(){
for (unsigned int i=0;i<windows.size();i++){
for (unsigned int i=0;i<sort_clones.size();i++){
free(m[i]);
}
free(m);
......@@ -125,40 +134,44 @@ list<list<junction> > comp_matrix::cluster(string forced_edges, int w, ostream
int c1=0;
int c2=0;
for (mjs::const_iterator ite = windows.getMap().begin();
ite != windows.getMap().end(); ++ite )
size_t i = 0;
for (list <pair<junction,int> >::const_iterator ite = sort_clones.begin();
(ite != sort_clones.end()) & (i<matrix_size); ++ite)
{
i++;
n_j++;
list <string> voisins ;
j1=ite->first;
neighbor[j1]=voisins;
}
for (mjs::const_iterator it0 = windows.getMap().begin();
it0 != windows.getMap().end(); ++it0 )
{
i = 0;
for (list <pair<junction,int> >::const_iterator it0 = sort_clones.begin();
(it0 != sort_clones.end()) & (i<matrix_size); ++it0)
{
i++;
j1=it0->first;
int k=it0->second.size();
int k=it0->second;
count[j1]=k;
n_j2+=k;
for (mjs::const_iterator it1 = windows.getMap().begin();
it1 != windows.getMap().end(); ++it1 )
size_t j = 0;
for (list <pair<junction,int> >::const_iterator it1 = sort_clones.begin();
(it1 != sort_clones.end()) & (j<matrix_size); ++it1)
{
j2=it1->first;
int distance = (int)m[c2][c1];
if (distance <= epsilon){
neighbor[j1].push_back(j2);
}
c1++;
c++;
}//fin it1
c2++;
c1=0;
}//fin it0
j++;
j2=it1->first;
int distance = (int)m[c2][c1];
if (distance <= epsilon){
neighbor[j1].push_back(j2);
}
c1++;
c++;
}//fin it1
c2++;
c1=0;
}//fin it0
/////////////////////////
//Forced - edges
......@@ -213,9 +226,11 @@ list<list<junction> > comp_matrix::cluster(string forced_edges, int w, ostream
int noise = 0;
int nb_comp = 0 ;
for (mjs::const_iterator it0 = windows.getMap().begin();
it0 != windows.getMap().end(); ++it0 )
i = 0;
for (list <pair<junction,int> >::const_iterator it0 = sort_clones.begin();
(it0 != sort_clones.end()) & (i<matrix_size); ++it0)
{
i++;
j1=it0->first;
if(visit[j1]==false && clust[j1]==false){
......@@ -382,6 +397,27 @@ void comp_matrix::stat_cluster( list<list<junction> > cluster, ostream &out )
}
JsonArray comp_matrix::toJson(list<list<junction> > clusterList)
{
JsonArray result;
for (list <list <string> >::iterator it = clusterList.begin();
it != clusterList.end(); ++it )
{
JsonArray cluster;
list<string> li=*it;
for (list<string>::iterator it2 = li.begin();
it2 != li.end(); ++it2 )
{
cluster.add(*it2);
}
result.add(cluster);
}
return result;
}
char **comp_matrix::alloc_matrix(size_t s) {
char **matrix = (char**)malloc(s*sizeof(char*));
for (size_t i=0;i<s;i++){
......
......@@ -9,24 +9,27 @@
#include <ctime>
#include "dynprog.h"
#include "windows.h"
#include "json.h"
using namespace std ;
#define SIMILAR_JUNCTIONS_THRESHOLD 1
#define JUNCTION_LIMIT 200
class comp_matrix {
public:
char ** m;
WindowsStorage &windows;
list<pair <junction, int> > sort_clones;
map <string, int> count;
int n_j;
int n_j2;
size_t matrix_size;
/**
* create new distance matrix
*/
comp_matrix(WindowsStorage &windows);
comp_matrix(list<pair <junction, int> > sc);
/**
* init matrix with a KmerStore and compute distance value between sequences
......@@ -65,6 +68,9 @@ class comp_matrix {
void del();
void stat_cluster( list<list<junction> > cluster, ostream &out=cout);
JsonArray toJson( list<list<junction> > cluster);
private:
......
......@@ -22,12 +22,13 @@ Germline::Germline(string _code, char _shortcut,
}
Germline::Germline(Fasta _rep_5, Fasta _rep_4, Fasta _rep_3,
Germline::Germline(string _code, char _shortcut,
Fasta _rep_5, Fasta _rep_4, Fasta _rep_3,
string seed,
int _delta_min, int _delta_max)
{
code = "" ;
shortcut = 'X' ;
code = _code ;
shortcut = _shortcut ;
// affect_5 = KmerAffect("", "V", 0) ;
// affect_3 = KmerAffect("", "J", 0) ;
......@@ -88,7 +89,8 @@ MultiGermline::MultiGermline(string f_germlines_json)
Fasta rep_3(f_rep_3, 2, "|", cout);
Germline *germline;
germline = new Germline(rep_5, rep_4, rep_3,
germline = new Germline("TRG", 'G',
rep_5, rep_4, rep_3,
seed,
delta_min, delta_max);
......
......@@ -29,7 +29,8 @@ class Germline {
string seed,
int _delta_min, int _delta_max);
Germline(Fasta _rep_5, Fasta _rep_4, Fasta _rep_3,
Germline(string _code, char _shortcut,
Fasta _rep_5, Fasta _rep_4, Fasta _rep_3,
string seed,
int _delta_min, int _delta_max);
......
......@@ -164,7 +164,7 @@ string JsonArray::toString(){
stream << " [ ";
for ( list<string>::iterator i=l.begin(); i!= l.end(); ++i){
if (i!=l.begin()) stream << ", ";
if (i!=l.begin()) stream << ", " << endl;
stream << (*i);
}
......
......@@ -230,7 +230,7 @@ KmerSegmenter::KmerSegmenter(Sequence seq, MultiGermline *multigermline)
// removeChevauchement is called once info was already computed: it is only to output info_extra
info_extra += removeChevauchement();
finishSegmentation();
system = germline->code;
return ;
}
} // end for (Germlines)
......@@ -722,35 +722,25 @@ void FineSegmenter::FineSegmentD(Germline *germline){
JsonList FineSegmenter::toJsonList(Germline *germline){
JsonList result;
result.add("status", because);
result.add("sequence", revcomp(sequence, reversed) );
if (isSegmented()) {
result.add("name", code_short);
result.add("Jstart", Jstart);
result.add("Nlength", (del_V+del_J+seg_N.size()) );
result.add("Vend", Vend);
JsonArray jsonV;
JsonArray jsonJ;
// TODO: what is going on if some list is smaller than JSON_REMEMBER_BEST ?
for (int i=0; i<JSON_REMEMBER_BEST; i++) jsonV.add( germline->rep_5.label(score_V[i].second) ) ;
result.add("V", jsonV);
JsonList seg;
seg.add("5", germline->rep_5.label(best_V));
seg.add("5start", 0);
seg.add("5end", Vend);
if (score_D.size()>0){
result.add("Dstart", Dstart);
result.add("Dend", Dend);
JsonArray jsonD;
for (int i=0; i<JSON_REMEMBER_BEST; i++) jsonD.add( germline->rep_4.label(score_D[i].second) ) ;
result.add("D", jsonD);
seg.add("4", germline->rep_4.label(best_D));
result.add("4start", Dstart);
result.add("4end", Dend);
}
for (int i=0; i<JSON_REMEMBER_BEST; i++) jsonJ.add( germline->rep_3.label(score_J[i].second) ) ;
result.add("J", jsonJ);
seg.add("3", germline->rep_3.label(best_J));
seg.add("3start", Jstart);
result.add("seg", seg);
}
return result;
}
......
......@@ -54,7 +54,7 @@ protected:
string info_extra; // .vdj.fa header, other information, at the end of the header
int best_V, best_J ;
int del_V, del_D_left, del_D_right, del_J ;
string seg_V, seg_N, seg_J;
string seg_V, seg_N, seg_J, system;
int best_D;
string seg_N1, seg_D, seg_N2;
......
......@@ -11,6 +11,12 @@ WindowsStorage *WindowExtractor::extract(OnlineFasta *reads, MultiGermline *mult
WindowsStorage *windowsStorage = new WindowsStorage(windows_labels);
for (list<Germline*>::const_iterator it = multigermline->germlines.begin(); it != multigermline->germlines.end(); ++it)
{
Germline *germline = *it ;
nb_reads_germline[germline->code] = 0;
}
while (reads->hasNext()) {
reads->next();
nb_reads++;
......@@ -36,6 +42,8 @@ WindowsStorage *WindowExtractor::extract(OnlineFasta *reads, MultiGermline *mult
*out_segmented << seg.getKmerAffectAnalyser()->toString() << endl;
}
nb_reads_germline[seg.system]++;
} else if (out_unsegmented) {
*out_unsegmented << reads->getSequence();
*out_unsegmented << "#" << segmented_mesg[seg.getSegmentationStatus()] << endl;
......@@ -59,6 +67,10 @@ size_t WindowExtractor::getNbSegmented(SEGMENTED seg) {
return stats[seg].nb;
}
size_t WindowExtractor::getNbReadsGermline(string germline) {
return nb_reads_germline[germline];
}
void WindowExtractor::setSegmentedOutput(ostream *out) {
out_segmented = out;
}
......
......@@ -2,6 +2,8 @@
#define WINDOW_EXTRACTOR_H
#include <iostream>
#include <string>
#include <map>
#include "segment.h"
#include "germline.h"
#include "kmerstore.h"
......@@ -17,6 +19,7 @@ using namespace std;
class WindowExtractor {
private:
size_t nb_reads;
map<string, size_t> nb_reads_germline;
ostream *out_segmented;
ostream *out_unsegmented;
......@@ -63,6 +66,13 @@ class WindowExtractor {
*/
size_t getNbSegmented(SEGMENTED seg);
/**
* @return the number of reads segmented from germline
* @param germline_code: one of the germline code in multigermline
* @pre extract() must have been launched.
*/
size_t getNbReadsGermline(string germline_code);
/**
* Defines the output stream where the segmented sequences will be output.
* Otherwise no output will be given.
......
......@@ -163,21 +163,22 @@ JsonArray WindowsStorage::sortedWindowsToJsonArray(map <junction, JsonList> json
{
JsonList windowsList;
JsonArray json_size;
JsonArray json_reads;
JsonArray json_seg;
json_size.add(it->second);
json_reads.add(it->second);
if (json_data_segment.find(it->first) != json_data_segment.end()){
windowsList.concat(json_data_segment[it->first]);
}else{
windowsList.add("sequence", 0); //TODO need to compute representative sequence for this case
}
windowsList.add("window", it->first);
windowsList.add("size", json_size);
windowsList.add("id", it->first);
windowsList.add("reads", json_reads);
windowsList.add("top", top++);
windowsList.add("id", this->getId(it->first));
//windowsList.add("id", this->getId(it->first));
JsonList seg_stat = this->statusToJson(it->first);
json_seg.add(seg_stat);
windowsList.add("germline", germline_by_window[it->first]->code);
windowsList.add("seg_stat", json_seg);
windowsArray.add(windowsList);
}
......
!LAUNCH: ../../vidjil -G ../../germline/IGH -r 5 -d ../../data/Stanford_S22.fasta ; cat out/vidjil.data | sh format-json.sh
$ Number of reads
1:"reads_total" : [ 13153 ] ,
1:"total" : [ 13153 ] ,
$ Number of segmented reads
1:"reads_segmented" : [ 13139 ] ,
1:"segmented" : [ 13139 ]
$ Most abundant window
1:"window" : "CCACCTATTACTGTACCCGGGAGGAACAATATAGCAGCTGGTACTTTGACTTCTGGGGCC", "size" : [ 8 ]
1:"id" : "CCACCTATTACTGTACCCGGGAGGAACAATATAGCAGCTGGTACTTTGACTTCTGGGGCC", "reads" : [ 8 ]
......@@ -4,7 +4,7 @@ $ Points list
f1:"point": [ "", "" ]
$ Most abundant window, twice, fused
f1:"window": "CCACCTATTACTGTACCCGGGAGGAACAATATAGCAGCTGGTACTTTGACTTCTGGGGCC", "id": .*, "size": [ 8, 8 ]
f1:"id": "CCACCTATTACTGTACCCGGGAGGAACAATATAGCAGCTGGTACTTTGACTTCTGGGGCC", .*"reads": [ 8, 8 ]
# Fails since 49046ca6b97, no more 'others'
$ Windows that are not in the top 50
......
......@@ -15,7 +15,7 @@ void testSegmentationBug1(int delta_min, int delta_max) {
Fasta seqJ("../../germline/TRGJ.fa");
Germline *germline ;
germline = new Germline(seqV, seqV, seqJ, "##############", delta_min, delta_max);
germline = new Germline("custom", 'x', seqV, seqV, seqJ, "##############", delta_min, delta_max);
MultiGermline *multi ;
multi = new MultiGermline();
......
......@@ -47,8 +47,10 @@ void testCluster() {
windows.add("CCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGTCTAGGGGG", seq, 0, 0);
windows.add("CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCATGCCCCCC", seq, 0, 0);
windows.sort();
list<pair <junction, int> > sort_clones = windows.getSortedList();
comp_matrix comp=comp_matrix(windows);
comp_matrix comp=comp_matrix(sort_clones);
//create matrix using junctions
comp.compare( cout, Cluster);
......
......@@ -20,7 +20,7 @@ void testFineSegment()
Fasta data("../../data/Stanford_S22.fasta", 1, " ");
Germline *germline ;
germline = new Germline(seqV, seqD, seqJ, "####", 0, 50);
germline = new Germline("IGH", 'G', seqV, seqD, seqJ, "####", 0, 50);
Sequence seq = data.read(2);
......@@ -70,9 +70,9 @@ void testSegmentOverlap()
Fasta data("../../data/bug-segment-overlap.fa", 1, " ");
Germline *germline1 ;
germline1 = new Germline(seqV, seqV, seqJ, "##########", -50, 50);
germline1 = new Germline("TRG", 'G', seqV, seqV, seqJ, "##########", -50, 50);
Germline *germline2 ;
germline2 = new Germline(seqV, seqV, seqJ, "##########", -50, 50);
germline2 = new Germline("TRG2", 'G', seqV, seqV, seqJ, "##########", -50, 50);
MultiGermline *multi1 ;
multi1 = new MultiGermline();
......@@ -103,7 +103,7 @@ void testSegmentationCause() {
Fasta data("../../data/segmentation.fasta", 1, " ");
Germline *germline ;
germline = new Germline(seqV, seqV, seqJ, "##########", 0, 10);
germline = new Germline("TRG", 'G', seqV, seqV, seqJ, "##########", 0, 10);
MultiGermline *multi ;
multi = new MultiGermline();
......@@ -197,7 +197,7 @@ void testExtractor() {
OnlineFasta data("../../data/segmentation.fasta", 1, " ");
Germline *germline ;
germline = new Germline(seqV, seqV, seqJ, "##########", 0, 10);
germline = new Germline("TRG", 'G', seqV, seqV, seqJ, "##########", 0, 10);
MultiGermline *multi ;
multi = new MultiGermline();
......
......@@ -419,7 +419,7 @@ int main (int argc, char **argv)
min_reads_clone = 0 ;
max_clones = 0 ;
break ;
// Seeds
case 's':
......@@ -793,7 +793,8 @@ int main (int argc, char **argv)
Fasta rep_J(f_rep_J, 2, "|", cout);
Germline *germline;
germline = new Germline(rep_V, rep_D, rep_J, seed,
germline = new Germline(germline_system, 'X',
rep_V, rep_D, rep_J, seed,
delta_min, delta_max);
multigermline->insert(germline);
......@@ -908,13 +909,16 @@ int main (int argc, char **argv)
cout << " ! No windows with current parameters." << endl;
}
if (command == CMD_CLONES) {
//////////////////////////////////
//$$ Clustering
windowsStorage->sort();
list<pair <junction, int> > sort_clones = windowsStorage->getSortedList();
cout << " ==> " << sort_clones.size() << " clones" << endl ;
list <list <junction> > clones_windows;
comp_matrix comp=comp_matrix(*windowsStorage);
comp_matrix comp=comp_matrix(sort_clones);
if (command == CMD_CLONES) {
if (epsilon || forced_edges.size())
{
......@@ -952,10 +956,6 @@ int main (int argc, char **argv)
// || ((clone_nb_reads >= min_reads_clone)
// && (clone_nb_reads * 100.0 / nb_segmented >= ratio_reads_clone)))
windowsStorage->sort();
list<pair <junction, int> > sort_clones = windowsStorage->getSortedList();
cout << " ==> " << sort_clones.size() << " clones" << endl ;
if (sort_clones.size() == 0)
{
cout << " ! No clones with current parameters." << endl;
......@@ -1223,33 +1223,65 @@ int main (int argc, char **argv)
JsonArray json_nb_segmented;
json_nb_segmented.add(nb_segmented);
JsonArray json_original_names;
json_original_names.add(f_reads);
JsonArray json_timestamp;
json_timestamp.add(time_buffer);