Attention une mise à jour du service Gitlab va être effectuée le mardi 14 décembre entre 13h30 et 14h00. Cette mise à jour va générer une interruption du service dont nous ne maîtrisons pas complètement la durée mais qui ne devrait pas excéder quelques minutes.

Commit a519cbc4 authored by Mathieu Giraud's avatar Mathieu Giraud
Browse files

merge - merge branch 'rework_data', new 2014.10 .data format

@duez worked a lot to implement the new .data format, described in doc/format-analysis.org
Behind, the access to the objects in the .js code have been streamlined.
parents 7ca20a8d 6935e2ae
......@@ -24,7 +24,12 @@
return lh.second>rh.second;
}
comp_matrix::comp_matrix(WindowsStorage &ws):windows(ws){}
comp_matrix::comp_matrix(list<pair <junction, int> > sc)
{
sort_clones = sc;
matrix_size = sort_clones.size();
if (matrix_size > JUNCTION_LIMIT) matrix_size = JUNCTION_LIMIT;
}
void comp_matrix::compare(ostream &out, Cost cluster_cost)
{
......@@ -37,30 +42,33 @@ void comp_matrix::compare(ostream &out, Cost cluster_cost)
out << " Using cost " << compareCost << endl ;
string j1, j2;
m = alloc_matrix(windows.size());
m = alloc_matrix(sort_clones.size());
int c=0;
int c1=0;
int c2=0;
for (mjs::const_iterator it0 = windows.getMap().begin();
it0 != windows.getMap().end(); ++it0 )
size_t i = 0;
for (list <pair<junction,int> >::const_iterator it0 = sort_clones.begin();
(it0 != sort_clones.end()) & (i<matrix_size); ++it0)
{
i++;
j1=it0->first;
for (mjs::const_iterator it1 = it0;
it1 != windows.getMap().end(); ++it1 )
{
j2=it1->first;
DynProg dp = DynProg(j1, j2, DynProg::Local, compareCost);
int score=dp.compute();
int distance = max(j1.size(), j2.size())-score;
m[c2][c1]=distance;
m[c1][c2]=distance;
c1++;
c++;
}//fin it1
size_t k = 0;
for (list <pair<junction,int> >::const_iterator it1 = it0;
(it1 != sort_clones.end()) & (k<matrix_size); ++it1)
{
k++;
j2=it1->first;
DynProg dp = DynProg(j1, j2, DynProg::Local, compareCost);
int score=dp.compute();
int distance = max(j1.size(), j2.size())-score;
m[c2][c1]=distance;
m[c1][c2]=distance;
c1++;
c++;
}//fin it1
c2++;
c1=c2;
}//fin it0
......@@ -69,14 +77,15 @@ void comp_matrix::compare(ostream &out, Cost cluster_cost)
}
void comp_matrix::load(string file){
m = alloc_matrix(windows.size());
char* tampon=(char*)malloc(windows.size()*sizeof(char));
m = alloc_matrix(sort_clones.size());
char* tampon=(char*)malloc(sort_clones.size()*sizeof(char));
ifstream in_comp(file.c_str());
for(unsigned int i=0; i<windows.size();i++){
in_comp.read(tampon, windows.size()*sizeof(char));
for(unsigned int j=0;j<windows.size(); j++){
for(unsigned int i=0; i<sort_clones.size();i++){
in_comp.read(tampon, sort_clones.size()*sizeof(char));
for(unsigned int j=0;j<sort_clones.size(); j++){
m[i][j]=tampon[j];
}
}
......@@ -90,8 +99,8 @@ void comp_matrix::save(string file){
ofstream out_comp(file.c_str());
for(unsigned int i=0; i<windows.size();i++){
out_comp.write((char *)m[i],windows.size()*sizeof(char));
for(unsigned int i=0; i<sort_clones.size();i++){
out_comp.write((char *)m[i],sort_clones.size()*sizeof(char));
}
out_comp.close();
......@@ -99,7 +108,7 @@ void comp_matrix::save(string file){
void comp_matrix::del(){
for (unsigned int i=0;i<windows.size();i++){
for (unsigned int i=0;i<sort_clones.size();i++){
free(m[i]);
}
free(m);
......@@ -125,40 +134,44 @@ list<list<junction> > comp_matrix::cluster(string forced_edges, int w, ostream
int c1=0;
int c2=0;
for (mjs::const_iterator ite = windows.getMap().begin();
ite != windows.getMap().end(); ++ite )
size_t i = 0;
for (list <pair<junction,int> >::const_iterator ite = sort_clones.begin();
(ite != sort_clones.end()) & (i<matrix_size); ++ite)
{
i++;
n_j++;
list <string> voisins ;
j1=ite->first;
neighbor[j1]=voisins;
}
for (mjs::const_iterator it0 = windows.getMap().begin();
it0 != windows.getMap().end(); ++it0 )
{
i = 0;
for (list <pair<junction,int> >::const_iterator it0 = sort_clones.begin();
(it0 != sort_clones.end()) & (i<matrix_size); ++it0)
{
i++;
j1=it0->first;
int k=it0->second.size();
int k=it0->second;
count[j1]=k;
n_j2+=k;
for (mjs::const_iterator it1 = windows.getMap().begin();
it1 != windows.getMap().end(); ++it1 )
size_t j = 0;
for (list <pair<junction,int> >::const_iterator it1 = sort_clones.begin();
(it1 != sort_clones.end()) & (j<matrix_size); ++it1)
{
j2=it1->first;
int distance = (int)m[c2][c1];
if (distance <= epsilon){
neighbor[j1].push_back(j2);
}
c1++;
c++;
}//fin it1
c2++;
c1=0;
}//fin it0
j++;
j2=it1->first;
int distance = (int)m[c2][c1];
if (distance <= epsilon){
neighbor[j1].push_back(j2);
}
c1++;
c++;
}//fin it1
c2++;
c1=0;
}//fin it0
/////////////////////////
//Forced - edges
......@@ -213,9 +226,11 @@ list<list<junction> > comp_matrix::cluster(string forced_edges, int w, ostream
int noise = 0;
int nb_comp = 0 ;
for (mjs::const_iterator it0 = windows.getMap().begin();
it0 != windows.getMap().end(); ++it0 )
i = 0;
for (list <pair<junction,int> >::const_iterator it0 = sort_clones.begin();
(it0 != sort_clones.end()) & (i<matrix_size); ++it0)
{
i++;
j1=it0->first;
if(visit[j1]==false && clust[j1]==false){
......@@ -382,6 +397,27 @@ void comp_matrix::stat_cluster( list<list<junction> > cluster, ostream &out )
}
JsonArray comp_matrix::toJson(list<list<junction> > clusterList)
{
JsonArray result;
for (list <list <string> >::iterator it = clusterList.begin();
it != clusterList.end(); ++it )
{
JsonArray cluster;
list<string> li=*it;
for (list<string>::iterator it2 = li.begin();
it2 != li.end(); ++it2 )
{
cluster.add(*it2);
}
result.add(cluster);
}
return result;
}
char **comp_matrix::alloc_matrix(size_t s) {
char **matrix = (char**)malloc(s*sizeof(char*));
for (size_t i=0;i<s;i++){
......
......@@ -9,24 +9,27 @@
#include <ctime>
#include "dynprog.h"
#include "windows.h"
#include "json.h"
using namespace std ;
#define SIMILAR_JUNCTIONS_THRESHOLD 1
#define JUNCTION_LIMIT 200
class comp_matrix {
public:
char ** m;
WindowsStorage &windows;
list<pair <junction, int> > sort_clones;
map <string, int> count;
int n_j;
int n_j2;
size_t matrix_size;
/**
* create new distance matrix
*/
comp_matrix(WindowsStorage &windows);
comp_matrix(list<pair <junction, int> > sc);
/**
* init matrix with a KmerStore and compute distance value between sequences
......@@ -65,6 +68,9 @@ class comp_matrix {
void del();
void stat_cluster( list<list<junction> > cluster, ostream &out=cout);
JsonArray toJson( list<list<junction> > cluster);
private:
......
......@@ -22,12 +22,13 @@ Germline::Germline(string _code, char _shortcut,
}
Germline::Germline(Fasta _rep_5, Fasta _rep_4, Fasta _rep_3,
Germline::Germline(string _code, char _shortcut,
Fasta _rep_5, Fasta _rep_4, Fasta _rep_3,
string seed,
int _delta_min, int _delta_max)
{
code = "" ;
shortcut = 'X' ;
code = _code ;
shortcut = _shortcut ;
// affect_5 = KmerAffect("", "V", 0) ;
// affect_3 = KmerAffect("", "J", 0) ;
......@@ -88,7 +89,8 @@ MultiGermline::MultiGermline(string f_germlines_json)
Fasta rep_3(f_rep_3, 2, "|", cout);
Germline *germline;
germline = new Germline(rep_5, rep_4, rep_3,
germline = new Germline("TRG", 'G',
rep_5, rep_4, rep_3,
seed,
delta_min, delta_max);
......
......@@ -29,7 +29,8 @@ class Germline {
string seed,
int _delta_min, int _delta_max);
Germline(Fasta _rep_5, Fasta _rep_4, Fasta _rep_3,
Germline(string _code, char _shortcut,
Fasta _rep_5, Fasta _rep_4, Fasta _rep_3,
string seed,
int _delta_min, int _delta_max);
......
......@@ -164,7 +164,7 @@ string JsonArray::toString(){
stream << " [ ";
for ( list<string>::iterator i=l.begin(); i!= l.end(); ++i){
if (i!=l.begin()) stream << ", ";
if (i!=l.begin()) stream << ", " << endl;
stream << (*i);
}
......
......@@ -230,7 +230,7 @@ KmerSegmenter::KmerSegmenter(Sequence seq, MultiGermline *multigermline)
// removeChevauchement is called once info was already computed: it is only to output info_extra
info_extra += removeChevauchement();
finishSegmentation();
system = germline->code;
return ;
}
} // end for (Germlines)
......@@ -722,35 +722,25 @@ void FineSegmenter::FineSegmentD(Germline *germline){
JsonList FineSegmenter::toJsonList(Germline *germline){
JsonList result;
result.add("status", because);
result.add("sequence", revcomp(sequence, reversed) );
if (isSegmented()) {
result.add("name", code_short);
result.add("Jstart", Jstart);
result.add("Nlength", (del_V+del_J+seg_N.size()) );
result.add("Vend", Vend);
JsonArray jsonV;
JsonArray jsonJ;
// TODO: what is going on if some list is smaller than JSON_REMEMBER_BEST ?
for (int i=0; i<JSON_REMEMBER_BEST; i++) jsonV.add( germline->rep_5.label(score_V[i].second) ) ;
result.add("V", jsonV);
JsonList seg;
seg.add("5", germline->rep_5.label(best_V));
seg.add("5start", 0);
seg.add("5end", Vend);
if (score_D.size()>0){
result.add("Dstart", Dstart);
result.add("Dend", Dend);
JsonArray jsonD;
for (int i=0; i<JSON_REMEMBER_BEST; i++) jsonD.add( germline->rep_4.label(score_D[i].second) ) ;
result.add("D", jsonD);
seg.add("4", germline->rep_4.label(best_D));
result.add("4start", Dstart);
result.add("4end", Dend);
}
for (int i=0; i<JSON_REMEMBER_BEST; i++) jsonJ.add( germline->rep_3.label(score_J[i].second) ) ;
result.add("J", jsonJ);
seg.add("3", germline->rep_3.label(best_J));
seg.add("3start", Jstart);
result.add("seg", seg);
}
return result;
}
......
......@@ -54,7 +54,7 @@ protected:
string info_extra; // .vdj.fa header, other information, at the end of the header
int best_V, best_J ;
int del_V, del_D_left, del_D_right, del_J ;
string seg_V, seg_N, seg_J;
string seg_V, seg_N, seg_J, system;
int best_D;
string seg_N1, seg_D, seg_N2;
......
......@@ -11,6 +11,12 @@ WindowsStorage *WindowExtractor::extract(OnlineFasta *reads, MultiGermline *mult
WindowsStorage *windowsStorage = new WindowsStorage(windows_labels);
for (list<Germline*>::const_iterator it = multigermline->germlines.begin(); it != multigermline->germlines.end(); ++it)
{
Germline *germline = *it ;
nb_reads_germline[germline->code] = 0;
}
while (reads->hasNext()) {
reads->next();
nb_reads++;
......@@ -36,6 +42,8 @@ WindowsStorage *WindowExtractor::extract(OnlineFasta *reads, MultiGermline *mult
*out_segmented << seg.getKmerAffectAnalyser()->toString() << endl;
}
nb_reads_germline[seg.system]++;
} else if (out_unsegmented) {
*out_unsegmented << reads->getSequence();
*out_unsegmented << "#" << segmented_mesg[seg.getSegmentationStatus()] << endl;
......@@ -59,6 +67,10 @@ size_t WindowExtractor::getNbSegmented(SEGMENTED seg) {
return stats[seg].nb;
}
size_t WindowExtractor::getNbReadsGermline(string germline) {
return nb_reads_germline[germline];
}
void WindowExtractor::setSegmentedOutput(ostream *out) {
out_segmented = out;
}
......
......@@ -2,6 +2,8 @@
#define WINDOW_EXTRACTOR_H
#include <iostream>
#include <string>
#include <map>
#include "segment.h"
#include "germline.h"
#include "kmerstore.h"
......@@ -17,6 +19,7 @@ using namespace std;
class WindowExtractor {
private:
size_t nb_reads;
map<string, size_t> nb_reads_germline;
ostream *out_segmented;
ostream *out_unsegmented;
......@@ -63,6 +66,13 @@ class WindowExtractor {
*/
size_t getNbSegmented(SEGMENTED seg);
/**
* @return the number of reads segmented from germline
* @param germline_code: one of the germline code in multigermline
* @pre extract() must have been launched.
*/
size_t getNbReadsGermline(string germline_code);
/**
* Defines the output stream where the segmented sequences will be output.
* Otherwise no output will be given.
......
......@@ -163,21 +163,22 @@ JsonArray WindowsStorage::sortedWindowsToJsonArray(map <junction, JsonList> json
{
JsonList windowsList;
JsonArray json_size;
JsonArray json_reads;
JsonArray json_seg;
json_size.add(it->second);
json_reads.add(it->second);
if (json_data_segment.find(it->first) != json_data_segment.end()){
windowsList.concat(json_data_segment[it->first]);
}else{
windowsList.add("sequence", 0); //TODO need to compute representative sequence for this case
}
windowsList.add("window", it->first);
windowsList.add("size", json_size);
windowsList.add("id", it->first);
windowsList.add("reads", json_reads);
windowsList.add("top", top++);
windowsList.add("id", this->getId(it->first));
//windowsList.add("id", this->getId(it->first));
JsonList seg_stat = this->statusToJson(it->first);
json_seg.add(seg_stat);
windowsList.add("germline", germline_by_window[it->first]->code);
windowsList.add("seg_stat", json_seg);
windowsArray.add(windowsList);
}
......
!LAUNCH: ../../vidjil -G ../../germline/IGH -r 5 -d ../../data/Stanford_S22.fasta ; cat out/vidjil.data | sh format-json.sh
$ Number of reads
1:"reads_total" : [ 13153 ] ,
1:"total" : [ 13153 ] ,
$ Number of segmented reads
1:"reads_segmented" : [ 13139 ] ,
1:"segmented" : [ 13139 ]
$ Most abundant window
1:"window" : "CCACCTATTACTGTACCCGGGAGGAACAATATAGCAGCTGGTACTTTGACTTCTGGGGCC", "size" : [ 8 ]
1:"id" : "CCACCTATTACTGTACCCGGGAGGAACAATATAGCAGCTGGTACTTTGACTTCTGGGGCC", "reads" : [ 8 ]
......@@ -4,7 +4,7 @@ $ Points list
f1:"point": [ "", "" ]
$ Most abundant window, twice, fused
f1:"window": "CCACCTATTACTGTACCCGGGAGGAACAATATAGCAGCTGGTACTTTGACTTCTGGGGCC", "id": .*, "size": [ 8, 8 ]
f1:"id": "CCACCTATTACTGTACCCGGGAGGAACAATATAGCAGCTGGTACTTTGACTTCTGGGGCC", .*"reads": [ 8, 8 ]
# Fails since 49046ca6b97, no more 'others'
$ Windows that are not in the top 50
......
......@@ -15,7 +15,7 @@ void testSegmentationBug1(int delta_min, int delta_max) {
Fasta seqJ("../../germline/TRGJ.fa");
Germline *germline ;
germline = new Germline(seqV, seqV, seqJ, "##############", delta_min, delta_max);
germline = new Germline("custom", 'x', seqV, seqV, seqJ, "##############", delta_min, delta_max);
MultiGermline *multi ;
multi = new MultiGermline();
......
......@@ -47,8 +47,10 @@ void testCluster() {
windows.add("CCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGTCTAGGGGG", seq, 0, 0);
windows.add("CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCATGCCCCCC", seq, 0, 0);
windows.sort();
list<pair <junction, int> > sort_clones = windows.getSortedList();
comp_matrix comp=comp_matrix(windows);
comp_matrix comp=comp_matrix(sort_clones);
//create matrix using junctions
comp.compare( cout, Cluster);
......
......@@ -20,7 +20,7 @@ void testFineSegment()
Fasta data("../../data/Stanford_S22.fasta", 1, " ");
Germline *germline ;
germline = new Germline(seqV, seqD, seqJ, "####", 0, 50);
germline = new Germline("IGH", 'G', seqV, seqD, seqJ, "####", 0, 50);
Sequence seq = data.read(2);
......@@ -70,9 +70,9 @@ void testSegmentOverlap()
Fasta data("../../data/bug-segment-overlap.fa", 1, " ");
Germline *germline1 ;
germline1 = new Germline(seqV, seqV, seqJ, "##########", -50, 50);
germline1 = new Germline("TRG", 'G', seqV, seqV, seqJ, "##########", -50, 50);
Germline *germline2 ;
germline2 = new Germline(seqV, seqV, seqJ, "##########", -50, 50);
germline2 = new Germline("TRG2", 'G', seqV, seqV, seqJ, "##########", -50, 50);
MultiGermline *multi1 ;
multi1 = new MultiGermline();
......@@ -103,7 +103,7 @@ void testSegmentationCause() {
Fasta data("../../data/segmentation.fasta", 1, " ");
Germline *germline ;
germline = new Germline(seqV, seqV, seqJ, "##########", 0, 10);
germline = new Germline("TRG", 'G', seqV, seqV, seqJ, "##########", 0, 10);
MultiGermline *multi ;
multi = new MultiGermline();
......@@ -197,7 +197,7 @@ void testExtractor() {
OnlineFasta data("../../data/segmentation.fasta", 1, " ");
Germline *germline ;
germline = new Germline(seqV, seqV, seqJ, "##########", 0, 10);
germline = new Germline("TRG", 'G', seqV, seqV, seqJ, "##########", 0, 10);
MultiGermline *multi ;
multi = new MultiGermline();
......
......@@ -419,7 +419,7 @@ int main (int argc, char **argv)
min_reads_clone = 0 ;
max_clones = 0 ;
break ;
// Seeds
case 's':
......@@ -793,7 +793,8 @@ int main (int argc, char **argv)
Fasta rep_J(f_rep_J, 2, "|", cout);
Germline *germline;
germline = new Germline(rep_V, rep_D, rep_J, seed,
germline = new Germline(germline_system, 'X',
rep_V, rep_D, rep_J, seed,
delta_min, delta_max);
multigermline->insert(germline);
......@@ -908,13 +909,16 @@ int main (int argc, char **argv)
cout << " ! No windows with current parameters." << endl;
}
if (command == CMD_CLONES) {
//////////////////////////////////
//$$ Clustering
windowsStorage->sort();
list<pair <junction, int> > sort_clones = windowsStorage->getSortedList();
cout << " ==> " << sort_clones.size() << " clones" << endl ;
list <list <junction> > clones_windows;
comp_matrix comp=comp_matrix(*windowsStorage);
comp_matrix comp=comp_matrix(sort_clones);
if (command == CMD_CLONES) {
if (epsilon || forced_edges.size())
{
......@@ -952,10 +956,6 @@ int main (int argc, char **argv)
// || ((clone_nb_reads >= min_reads_clone)
// && (clone_nb_reads * 100.0 / nb_segmented >= ratio_reads_clone)))
windowsStorage->sort();
list<pair <junction, int> > sort_clones = windowsStorage->getSortedList();
cout << " ==> " << sort_clones.size() << " clones" << endl ;
if (sort_clones.size() == 0)
{