Commit ea7f0f1b authored by Mathieu Giraud's avatar Mathieu Giraud
Browse files

merge - simplification of vidjil.cpp, new Germline/MultiGermline objects

parents cfaa76d2 d410df5b
......@@ -306,20 +306,6 @@ list<list<junction> > comp_matrix::cluster(string forced_edges, int w, ostream
return cluster;
}
list<list<junction> > comp_matrix::nocluster()
{
list <list < string > > cluster ;
for (map<junction, list<Sequence> >::const_iterator it0 = windows.getMap().begin();
it0 != windows.getMap().end(); ++it0 )
{
list< string > c1;
c1.push_back(it0->first);
cluster.push_back(c1);
}
return cluster;
}
void comp_matrix::stat_cluster( list<list<junction> > cluster, ostream &out )
{
......
......@@ -59,8 +59,6 @@ class comp_matrix {
int w=0,ostream &out=cout,
int epsilon=1, int minPts=10);
list<list<junction> > nocluster();
/**
* reset state
*/
......
#include "germline.h"
Germline::Germline(Fasta _rep_5, Fasta _rep_4, Fasta _rep_3,
string seed,
int _delta_min, int _delta_max)
{
// code = 'TRG' ;
// shortcut = 'G' ;
// description = "" ;
// affect_5 = KmerAffect("", "V", 0) ;
// affect_3 = KmerAffect("", "J", 0) ;
rep_5 = _rep_5 ;
rep_4 = _rep_4 ;
rep_3 = _rep_3 ;
delta_min = _delta_min ;
delta_max = _delta_max ;
bool rc = true ;
index = KmerStoreFactory::createIndex<KmerAffect>(seed, rc);
index->insert(rep_5, "V"); // affect_5);
index->insert(rep_3, "J"); // affect_3);
}
Germline::~Germline()
{
delete index;
}
ostream &operator<<(ostream &out, const Germline &germline)
{
out << germline.shortcut << " (" << germline.description << ") "
<< germline.delta_min << "/" << germline.delta_max << endl ;
return out;
}
MultiGermline::MultiGermline(Germline *germline)
{
germlines.push_back(germline);
}
MultiGermline::MultiGermline(string f_germlines_json)
{
// Should parse 'data/germlines.data'
string f_rep_5 = "germline/TRGV.fa";
string f_rep_4 = "";
string f_rep_3 = "germline/TRGJ.fa";
string seed = "#####-#####";
int delta_min = -10 ;
int delta_max = 20 ;
Fasta rep_5(f_rep_5, 2, "|", cout);
Fasta rep_4(f_rep_4, 2, "|", cout);
Fasta rep_3(f_rep_3, 2, "|", cout);
Germline *germline;
germline = new Germline(rep_5, rep_4, rep_3,
seed,
delta_min, delta_max);
germlines.push_back(germline);
}
#ifndef GERMLINE_H
#define GERMLINE_H
#include <string>
#include <list>
#include "kmeraffect.h"
#include "kmerstore.h"
using namespace std;
class Germline {
private:
public:
/*
* @param delta_min: the minimal distance between the right bound and the left bound
* so that the segmentation is accepted
* (left bound: end of V, right bound : start of J)
* @param delta_min: the maximal distance between the right bound and the left bound
* so that the segmentation is accepted
* (left bound: end of V, right bound : start of J)
*/
Germline(Fasta rep_5, Fasta rep_4, Fasta rep_3,
string seed,
int delta_min, int delta_max);
~Germline();
string code ;
char shortcut ;
string description ;
// KmerAffect affect_5 ;
// KmerAffect affect_3 ;
Fasta rep_5 ;
Fasta rep_4 ;
Fasta rep_3 ;
IKmerStore<KmerAffect> *index;
int delta_min;
int delta_max;
};
ostream &operator<<(ostream &out, const Germline &germline);
class MultiGermline {
private:
public:
list <Germline*> germlines;
MultiGermline(Germline *germline);
MultiGermline(string f_germlines_json);
};
#endif
......@@ -163,7 +163,7 @@ void KmerRepresentativeComputer::compute() {
is_computed = true;
representative = sequence_longest_run;
representative.sequence = representative.sequence.substr(pos_longest_run, length_longest_run);
representative.label = "representative--" + representative.label + "-[" + string_of_int(pos_longest_run) + ","
representative.label = representative.label + "-[" + string_of_int(pos_longest_run) + ","
+ string_of_int(pos_longest_run + length_longest_run - 1) + "]";
}
delete index;
......
......@@ -161,18 +161,20 @@ ostream &operator<<(ostream &out, const Segmenter &s)
// KmerSegmenter (Cheap)
KmerSegmenter::KmerSegmenter(Sequence seq, IKmerStore<KmerAffect> *index,
int delta_min, int delta_max)
KmerSegmenter::KmerSegmenter(Sequence seq, MultiGermline *multigermline)
{
label = seq.label ;
sequence = seq.sequence ;
info = "" ;
info_extra = "seed";
segmented = false;
segmented_germline = 0;
reversed = false;
Dend=0;
int s = (size_t)index->getS() ;
// Now we just take one germline
Germline *germline = multigermline->germlines.back();
int s = (size_t)germline->index->getS() ;
int length = sequence.length() ;
if (length < s)
......@@ -182,7 +184,7 @@ KmerSegmenter::KmerSegmenter(Sequence seq, IKmerStore<KmerAffect> *index,
return ;
}
kaa = new KmerAffectAnalyser<KmerAffect>(*index, sequence);
kaa = new KmerAffectAnalyser<KmerAffect>(*(germline->index), sequence);
// Check strand consistency among the affectations.
int strand;
......@@ -210,11 +212,12 @@ KmerSegmenter::KmerSegmenter(Sequence seq, IKmerStore<KmerAffect> *index,
strand = 2;
}
computeSegmentation(strand, delta_min, delta_max, s);
computeSegmentation(strand, germline->delta_min, germline->delta_max, s);
if (segmented)
{
// Yes, it is segmented
segmented_germline = germline;
reversed = (strand == -1);
because = reversed ? SEG_MINUS : SEG_PLUS ;
......@@ -437,8 +440,7 @@ string format_del(int deletions)
return deletions ? *"(" + string_of_int(deletions) + " del)" : "" ;
}
FineSegmenter::FineSegmenter(Sequence seq, Fasta &rep_V, Fasta &rep_J,
int delta_min, int delta_max, Cost segment_c)
FineSegmenter::FineSegmenter(Sequence seq, Germline *germline, Cost segment_c)
{
info_extra = "" ;
......@@ -461,10 +463,10 @@ FineSegmenter::FineSegmenter(Sequence seq, Fasta &rep_V, Fasta &rep_J,
vector<pair<int, int> > score_plus_V;
vector<pair<int, int> > score_plus_J;
int plus_left = align_against_collection(sequence, rep_V, false, false, &tag_plus_V, &del_plus_V, &del2, &beg,
int plus_left = align_against_collection(sequence, germline->rep_5, false, false, &tag_plus_V, &del_plus_V, &del2, &beg,
&plus_length, &score_plus_V
, segment_cost);
int plus_right = align_against_collection(sequence, rep_J, true, false, &tag_plus_J, &del_plus_J, &del2, &beg,
int plus_right = align_against_collection(sequence, germline->rep_3, true, false, &tag_plus_J, &del_plus_J, &del2, &beg,
&plus_length, &score_plus_J
, segment_cost);
plus_length += plus_right - plus_left ;
......@@ -481,10 +483,10 @@ FineSegmenter::FineSegmenter(Sequence seq, Fasta &rep_V, Fasta &rep_J,
vector<pair<int, int> > score_minus_V;
vector<pair<int, int> > score_minus_J;
int minus_left = align_against_collection(rc, rep_V, false, false, &tag_minus_V, &del_minus_V, &del2, &beg,
int minus_left = align_against_collection(rc, germline->rep_5, false, false, &tag_minus_V, &del_minus_V, &del2, &beg,
&minus_length, &score_minus_V
, segment_cost);
int minus_right = align_against_collection(rc, rep_J, true, false, &tag_minus_J, &del_minus_J, &del2, &beg,
int minus_right = align_against_collection(rc, germline->rep_3, true, false, &tag_minus_J, &del_minus_J, &del2, &beg,
&minus_length, &score_minus_J
, segment_cost);
minus_length += minus_right - minus_left ;
......@@ -517,7 +519,7 @@ FineSegmenter::FineSegmenter(Sequence seq, Fasta &rep_V, Fasta &rep_J,
}
segmented = (Vend != (int) string::npos) && (Jstart != (int) string::npos) &&
(Jstart - Vend >= delta_min) && (Jstart - Vend <= delta_max);
(Jstart - Vend >= germline->delta_min) && (Jstart - Vend <= germline->delta_max);
dSegmented=false;
......@@ -526,12 +528,12 @@ FineSegmenter::FineSegmenter(Sequence seq, Fasta &rep_V, Fasta &rep_J,
because = DONT_KNOW;
info = " @" + string_of_int (Vend + FIRST_POS) + " @" + string_of_int(Jstart + FIRST_POS) ;
if (Jstart - Vend < delta_min)
if (Jstart - Vend < germline->delta_min)
{
because = UNSEG_BAD_DELTA_MIN ;
}
if (Jstart - Vend > delta_max)
if (Jstart - Vend > germline->delta_max)
{
because = UNSEG_BAD_DELTA_MAX ;
}
......@@ -560,7 +562,7 @@ FineSegmenter::FineSegmenter(Sequence seq, Fasta &rep_V, Fasta &rep_J,
string seq_right = sequence.substr(Jstart);
best_align(overlap, seq_left, seq_right,
rep_V.sequence(best_V), rep_J.sequence(best_J), &b_r,&b_l, segment_cost);
germline->rep_5.sequence(best_V), germline->rep_3.sequence(best_J), &b_r,&b_l, segment_cost);
// Trim V
Vend -= b_l;
del_V += b_l;
......@@ -577,24 +579,24 @@ FineSegmenter::FineSegmenter(Sequence seq, Fasta &rep_V, Fasta &rep_J,
/// used only below, then recomputed in finishSegmentation() ;
seg_N = revcomp(sequence, reversed).substr(Vend+1, Jstart-Vend-1);
code = rep_V.label(best_V) +
code = germline->rep_5.label(best_V) +
" "+ string_of_int(del_V) +
"/" + seg_N +
// chevauchement +
"/" + string_of_int(del_J) +
" " + rep_J.label(best_J);
" " + germline->rep_3.label(best_J);
stringstream code_s;
code_s<< rep_V.label(best_V) <<
code_s<< germline->rep_5.label(best_V) <<
" -" << string_of_int(del_V) << "/"
<< seg_N.size()
// chevauchement +
<< "/-" << string_of_int(del_J)
<<" " << rep_J.label(best_J);
<<" " << germline->rep_3.label(best_J);
code_short=code_s.str();
code_light = rep_V.label(best_V) +
"/ " + rep_J.label(best_J);
code_light = germline->rep_5.label(best_V) +
"/ " + germline->rep_3.label(best_J);
info = string_of_int(Vend + FIRST_POS) + " " + string_of_int(Jstart + FIRST_POS) ;
......@@ -603,7 +605,7 @@ FineSegmenter::FineSegmenter(Sequence seq, Fasta &rep_V, Fasta &rep_J,
}
void FineSegmenter::FineSegmentD(Fasta &rep_V, Fasta &rep_D, Fasta &rep_J){
void FineSegmenter::FineSegmentD(Germline *germline){
if (segmented){
......@@ -625,7 +627,7 @@ void FineSegmenter::FineSegmentD(Fasta &rep_V, Fasta &rep_D, Fasta &rep_J){
string str = getSequence().sequence.substr(l, r-l);
// Align
end = align_against_collection(str, rep_D, false, true, &tag_D, &del_D_right, &del_D_left, &begin,
end = align_against_collection(str, germline->rep_4, false, true, &tag_D, &del_D_right, &del_D_left, &begin,
&length, &score_D, segment_cost);
best_D = tag_D;
......@@ -645,7 +647,7 @@ void FineSegmenter::FineSegmentD(Fasta &rep_V, Fasta &rep_D, Fasta &rep_J){
string seq_right = seq.substr(Dstart, Dend-Dstart+1);
best_align(overlap, seq_left, seq_right,
rep_V.sequence(best_V), rep_D.sequence(best_D), &b_r,&b_l, segment_cost);
germline->rep_5.sequence(best_V), germline->rep_4.sequence(best_D), &b_r,&b_l, segment_cost);
// Trim V
Vend -= b_l;
......@@ -664,7 +666,7 @@ void FineSegmenter::FineSegmentD(Fasta &rep_V, Fasta &rep_D, Fasta &rep_J){
string seq_left = seq.substr(Jstart, seq.length()-Jstart);
best_align(overlap, seq_left, seq_right,
rep_D.sequence(best_D), rep_J.sequence(best_J), &b_r,&b_l, segment_cost);
germline->rep_4.sequence(best_D), germline->rep_3.sequence(best_J), &b_r,&b_l, segment_cost);
// Trim D
Dend -= b_l;
......@@ -675,42 +677,42 @@ void FineSegmenter::FineSegmentD(Fasta &rep_V, Fasta &rep_D, Fasta &rep_J){
}
seg_N2 = seq.substr(Dend+1, Jstart-Dend-1) ; // From Dend+1 to right-1
code = rep_V.label(best_V) +
code = germline->rep_5.label(best_V) +
" "+ string_of_int(del_V) +
"/" + seg_N1 +
"/" + string_of_int(del_D_left) +
" " + rep_D.label(best_D) +
" " + germline->rep_4.label(best_D) +
" " + string_of_int(del_D_right) +
"/" + seg_N2 +
"/" + string_of_int(del_J) +
" " + rep_J.label(best_J);
" " + germline->rep_3.label(best_J);
stringstream code_s;
code_s << rep_V.label(best_V)
code_s << germline->rep_5.label(best_V)
<< " -" << string_of_int(del_V) << "/"
<< seg_N1.size()
<< "/-" << string_of_int(del_D_left)
<< " " << rep_D.label(best_D)
<< " " << germline->rep_4.label(best_D)
<< " -" << string_of_int(del_D_right) << "/"
<< seg_N2.size()
<< "/-" << string_of_int(del_J)
<< " " << rep_J.label(best_J);
<< " " << germline->rep_3.label(best_J);
code_short=code_s.str();
code_light = rep_V.label(best_V) +
"/ " + rep_D.label(best_D) +
"/ " + rep_J.label(best_J);
code_light = germline->rep_5.label(best_V) +
"/ " + germline->rep_4.label(best_D) +
"/ " + germline->rep_3.label(best_J);
finishSegmentationD();
}
}
JsonList FineSegmenter::toJsonList(Fasta &rep_V, Fasta &rep_D, Fasta &rep_J){
JsonList FineSegmenter::toJsonList(Germline *germline){
JsonList result;
result.add("status", because);
......@@ -727,7 +729,7 @@ JsonList FineSegmenter::toJsonList(Fasta &rep_V, Fasta &rep_D, Fasta &rep_J){
// TODO: what is going on if some list is smaller than JSON_REMEMBER_BEST ?
for (int i=0; i<JSON_REMEMBER_BEST; i++) jsonV.add( rep_V.label(score_V[i].second) ) ;
for (int i=0; i<JSON_REMEMBER_BEST; i++) jsonV.add( germline->rep_5.label(score_V[i].second) ) ;
result.add("V", jsonV);
......@@ -736,11 +738,11 @@ JsonList FineSegmenter::toJsonList(Fasta &rep_V, Fasta &rep_D, Fasta &rep_J){
result.add("Dend", Dend);
JsonArray jsonD;
for (int i=0; i<JSON_REMEMBER_BEST; i++) jsonD.add( rep_D.label(score_D[i].second) ) ;
for (int i=0; i<JSON_REMEMBER_BEST; i++) jsonD.add( germline->rep_4.label(score_D[i].second) ) ;
result.add("D", jsonD);
}
for (int i=0; i<JSON_REMEMBER_BEST; i++) jsonJ.add( rep_J.label(score_J[i].second) ) ;
for (int i=0; i<JSON_REMEMBER_BEST; i++) jsonJ.add( germline->rep_3.label(score_J[i].second) ) ;
result.add("J", jsonJ);
}
return result;
......
......@@ -6,6 +6,7 @@
#include "fasta.h"
#include "dynprog.h"
#include "tools.h"
#include "germline.h"
#include "kmerstore.h"
#include "kmeraffect.h"
#include "affectanalyser.h"
......@@ -54,7 +55,7 @@ protected:
int best_V, best_J ;
int del_V, del_D_left, del_D_right, del_J ;
string seg_V, seg_N, seg_J;
int best_D;
string seg_N1, seg_D, seg_N2;
Cost segment_cost;
......@@ -132,19 +133,14 @@ class KmerSegmenter : public Segmenter
string affects;
public:
Germline *segmented_germline;
/**
* Build a segmenter based on KmerSegmentation
* @param seq: An object read from a FASTA/FASTQ file
* @param index: A Kmer index
* @param delta_min: the minimal distance between the right bound and the left bound
* so that the segmentation is accepted
* (left bound: end of V, right bound : start of J)
* @param delta_min: the maximal distance between the right bound and the left bound
* so that the segmentation is accepted
* (left bound: end of V, right bound : start of J)
* @param multigermline: the multigermline
*/
KmerSegmenter(Sequence seq, IKmerStore<KmerAffect> *index,
int delta_min, int delta_max);
KmerSegmenter(Sequence seq, MultiGermline *multigermline);
~KmerSegmenter();
......@@ -176,27 +172,17 @@ class FineSegmenter : public Segmenter
/**
* Build a fineSegmenter based on KmerSegmentation
* @param seq: An object read from a FASTA/FASTQ file
* @param rep_V: germline for V
* @param rep_J: germline for J
* @param delta_min: the minimal distance between the right bound and the left bound
* so that the segmentation is accepted
* (left bound: end of V, right bound : start of J)
* @param delta_min: the maximal distance between the right bound and the left bound
* so that the segmentation is accepted
* (left bound: end of V, right bound : start of J)
* @param germline: germline used
*/
FineSegmenter(Sequence seq, Fasta &rep_V, Fasta &rep_J,
int delta_min, int delta_max, Cost segment_cost);
FineSegmenter(Sequence seq, Germline *germline, Cost segment_cost);
/**
* extend segmentation from VJ to VDJ
* @param rep_V: germline for V
* @param rep_D: germline for D
* @param rep_J: germline for J
* @param germline: germline used
*/
void FineSegmentD(Fasta &rep_V, Fasta &rep_D, Fasta &rep_J);
void FineSegmentD(Germline *germline);
JsonList toJsonList(Fasta &rep_V, Fasta &rep_D, Fasta &rep_J);
JsonList toJsonList(Germline *germline);
};
......
......@@ -4,8 +4,8 @@
WindowExtractor::WindowExtractor(): out_segmented(NULL), out_unsegmented(NULL){}
WindowsStorage *WindowExtractor::extract(OnlineFasta *reads, IKmerStore<KmerAffect> *index,
size_t w, int delta_min, int delta_max,
WindowsStorage *WindowExtractor::extract(OnlineFasta *reads, MultiGermline *multigermline,
size_t w,
map<string, string> &windows_labels) {
init_stats();
......@@ -14,8 +14,8 @@ WindowsStorage *WindowExtractor::extract(OnlineFasta *reads, IKmerStore<KmerAffe
while (reads->hasNext()) {
reads->next();
nb_reads++;
KmerSegmenter seg(reads->getSequence(), index, delta_min, delta_max);
KmerSegmenter seg(reads->getSequence(), multigermline);
stats_segmented[seg.getSegmentationStatus()]++;
stats_length[seg.getSegmentationStatus()] += seg.getSequence().sequence.length();
......
......@@ -3,6 +3,7 @@
#include <iostream>
#include "segment.h"
#include "germline.h"
#include "kmerstore.h"
#include "kmeraffect.h"
#include "windows.h"
......@@ -31,20 +32,16 @@ class WindowExtractor {
* If (un)segmented sequences must be output, the functions
* set(Un)SegmentedOutput() must be called before.
* @param reads: the collection of input reads
* @param index: the index of the germline
* @param multigermline: the multigermline
* @param w: length of the window
* @param delta_min: The minimal distance between the end of the V
* and the start of the J (can be < 0)
* @param delta_max: The maximal distance between the end of the V
* and the start of the J.
* @param windows_labels: Windows that must be kept and registered as such.
* @return a pointer to a WindowsStorage that will contain all the windows.
* It is a pointer so that the WindowsStorage is not duplicated.
* @post Statistics on segmentation will be provided through the getSegmentationStats() methods
* and getAverageSegmentationLength().
*/
WindowsStorage *extract(OnlineFasta *reads, IKmerStore<KmerAffect> *index,
size_t w, int delta_min, int delta_max,
WindowsStorage *extract(OnlineFasta *reads, MultiGermline *multigermline,
size_t w,
map<string, string> &windows_labels);
/**
......
!LAUNCH: ../../vidjil -k 14 -w 50 -c clones -G ../../germline/IGH -x -r 1 -R 1 -d ../../data/clones_simul.fa
!LAUNCH: ../../vidjil -k 14 -w 50 -c clones -G ../../germline/IGH -x -r 1 -d ../../data/clones_simul.fa
$ Junction extractions
1:found 25 50-windows in 66 segments
......
!LAUNCH: ../../vidjil -k 14 -w 50 -c clones -G ../../germline/IGH -x -r 1 -R 5 -n 5 -d ../../data/clones_simul.fa
!LAUNCH: ../../vidjil -k 14 -w 50 -c clones -G ../../germline/IGH -x -r 1 -n 5 -d ../../data/clones_simul.fa
$ Window extractions
1:found 25 50-windows in 66 segments
$ Some clustering
1:==> 2 clones
1:==> 2 clusters
$ Clone 1 output
1:Clone #001 .* 36 reads
f1:Clone #001 .* 36 reads
$ Clone 2 output
1:Clone #002 .* 23 reads
f1:Clone #002 .* 23 reads
......@@ -14,15 +14,17 @@ void testSegmentationBug1(int delta_min, int delta_max) {
Fasta seqV("../../germline/TRGV.fa");
Fasta seqJ("../../germline/TRGJ.fa");
IKmerStore<KmerAffect> *index = new ArrayKmerStore<KmerAffect>(k, rc);
index->insert(seqV, "V");
index->insert(seqJ, "J");
Germline *germline ;
germline = new Germline(seqV, seqV, seqJ, "##############", delta_min, delta_max);
MultiGermline *multi ;
multi = new MultiGermline(germline);
OnlineFasta input(buggy_sequences);
while (input.hasNext()) {
input.next();
KmerAffectAnalyser<KmerAffect> *kaa = new KmerAffectAnalyser<KmerAffect>(*index, input.getSequence().sequence);
KmerAffectAnalyser<KmerAffect> *kaa = new KmerAffectAnalyser<KmerAffect>(*(germline->index), input.getSequence().sequence);
set<KmerAffect> distinct_a = kaa->getDistinctAffectations();
int strand = 0;
......@@ -37,8 +39,8 @@ void testSegmentationBug1(int delta_min, int delta_max) {