Commit 5ed8768e authored by Marc Duez's avatar Marc Duez
Browse files
parents 3bc1fc51 d7d0d3f3
......@@ -306,20 +306,6 @@ list<list<junction> > comp_matrix::cluster(string forced_edges, int w, ostream
return cluster;
}
list<list<junction> > comp_matrix::nocluster()
{
list <list < string > > cluster ;
for (map<junction, list<Sequence> >::const_iterator it0 = windows.getMap().begin();
it0 != windows.getMap().end(); ++it0 )
{
list< string > c1;
c1.push_back(it0->first);
cluster.push_back(c1);
}
return cluster;
}
void comp_matrix::stat_cluster( list<list<junction> > cluster, ostream &out )
{
......
......@@ -59,8 +59,6 @@ class comp_matrix {
int w=0,ostream &out=cout,
int epsilon=1, int minPts=10);
list<list<junction> > nocluster();
/**
* reset state
*/
......
#include "germline.h"
Germline::Germline(string _code, char _shortcut,
string f_rep_5, string f_rep_4, string f_rep_3,
string seed,
int _delta_min, int _delta_max)
{
code = _code ;
shortcut = _shortcut ;
rep_5 = Fasta(f_rep_5, 2, "|", cout);
rep_4 = Fasta(f_rep_4, 2, "|", cout);
rep_3 = Fasta(f_rep_3, 2, "|", cout);
delta_min = _delta_min ;
delta_max = _delta_max ;
build_index(seed);
stats.setLabel(code);
}
Germline::Germline(Fasta _rep_5, Fasta _rep_4, Fasta _rep_3,
string seed,
int _delta_min, int _delta_max)
{
code = "" ;
shortcut = 'X' ;
// affect_5 = KmerAffect("", "V", 0) ;
// affect_3 = KmerAffect("", "J", 0) ;
rep_5 = _rep_5 ;
rep_4 = _rep_4 ;
rep_3 = _rep_3 ;
delta_min = _delta_min ;
delta_max = _delta_max ;
build_index(seed);
stats.setLabel(code);
}
void Germline::build_index(string seed)
{
bool rc = true ;
index = KmerStoreFactory::createIndex<KmerAffect>(seed, rc);
index->insert(rep_5, "V"); // affect_5);
index->insert(rep_3, "J"); // affect_3);
}
Germline::~Germline()
{
delete index;
}
ostream &operator<<(ostream &out, const Germline &germline)
{
out << germline.code << " '" << germline.shortcut << "' "
<< germline.delta_min << "/" << germline.delta_max << endl ;
return out;
}
MultiGermline::MultiGermline()
{
}
MultiGermline::MultiGermline(string f_germlines_json)
{
// Should parse 'data/germlines.data'
string f_rep_5 = "germline/TRGV.fa";
string f_rep_4 = "";
string f_rep_3 = "germline/TRGJ.fa";
string seed = "#####-#####";
int delta_min = -10 ;
int delta_max = 20 ;
Fasta rep_5(f_rep_5, 2, "|", cout);
Fasta rep_4(f_rep_4, 2, "|", cout);
Fasta rep_3(f_rep_3, 2, "|", cout);
Germline *germline;
germline = new Germline(rep_5, rep_4, rep_3,
seed,
delta_min, delta_max);
germlines.push_back(germline);
}
void MultiGermline::insert(Germline *germline)
{
germlines.push_back(germline);
}
void MultiGermline::load_default_set()
{
germlines.push_back(new Germline("TRG", 'G', "germline/TRGV.fa", "", "germline/TRGJ.fa", "#####-#####", -10, 20));
germlines.push_back(new Germline("IGH", 'H', "germline/IGHV.fa", "germline/IGHD.fa", "germline/IGHJ.fa", "######-######", 0, 80));
}
void MultiGermline::out_stats(ostream &out)
{
for (list<Germline*>::const_iterator it = germlines.begin(); it != germlines.end(); ++it)
{
Germline *germline = *it ;
out << germline->stats ;
}
}
#ifndef GERMLINE_H
#define GERMLINE_H
#include <string>
#include <list>
#include "kmeraffect.h"
#include "kmerstore.h"
#include "stats.h"
using namespace std;
class Germline {
private:
void build_index(string seed);
public:
/*
* @param delta_min: the minimal distance between the right bound and the left bound
* so that the segmentation is accepted
* (left bound: end of V, right bound : start of J)
* @param delta_min: the maximal distance between the right bound and the left bound
* so that the segmentation is accepted
* (left bound: end of V, right bound : start of J)
*/
Germline(string _code, char _shortcut,
string f_rep_5, string f_rep_4, string f_rep_3,
string seed,
int _delta_min, int _delta_max);
Germline(Fasta _rep_5, Fasta _rep_4, Fasta _rep_3,
string seed,
int _delta_min, int _delta_max);
~Germline();
string code ;
char shortcut ;
// KmerAffect affect_5 ;
// KmerAffect affect_3 ;
Fasta rep_5 ;
Fasta rep_4 ;
Fasta rep_3 ;
IKmerStore<KmerAffect> *index;
int delta_min;
int delta_max;
Stats stats;
};
ostream &operator<<(ostream &out, const Germline &germline);
class MultiGermline {
private:
public:
list <Germline*> germlines;
MultiGermline();
MultiGermline(string f_germlines_json);
void insert(Germline *germline);
void load_default_set();
void out_stats(ostream &out);
};
#endif
......@@ -163,7 +163,7 @@ void KmerRepresentativeComputer::compute() {
is_computed = true;
representative = sequence_longest_run;
representative.sequence = representative.sequence.substr(pos_longest_run, length_longest_run);
representative.label = "representative--" + representative.label + "-[" + string_of_int(pos_longest_run) + ","
representative.label = representative.label + "-[" + string_of_int(pos_longest_run) + ","
+ string_of_int(pos_longest_run + length_longest_run - 1) + "]";
}
delete index;
......
......@@ -161,28 +161,33 @@ ostream &operator<<(ostream &out, const Segmenter &s)
// KmerSegmenter (Cheap)
KmerSegmenter::KmerSegmenter(Sequence seq, IKmerStore<KmerAffect> *index,
int delta_min, int delta_max)
KmerSegmenter::KmerSegmenter(Sequence seq, MultiGermline *multigermline)
{
label = seq.label ;
sequence = seq.sequence ;
info = "" ;
info_extra = "seed";
segmented = false;
segmented_germline = 0;
reversed = false;
Dend=0;
int s = (size_t)index->getS() ;
// Iterate over the germlines
for (list<Germline*>::const_iterator it = multigermline->germlines.begin(); it != multigermline->germlines.end(); ++it)
{
Germline *germline = *it ;
int s = (size_t)germline->index->getS() ;
int length = sequence.length() ;
if (length < s)
{
because = UNSEG_TOO_SHORT;
kaa = NULL;
return ;
continue ;
}
kaa = new KmerAffectAnalyser<KmerAffect>(*index, sequence);
kaa = new KmerAffectAnalyser<KmerAffect>(*(germline->index), sequence);
// Check strand consistency among the affectations.
int strand;
......@@ -210,11 +215,13 @@ KmerSegmenter::KmerSegmenter(Sequence seq, IKmerStore<KmerAffect> *index,
strand = 2;
}
computeSegmentation(strand, delta_min, delta_max, s);
computeSegmentation(strand, germline->delta_min, germline->delta_max, s);
if (segmented)
{
// Yes, it is segmented
segmented_germline = germline;
germline->stats.insert(length);
reversed = (strand == -1);
because = reversed ? SEG_MINUS : SEG_PLUS ;
......@@ -223,7 +230,10 @@ KmerSegmenter::KmerSegmenter(Sequence seq, IKmerStore<KmerAffect> *index,
// removeChevauchement is called once info was already computed: it is only to output info_extra
info_extra += removeChevauchement();
finishSegmentation();
return ;
}
} // end for (Germlines)
}
KmerSegmenter::~KmerSegmenter() {
......@@ -437,8 +447,7 @@ string format_del(int deletions)
return deletions ? *"(" + string_of_int(deletions) + " del)" : "" ;
}
FineSegmenter::FineSegmenter(Sequence seq, Fasta &rep_V, Fasta &rep_J,
int delta_min, int delta_max, Cost segment_c)
FineSegmenter::FineSegmenter(Sequence seq, Germline *germline, Cost segment_c)
{
info_extra = "" ;
......@@ -461,10 +470,10 @@ FineSegmenter::FineSegmenter(Sequence seq, Fasta &rep_V, Fasta &rep_J,
vector<pair<int, int> > score_plus_V;
vector<pair<int, int> > score_plus_J;
int plus_left = align_against_collection(sequence, rep_V, false, false, &tag_plus_V, &del_plus_V, &del2, &beg,
int plus_left = align_against_collection(sequence, germline->rep_5, false, false, &tag_plus_V, &del_plus_V, &del2, &beg,
&plus_length, &score_plus_V
, segment_cost);
int plus_right = align_against_collection(sequence, rep_J, true, false, &tag_plus_J, &del_plus_J, &del2, &beg,
int plus_right = align_against_collection(sequence, germline->rep_3, true, false, &tag_plus_J, &del_plus_J, &del2, &beg,
&plus_length, &score_plus_J
, segment_cost);
plus_length += plus_right - plus_left ;
......@@ -481,10 +490,10 @@ FineSegmenter::FineSegmenter(Sequence seq, Fasta &rep_V, Fasta &rep_J,
vector<pair<int, int> > score_minus_V;
vector<pair<int, int> > score_minus_J;
int minus_left = align_against_collection(rc, rep_V, false, false, &tag_minus_V, &del_minus_V, &del2, &beg,
int minus_left = align_against_collection(rc, germline->rep_5, false, false, &tag_minus_V, &del_minus_V, &del2, &beg,
&minus_length, &score_minus_V
, segment_cost);
int minus_right = align_against_collection(rc, rep_J, true, false, &tag_minus_J, &del_minus_J, &del2, &beg,
int minus_right = align_against_collection(rc, germline->rep_3, true, false, &tag_minus_J, &del_minus_J, &del2, &beg,
&minus_length, &score_minus_J
, segment_cost);
minus_length += minus_right - minus_left ;
......@@ -517,7 +526,7 @@ FineSegmenter::FineSegmenter(Sequence seq, Fasta &rep_V, Fasta &rep_J,
}
segmented = (Vend != (int) string::npos) && (Jstart != (int) string::npos) &&
(Jstart - Vend >= delta_min) && (Jstart - Vend <= delta_max);
(Jstart - Vend >= germline->delta_min) && (Jstart - Vend <= germline->delta_max);
dSegmented=false;
......@@ -526,12 +535,12 @@ FineSegmenter::FineSegmenter(Sequence seq, Fasta &rep_V, Fasta &rep_J,
because = DONT_KNOW;
info = " @" + string_of_int (Vend + FIRST_POS) + " @" + string_of_int(Jstart + FIRST_POS) ;
if (Jstart - Vend < delta_min)
if (Jstart - Vend < germline->delta_min)
{
because = UNSEG_BAD_DELTA_MIN ;
}
if (Jstart - Vend > delta_max)
if (Jstart - Vend > germline->delta_max)
{
because = UNSEG_BAD_DELTA_MAX ;
}
......@@ -560,7 +569,7 @@ FineSegmenter::FineSegmenter(Sequence seq, Fasta &rep_V, Fasta &rep_J,
string seq_right = sequence.substr(Jstart);
best_align(overlap, seq_left, seq_right,
rep_V.sequence(best_V), rep_J.sequence(best_J), &b_r,&b_l, segment_cost);
germline->rep_5.sequence(best_V), germline->rep_3.sequence(best_J), &b_r,&b_l, segment_cost);
// Trim V
Vend -= b_l;
del_V += b_l;
......@@ -577,24 +586,24 @@ FineSegmenter::FineSegmenter(Sequence seq, Fasta &rep_V, Fasta &rep_J,
/// used only below, then recomputed in finishSegmentation() ;
seg_N = revcomp(sequence, reversed).substr(Vend+1, Jstart-Vend-1);
code = rep_V.label(best_V) +
code = germline->rep_5.label(best_V) +
" "+ string_of_int(del_V) +
"/" + seg_N +
// chevauchement +
"/" + string_of_int(del_J) +
" " + rep_J.label(best_J);
" " + germline->rep_3.label(best_J);
stringstream code_s;
code_s<< rep_V.label(best_V) <<
code_s<< germline->rep_5.label(best_V) <<
" -" << string_of_int(del_V) << "/"
<< seg_N.size()
// chevauchement +
<< "/-" << string_of_int(del_J)
<<" " << rep_J.label(best_J);
<<" " << germline->rep_3.label(best_J);
code_short=code_s.str();
code_light = rep_V.label(best_V) +
"/ " + rep_J.label(best_J);
code_light = germline->rep_5.label(best_V) +
"/ " + germline->rep_3.label(best_J);
info = string_of_int(Vend + FIRST_POS) + " " + string_of_int(Jstart + FIRST_POS) ;
......@@ -603,7 +612,7 @@ FineSegmenter::FineSegmenter(Sequence seq, Fasta &rep_V, Fasta &rep_J,
}
void FineSegmenter::FineSegmentD(Fasta &rep_V, Fasta &rep_D, Fasta &rep_J){
void FineSegmenter::FineSegmentD(Germline *germline){
if (segmented){
......@@ -625,7 +634,7 @@ void FineSegmenter::FineSegmentD(Fasta &rep_V, Fasta &rep_D, Fasta &rep_J){
string str = getSequence().sequence.substr(l, r-l);
// Align
end = align_against_collection(str, rep_D, false, true, &tag_D, &del_D_right, &del_D_left, &begin,
end = align_against_collection(str, germline->rep_4, false, true, &tag_D, &del_D_right, &del_D_left, &begin,
&length, &score_D, segment_cost);
best_D = tag_D;
......@@ -645,7 +654,7 @@ void FineSegmenter::FineSegmentD(Fasta &rep_V, Fasta &rep_D, Fasta &rep_J){
string seq_right = seq.substr(Dstart, Dend-Dstart+1);
best_align(overlap, seq_left, seq_right,
rep_V.sequence(best_V), rep_D.sequence(best_D), &b_r,&b_l, segment_cost);
germline->rep_5.sequence(best_V), germline->rep_4.sequence(best_D), &b_r,&b_l, segment_cost);
// Trim V
Vend -= b_l;
......@@ -664,7 +673,7 @@ void FineSegmenter::FineSegmentD(Fasta &rep_V, Fasta &rep_D, Fasta &rep_J){
string seq_left = seq.substr(Jstart, seq.length()-Jstart);
best_align(overlap, seq_left, seq_right,
rep_D.sequence(best_D), rep_J.sequence(best_J), &b_r,&b_l, segment_cost);
germline->rep_4.sequence(best_D), germline->rep_3.sequence(best_J), &b_r,&b_l, segment_cost);
// Trim D
Dend -= b_l;
......@@ -675,42 +684,42 @@ void FineSegmenter::FineSegmentD(Fasta &rep_V, Fasta &rep_D, Fasta &rep_J){
}
seg_N2 = seq.substr(Dend+1, Jstart-Dend-1) ; // From Dend+1 to right-1
code = rep_V.label(best_V) +
code = germline->rep_5.label(best_V) +
" "+ string_of_int(del_V) +
"/" + seg_N1 +
"/" + string_of_int(del_D_left) +
" " + rep_D.label(best_D) +
" " + germline->rep_4.label(best_D) +
" " + string_of_int(del_D_right) +
"/" + seg_N2 +
"/" + string_of_int(del_J) +
" " + rep_J.label(best_J);
" " + germline->rep_3.label(best_J);
stringstream code_s;
code_s << rep_V.label(best_V)
code_s << germline->rep_5.label(best_V)
<< " -" << string_of_int(del_V) << "/"
<< seg_N1.size()
<< "/-" << string_of_int(del_D_left)
<< " " << rep_D.label(best_D)
<< " " << germline->rep_4.label(best_D)
<< " -" << string_of_int(del_D_right) << "/"
<< seg_N2.size()
<< "/-" << string_of_int(del_J)
<< " " << rep_J.label(best_J);
<< " " << germline->rep_3.label(best_J);
code_short=code_s.str();
code_light = rep_V.label(best_V) +
"/ " + rep_D.label(best_D) +
"/ " + rep_J.label(best_J);
code_light = germline->rep_5.label(best_V) +
"/ " + germline->rep_4.label(best_D) +
"/ " + germline->rep_3.label(best_J);
finishSegmentationD();
}
}
JsonList FineSegmenter::toJsonList(Fasta &rep_V, Fasta &rep_D, Fasta &rep_J){
JsonList FineSegmenter::toJsonList(Germline *germline){
JsonList result;
result.add("status", because);
......@@ -727,7 +736,7 @@ JsonList FineSegmenter::toJsonList(Fasta &rep_V, Fasta &rep_D, Fasta &rep_J){
// TODO: what is going on if some list is smaller than JSON_REMEMBER_BEST ?
for (int i=0; i<JSON_REMEMBER_BEST; i++) jsonV.add( rep_V.label(score_V[i].second) ) ;
for (int i=0; i<JSON_REMEMBER_BEST; i++) jsonV.add( germline->rep_5.label(score_V[i].second) ) ;
result.add("V", jsonV);
......@@ -736,11 +745,11 @@ JsonList FineSegmenter::toJsonList(Fasta &rep_V, Fasta &rep_D, Fasta &rep_J){
result.add("Dend", Dend);
JsonArray jsonD;
for (int i=0; i<JSON_REMEMBER_BEST; i++) jsonD.add( rep_D.label(score_D[i].second) ) ;
for (int i=0; i<JSON_REMEMBER_BEST; i++) jsonD.add( germline->rep_4.label(score_D[i].second) ) ;
result.add("D", jsonD);
}
for (int i=0; i<JSON_REMEMBER_BEST; i++) jsonJ.add( rep_J.label(score_J[i].second) ) ;
for (int i=0; i<JSON_REMEMBER_BEST; i++) jsonJ.add( germline->rep_3.label(score_J[i].second) ) ;
result.add("J", jsonJ);
}
return result;
......
......@@ -6,6 +6,7 @@
#include "fasta.h"
#include "dynprog.h"
#include "tools.h"
#include "germline.h"
#include "kmerstore.h"
#include "kmeraffect.h"
#include "affectanalyser.h"
......@@ -54,7 +55,7 @@ protected:
int best_V, best_J ;
int del_V, del_D_left, del_D_right, del_J ;
string seg_V, seg_N, seg_J;
int best_D;
string seg_N1, seg_D, seg_N2;
Cost segment_cost;
......@@ -132,19 +133,14 @@ class KmerSegmenter : public Segmenter
string affects;
public:
Germline *segmented_germline;
/**
* Build a segmenter based on KmerSegmentation
* @param seq: An object read from a FASTA/FASTQ file
* @param index: A Kmer index
* @param delta_min: the minimal distance between the right bound and the left bound
* so that the segmentation is accepted
* (left bound: end of V, right bound : start of J)
* @param delta_min: the maximal distance between the right bound and the left bound
* so that the segmentation is accepted
* (left bound: end of V, right bound : start of J)
* @param multigermline: the multigermline
*/
KmerSegmenter(Sequence seq, IKmerStore<KmerAffect> *index,
int delta_min, int delta_max);
KmerSegmenter(Sequence seq, MultiGermline *multigermline);
~KmerSegmenter();
......@@ -176,27 +172,17 @@ class FineSegmenter : public Segmenter
/**
* Build a fineSegmenter based on KmerSegmentation
* @param seq: An object read from a FASTA/FASTQ file
* @param rep_V: germline for V
* @param rep_J: germline for J
* @param delta_min: the minimal distance between the right bound and the left bound
* so that the segmentation is accepted
* (left bound: end of V, right bound : start of J)
* @param delta_min: the maximal distance between the right bound and the left bound
* so that the segmentation is accepted
* (left bound: end of V, right bound : start of J)
* @param germline: germline used
*/
FineSegmenter(Sequence seq, Fasta &rep_V, Fasta &rep_J,
int delta_min, int delta_max, Cost segment_cost);
FineSegmenter(Sequence seq, Germline *germline, Cost segment_cost);
/**
* extend segmentation from VJ to VDJ
* @param rep_V: germline for V
* @param rep_D: germline for D
* @param rep_J: germline for J
* @param germline: germline used
*/
void FineSegmentD(Fasta &rep_V, Fasta &rep_D, Fasta &rep_J);
void FineSegmentD(Germline *germline);
JsonList toJsonList(Fasta &rep_V, Fasta &rep_D, Fasta &rep_J);
JsonList toJsonList(Germline *germline);
};
......
#include "stats.h"
Stats::Stats()
{
nb = 0 ;
length = 0 ;
}
void Stats::setLabel(string _label)
{
label = _label ;
}
void Stats::insert(int _length)
{
nb++ ;
length += _length ;
}
float Stats::getAverageLength()
{
return (float) length / nb ;
}
ostream &operator<<(ostream &out, Stats &stats)
{
out << " " << left << setw(20) << stats.label