Commit d7d0d3f3 authored by Mathieu Giraud's avatar Mathieu Giraud
Browse files

merge - experimental '-g' option, new Stats object

parents ea7f0f1b d130b9d5
#include "germline.h"
Germline::Germline(string _code, char _shortcut,
string f_rep_5, string f_rep_4, string f_rep_3,
string seed,
int _delta_min, int _delta_max)
{
code = _code ;
shortcut = _shortcut ;
rep_5 = Fasta(f_rep_5, 2, "|", cout);
rep_4 = Fasta(f_rep_4, 2, "|", cout);
rep_3 = Fasta(f_rep_3, 2, "|", cout);
delta_min = _delta_min ;
delta_max = _delta_max ;
build_index(seed);
stats.setLabel(code);
}
Germline::Germline(Fasta _rep_5, Fasta _rep_4, Fasta _rep_3,
string seed,
int _delta_min, int _delta_max)
{
// code = 'TRG' ;
// shortcut = 'G' ;
// description = "" ;
code = "" ;
shortcut = 'X' ;
// affect_5 = KmerAffect("", "V", 0) ;
// affect_3 = KmerAffect("", "J", 0) ;
......@@ -19,6 +39,13 @@ Germline::Germline(Fasta _rep_5, Fasta _rep_4, Fasta _rep_3,
delta_min = _delta_min ;
delta_max = _delta_max ;
build_index(seed);
stats.setLabel(code);
}
void Germline::build_index(string seed)
{
bool rc = true ;
index = KmerStoreFactory::createIndex<KmerAffect>(seed, rc);
......@@ -33,16 +60,15 @@ Germline::~Germline()
ostream &operator<<(ostream &out, const Germline &germline)
{
out << germline.shortcut << " (" << germline.description << ") "
out << germline.code << " '" << germline.shortcut << "' "
<< germline.delta_min << "/" << germline.delta_max << endl ;
return out;
}
MultiGermline::MultiGermline(Germline *germline)
MultiGermline::MultiGermline()
{
germlines.push_back(germline);
}
......@@ -70,5 +96,23 @@ MultiGermline::MultiGermline(string f_germlines_json)
}
void MultiGermline::insert(Germline *germline)
{
germlines.push_back(germline);
}
void MultiGermline::load_default_set()
{
germlines.push_back(new Germline("TRG", 'G', "germline/TRGV.fa", "", "germline/TRGJ.fa", "#####-#####", -10, 20));
germlines.push_back(new Germline("IGH", 'H', "germline/IGHV.fa", "germline/IGHD.fa", "germline/IGHJ.fa", "######-######", 0, 80));
}
void MultiGermline::out_stats(ostream &out)
{
for (list<Germline*>::const_iterator it = germlines.begin(); it != germlines.end(); ++it)
{
Germline *germline = *it ;
out << germline->stats ;
}
}
......@@ -6,11 +6,13 @@
#include <list>
#include "kmeraffect.h"
#include "kmerstore.h"
#include "stats.h"
using namespace std;
class Germline {
private:
void build_index(string seed);
public:
/*
......@@ -21,14 +23,20 @@ class Germline {
* so that the segmentation is accepted
* (left bound: end of V, right bound : start of J)
*/
Germline(Fasta rep_5, Fasta rep_4, Fasta rep_3,
Germline(string _code, char _shortcut,
string f_rep_5, string f_rep_4, string f_rep_3,
string seed,
int _delta_min, int _delta_max);
Germline(Fasta _rep_5, Fasta _rep_4, Fasta _rep_3,
string seed,
int delta_min, int delta_max);
int _delta_min, int _delta_max);
~Germline();
string code ;
char shortcut ;
string description ;
// KmerAffect affect_5 ;
// KmerAffect affect_3 ;
......@@ -40,6 +48,8 @@ class Germline {
int delta_min;
int delta_max;
Stats stats;
};
......@@ -52,8 +62,13 @@ class MultiGermline {
public:
list <Germline*> germlines;
MultiGermline(Germline *germline);
MultiGermline();
MultiGermline(string f_germlines_json);
void insert(Germline *germline);
void load_default_set();
void out_stats(ostream &out);
};
......
......@@ -172,8 +172,11 @@ KmerSegmenter::KmerSegmenter(Sequence seq, MultiGermline *multigermline)
reversed = false;
Dend=0;
// Now we just take one germline
Germline *germline = multigermline->germlines.back();
// Iterate over the germlines
for (list<Germline*>::const_iterator it = multigermline->germlines.begin(); it != multigermline->germlines.end(); ++it)
{
Germline *germline = *it ;
int s = (size_t)germline->index->getS() ;
int length = sequence.length() ;
......@@ -181,7 +184,7 @@ KmerSegmenter::KmerSegmenter(Sequence seq, MultiGermline *multigermline)
{
because = UNSEG_TOO_SHORT;
kaa = NULL;
return ;
continue ;
}
kaa = new KmerAffectAnalyser<KmerAffect>(*(germline->index), sequence);
......@@ -218,6 +221,7 @@ KmerSegmenter::KmerSegmenter(Sequence seq, MultiGermline *multigermline)
{
// Yes, it is segmented
segmented_germline = germline;
germline->stats.insert(length);
reversed = (strand == -1);
because = reversed ? SEG_MINUS : SEG_PLUS ;
......@@ -226,7 +230,10 @@ KmerSegmenter::KmerSegmenter(Sequence seq, MultiGermline *multigermline)
// removeChevauchement is called once info was already computed: it is only to output info_extra
info_extra += removeChevauchement();
finishSegmentation();
return ;
}
} // end for (Germlines)
}
KmerSegmenter::~KmerSegmenter() {
......
#include "stats.h"
Stats::Stats()
{
nb = 0 ;
length = 0 ;
}
void Stats::setLabel(string _label)
{
label = _label ;
}
void Stats::insert(int _length)
{
nb++ ;
length += _length ;
}
float Stats::getAverageLength()
{
return (float) length / nb ;
}
ostream &operator<<(ostream &out, Stats &stats)
{
out << " " << left << setw(20) << stats.label
<< " ->" << right << setw(9) << stats.nb ;
if (stats.nb)
out << " " << setw(5) << fixed << setprecision(1) << stats.getAverageLength() ;
out << endl ;
return out;
}
#ifndef STATS_H
#define STATS_H
#include <string>
#include <iostream>
#include <iomanip>
using namespace std;
class Stats {
public:
string label;
int nb;
int length;
public:
Stats();
void setLabel(string _label);
void insert(int _length);
float getAverageLength();
};
ostream &operator<<(ostream &out, Stats &stats);
#endif
......@@ -16,19 +16,17 @@ WindowsStorage *WindowExtractor::extract(OnlineFasta *reads, MultiGermline *mult
nb_reads++;
KmerSegmenter seg(reads->getSequence(), multigermline);
int read_length = seg.getSequence().sequence.length();
stats_segmented[seg.getSegmentationStatus()]++;
stats_length[seg.getSegmentationStatus()] += seg.getSequence().sequence.length();
stats[seg.getSegmentationStatus()].insert(read_length);
if (seg.isSegmented()) {
junction junc = seg.getJunction(w);
if (junc.size()) {
stats_segmented[TOTAL_SEG_AND_WINDOW]++ ;
stats_length[TOTAL_SEG_AND_WINDOW] += seg.getSequence().sequence.length() ;
windowsStorage->add(junc, reads->getSequence(), seg.getSegmentationStatus());
stats[TOTAL_SEG_AND_WINDOW].insert(read_length) ;
windowsStorage->add(junc, reads->getSequence(), seg.getSegmentationStatus(), seg.segmented_germline);
} else {
stats_segmented[TOTAL_SEG_BUT_TOO_SHORT_FOR_THE_WINDOW]++ ;
stats_length[TOTAL_SEG_BUT_TOO_SHORT_FOR_THE_WINDOW] += seg.getSequence().sequence.length() ;
stats[TOTAL_SEG_BUT_TOO_SHORT_FOR_THE_WINDOW].insert(read_length) ;
}
if (out_segmented) {
......@@ -50,7 +48,7 @@ WindowsStorage *WindowExtractor::extract(OnlineFasta *reads, MultiGermline *mult
}
float WindowExtractor::getAverageSegmentationLength(SEGMENTED seg) {
return stats_length[seg]*1./getNbSegmented(seg);
return stats[seg].getAverageLength();
}
size_t WindowExtractor::getNbReads() {
......@@ -58,7 +56,7 @@ size_t WindowExtractor::getNbReads() {
}
size_t WindowExtractor::getNbSegmented(SEGMENTED seg) {
return stats_segmented[seg];
return stats[seg].nb;
}
void WindowExtractor::setSegmentedOutput(ostream *out) {
......@@ -71,8 +69,17 @@ void WindowExtractor::setUnsegmentedOutput(ostream *out) {
void WindowExtractor::init_stats() {
for (int i = 0; i < STATS_SIZE; i++) {
stats_segmented[i] = 0;
stats_length[i] = 0;
stats[i].label = segmented_mesg[i];
}
nb_reads = 0;
}
void WindowExtractor::out_stats(ostream &out)
{
for (int i=0; i<STATS_SIZE; i++)
{
if (i == TOTAL_SEG_AND_WINDOW)
out << endl;
out << stats[i] ;
}
}
......@@ -16,13 +16,13 @@ using namespace std;
*/
class WindowExtractor {
private:
size_t stats_segmented[STATS_SIZE];
size_t stats_length[STATS_SIZE];
size_t nb_reads;
ostream *out_segmented;
ostream *out_unsegmented;
Stats stats[STATS_SIZE];
public:
WindowExtractor();
......@@ -77,6 +77,12 @@ class WindowExtractor {
*/
void setUnsegmentedOutput(ostream *out);
/**
* Output the segmentation stats
* @param out: The output stream
*/
void out_stats(ostream &out);
private:
/**
* Initialize the statistics (put 0 everywhere).
......
......@@ -26,6 +26,10 @@ vector<int> WindowsStorage::getStatus(junction window) {
return status_by_window[window];
}
Germline *WindowsStorage::getGermline(junction window) {
return germline_by_window[window];
}
JsonList WindowsStorage::statusToJson(junction window) {
JsonList result;
......@@ -85,12 +89,14 @@ int WindowsStorage::getId(junction window) {
return id_by_window[window];
}
void WindowsStorage::add(junction window, Sequence sequence, int status) {
void WindowsStorage::add(junction window, Sequence sequence, int status, Germline *germline) {
seqs_by_window[window].push_back(sequence);
if (status_by_window.find(window) == status_by_window.end() ) {
status_by_window[window].resize(STATS_SIZE);
}
status_by_window[window][status]++;
germline_by_window[window] = germline;
}
pair <int, int> WindowsStorage::keepInterestingWindows(size_t min_reads_window) {
......
......@@ -16,7 +16,7 @@
#include "fasta.h"
#include "json.h"
#include "segment.h"
#include "json.h"
#include "germline.h"
using namespace std;
......@@ -26,6 +26,7 @@ class WindowsStorage {
private:
map<junction, list<Sequence> > seqs_by_window;
map<junction, vector<int> > status_by_window;
map<junction, Germline* > germline_by_window;
map<string, string> windows_labels;
list<pair <junction, int> > sort_all_windows;
map<junction, int> id_by_window;
......@@ -51,6 +52,7 @@ class WindowsStorage {
* @return the segmented status of reads supporting a given window
*/
vector<int> getStatus(junction window);
Germline *getGermline(junction window);
JsonList statusToJson(junction window);
......@@ -107,7 +109,7 @@ class WindowsStorage {
/**
* Add a new window with its list of sequences
*/
void add(junction window, Sequence sequence, int status);
void add(junction window, Sequence sequence, int status, Germline *germline);
/**
* Only keep windows that are interesting. Those windows are windows
......
......@@ -18,8 +18,9 @@ void testSegmentationBug1(int delta_min, int delta_max) {
germline = new Germline(seqV, seqV, seqJ, "##############", delta_min, delta_max);
MultiGermline *multi ;
multi = new MultiGermline(germline);
multi = new MultiGermline();
multi->insert(germline);
OnlineFasta input(buggy_sequences);
while (input.hasNext()) {
......
......@@ -27,25 +27,25 @@ void testCluster() {
Sequence seq = {"", "", "", "", NULL};
windows.add("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAT", seq, 0);
windows.add("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", seq, 0);
windows.add("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAG", seq, 0);
windows.add("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAC", seq, 0);
windows.add("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAT", seq, 0, 0);
windows.add("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", seq, 0, 0);
windows.add("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAG", seq, 0, 0);
windows.add("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAC", seq, 0, 0);
windows.add("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT", seq, 0);
windows.add("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTAAA", seq, 0);
windows.add("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTGGG", seq, 0);
windows.add("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTCCC", seq, 0);
windows.add("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT", seq, 0, 0);
windows.add("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTAAA", seq, 0, 0);
windows.add("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTGGG", seq, 0, 0);
windows.add("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTCCC", seq, 0, 0);
windows.add("GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTTTTTTTT", seq, 0);
windows.add("GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGAAAAAAAA", seq, 0);
windows.add("GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG", seq, 0);
windows.add("GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGCCCCCCCC", seq, 0);
windows.add("GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTTTTTTTT", seq, 0, 0);
windows.add("GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGAAAAAAAA", seq, 0, 0);
windows.add("GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG", seq, 0, 0);
windows.add("GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGCCCCCCCC", seq, 0, 0);
windows.add("CCCCCCCCCCCCCCCCCCCCCCCCCTTTTTTTTTTTTTTT", seq, 0);
windows.add("CCCCCCCCCCCCCCCCCCCCCCCCCAAAAAAGCTAAAAAA", seq, 0);
windows.add("CCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGTCTAGGGGG", seq, 0);
windows.add("CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCATGCCCCCC", seq, 0);
windows.add("CCCCCCCCCCCCCCCCCCCCCCCCCTTTTTTTTTTTTTTT", seq, 0, 0);
windows.add("CCCCCCCCCCCCCCCCCCCCCCCCCAAAAAAGCTAAAAAA", seq, 0, 0);
windows.add("CCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGTCTAGGGGG", seq, 0, 0);
windows.add("CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCATGCCCCCC", seq, 0, 0);
comp_matrix comp=comp_matrix(windows);
......
......@@ -75,7 +75,8 @@ void testSegmentOverlap()
germline2 = new Germline(seqV, seqV, seqJ, "##########", -50, 50);
MultiGermline *multi1 ;
multi1 = new MultiGermline(germline1);
multi1 = new MultiGermline();
multi1->insert(germline1);
for (int i = 0; i < data.size(); i++) {
KmerSegmenter ks(data.read(i), multi1);
......@@ -102,7 +103,8 @@ void testSegmentationCause() {
germline = new Germline(seqV, seqV, seqJ, "##########", 0, 10);
MultiGermline *multi ;
multi = new MultiGermline(germline);
multi = new MultiGermline();
multi->insert(germline);
int nb_checked = 0;
......@@ -193,7 +195,8 @@ void testExtractor() {
germline = new Germline(seqV, seqV, seqJ, "##########", 0, 10);
MultiGermline *multi ;
multi = new MultiGermline(germline);
multi = new MultiGermline();
multi->insert(germline);
WindowExtractor we;
map<string, string> labels;
......
......@@ -146,6 +146,7 @@ void usage(char *progname)
<< " -D <file> D germline multi-fasta file (automatically implies -d)" << endl
<< " -J <file> J germline multi-fasta file" << endl
<< " -G <prefix> prefix for V (D) and J repertoires (shortcut for -V <prefix>V.fa -D <prefix>D.fa -J <prefix>J.fa)" << endl
<< " -g multiple germlines (experimental)" << endl
<< endl
<< "Window prediction" << endl
......@@ -276,6 +277,7 @@ int main (int argc, char **argv)
bool detailed_cluster_analysis = true ;
bool output_segmented = false;
bool output_unsegmented = false;
bool multi_germline = false;
string forced_edges = "" ;
......@@ -291,7 +293,7 @@ int main (int argc, char **argv)
//$$ options: getopt
while ((c = getopt(argc, argv, "AhaG:V:D:J:k:r:vw:e:C:t:l:dc:m:M:N:s:p:Sn:o:Lx%:Z:z:uU")) != EOF)
while ((c = getopt(argc, argv, "AhagG:V:D:J:k:r:vw:e:C:t:l:dc:m:M:N:s:p:Sn:o:Lx%:Z:z:uU")) != EOF)
switch (c)
{
......@@ -353,6 +355,10 @@ int main (int argc, char **argv)
germline_system = "custom" ;
break;
case 'g':
multi_germline = true ;
break;
case 'G':
germline_system = string(optarg);
f_rep_V = (germline_system + "V.fa").c_str() ;
......@@ -772,12 +778,20 @@ int main (int argc, char **argv)
//$$ Build Kmer indexes
cout << "Build Kmer indexes" << endl ;
Germline *germline;
germline = new Germline(rep_V, rep_D, rep_J, seed,
delta_min, delta_max);
MultiGermline *multigermline = new MultiGermline();
MultiGermline *multigermline;
multigermline = new MultiGermline(germline);
if (multi_germline)
{
multigermline->load_default_set();
}
else
{
Germline *germline;
germline = new Germline(rep_V, rep_D, rep_J, seed,
delta_min, delta_max);
multigermline->insert(germline);
}
//////////////////////////////////
//$$ Kmer Segmentation
......@@ -840,17 +854,9 @@ int main (int argc, char **argv)
stream_segmentation_info << " # av. length" << endl ;
for (int i=0; i<STATS_SIZE; i++)
{
stream_segmentation_info << " " << left << setw(20) << segmented_mesg[i]
<< " ->" << right << setw(9) << we.getNbSegmented(static_cast<SEGMENTED>(i)) ;
if (we.getAverageSegmentationLength(static_cast<SEGMENTED>(i)))
stream_segmentation_info << " " << setw(5) << fixed << setprecision(1)
<< we.getAverageSegmentationLength(static_cast<SEGMENTED>(i));
stream_segmentation_info << endl ;
}
multigermline->out_stats(stream_segmentation_info);
stream_segmentation_info << endl;
we.out_stats(stream_segmentation_info);
cout << stream_segmentation_info.str();
map <junction, JsonList> json_data_segment ;
......@@ -967,7 +973,7 @@ int main (int argc, char **argv)
list <Sequence> representatives ;
list <string> representatives_labels ;
VirtualReadScore *scorer = new KmerAffectReadScore(*(germline->index));
// VirtualReadScore *scorer = new KmerAffectReadScore(*(germline->index));
int num_clone = 0 ;
int clones_without_representative = 0 ;
......@@ -1000,9 +1006,11 @@ int main (int argc, char **argv)
string clone_file_name = out_seqdir+ prefix_filename + CLONE_FILENAME + string_of_int(num_clone) ;
ofstream out_clone(clone_file_name.c_str());
Germline *segmented_germline = windowsStorage->getGermline(it->first);
ostringstream oss;
oss << "clone-" << setfill('0') << setw(WIDTH_NB_CLONES) << num_clone
oss << "clone-" << setfill('0') << setw(WIDTH_NB_CLONES) << num_clone
<< "--" << segmented_germline->code
<< "--" << setfill('0') << setw(WIDTH_NB_READS) << clone_nb_reads
<< "--" << setprecision(3) << 100 * (float) clone_nb_reads / nb_segmented << "%" ;
string clone_id = oss.str();
......@@ -1058,10 +1066,10 @@ int main (int argc, char **argv)
representative.label = clone_id + "--" + representative.label;
// FineSegmenter
FineSegmenter seg(representative, germline, segment_cost);
FineSegmenter seg(representative, segmented_germline, segment_cost);
if (segment_D)
seg.FineSegmentD(germline);
seg.FineSegmentD(segmented_germline);
// Output representative, possibly segmented...
// to stdout, CLONES_FILENAME, and CLONE_FILENAME-*
......@@ -1070,7 +1078,7 @@ int main (int argc, char **argv)
out_clones << seg << endl ;
// Output segmentation to .json
json_data_segment[it->first]=seg.toJsonList(germline);
json_data_segment[it->first]=seg.toJsonList(segmented_germline);
if (seg.isSegmented())
{
......@@ -1095,9 +1103,9 @@ int main (int argc, char **argv)
}