Commit 5ed8768e authored by Marc Duez's avatar Marc Duez
parents 3bc1fc51 d7d0d3f3
......@@ -306,20 +306,6 @@ list<list<junction> > comp_matrix::cluster(string forced_edges, int w, ostream
return cluster;
}
list<list<junction> > comp_matrix::nocluster()
{
list <list < string > > cluster ;
for (map<junction, list<Sequence> >::const_iterator it0 = windows.getMap().begin();
it0 != windows.getMap().end(); ++it0 )
{
list< string > c1;
c1.push_back(it0->first);
cluster.push_back(c1);
}
return cluster;
}
void comp_matrix::stat_cluster( list<list<junction> > cluster, ostream &out )
{
......
......@@ -59,8 +59,6 @@ class comp_matrix {
int w=0,ostream &out=cout,
int epsilon=1, int minPts=10);
list<list<junction> > nocluster();
/**
* reset state
*/
......
#include "germline.h"
Germline::Germline(string _code, char _shortcut,
string f_rep_5, string f_rep_4, string f_rep_3,
string seed,
int _delta_min, int _delta_max)
{
code = _code ;
shortcut = _shortcut ;
rep_5 = Fasta(f_rep_5, 2, "|", cout);
rep_4 = Fasta(f_rep_4, 2, "|", cout);
rep_3 = Fasta(f_rep_3, 2, "|", cout);
delta_min = _delta_min ;
delta_max = _delta_max ;
build_index(seed);
stats.setLabel(code);
}
Germline::Germline(Fasta _rep_5, Fasta _rep_4, Fasta _rep_3,
string seed,
int _delta_min, int _delta_max)
{
code = "" ;
shortcut = 'X' ;
// affect_5 = KmerAffect("", "V", 0) ;
// affect_3 = KmerAffect("", "J", 0) ;
rep_5 = _rep_5 ;
rep_4 = _rep_4 ;
rep_3 = _rep_3 ;
delta_min = _delta_min ;
delta_max = _delta_max ;
build_index(seed);
stats.setLabel(code);
}
void Germline::build_index(string seed)
{
bool rc = true ;
index = KmerStoreFactory::createIndex<KmerAffect>(seed, rc);
index->insert(rep_5, "V"); // affect_5);
index->insert(rep_3, "J"); // affect_3);
}
Germline::~Germline()
{
delete index;
}
ostream &operator<<(ostream &out, const Germline &germline)
{
out << germline.code << " '" << germline.shortcut << "' "
<< germline.delta_min << "/" << germline.delta_max << endl ;
return out;
}
MultiGermline::MultiGermline()
{
}
MultiGermline::MultiGermline(string f_germlines_json)
{
// Should parse 'data/germlines.data'
string f_rep_5 = "germline/TRGV.fa";
string f_rep_4 = "";
string f_rep_3 = "germline/TRGJ.fa";
string seed = "#####-#####";
int delta_min = -10 ;
int delta_max = 20 ;
Fasta rep_5(f_rep_5, 2, "|", cout);
Fasta rep_4(f_rep_4, 2, "|", cout);
Fasta rep_3(f_rep_3, 2, "|", cout);
Germline *germline;
germline = new Germline(rep_5, rep_4, rep_3,
seed,
delta_min, delta_max);
germlines.push_back(germline);
}
void MultiGermline::insert(Germline *germline)
{
germlines.push_back(germline);
}
void MultiGermline::load_default_set()
{
germlines.push_back(new Germline("TRG", 'G', "germline/TRGV.fa", "", "germline/TRGJ.fa", "#####-#####", -10, 20));
germlines.push_back(new Germline("IGH", 'H', "germline/IGHV.fa", "germline/IGHD.fa", "germline/IGHJ.fa", "######-######", 0, 80));
}
void MultiGermline::out_stats(ostream &out)
{
for (list<Germline*>::const_iterator it = germlines.begin(); it != germlines.end(); ++it)
{
Germline *germline = *it ;
out << germline->stats ;
}
}
#ifndef GERMLINE_H
#define GERMLINE_H
#include <string>
#include <list>
#include "kmeraffect.h"
#include "kmerstore.h"
#include "stats.h"
using namespace std;
class Germline {
private:
void build_index(string seed);
public:
/*
* @param delta_min: the minimal distance between the right bound and the left bound
* so that the segmentation is accepted
* (left bound: end of V, right bound : start of J)
* @param delta_min: the maximal distance between the right bound and the left bound
* so that the segmentation is accepted
* (left bound: end of V, right bound : start of J)
*/
Germline(string _code, char _shortcut,
string f_rep_5, string f_rep_4, string f_rep_3,
string seed,
int _delta_min, int _delta_max);
Germline(Fasta _rep_5, Fasta _rep_4, Fasta _rep_3,
string seed,
int _delta_min, int _delta_max);
~Germline();
string code ;
char shortcut ;
// KmerAffect affect_5 ;
// KmerAffect affect_3 ;
Fasta rep_5 ;
Fasta rep_4 ;
Fasta rep_3 ;
IKmerStore<KmerAffect> *index;
int delta_min;
int delta_max;
Stats stats;
};
ostream &operator<<(ostream &out, const Germline &germline);
class MultiGermline {
private:
public:
list <Germline*> germlines;
MultiGermline();
MultiGermline(string f_germlines_json);
void insert(Germline *germline);
void load_default_set();
void out_stats(ostream &out);
};
#endif
......@@ -163,7 +163,7 @@ void KmerRepresentativeComputer::compute() {
is_computed = true;
representative = sequence_longest_run;
representative.sequence = representative.sequence.substr(pos_longest_run, length_longest_run);
representative.label = "representative--" + representative.label + "-[" + string_of_int(pos_longest_run) + ","
representative.label = representative.label + "-[" + string_of_int(pos_longest_run) + ","
+ string_of_int(pos_longest_run + length_longest_run - 1) + "]";
}
delete index;
......
This diff is collapsed.
......@@ -6,6 +6,7 @@
#include "fasta.h"
#include "dynprog.h"
#include "tools.h"
#include "germline.h"
#include "kmerstore.h"
#include "kmeraffect.h"
#include "affectanalyser.h"
......@@ -54,7 +55,7 @@ protected:
int best_V, best_J ;
int del_V, del_D_left, del_D_right, del_J ;
string seg_V, seg_N, seg_J;
int best_D;
string seg_N1, seg_D, seg_N2;
Cost segment_cost;
......@@ -132,19 +133,14 @@ class KmerSegmenter : public Segmenter
string affects;
public:
Germline *segmented_germline;
/**
* Build a segmenter based on KmerSegmentation
* @param seq: An object read from a FASTA/FASTQ file
* @param index: A Kmer index
* @param delta_min: the minimal distance between the right bound and the left bound
* so that the segmentation is accepted
* (left bound: end of V, right bound : start of J)
* @param delta_min: the maximal distance between the right bound and the left bound
* so that the segmentation is accepted
* (left bound: end of V, right bound : start of J)
* @param multigermline: the multigermline
*/
KmerSegmenter(Sequence seq, IKmerStore<KmerAffect> *index,
int delta_min, int delta_max);
KmerSegmenter(Sequence seq, MultiGermline *multigermline);
~KmerSegmenter();
......@@ -176,27 +172,17 @@ class FineSegmenter : public Segmenter
/**
* Build a fineSegmenter based on KmerSegmentation
* @param seq: An object read from a FASTA/FASTQ file
* @param rep_V: germline for V
* @param rep_J: germline for J
* @param delta_min: the minimal distance between the right bound and the left bound
* so that the segmentation is accepted
* (left bound: end of V, right bound : start of J)
* @param delta_min: the maximal distance between the right bound and the left bound
* so that the segmentation is accepted
* (left bound: end of V, right bound : start of J)
* @param germline: germline used
*/
FineSegmenter(Sequence seq, Fasta &rep_V, Fasta &rep_J,
int delta_min, int delta_max, Cost segment_cost);
FineSegmenter(Sequence seq, Germline *germline, Cost segment_cost);
/**
* extend segmentation from VJ to VDJ
* @param rep_V: germline for V
* @param rep_D: germline for D
* @param rep_J: germline for J
* @param germline: germline used
*/
void FineSegmentD(Fasta &rep_V, Fasta &rep_D, Fasta &rep_J);
void FineSegmentD(Germline *germline);
JsonList toJsonList(Fasta &rep_V, Fasta &rep_D, Fasta &rep_J);
JsonList toJsonList(Germline *germline);
};
......
#include "stats.h"
Stats::Stats()
{
nb = 0 ;
length = 0 ;
}
void Stats::setLabel(string _label)
{
label = _label ;
}
void Stats::insert(int _length)
{
nb++ ;
length += _length ;
}
float Stats::getAverageLength()
{
return (float) length / nb ;
}
ostream &operator<<(ostream &out, Stats &stats)
{
out << " " << left << setw(20) << stats.label
<< " ->" << right << setw(9) << stats.nb ;
if (stats.nb)
out << " " << setw(5) << fixed << setprecision(1) << stats.getAverageLength() ;
out << endl ;
return out;
}
#ifndef STATS_H
#define STATS_H
#include <string>
#include <iostream>
#include <iomanip>
using namespace std;
class Stats {
public:
string label;
int nb;
int length;
public:
Stats();
void setLabel(string _label);
void insert(int _length);
float getAverageLength();
};
ostream &operator<<(ostream &out, Stats &stats);
#endif
......@@ -4,8 +4,8 @@
WindowExtractor::WindowExtractor(): out_segmented(NULL), out_unsegmented(NULL){}
WindowsStorage *WindowExtractor::extract(OnlineFasta *reads, IKmerStore<KmerAffect> *index,
size_t w, int delta_min, int delta_max,
WindowsStorage *WindowExtractor::extract(OnlineFasta *reads, MultiGermline *multigermline,
size_t w,
map<string, string> &windows_labels) {
init_stats();
......@@ -14,21 +14,19 @@ WindowsStorage *WindowExtractor::extract(OnlineFasta *reads, IKmerStore<KmerAffe
while (reads->hasNext()) {
reads->next();
nb_reads++;
KmerSegmenter seg(reads->getSequence(), index, delta_min, delta_max);
KmerSegmenter seg(reads->getSequence(), multigermline);
int read_length = seg.getSequence().sequence.length();
stats_segmented[seg.getSegmentationStatus()]++;
stats_length[seg.getSegmentationStatus()] += seg.getSequence().sequence.length();
stats[seg.getSegmentationStatus()].insert(read_length);
if (seg.isSegmented()) {
junction junc = seg.getJunction(w);
if (junc.size()) {
stats_segmented[TOTAL_SEG_AND_WINDOW]++ ;
stats_length[TOTAL_SEG_AND_WINDOW] += seg.getSequence().sequence.length() ;
windowsStorage->add(junc, reads->getSequence(), seg.getSegmentationStatus());
stats[TOTAL_SEG_AND_WINDOW].insert(read_length) ;
windowsStorage->add(junc, reads->getSequence(), seg.getSegmentationStatus(), seg.segmented_germline);
} else {
stats_segmented[TOTAL_SEG_BUT_TOO_SHORT_FOR_THE_WINDOW]++ ;
stats_length[TOTAL_SEG_BUT_TOO_SHORT_FOR_THE_WINDOW] += seg.getSequence().sequence.length() ;
stats[TOTAL_SEG_BUT_TOO_SHORT_FOR_THE_WINDOW].insert(read_length) ;
}
if (out_segmented) {
......@@ -50,7 +48,7 @@ WindowsStorage *WindowExtractor::extract(OnlineFasta *reads, IKmerStore<KmerAffe
}
float WindowExtractor::getAverageSegmentationLength(SEGMENTED seg) {
return stats_length[seg]*1./getNbSegmented(seg);
return stats[seg].getAverageLength();
}
size_t WindowExtractor::getNbReads() {
......@@ -58,7 +56,7 @@ size_t WindowExtractor::getNbReads() {
}
size_t WindowExtractor::getNbSegmented(SEGMENTED seg) {
return stats_segmented[seg];
return stats[seg].nb;
}
void WindowExtractor::setSegmentedOutput(ostream *out) {
......@@ -71,8 +69,17 @@ void WindowExtractor::setUnsegmentedOutput(ostream *out) {
void WindowExtractor::init_stats() {
for (int i = 0; i < STATS_SIZE; i++) {
stats_segmented[i] = 0;
stats_length[i] = 0;
stats[i].label = segmented_mesg[i];
}
nb_reads = 0;
}
void WindowExtractor::out_stats(ostream &out)
{
for (int i=0; i<STATS_SIZE; i++)
{
if (i == TOTAL_SEG_AND_WINDOW)
out << endl;
out << stats[i] ;
}
}
......@@ -3,6 +3,7 @@
#include <iostream>
#include "segment.h"
#include "germline.h"
#include "kmerstore.h"
#include "kmeraffect.h"
#include "windows.h"
......@@ -15,13 +16,13 @@ using namespace std;
*/
class WindowExtractor {
private:
size_t stats_segmented[STATS_SIZE];
size_t stats_length[STATS_SIZE];
size_t nb_reads;
ostream *out_segmented;
ostream *out_unsegmented;
Stats stats[STATS_SIZE];
public:
WindowExtractor();
......@@ -31,20 +32,16 @@ class WindowExtractor {
* If (un)segmented sequences must be output, the functions
* set(Un)SegmentedOutput() must be called before.
* @param reads: the collection of input reads
* @param index: the index of the germline
* @param multigermline: the multigermline
* @param w: length of the window
* @param delta_min: The minimal distance between the end of the V
* and the start of the J (can be < 0)
* @param delta_max: The maximal distance between the end of the V
* and the start of the J.
* @param windows_labels: Windows that must be kept and registered as such.
* @return a pointer to a WindowsStorage that will contain all the windows.
* It is a pointer so that the WindowsStorage is not duplicated.
* @post Statistics on segmentation will be provided through the getSegmentationStats() methods
* and getAverageSegmentationLength().
*/
WindowsStorage *extract(OnlineFasta *reads, IKmerStore<KmerAffect> *index,
size_t w, int delta_min, int delta_max,
WindowsStorage *extract(OnlineFasta *reads, MultiGermline *multigermline,
size_t w,
map<string, string> &windows_labels);
/**
......@@ -80,6 +77,12 @@ class WindowExtractor {
*/
void setUnsegmentedOutput(ostream *out);
/**
* Output the segmentation stats
* @param out: The output stream
*/
void out_stats(ostream &out);
private:
/**
* Initialize the statistics (put 0 everywhere).
......
......@@ -26,6 +26,10 @@ vector<int> WindowsStorage::getStatus(junction window) {
return status_by_window[window];
}
Germline *WindowsStorage::getGermline(junction window) {
return germline_by_window[window];
}
JsonList WindowsStorage::statusToJson(junction window) {
JsonList result;
......@@ -85,12 +89,14 @@ int WindowsStorage::getId(junction window) {
return id_by_window[window];
}
void WindowsStorage::add(junction window, Sequence sequence, int status) {
void WindowsStorage::add(junction window, Sequence sequence, int status, Germline *germline) {
seqs_by_window[window].push_back(sequence);
if (status_by_window.find(window) == status_by_window.end() ) {
status_by_window[window].resize(STATS_SIZE);
}
status_by_window[window][status]++;
germline_by_window[window] = germline;
}
pair <int, int> WindowsStorage::keepInterestingWindows(size_t min_reads_window) {
......
......@@ -16,7 +16,7 @@
#include "fasta.h"
#include "json.h"
#include "segment.h"
#include "json.h"
#include "germline.h"
using namespace std;
......@@ -26,6 +26,7 @@ class WindowsStorage {
private:
map<junction, list<Sequence> > seqs_by_window;
map<junction, vector<int> > status_by_window;
map<junction, Germline* > germline_by_window;
map<string, string> windows_labels;
list<pair <junction, int> > sort_all_windows;
map<junction, int> id_by_window;
......@@ -51,6 +52,7 @@ class WindowsStorage {
* @return the segmented status of reads supporting a given window
*/
vector<int> getStatus(junction window);
Germline *getGermline(junction window);
JsonList statusToJson(junction window);
......@@ -107,7 +109,7 @@ class WindowsStorage {
/**
* Add a new window with its list of sequences
*/
void add(junction window, Sequence sequence, int status);
void add(junction window, Sequence sequence, int status, Germline *germline);
/**
* Only keep windows that are interesting. Those windows are windows
......
!LAUNCH: ../../vidjil -k 14 -w 50 -c clones -G ../../germline/IGH -x -r 1 -R 1 -d ../../data/clones_simul.fa
!LAUNCH: ../../vidjil -k 14 -w 50 -c clones -G ../../germline/IGH -x -r 1 -d ../../data/clones_simul.fa
$ Junction extractions
1:found 25 50-windows in 66 segments
......
!LAUNCH: ../../vidjil -k 14 -w 50 -c clones -G ../../germline/IGH -x -r 1 -R 5 -n 5 -d ../../data/clones_simul.fa
!LAUNCH: ../../vidjil -k 14 -w 50 -c clones -G ../../germline/IGH -x -r 1 -n 5 -d ../../data/clones_simul.fa
$ Window extractions
1:found 25 50-windows in 66 segments
$ Some clustering
1:==> 2 clones
1:==> 2 clusters
$ Clone 1 output
1:Clone #001 .* 36 reads
f1:Clone #001 .* 36 reads
$ Clone 2 output
1:Clone #002 .* 23 reads
f1:Clone #002 .* 23 reads
......@@ -14,15 +14,18 @@ void testSegmentationBug1(int delta_min, int delta_max) {
Fasta seqV("../../germline/TRGV.fa");
Fasta seqJ("../../germline/TRGJ.fa");
IKmerStore<KmerAffect> *index = new ArrayKmerStore<KmerAffect>(k, rc);
index->insert(seqV, "V");
index->insert(seqJ, "J");
Germline *germline ;
germline = new Germline(seqV, seqV, seqJ, "##############", delta_min, delta_max);
MultiGermline *multi ;
multi = new MultiGermline();
multi->insert(germline);
OnlineFasta input(buggy_sequences);
while (input.hasNext()) {
input.next();
KmerAffectAnalyser<KmerAffect> *kaa = new KmerAffectAnalyser<KmerAffect>(*index, input.getSequence().sequence);
KmerAffectAnalyser<KmerAffect> *kaa = new KmerAffectAnalyser<KmerAffect>(*(germline->index), input.getSequence().sequence);
set<KmerAffect> distinct_a = kaa->getDistinctAffectations();
int strand = 0;
......@@ -37,8 +40,8 @@ void testSegmentationBug1(int delta_min, int delta_max) {
}
}
Segmenter *segment = new KmerSegmenter(input.getSequence(), index,
delta_min, delta_max);
Segmenter *segment = new KmerSegmenter(input.getSequence(), multi);
if (strand == 2
|| (strand == 1
......@@ -52,7 +55,7 @@ void testSegmentationBug1(int delta_min, int delta_max) {
delete segment;
delete kaa;
}
delete index;
delete germline;
}
void testBugs() {
......
......@@ -27,25 +27,25 @@ void testCluster() {
Sequence seq = {"", "", "", "", NULL};
windows.add("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAT", seq, 0);
windows.add("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", seq, 0);
windows.add("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAG", seq, 0);
windows.add("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAC", seq, 0);
windows.add("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAT", seq, 0, 0);
windows.add("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", seq, 0, 0);
windows.add("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAG", seq, 0, 0);
windows.add("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAC", seq, 0, 0);
windows.add("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT", seq, 0);
windows.add("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTAAA", seq, 0);
windows.add("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTGGG", seq, 0);
windows.add("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTCCC", seq, 0);
windows.add("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT", seq, 0, 0);
windows.add("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTAAA", seq, 0, 0);
windows.add("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTGGG", seq, 0, 0);
windows.add("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTCCC", seq, 0, 0);
windows.add("GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTTTTTTTT", seq, 0);
windows.add("GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGAAAAAAAA", seq, 0);
windows.add("GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG", seq, 0);
windows.add("GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGCCCCCCCC", seq, 0);
windows.add("GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTTTTTTTT", seq, 0, 0);
windows.add("GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGAAAAAAAA", seq, 0, 0);
windows.add("GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG", seq, 0, 0);
windows.add("GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGCCCCCCCC", seq, 0, 0);
windows.add("CCCCCCCCCCCCCCCCCCCCCCCCCTTTTTTTTTTTTTTT", seq, 0);
windows.add("CCCCCCCCCCCCCCCCCCCCCCCCCAAAAAAGCTAAAAAA", seq, 0);
windows.add("CCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGTCTAGGGGG", seq, 0);
windows.add("CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCATGCCCCCC", seq, 0);
windows.add("CCCCCCCCCCCCCCCCCCCCCCCCCTTTTTTTTTTTTTTT", seq, 0, 0);
windows.add("CCCCCCCCCCCCCCCCCCCCCCCCCAAAAAAGCTAAAAAA", seq, 0, 0);
windows.add("CCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGTCTAGGGGG", seq, 0, 0);
windows.add("CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCATGCCCCCC", seq, 0, 0);
comp_matrix comp=comp_matrix(windows);
......