Commit 31bbba7b authored by Mathieu Giraud's avatar Mathieu Giraud
Browse files

merge - new germline index handling, streamlined '-y'/'-z' options

parents 7ca20a8d 9e67e864
......@@ -25,7 +25,7 @@ test_with_fuse:
unit: all
@echo "*** Launching unit tests..."
make COVERAGE="$(COVERAGE_OPTION)" -C $(VIDJIL_ALGO_SRC)/tests
@echo "*** All .should_get tests passed"
@echo "*** All unit tests passed"
pytests:
@echo "*** Launching python tests..."
......
......@@ -24,7 +24,12 @@
return lh.second>rh.second;
}
comp_matrix::comp_matrix(WindowsStorage &ws):windows(ws){}
comp_matrix::comp_matrix(list<pair <junction, int> > sc)
{
sort_clones = sc;
matrix_size = sort_clones.size();
if (matrix_size > JUNCTION_LIMIT) matrix_size = JUNCTION_LIMIT;
}
void comp_matrix::compare(ostream &out, Cost cluster_cost)
{
......@@ -37,30 +42,33 @@ void comp_matrix::compare(ostream &out, Cost cluster_cost)
out << " Using cost " << compareCost << endl ;
string j1, j2;
m = alloc_matrix(windows.size());
m = alloc_matrix(sort_clones.size());
int c=0;
int c1=0;
int c2=0;
for (mjs::const_iterator it0 = windows.getMap().begin();
it0 != windows.getMap().end(); ++it0 )
size_t i = 0;
for (list <pair<junction,int> >::const_iterator it0 = sort_clones.begin();
(it0 != sort_clones.end()) & (i<matrix_size); ++it0)
{
i++;
j1=it0->first;
for (mjs::const_iterator it1 = it0;
it1 != windows.getMap().end(); ++it1 )
{
j2=it1->first;
DynProg dp = DynProg(j1, j2, DynProg::Local, compareCost);
int score=dp.compute();
int distance = max(j1.size(), j2.size())-score;
m[c2][c1]=distance;
m[c1][c2]=distance;
c1++;
c++;
}//fin it1
size_t k = 0;
for (list <pair<junction,int> >::const_iterator it1 = it0;
(it1 != sort_clones.end()) & (k<matrix_size); ++it1)
{
k++;
j2=it1->first;
DynProg dp = DynProg(j1, j2, DynProg::Local, compareCost);
int score=dp.compute();
int distance = max(j1.size(), j2.size())-score;
m[c2][c1]=distance;
m[c1][c2]=distance;
c1++;
c++;
}//fin it1
c2++;
c1=c2;
}//fin it0
......@@ -69,14 +77,15 @@ void comp_matrix::compare(ostream &out, Cost cluster_cost)
}
void comp_matrix::load(string file){
m = alloc_matrix(windows.size());
char* tampon=(char*)malloc(windows.size()*sizeof(char));
m = alloc_matrix(sort_clones.size());
char* tampon=(char*)malloc(sort_clones.size()*sizeof(char));
ifstream in_comp(file.c_str());
for(unsigned int i=0; i<windows.size();i++){
in_comp.read(tampon, windows.size()*sizeof(char));
for(unsigned int j=0;j<windows.size(); j++){
for(unsigned int i=0; i<sort_clones.size();i++){
in_comp.read(tampon, sort_clones.size()*sizeof(char));
for(unsigned int j=0;j<sort_clones.size(); j++){
m[i][j]=tampon[j];
}
}
......@@ -90,8 +99,8 @@ void comp_matrix::save(string file){
ofstream out_comp(file.c_str());
for(unsigned int i=0; i<windows.size();i++){
out_comp.write((char *)m[i],windows.size()*sizeof(char));
for(unsigned int i=0; i<sort_clones.size();i++){
out_comp.write((char *)m[i],sort_clones.size()*sizeof(char));
}
out_comp.close();
......@@ -99,7 +108,7 @@ void comp_matrix::save(string file){
void comp_matrix::del(){
for (unsigned int i=0;i<windows.size();i++){
for (unsigned int i=0;i<sort_clones.size();i++){
free(m[i]);
}
free(m);
......@@ -125,40 +134,44 @@ list<list<junction> > comp_matrix::cluster(string forced_edges, int w, ostream
int c1=0;
int c2=0;
for (mjs::const_iterator ite = windows.getMap().begin();
ite != windows.getMap().end(); ++ite )
size_t i = 0;
for (list <pair<junction,int> >::const_iterator ite = sort_clones.begin();
(ite != sort_clones.end()) & (i<matrix_size); ++ite)
{
i++;
n_j++;
list <string> voisins ;
j1=ite->first;
neighbor[j1]=voisins;
}
for (mjs::const_iterator it0 = windows.getMap().begin();
it0 != windows.getMap().end(); ++it0 )
{
i = 0;
for (list <pair<junction,int> >::const_iterator it0 = sort_clones.begin();
(it0 != sort_clones.end()) & (i<matrix_size); ++it0)
{
i++;
j1=it0->first;
int k=it0->second.size();
int k=it0->second;
count[j1]=k;
n_j2+=k;
for (mjs::const_iterator it1 = windows.getMap().begin();
it1 != windows.getMap().end(); ++it1 )
size_t j = 0;
for (list <pair<junction,int> >::const_iterator it1 = sort_clones.begin();
(it1 != sort_clones.end()) & (j<matrix_size); ++it1)
{
j2=it1->first;
int distance = (int)m[c2][c1];
if (distance <= epsilon){
neighbor[j1].push_back(j2);
}
c1++;
c++;
}//fin it1
c2++;
c1=0;
}//fin it0
j++;
j2=it1->first;
int distance = (int)m[c2][c1];
if (distance <= epsilon){
neighbor[j1].push_back(j2);
}
c1++;
c++;
}//fin it1
c2++;
c1=0;
}//fin it0
/////////////////////////
//Forced - edges
......@@ -213,9 +226,11 @@ list<list<junction> > comp_matrix::cluster(string forced_edges, int w, ostream
int noise = 0;
int nb_comp = 0 ;
for (mjs::const_iterator it0 = windows.getMap().begin();
it0 != windows.getMap().end(); ++it0 )
i = 0;
for (list <pair<junction,int> >::const_iterator it0 = sort_clones.begin();
(it0 != sort_clones.end()) & (i<matrix_size); ++it0)
{
i++;
j1=it0->first;
if(visit[j1]==false && clust[j1]==false){
......@@ -382,6 +397,27 @@ void comp_matrix::stat_cluster( list<list<junction> > cluster, ostream &out )
}
JsonArray comp_matrix::toJson(list<list<junction> > clusterList)
{
JsonArray result;
for (list <list <string> >::iterator it = clusterList.begin();
it != clusterList.end(); ++it )
{
JsonArray cluster;
list<string> li=*it;
for (list<string>::iterator it2 = li.begin();
it2 != li.end(); ++it2 )
{
cluster.add(*it2);
}
result.add(cluster);
}
return result;
}
char **comp_matrix::alloc_matrix(size_t s) {
char **matrix = (char**)malloc(s*sizeof(char*));
for (size_t i=0;i<s;i++){
......
......@@ -9,24 +9,27 @@
#include <ctime>
#include "dynprog.h"
#include "windows.h"
#include "json.h"
using namespace std ;
#define SIMILAR_JUNCTIONS_THRESHOLD 1
#define JUNCTION_LIMIT 200
class comp_matrix {
public:
char ** m;
WindowsStorage &windows;
list<pair <junction, int> > sort_clones;
map <string, int> count;
int n_j;
int n_j2;
size_t matrix_size;
/**
* create new distance matrix
*/
comp_matrix(WindowsStorage &windows);
comp_matrix(list<pair <junction, int> > sc);
/**
* init matrix with a KmerStore and compute distance value between sequences
......@@ -65,6 +68,9 @@ class comp_matrix {
void del();
void stat_cluster( list<list<junction> > cluster, ostream &out=cout);
JsonArray toJson( list<list<junction> > cluster);
private:
......
......@@ -3,11 +3,15 @@
Germline::Germline(string _code, char _shortcut,
string f_rep_5, string f_rep_4, string f_rep_3,
string seed,
int _delta_min, int _delta_max)
{
code = _code ;
shortcut = _shortcut ;
index = 0 ;
affect_5 = "V" ;
affect_4 = "";
affect_3 = "J" ;
rep_5 = Fasta(f_rep_5, 2, "|", cout);
rep_4 = Fasta(f_rep_4, 2, "|", cout);
......@@ -16,21 +20,23 @@ Germline::Germline(string _code, char _shortcut,
delta_min = _delta_min ;
delta_max = _delta_max ;
build_index(seed);
stats.setLabel(code);
}
Germline::Germline(Fasta _rep_5, Fasta _rep_4, Fasta _rep_3,
string seed,
Germline::Germline(string _code, char _shortcut,
Fasta _rep_5, Fasta _rep_4, Fasta _rep_3,
int _delta_min, int _delta_max)
{
code = "" ;
shortcut = 'X' ;
code = _code ;
shortcut = _shortcut ;
index = 0 ;
// affect_5 = KmerAffect("", "V", 0) ;
// affect_3 = KmerAffect("", "J", 0) ;
affect_5 = "V" ;
affect_4 = "" ;
affect_3 = "J" ;
rep_5 = _rep_5 ;
rep_4 = _rep_4 ;
......@@ -39,23 +45,41 @@ Germline::Germline(Fasta _rep_5, Fasta _rep_4, Fasta _rep_3,
delta_min = _delta_min ;
delta_max = _delta_max ;
build_index(seed);
stats.setLabel(code);
}
void Germline::build_index(string seed)
void Germline::new_index(string seed)
{
bool rc = true ;
index = KmerStoreFactory::createIndex<KmerAffect>(seed, rc);
index->insert(rep_5, "V"); // affect_5);
index->insert(rep_3, "J"); // affect_3);
update_index();
}
void Germline::use_index(IKmerStore<KmerAffect> *_index)
{
index = _index;
update_index();
}
void Germline::update_index()
{
index->insert(rep_5, affect_5);
if (affect_4.size())
index->insert(rep_4, affect_4);
index->insert(rep_3, affect_3);
cout << " --- index " << index << " updated " << affect_5 << "/" << affect_4 << "/" << affect_3 << endl;
}
Germline::~Germline()
{
delete index;
if (index)
delete index;
}
ostream &operator<<(ostream &out, const Germline &germline)
......@@ -88,8 +112,8 @@ MultiGermline::MultiGermline(string f_germlines_json)
Fasta rep_3(f_rep_3, 2, "|", cout);
Germline *germline;
germline = new Germline(rep_5, rep_4, rep_3,
seed,
germline = new Germline("TRG", 'G',
rep_5, rep_4, rep_3,
delta_min, delta_max);
germlines.push_back(germline);
......@@ -107,12 +131,52 @@ void MultiGermline::insert(Germline *germline)
germlines.push_back(germline);
}
void MultiGermline::load_default_set(string path)
void MultiGermline::build_default_set(string path)
{
Germline *germline;
germline = new Germline("TRG", 'G', path + "/TRGV.fa", "", path + "/TRGJ.fa", -10, 20);
germline->new_index("#####-#####");
germlines.push_back(germline);
germline = new Germline("IGH", 'H', path + "/IGHV.fa", path + "/IGHD.fa", path + "/IGHJ.fa", 0, 80);
germline->new_index("######-######");
germlines.push_back(germline);
}
void MultiGermline::load_standard_set(string path)
{
germlines.push_back(new Germline("TRA", 'A', path + "/TRAV.fa", "", path + "/TRAJ.fa", -10, 20));
germlines.push_back(new Germline("TRB", 'B', path + "/TRBV.fa", path + "/TRBD.fa", path + "/TRBJ.fa", -10, 20));
germlines.push_back(new Germline("TRG", 'G', path + "/TRGV.fa", "", path + "/TRGJ.fa", -10, 20));
germlines.push_back(new Germline("TRD", 'D', path + "/TRDV.fa", path + "/TRDD.fa", path + "/TRDJ.fa", 0, 80));
germlines.push_back(new Germline("IGH", 'H', path + "/IGHV.fa", path + "/IGHD.fa", path + "/IGHJ.fa", 0, 80));
germlines.push_back(new Germline("IGK", 'K', path + "/IGKV.fa", "", path + "/IGKJ.fa", -10, 20));
germlines.push_back(new Germline("IGL", 'L', path + "/IGLV.fa", "", path + "/IGLJ.fa", -10, 20));
}
void MultiGermline::insert_in_one_index(IKmerStore<KmerAffect> *_index)
{
germlines.push_back(new Germline("TRG", 'G', path + "/TRGV.fa", "", path + "/TRGJ.fa", "#####-#####", -10, 20));
germlines.push_back(new Germline("IGH", 'H', path + "/IGHV.fa", path + "/IGHD.fa", path + "/IGHJ.fa", "######-######", 0, 80));
for (list<Germline*>::const_iterator it = germlines.begin(); it != germlines.end(); ++it)
{
Germline *germline = *it ;
germline->affect_5 = string(1, germline->shortcut) + "-" + germline->code + "V";
if (germline->rep_4.size())
germline->affect_4 = string(1, 14 + germline->shortcut) + "-" + germline->code + "D";
germline->affect_3 = string(1, tolower(germline->shortcut)) + "-" + germline->code + "J";
germline->use_index(_index) ;
}
}
void MultiGermline::build_with_one_index(string seed)
{
bool rc = true ;
index = KmerStoreFactory::createIndex<KmerAffect>(seed, rc);
insert_in_one_index(index);
}
void MultiGermline::out_stats(ostream &out)
{
......
......@@ -12,7 +12,7 @@ using namespace std;
class Germline {
private:
void build_index(string seed);
void update_index();
public:
/*
......@@ -26,11 +26,10 @@ class Germline {
Germline(string _code, char _shortcut,
string f_rep_5, string f_rep_4, string f_rep_3,
string seed,
int _delta_min, int _delta_max);
Germline(Fasta _rep_5, Fasta _rep_4, Fasta _rep_3,
string seed,
Germline(string _code, char _shortcut,
Fasta _rep_5, Fasta _rep_4, Fasta _rep_3,
int _delta_min, int _delta_max);
~Germline();
......@@ -38,9 +37,15 @@ class Germline {
string code ;
char shortcut ;
void new_index(string seed);
void use_index(IKmerStore<KmerAffect> *index);
// KmerAffect affect_5 ;
// KmerAffect affect_3 ;
string affect_5 ;
string affect_4 ;
string affect_3 ;
Fasta rep_5 ;
Fasta rep_4 ;
Fasta rep_3 ;
......@@ -62,12 +67,19 @@ class MultiGermline {
public:
list <Germline*> germlines;
// A unique index can be used
IKmerStore<KmerAffect> *index;
MultiGermline();
MultiGermline(string f_germlines_json);
~MultiGermline();
void insert(Germline *germline);
void load_default_set(string path);
void build_default_set(string path);
void load_standard_set(string path);
void insert_in_one_index(IKmerStore<KmerAffect> *_index);
void build_with_one_index(string seed);
void out_stats(ostream &out);
};
......
......@@ -164,7 +164,7 @@ string JsonArray::toString(){
stream << " [ ";
for ( list<string>::iterator i=l.begin(); i!= l.end(); ++i){
if (i!=l.begin()) stream << ", ";
if (i!=l.begin()) stream << ", " << endl;
stream << (*i);
}
......
......@@ -75,7 +75,7 @@ KmerAffect::KmerAffect(const KmerAffect &ka) {
affect = ka.affect;
}
KmerAffect::KmerAffect(const string &kmer, const string &label,
KmerAffect::KmerAffect(const string &label,
int strand) {
affect.c = label[0];
if (strand == 1)
......@@ -131,7 +131,7 @@ KmerAffect KmerAffect::getUnknown() {
}
bool KmerAffect::isAmbiguous() const {
return affect_strand(affect) == 1 && affect_char(affect) == 0;
return affect_strand(affect) == 1 && affect_char(affect) == 1;
}
bool KmerAffect::isUnknown() const {
......@@ -177,7 +177,7 @@ KmerStringAffect::KmerStringAffect(const KmerStringAffect &ksa):
label(ksa.label),strand(ksa.strand){}
KmerStringAffect::KmerStringAffect(const string &kmer, const string &label,
KmerStringAffect::KmerStringAffect(const string &label,
int strand) {
this->label = label;
this->strand = strand;
......
......@@ -69,7 +69,7 @@ public:
* Construct an affectation as stated by the parameters
* @post affect_strand(affect) == strand AND affect_char(affect) == kmer[0]
*/
KmerAffect(const string &kmer, const string &label="", int strand=1);
KmerAffect(const string &label, int strand=1);
/**
* Add another affectation to the current one.
* @post The current affectation is not modified if the parameter is the same
......@@ -142,17 +142,17 @@ ostream &operator<<(ostream &os, const KmerAffect &kmer);
/**
* Constant defining the unknown affectation (not known yet)
*/
const KmerAffect AFFECT_UNKNOWN = KmerAffect("", "\0", 0);
const KmerAffect AFFECT_UNKNOWN = KmerAffect("\0", 0);
/**
* Constant defining the ambiguous affectation (many possibilities)
*/
const KmerAffect AFFECT_AMBIGUOUS = KmerAffect("", "\0", 1);
const KmerAffect AFFECT_AMBIGUOUS = KmerAffect("\1", 1);
const KmerAffect AFFECT_V = KmerAffect("", "V", 1);
const KmerAffect AFFECT_J = KmerAffect("", "J", 1);
const KmerAffect AFFECT_V = KmerAffect("V", 1);
const KmerAffect AFFECT_J = KmerAffect("J", 1);
const KmerAffect AFFECT_V_BWD = KmerAffect("", "V", -1);
const KmerAffect AFFECT_J_BWD = KmerAffect("", "J", -1);
const KmerAffect AFFECT_V_BWD = KmerAffect("V", -1);
const KmerAffect AFFECT_J_BWD = KmerAffect("J", -1);
////////////////////////////////////////////////////////////////////////////////////////////////////
......@@ -176,7 +176,7 @@ public:
* Construct an affectation as stated by the parameters
* @post affect_strand(affect) == strand AND affect_char(affect) == kmer[0]
*/
KmerStringAffect(const string &kmer, const string &label="", int strand=1);
KmerStringAffect(const string &label, int strand=1);
/**
* Add another affectation to the current one.
* @post The current affectation is not modified if the parameter is the same
......@@ -244,6 +244,6 @@ bool operator>=(const KmerStringAffect &k1, const KmerStringAffect &k2);
ostream &operator<<(ostream &os, const KmerStringAffect &kmer);
const KmerStringAffect KSA_UNKNOWN = KmerStringAffect();
const KmerStringAffect KSA_AMBIGUOUS = KmerStringAffect("", "", 2);
const KmerStringAffect KSA_AMBIGUOUS = KmerStringAffect("", 2);
#endif
......@@ -26,7 +26,7 @@
Kmer::Kmer():count(0) {}
Kmer::Kmer(const string &kmer, const string &label, int strand) {
Kmer::Kmer(const string &label, int strand) {
count = 1;
}
......
......@@ -16,7 +16,12 @@ public:
unsigned int count;
Kmer();
Kmer(const seqtype &kmer, const string &label="", int strand=1);
/**
* This constructor is used via a IKmerStore<Kmer> index (hence the argument list)
*/
Kmer(const string &label, int strand=1);
Kmer &operator+=(const Kmer &);
static bool hasRevcompSymetry();
} ;
......@@ -37,17 +42,19 @@ public:
virtual ~IKmerStore();
list< pair <T, string> > labels;
/**
* @param input: A single FASTA file
* @param label: label that must be associated to the given files
* @post All the sequences in the FASTA files have been indexed.
* @post All the sequences in the FASTA files have been indexed, and the label is stored in the list of labels