Commit dad356da authored by Mathieu Giraud's avatar Mathieu Giraud
Browse files

algo: experimental '-x' option, MAX12 pseudo-germline

Try to detect unexpected recombinations
parents 7c403883 c7a85b67
......@@ -191,6 +191,35 @@ const string &KmerAffectAnalyser::getSequence() const{
}
pair <KmerAffect, KmerAffect> KmerAffectAnalyser::sortLeftRight(const pair <KmerAffect, KmerAffect> ka12) const {
KmerAffect ka1 = ka12.first;
KmerAffect ka2 = ka12.second;
int ka1_count = 0; int ka1_pos = 0;
int ka2_count = 0; int ka2_pos = 0;
for (size_t i = 0; i < affectations.size(); i++) {
if (affectations[i] == ka1)
{
ka1_count++ ; ka1_pos += i ;
}
else if (affectations[i] == ka2)
{
ka2_count++ ; ka2_pos += i ;
}
}
// Is ka1 'more on the left' than ka2 ?
// We check for (k1_pos / ka1_count > ka2_pos / ka2_count), but without floats
if (ka1_pos * ka2_count < ka2_pos * ka1_count)
return make_pair(ka1, ka2);
else
return make_pair(ka2, ka1);
}
int KmerAffectAnalyser::first(const KmerAffect &affect) const{
for (size_t i = 0; i < affectations.size(); i++)
if (affect == affectations[i])
......
......@@ -186,6 +186,13 @@ class KmerAffectAnalyser: public AffectAnalyser {
const string &getSequence() const;
/**
* @param A pair of KmerAffects
* @return The same pair of KmerAffects, but sorted.
* The first one is 'more on the left' than the second one.
*/
pair <KmerAffect, KmerAffect> sortLeftRight(const pair <KmerAffect, KmerAffect> ka12) const;
int first(const KmerAffect &affect) const;
int last(const KmerAffect &affect) const ;
......
......@@ -24,6 +24,14 @@ void Germline::init(string _code, char _shortcut,
stats_clones.setLabel("");
}
Germline::Germline(string _code, char _shortcut,
int _delta_min, int _delta_max)
{
init(_code, _shortcut, _delta_min, _delta_max);
}
Germline::Germline(string _code, char _shortcut,
string f_rep_5, string f_rep_4, string f_rep_3,
int _delta_min, int _delta_max)
......@@ -84,22 +92,22 @@ void Germline::new_index(string seed)
update_index();
}
void Germline::use_index(IKmerStore<KmerAffect> *_index)
void Germline::set_index(IKmerStore<KmerAffect> *_index)
{
index = _index;
update_index();
}
void Germline::update_index()
void Germline::update_index(IKmerStore<KmerAffect> *_index)
{
index->insert(rep_5, affect_5);
if (!_index) _index = index ;
_index->insert(rep_5, affect_5);
if (affect_4.size())
index->insert(rep_4, affect_4);
_index->insert(rep_4, affect_4);
index->insert(rep_3, affect_3);
_index->insert(rep_3, affect_3);
}
void Germline::mark_as_ambiguous(Germline *other)
......@@ -138,6 +146,7 @@ ostream &operator<<(ostream &out, const Germline &germline)
MultiGermline::MultiGermline(bool _one_index_per_germline)
{
index = NULL;
one_index_per_germline = _one_index_per_germline;
}
......@@ -202,7 +211,7 @@ void MultiGermline::build_incomplete_set(string path)
}
/* if 'one_index_per_germline' was not set, this should be called once all germlines have been loaded */
void MultiGermline::insert_in_one_index(IKmerStore<KmerAffect> *_index)
void MultiGermline::insert_in_one_index(IKmerStore<KmerAffect> *_index, bool set_index)
{
for (list<Germline*>::const_iterator it = germlines.begin(); it != germlines.end(); ++it)
{
......@@ -211,15 +220,18 @@ void MultiGermline::insert_in_one_index(IKmerStore<KmerAffect> *_index)
if (germline->rep_4.size())
germline->affect_4 = string(1, 14 + germline->shortcut) + "-" + germline->code + "D";
germline->use_index(_index) ;
germline->update_index(_index);
if (set_index)
germline->set_index(_index);
}
}
void MultiGermline::build_with_one_index(string seed)
void MultiGermline::build_with_one_index(string seed, bool set_index)
{
bool rc = true ;
index = KmerStoreFactory::createIndex<KmerAffect>(seed, rc);
insert_in_one_index(index);
insert_in_one_index(index, set_index);
}
void MultiGermline::out_stats(ostream &out)
......
......@@ -9,6 +9,8 @@
#include "stats.h"
#include "tools.h"
#define PSEUDO_GERMLINE_MAX12 "xxx"
using namespace std;
class Germline {
......@@ -16,8 +18,6 @@ class Germline {
void init(string _code, char _shortcut,
int _delta_min, int _delta_max);
void update_index();
public:
/*
* @param delta_min: the minimal distance between the right bound and the left bound
......@@ -40,13 +40,18 @@ class Germline {
Fasta _rep_5, Fasta _rep_4, Fasta _rep_3,
int _delta_min, int _delta_max);
Germline(string _code, char _shortcut,
int _delta_min, int _delta_max);
~Germline();
string code ;
char shortcut ;
void new_index(string seed);
void use_index(IKmerStore<KmerAffect> *index);
void set_index(IKmerStore<KmerAffect> *index);
void update_index(IKmerStore<KmerAffect> *_index = NULL);
void mark_as_ambiguous(Germline *other);
......@@ -78,9 +83,9 @@ ostream &operator<<(ostream &out, const Germline &germline);
class MultiGermline {
private:
bool one_index_per_germline;
public:
bool one_index_per_germline;
list <Germline*> germlines;
// A unique index can be used
......@@ -94,8 +99,10 @@ class MultiGermline {
void build_default_set(string path);
void build_incomplete_set(string path);
void insert_in_one_index(IKmerStore<KmerAffect> *_index);
void build_with_one_index(string seed);
// Creates and update an unique index for all the germlines
// If 'set_index' is set, set this index as the index for all germlines
void insert_in_one_index(IKmerStore<KmerAffect> *_index, bool set_index);
void build_with_one_index(string seed, bool set_index);
void mark_cross_germlines_as_ambiguous();
......
......@@ -26,6 +26,7 @@
#include "tools.h"
#include "affectanalyser.h"
#include <sstream>
#include <cstring>
#include <string>
Segmenter::~Segmenter() {}
......@@ -196,6 +197,7 @@ KmerSegmenter::KmerSegmenter(Sequence seq, Germline *germline)
sequence = seq.sequence ;
info = "" ;
info_extra = "seed";
detected = false ;
segmented = false;
segmented_germline = germline ;
reversed = false;
......@@ -233,6 +235,26 @@ KmerSegmenter::KmerSegmenter(Sequence seq, Germline *germline)
KmerAffect before, after;
if (!strcmp(germline->code.c_str(), PSEUDO_GERMLINE_MAX12))
{ // Pseudo-germline, max12
set<KmerAffect> forbidden;
forbidden.insert(KmerAffect::getAmbiguous());
forbidden.insert(KmerAffect::getUnknown());
CountKmerAffectAnalyser ckaa(*(germline->index), sequence);
pair <KmerAffect, KmerAffect> max12 = ckaa.sortLeftRight(ckaa.max12(forbidden));
strand = nb_strand[0] > nb_strand[1] ? -1 : 1 ;
computeSegmentation(strand, max12.first, max12.second);
// The pseudo-germline should never take precedence over the regular germlines
score = 1 ;
}
else
{ // Regular germline
// Test on which strand we are, select the before and after KmerAffects
if (nb_strand[0] == 0 && nb_strand[1] == 0) {
because = UNSEG_TOO_FEW_ZERO ;
......@@ -252,9 +274,11 @@ KmerSegmenter::KmerSegmenter(Sequence seq, Germline *germline)
return ;
}
detected = false ;
computeSegmentation(strand, before, after);
} // endif Pseudo-germline
if (! because)
{
// Now we check the delta between Vend and right
......@@ -596,6 +620,9 @@ FineSegmenter::FineSegmenter(Sequence seq, Germline *germline, Cost segment_c)
CDR3start = -1;
CDR3end = -1;
if (!germline->rep_5.size() || !germline->rep_3.size())
return ;
// TODO: factoriser tout cela, peut-etre en lancant deux segmenteurs, un +, un -, puis un qui chapote
// Strand +
......
!LAUNCH: ../../vidjil -g ../../germline -2 -i ../../data/chimera-fake.fa
$ Segment all reads
1:segmented 3 reads
$ Segment all reads on 'xxx'
1:xxx .* -> .* 3
!LAUNCH: ../../vidjil -g ../../germline -i ../../data/chimera-fake.fa
$ Do not segment on any germline, even incomplete
1:segmented 0 reads
......@@ -176,6 +176,9 @@ void testAffectAnalyserMaxes() {
TAP_TEST(ckaa.max12(forbidden).first == gAffect, TEST_COUNT_AA_MAX12, "max1 is " << ckaa.max12(forbidden).first);
TAP_TEST(ckaa.max12(forbidden).second == cAffect, TEST_COUNT_AA_MAX12, "max2 is " << ckaa.max12(forbidden).second);
TAP_TEST(ckaa.sortLeftRight(ckaa.max12(forbidden)).first == cAffect, TEST_AA_SORT_LEFT_RIGHT, "bad max12, left");
TAP_TEST(ckaa.sortLeftRight(ckaa.max12(forbidden)).second == gAffect, TEST_AA_SORT_LEFT_RIGHT, "bad max12, right");
forbidden.insert(gAffect);
TAP_TEST(ckaa.max(forbidden) == cAffect, TEST_COUNT_AA_MAX, "max is " << ckaa.max(forbidden));
TAP_TEST(ckaa.max12(forbidden).first == cAffect, TEST_COUNT_AA_MAX12, "max1 is " << ckaa.max12(forbidden).first);
......
......@@ -78,6 +78,7 @@ enum {
TEST_COUNT_AA_LAST_MAX,
TEST_COUNT_AA_MAX,
TEST_COUNT_AA_MAX12,
TEST_AA_SORT_LEFT_RIGHT,
/* Cluster */
TEST_CLUSTER,
......
......@@ -162,6 +162,7 @@ void usage(char *progname, bool advanced)
cerr << "Experimental options (do not use)" << endl
<< " -I ignore k-mers common to different germline systems (experimental, must be used with -g, do not use)" << endl
<< " -1 use a unique index for all germline systems (experimental, must be used with -g, do not use)" << endl
<< " -2 try to detect unexpected recombinations (experimental, must be used with -g, do not use)" << endl
<< " -! keep unsegmented reads as clones, taking for junction the complete sequence, to be used on very small datasets (for example -!AX 20)" << endl
<< endl
......@@ -326,6 +327,7 @@ int main (int argc, char **argv)
bool multi_germline_incomplete = false;
bool multi_germline_mark = false;
bool multi_germline_one_index_per_germline = true;
bool multi_germline_unexpected_recombinations = false;
string multi_germline_file = DEFAULT_MULTIGERMLINE;
string forced_edges = "" ;
......@@ -344,7 +346,7 @@ int main (int argc, char **argv)
//$$ options: getopt
while ((c = getopt(argc, argv, "A!x:X:hHaiI1g:G:V:D:J:k:r:vw:e:C:f:l:c:m:M:N:s:b:Sn:o:L%:y:z:uUK3E:")) != EOF)
while ((c = getopt(argc, argv, "A!x:X:hHaiI12g:G:V:D:J:k:r:vw:e:C:f:l:c:m:M:N:s:b:Sn:o:L%:y:z:uUK3E:")) != EOF)
switch (c)
{
......@@ -406,7 +408,11 @@ int main (int argc, char **argv)
case '1':
multi_germline_one_index_per_germline = false ;
break;
case '2':
multi_germline_unexpected_recombinations = true ;
break;
case 'G':
germline_system = string(optarg);
f_reps_V.push_back((germline_system + "V.fa").c_str()) ;
......@@ -762,8 +768,6 @@ int main (int argc, char **argv)
if (multi_germline)
{
multigermline->build_default_set(multi_germline_file);
if (multi_germline_incomplete)
multigermline->build_incomplete_set(multi_germline_file);
}
else
{
......@@ -781,8 +785,25 @@ int main (int argc, char **argv)
cout << endl ;
if (!multi_germline_one_index_per_germline)
multigermline->build_with_one_index(seed);
if (!multi_germline_one_index_per_germline) {
multigermline->build_with_one_index(seed, true);
}
if (multi_germline_unexpected_recombinations) {
if (!multigermline->index) {
multigermline->build_with_one_index(seed, false);
}
Germline *pseudo = new Germline(PSEUDO_GERMLINE_MAX12, 'x', -10, 80);
pseudo->index = multigermline->index ;
multigermline->germlines.push_back(pseudo);
}
// Should come after the initialization of regular (and possibly pseudo) germlines
if (multi_germline_incomplete) {
multigermline->one_index_per_germline = true; // Starting from now, creates new indexes
multigermline->build_incomplete_set(multi_germline_file);
}
if (multi_germline_mark)
multigermline->mark_cross_germlines_as_ambiguous();
......
# Fake chimera sequences, 2 x (first 60 bp) of some V genes
>TRAV1-1*01--TRB--V1*01
ggacaaagccttgagcagccctctgaagtgacagctgtggaaggagccattgtccagata
gatactggaattacccagacaccaaaatacctggtcacagcaatggggagtaaaaggaca
>TRGV1*01--TRDV1*01
tcttccaacttggaagggagaacgaagtcagtcaccaggctgactgggtcatctgctgaa
gcccagaaggttactcaagcccagtcatcagtatccatgccagtgaggaaagcagtcacc
>IGHV1-18*01--IGKV1-12*01
caggttcagctggtgcagtctggagctgaggtgaagaagcctggggcctcagtgaaggtc
gacatccagatgacccagtctccatcttccgtgtctgcatctgtaggagacagagtcacc
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment