Une MAJ de sécurité est nécessaire sur notre version actuelle. Elle sera effectuée lundi 02/08 entre 12h30 et 13h. L'interruption de service devrait durer quelques minutes (probablement moins de 5 minutes).

Commit 12bcb3cc authored by Mathieu Giraud's avatar Mathieu Giraud
Browse files

merge - experimental MAX1U segmenting (-4)

This experimental option tries to segment reads while detecting
translocations involving, for one part, the V(D)J region.

It works - but the probabilistic model is far too stringent here,
giving a p-value of about 0.1-0.2 for 40 unknown kmers.
Somehow, if one half of the read is in V(D)J, then having
unknown kmers in the other half is unexpected !
parents 4a83f281 a2eb70ef
......@@ -157,9 +157,11 @@ affect_infos KmerAffectAnalyser::getMaximum(const KmerAffect &before,
results.nb_before_right++;
}
left_evalue = kms.getProbabilityAtLeastOrAbove(results.nb_before_left,
left_evalue = kms.getProbabilityAtLeastOrAbove(before,
results.nb_before_left,
1 + results.first_pos_max);
right_evalue = kms.getProbabilityAtLeastOrAbove(results.nb_after_right,
right_evalue = kms.getProbabilityAtLeastOrAbove(after,
results.nb_after_right,
seq.size() - 1 - results.last_pos_max);
/* Main test:
......@@ -179,8 +181,8 @@ affect_infos KmerAffectAnalyser::getMaximum(const KmerAffect &before,
}
double KmerAffectAnalyser::getProbabilityAtLeastOrAbove(int at_least) const {
return kms.getProbabilityAtLeastOrAbove(at_least, seq.size());
double KmerAffectAnalyser::getProbabilityAtLeastOrAbove(const KmerAffect &kmer, int at_least) const {
return kms.getProbabilityAtLeastOrAbove(kmer, at_least, seq.size());
}
pair <double, double> KmerAffectAnalyser::getLeftRightProbabilityAtLeastOrAbove() const {
......
......@@ -177,7 +177,7 @@ class KmerAffectAnalyser: public AffectAnalyser {
/**
* @return probability that the number of kmers is 'at_least' or more
*/
double getProbabilityAtLeastOrAbove(int at_least) const;
double getProbabilityAtLeastOrAbove(const KmerAffect &kmer, int at_least) const;
/**
* @return probabilities that the number of left/right kmers is 'at_least' or more
......
......@@ -11,6 +11,7 @@
#include "../lib/json.hpp"
#define PSEUDO_GERMLINE_MAX12 "xxx"
#define PSEUDO_GERMLINE_MAX1U "yyy"
using namespace std;
using json = nlohmann::json;
......
......@@ -154,16 +154,24 @@ bool operator>=(const KmerAffect &a1, const KmerAffect &a2);
bool operator!=(const KmerAffect &a1, const KmerAffect &a2);
ostream &operator<<(ostream &os, const KmerAffect &kmer);
#define AFFECT_NOT_UNKNOWN_SYMBOL "*"
#define AFFECT_AMBIGUOUS_SYMBOL "?"
#define AFFECT_UNKNOWN_SYMBOL "_"
#define AFFECT_AMBIGUOUS_CHAR (AFFECT_AMBIGUOUS_SYMBOL[0])
#define AFFECT_UNKNOWN_CHAR (AFFECT_UNKNOWN_SYMBOL[0])
/**
* Constant defining any not-unknown affectation
* Could be used by .getIndexLoad(), but now any non-AFFECT_UNKNOWN kmer will work.
*/
const KmerAffect AFFECT_NOT_UNKNOWN = KmerAffect(AFFECT_NOT_UNKNOWN_SYMBOL, 0);
/**
* Constant defining the unknown affectation (not known yet)
*/
const KmerAffect AFFECT_UNKNOWN = KmerAffect(AFFECT_UNKNOWN_SYMBOL, 0);
/**
* Constant defining the ambiguous affectation (many possibilities)
*/
......
......@@ -92,6 +92,7 @@ public:
/**
* @return the percentage of kmers that are set in the index
*/
float getIndexLoad(T kmer) const;
float getIndexLoad() const;
/**
......@@ -103,7 +104,7 @@ public:
/**
* @return probability that the number of kmers is 'at_least' or more in a sequence of length 'length'
*/
double getProbabilityAtLeastOrAbove(int at_least, int length) const;
double getProbabilityAtLeastOrAbove(T kmer, int at_least, int length) const;
/**
* @return the value of k
......@@ -279,6 +280,10 @@ void IKmerStore<T>::insert(const seqtype &sequence,
}
}
template<class T>
float IKmerStore<T>::getIndexLoad(const T kmer) const {
return kmer.isUnknown() ? 1 - getIndexLoad() : getIndexLoad();
}
template<class T>
float IKmerStore<T>::getIndexLoad() const {
return nb_kmers_inserted / pow(4.0, k);
......@@ -293,11 +298,11 @@ int IKmerStore<T>::atMostMaxSizeIndexing(int n) const {
}
template<class T>
double IKmerStore<T>::getProbabilityAtLeastOrAbove(int at_least, int length) const {
double IKmerStore<T>::getProbabilityAtLeastOrAbove(const T kmer, int at_least, int length) const {
// n: number of kmers in the sequence
int n = length - getS() + 1;
float index_load = getIndexLoad() ;
float index_load = getIndexLoad(kmer) ;
double proba = 0;
double probability_having_system = pow(index_load, at_least);
......@@ -333,7 +338,7 @@ string IKmerStore<T>::getLabel(T kmer) const {
for (typename list< pair<T, string> >::const_iterator it = labels.begin(); it != labels.end(); ++it)
if (it->first == kmer)
return it->second ;
return "" ;
return "?" ;
}
// .getResults()
......
......@@ -247,25 +247,48 @@ KmerSegmenter::KmerSegmenter(Sequence seq, Germline *germline, double threshold,
score = nb_strand[0] + nb_strand[1] ; // Used only for non-segmented germlines
if (!strcmp(germline->code.c_str(), PSEUDO_GERMLINE_MAX12))
{ // Pseudo-germline, max12
if ((!strcmp(germline->code.c_str(), PSEUDO_GERMLINE_MAX12)
|| (!strcmp(germline->code.c_str(), PSEUDO_GERMLINE_MAX1U))))
{ // Pseudo-germline, MAX12 and MAX1U
pair <KmerAffect, KmerAffect> max12 ;
CountKmerAffectAnalyser ckaa(*(germline->index), sequence);
set<KmerAffect> forbidden;
forbidden.insert(KmerAffect::getAmbiguous());
forbidden.insert(KmerAffect::getUnknown());
CountKmerAffectAnalyser ckaa(*(germline->index), sequence);
pair <KmerAffect, KmerAffect> max12 = ckaa.sortLeftRight(ckaa.max12(forbidden));
if (!strcmp(germline->code.c_str(), PSEUDO_GERMLINE_MAX12))
// MAX12: two maximum k-mers (no unknown)
{
max12 = ckaa.max12(forbidden);
before = max12.first ;
after = max12.second ;
if (max12.first.isUnknown() || max12.second.isUnknown())
{
because = UNSEG_TOO_FEW_ZERO ;
return ;
}
}
if (max12.first.isUnknown() || max12.second.isUnknown())
else
// MAX1U: the maximum k-mers (no unknown) + unknown
{
because = UNSEG_TOO_FEW_ZERO ;
return ;
CountKmerAffectAnalyser ckaa(*(germline->index), sequence);
KmerAffect max = ckaa.max(forbidden);
if (max.isUnknown())
{
because = UNSEG_TOO_FEW_ZERO ;
return ;
}
max12 = make_pair(max, KmerAffect::getUnknown());
}
pair <KmerAffect, KmerAffect> before_after = ckaa.sortLeftRight(max12);
before = before_after.first ;
after = before_after.second ;
// This strand computation is only a heuristic, especially for chimera +/- reads
// Anyway, it allows to gather such reads and their reverse complement into a unique window...
// ... except when the read is quite different outside the window
......@@ -619,9 +642,10 @@ FineSegmenter::FineSegmenter(Sequence seq, Germline *germline, Cost segment_c)
if (!germline->rep_5.size() || !germline->rep_3.size())
{
// We check whether this sequence is segmented with MAX12 (with default e-value parameters)
// We check whether this sequence is segmented with MAX12 or MAX1U (with default e-value parameters)
KmerSegmenter *kseg = new KmerSegmenter(seq, germline, THRESHOLD_NB_EXPECTED, 1);
if (kseg->isSegmented() && (!strcmp(germline->code.c_str(), PSEUDO_GERMLINE_MAX12)))
if (kseg->isSegmented() && ((!strcmp(germline->code.c_str(), PSEUDO_GERMLINE_MAX12))
|| !strcmp(germline->code.c_str(), PSEUDO_GERMLINE_MAX1U)))
{
reversed = kseg->isReverse();
......
!LAUNCH: ../../vidjil -e 10 -A -t 0 -g ../../germline -4 -i ../../data/chimera-fake-half.fa
# TODO: a more precise modeling should give a e-value computation that could make this work even with -e 1
$ The KmerSegmenter segments the six chimera reads on PSEUDO_MAX1U germline (-4)
1:yyy .* -> .* 6
$ The FineSegmenter gives the locus information (-4)
1:Unexpected .A-TRAV/.\?
1:Unexpected .G-TRGV/.\?
1:Unexpected .H-IGHV/.\?
1:Unexpected .\?/.B-TRBV
1:Unexpected .\?/.D-TRDV
1:Unexpected .\?/.K-IGKV
......@@ -297,18 +297,22 @@ void testProbability() {
TAP_TEST(germline.index->getIndexLoad() == .75, TEST_GET_INDEX_LOAD, "");
TAP_TEST(germline.index->getIndexLoad(AFFECT_NOT_UNKNOWN) == .75, TEST_GET_INDEX_LOAD, ".getIndexLoad with AFFECT_NOT_UNKNOWN");
TAP_TEST(germline.index->getIndexLoad(AFFECT_UNKNOWN) == .25, TEST_GET_INDEX_LOAD, ".getIndexLoad with AFFECT_UNKNOWN");
Sequence seq = {"to_segment", "to_segment", "TATCG", "", NULL};
KmerSegmenter kseg(seq, &germline);
KmerAffectAnalyser *kaa = kseg.getKmerAffectAnalyser();
TAP_TEST(kaa->getProbabilityAtLeastOrAbove(3) == 0, TEST_PROBABILITY_SEGMENTATION, "");
TAP_TEST(kaa->getProbabilityAtLeastOrAbove(2) == .75 * .75, TEST_PROBABILITY_SEGMENTATION, "");
TAP_TEST(kaa->getProbabilityAtLeastOrAbove(1) == .75 * 2 * .25 + kaa->getProbabilityAtLeastOrAbove(2),
TEST_PROBABILITY_SEGMENTATION, "");
TAP_TEST(kaa->getProbabilityAtLeastOrAbove(0) == 1, TEST_PROBABILITY_SEGMENTATION, "");
TAP_TEST(kaa->getProbabilityAtLeastOrAbove(AFFECT_NOT_UNKNOWN, 3) == 0, TEST_PROBABILITY_SEGMENTATION, "");
TAP_TEST(kaa->getProbabilityAtLeastOrAbove(AFFECT_NOT_UNKNOWN, 2) == .75 * .75, TEST_PROBABILITY_SEGMENTATION, "");
TAP_TEST(kaa->getProbabilityAtLeastOrAbove(AFFECT_NOT_UNKNOWN, 1) == .75 * 2 * .25 + kaa->getProbabilityAtLeastOrAbove(AFFECT_NOT_UNKNOWN, 2), TEST_PROBABILITY_SEGMENTATION, "");
TAP_TEST(kaa->getProbabilityAtLeastOrAbove(AFFECT_NOT_UNKNOWN, 0) == 1, TEST_PROBABILITY_SEGMENTATION, "");
TAP_TEST(kaa->getProbabilityAtLeastOrAbove(AFFECT_UNKNOWN, 3) == 0, TEST_PROBABILITY_SEGMENTATION, ".getProbabilityAtLeastOrAbove() with AFFECT_UNKOWN");
TAP_TEST(kaa->getProbabilityAtLeastOrAbove(AFFECT_UNKNOWN, 2) == .25 * .25, TEST_PROBABILITY_SEGMENTATION, ".getProbabilityAtLeastOrAbove() with AFFECT_UNKOWN");
TAP_TEST(kaa->getProbabilityAtLeastOrAbove(AFFECT_UNKNOWN, 0) == 1, TEST_PROBABILITY_SEGMENTATION, ".getProbabilityAtLeastOrAbove() with AFFECT_UNKOWN");
}
void testSegment() {
......
......@@ -165,6 +165,7 @@ void usage(char *progname, bool advanced)
<< " -I ignore k-mers common to different germline systems (experimental, must be used with -g, do not use)" << endl
<< " -1 use a unique index for all germline systems (experimental, must be used with -g, do not use)" << endl
<< " -2 try to detect unexpected recombinations (experimental, must be used with -g, do not use)" << endl
<< " -4 try to detect unexpected recombinations with translocations (experimental, must be used with -g, do not use)" << endl
<< " -! keep unsegmented reads as clones, taking for junction the complete sequence, to be used on very small datasets (for example -!AX 20)" << endl
<< endl
......@@ -346,7 +347,8 @@ int main (int argc, char **argv)
bool multi_germline_incomplete = false;
bool multi_germline_mark = false;
bool multi_germline_one_index_per_germline = true;
bool multi_germline_unexpected_recombinations = false;
bool multi_germline_unexpected_recombinations_12 = false;
bool multi_germline_unexpected_recombinations_1U = false;
string multi_germline_file = DEFAULT_MULTIGERMLINE;
string forced_edges = "" ;
......@@ -367,7 +369,7 @@ int main (int argc, char **argv)
//$$ options: getopt
while ((c = getopt(argc, argv, "A!x:X:hHaiI12g:G:V:D:J:k:r:vw:e:C:f:W:l:Fc:m:N:s:b:Sn:o:L%:y:z:uUK3E:t:")) != EOF)
while ((c = getopt(argc, argv, "A!x:X:hHaiI124g:G:V:D:J:k:r:vw:e:C:f:W:l:Fc:m:N:s:b:Sn:o:L%:y:z:uUK3E:t:")) != EOF)
switch (c)
{
......@@ -429,7 +431,11 @@ int main (int argc, char **argv)
break;
case '2':
multi_germline_unexpected_recombinations = true ;
multi_germline_unexpected_recombinations_12 = true ;
break;
case '4':
multi_germline_unexpected_recombinations_1U = true ;
break;
case 'G':
......@@ -795,14 +801,23 @@ int main (int argc, char **argv)
multigermline->build_with_one_index(seed, true);
}
if (multi_germline_unexpected_recombinations) {
if (multi_germline_unexpected_recombinations_12 || multi_germline_unexpected_recombinations_1U) {
if (!multigermline->index) {
multigermline->build_with_one_index(seed, false);
}
}
if (multi_germline_unexpected_recombinations_12) {
Germline *pseudo = new Germline(PSEUDO_GERMLINE_MAX12, 'x', -10, trim_sequences);
pseudo->index = multigermline->index ;
multigermline->germlines.push_back(pseudo);
}
if (multi_germline_unexpected_recombinations_1U) {
Germline *pseudo_u = new Germline(PSEUDO_GERMLINE_MAX1U, 'y', -10, trim_sequences);
// TODO: there should be more up/downstream regions for the 'yyy' germline. And/or smaller seeds ?
pseudo_u->index = multigermline->index ;
multigermline->germlines.push_back(pseudo_u);
}
// Should come after the initialization of regular (and possibly pseudo) germlines
......
# Fake chimera sequences
# one half is first 60 bp of some V genes
# the other half is a 40 bp di-mer sequence
>TRAV1-1*01--cgcg
ggacaaagccttgagcagccctctgaagtgacagctgtggaaggagccattgtccagata
cgcgcgcgcgcgcgcgcgcgcgcgcgcgcgcgcgcgcgcg
>TRGV1*01--cgcg
tcttccaacttggaagggagaacgaagtcagtcaccaggctgactgggtcatctgctgaa
cgcgcgcgcgcgcgcgcgcgcgcgcgcgcgcgcgcgcgcg
>IGHV1-18*01--cgcg
caggttcagctggtgcagtctggagctgaggtgaagaagcctggggcctcagtgaaggtc
cgcgcgcgcgcgcgcgcgcgcgcgcgcgcgcgcgcgcgcg
>atat--TRBV1*01
atatatatatatatatatatatatatatatatatatatat
gatactggaattacccagacaccaaaatacctggtcacagcaatggggagtaaaaggaca
>atat--TRDV1*01
atatatatatatatatatatatatatatatatatatatat
gcccagaaggttactcaagcccagtcatcagtatccatgccagtgaggaaagcagtcacc
>atat--IGKV1-12*01
atatatatatatatatatatatatatatatatatatatat
gacatccagatgacccagtctccatcttccgtgtctgcatctgtaggagacagagtcacc
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment