Commit d567545c authored by Mikaël Salson's avatar Mikaël Salson

algo/: Remove delta_max, use delta_min only in FineSegmenter.

delta_max is now useless. It was use to make sure we didn't segment
something not robust enough. Now we use the e-value, which is a much
trustable value.

delta_min is useless to the KmerSegmenter (we can't have overlaps
with the first_max, last_max algorithms).
But it can still be useful to FineSegmenter, that's why we keep it.
parent 83167c9f
......@@ -3,7 +3,7 @@
#include <ctype.h>
void Germline::init(string _code, char _shortcut,
int _delta_min, int _delta_max,
int _delta_min,
int max_indexing)
{
code = _code ;
......@@ -20,24 +20,23 @@ void Germline::init(string _code, char _shortcut,
affect_3 = string(1, tolower(shortcut)) + "-" + code + "J";
delta_min = _delta_min ;
delta_max = _delta_max ;
}
Germline::Germline(string _code, char _shortcut,
int _delta_min, int _delta_max,
int _delta_min,
int max_indexing)
{
init(_code, _shortcut, _delta_min, _delta_max, max_indexing);
init(_code, _shortcut, _delta_min, max_indexing);
}
Germline::Germline(string _code, char _shortcut,
string f_rep_5, string f_rep_4, string f_rep_3,
int _delta_min, int _delta_max,
int _delta_min,
int max_indexing)
{
init(_code, _shortcut, _delta_min, _delta_max, max_indexing);
init(_code, _shortcut, _delta_min, max_indexing);
f_reps_5.push_back(f_rep_5);
f_reps_4.push_back(f_rep_4);
......@@ -51,10 +50,10 @@ Germline::Germline(string _code, char _shortcut,
Germline::Germline(string _code, char _shortcut,
list <string> _f_reps_5, list <string> _f_reps_4, list <string> _f_reps_3,
int _delta_min, int _delta_max,
int _delta_min,
int max_indexing)
{
init(_code, _shortcut, _delta_min, _delta_max, max_indexing);
init(_code, _shortcut, _delta_min, max_indexing);
f_reps_5 = _f_reps_5 ;
f_reps_4 = _f_reps_4 ;
......@@ -77,10 +76,10 @@ Germline::Germline(string _code, char _shortcut,
Germline::Germline(string _code, char _shortcut,
Fasta _rep_5, Fasta _rep_4, Fasta _rep_3,
int _delta_min, int _delta_max,
int _delta_min,
int max_indexing)
{
init(_code, _shortcut, _delta_min, _delta_max, max_indexing);
init(_code, _shortcut, _delta_min, max_indexing);
rep_5 = _rep_5 ;
rep_4 = _rep_4 ;
......@@ -133,7 +132,7 @@ Germline::~Germline()
ostream &operator<<(ostream &out, const Germline &germline)
{
out << setw(4) << left << germline.code << right << " '" << germline.shortcut << "' "
<< setw(3) << germline.delta_min << "/" << setw(3) << germline.delta_max
<< setw(3) << germline.delta_min
<< " " << germline.index ;
if (germline.index) {
......@@ -183,34 +182,34 @@ void MultiGermline::add_germline(Germline *germline, string seed)
void MultiGermline::build_default_set(string path, int max_indexing)
{
// Should parse 'data/germlines.data'
add_germline(new Germline("TRA", 'A', path + "/TRAV.fa", "", path + "/TRAJ.fa", -10, 20, max_indexing), SEED_S13);
add_germline(new Germline("TRB", 'B', path + "/TRBV.fa", path + "/TRBD.fa", path + "/TRBJ.fa", 0, 80, max_indexing), SEED_S12);
add_germline(new Germline("TRG", 'G', path + "/TRGV.fa", "", path + "/TRGJ.fa", -10, 30, max_indexing), SEED_S10);
add_germline(new Germline("TRD", 'D', path + "/TRDV.fa", path + "/TRDD.fa", path + "/TRDJ.fa", 0, 80, max_indexing), SEED_S10);
add_germline(new Germline("IGH", 'H', path + "/IGHV.fa", path + "/IGHD.fa", path + "/IGHJ.fa", 0, 80, max_indexing), SEED_S12);
add_germline(new Germline("IGK", 'K', path + "/IGKV.fa", "", path + "/IGKJ.fa", -10, 20, max_indexing), SEED_S10);
add_germline(new Germline("IGL", 'L', path + "/IGLV.fa", "", path + "/IGLJ.fa", -10, 20, max_indexing), SEED_S10);
add_germline(new Germline("TRA", 'A', path + "/TRAV.fa", "", path + "/TRAJ.fa", -10, max_indexing), SEED_S13);
add_germline(new Germline("TRB", 'B', path + "/TRBV.fa", path + "/TRBD.fa", path + "/TRBJ.fa", 0, max_indexing), SEED_S12);
add_germline(new Germline("TRG", 'G', path + "/TRGV.fa", "", path + "/TRGJ.fa", -10, max_indexing), SEED_S10);
add_germline(new Germline("TRD", 'D', path + "/TRDV.fa", path + "/TRDD.fa", path + "/TRDJ.fa", 0, max_indexing), SEED_S10);
add_germline(new Germline("IGH", 'H', path + "/IGHV.fa", path + "/IGHD.fa", path + "/IGHJ.fa", 0, max_indexing), SEED_S12);
add_germline(new Germline("IGK", 'K', path + "/IGKV.fa", "", path + "/IGKJ.fa", -10, max_indexing), SEED_S10);
add_germline(new Germline("IGL", 'L', path + "/IGLV.fa", "", path + "/IGLJ.fa", -10, max_indexing), SEED_S10);
}
void MultiGermline::build_incomplete_set(string path, int max_indexing)
{
// Should parse 'data/germlines.data'
// TRA+D
add_germline(new Germline("TRA+D", 'a', path + "/TRDV.fa", path + "/TRDD.fa", path + "/TRAJ.fa", -10, 80, max_indexing), SEED_S13);
add_germline(new Germline("TRA+D", 'a', path + "/TRDD_upstream.fa", "", path + "/TRAJ.fa", -10, 80, max_indexing), SEED_S13);
add_germline(new Germline("TRA+D", 'a', path + "/TRDV.fa", path + "/TRDD.fa", path + "/TRAJ.fa", -10, max_indexing), SEED_S13);
add_germline(new Germline("TRA+D", 'a', path + "/TRDD_upstream.fa", "", path + "/TRAJ.fa", -10, max_indexing), SEED_S13);
// DD-JD + DD2-DD3
add_germline(new Germline("TRD+", 'd', path + "/TRDD2_upstream.fa", "", path + "/TRDJ.fa", -10, 60, max_indexing), SEED_9);
add_germline(new Germline("TRD+", 'd', path + "/TRDV.fa", "", path + "/TRDD3_downstream.fa", -10, 50, max_indexing), SEED_9);
add_germline(new Germline("TRD+", 'd', path + "/TRDD2_upstream.fa", "", path + "/TRDD3_downstream.fa", -10, 50, max_indexing), SEED_9);
add_germline(new Germline("TRD+", 'd', path + "/TRDD2_upstream.fa", "", path + "/TRDJ.fa", -10, max_indexing), SEED_9);
add_germline(new Germline("TRD+", 'd', path + "/TRDV.fa", "", path + "/TRDD3_downstream.fa", -10, max_indexing), SEED_9);
add_germline(new Germline("TRD+", 'd', path + "/TRDD2_upstream.fa", "", path + "/TRDD3_downstream.fa", -10, max_indexing), SEED_9);
// DH-JH
add_germline(new Germline("IGH+", 'h', path + "/IGHD_upstream.fa", "", path + "/IGHJ.fa", -10, 20, max_indexing), SEED_S12);
add_germline(new Germline("IGH+", 'h', path + "/IGHD_upstream.fa", "", path + "/IGHJ.fa", -10, max_indexing), SEED_S12);
// IGK: KDE, INTRON
add_germline(new Germline("IGK+", 'k', path + "/IGK-INTRON.fa", "", path + "/IGK-KDE.fa", -10, 80, max_indexing), SEED_S10);
add_germline(new Germline("IGK+", 'k', path + "/IGKV.fa", "", path + "/IGK-KDE.fa", -10, 80, max_indexing), SEED_S10);
add_germline(new Germline("IGK+", 'k', path + "/IGK-INTRON.fa", "", path + "/IGK-KDE.fa", -10, max_indexing), SEED_S10);
add_germline(new Germline("IGK+", 'k', path + "/IGKV.fa", "", path + "/IGK-KDE.fa", -10, max_indexing), SEED_S10);
}
/* if 'one_index_per_germline' was not set, this should be called once all germlines have been loaded */
......
......@@ -18,7 +18,7 @@ class Germline {
int max_indexing;
void init(string _code, char _shortcut,
int _delta_min, int _delta_max,
int _delta_min,
int max_indexing);
public:
......@@ -26,29 +26,26 @@ class Germline {
* @param delta_min: the minimal distance between the right bound and the left bound
* so that the segmentation is accepted
* (left bound: end of V, right bound : start of J)
* @param delta_min: the maximal distance between the right bound and the left bound
* so that the segmentation is accepted
* (left bound: end of V, right bound : start of J)
* @param max_indexing: maximal length of the sequence to be indexed (0: all)
*/
Germline(string _code, char _shortcut,
list <string> f_rep_5, list <string> f_rep_4, list <string> f_rep_3,
int _delta_min, int _delta_max,
int _delta_min,
int max_indexing=0);
Germline(string _code, char _shortcut,
string f_rep_5, string f_rep_4, string f_rep_3,
int _delta_min, int _delta_max,
int _delta_min,
int max_indexing=0);
Germline(string _code, char _shortcut,
Fasta _rep_5, Fasta _rep_4, Fasta _rep_3,
int _delta_min, int _delta_max,
int _delta_min,
int max_indexing=0);
Germline(string _code, char _shortcut,
int _delta_min, int _delta_max,
int _delta_min,
int max_indexing=0);
~Germline();
......@@ -79,7 +76,6 @@ class Germline {
IKmerStore<KmerAffect> *index;
int delta_min;
int delta_max;
};
......
......@@ -299,7 +299,7 @@ KmerSegmenter::KmerSegmenter(Sequence seq, Germline *germline, double threshold,
} // endif Pseudo-germline
computeSegmentation(strand, before, after, germline->delta_min, germline->delta_max, threshold, multiplier);
computeSegmentation(strand, before, after, threshold, multiplier);
}
KmerSegmenter::~KmerSegmenter() {
......@@ -378,7 +378,6 @@ KmerMultiSegmenter::~KmerMultiSegmenter() {
}
void KmerSegmenter::computeSegmentation(int strand, KmerAffect before, KmerAffect after,
int delta_min, int delta_max,
double threshold, int multiplier) {
// Try to segment, computing 'Vend' and 'Jstart'
// If not segmented, put the cause of unsegmentation in 'because'
......@@ -434,21 +433,6 @@ void KmerSegmenter::computeSegmentation(int strand, KmerAffect before, KmerAffec
Jstart = tmp;
}
// Now we check the delta between Vend and right
if (Jstart - Vend < delta_min)
{
because = UNSEG_BAD_DELTA_MIN ;
return ;
}
if (Jstart - Vend > delta_max)
{
because = UNSEG_BAD_DELTA_MAX ;
return ;
}
assert(because == NOT_PROCESSED);
// Yes, it is segmented
......@@ -722,7 +706,7 @@ FineSegmenter::FineSegmenter(Sequence seq, Germline *germline, Cost segment_c)
}
segmented = (Vend != (int) string::npos) && (Jstart != (int) string::npos) &&
(Jstart - Vend >= germline->delta_min) && (Jstart - Vend <= germline->delta_max);
(Jstart - Vend >= germline->delta_min);
if (!segmented)
{
......@@ -734,11 +718,6 @@ FineSegmenter::FineSegmenter(Sequence seq, Germline *germline, Cost segment_c)
because = UNSEG_BAD_DELTA_MIN ;
}
if (Jstart - Vend > germline->delta_max)
{
because = UNSEG_BAD_DELTA_MAX ;
}
if (Vend == (int) string::npos)
{
because = UNSEG_TOO_FEW_V ;
......
......@@ -43,7 +43,7 @@ enum SEGMENTED { NOT_PROCESSED,
SEG_PLUS, SEG_MINUS,
UNSEG_TOO_SHORT, UNSEG_STRAND_NOT_CONSISTENT,
UNSEG_TOO_FEW_ZERO, UNSEG_TOO_FEW_V, UNSEG_TOO_FEW_J,
UNSEG_BAD_DELTA_MIN, UNSEG_BAD_DELTA_MAX, UNSEG_AMBIGUOUS,
UNSEG_BAD_DELTA_MIN, UNSEG_AMBIGUOUS,
UNSEG_TOO_SHORT_FOR_WINDOW,
STATS_SIZE } ;
......@@ -52,7 +52,7 @@ const char* const segmented_mesg[] = { "?",
"SEG_+", "SEG_-",
"UNSEG too short", "UNSEG strand",
"UNSEG too few (0)", "UNSEG too few V", "UNSEG too few J",
"UNSEG < delta_min", "UNSEG > delta_max", "UNSEG ambiguous",
"UNSEG < delta_min", "UNSEG ambiguous",
"UNSEG too short w",
} ;
......@@ -201,7 +201,6 @@ class KmerSegmenter : public Segmenter
private:
void computeSegmentation(int strand, KmerAffect left, KmerAffect right,
int delta_min, int delta_max,
double threshold, int multiplier);
};
......
......@@ -20,7 +20,7 @@ void testFineSegment()
Fasta data("../../data/Stanford_S22.fasta", 1, " ");
Germline *germline ;
germline = new Germline("IGH", 'G', seqV, seqD, seqJ, 0, 50);
germline = new Germline("IGH", 'G', seqV, seqD, seqJ, 0);
germline->new_index("####");
Sequence seq = data.read(2);
......@@ -71,11 +71,11 @@ void testSegmentOverlap()
Fasta data("../../data/bug-segment-overlap.fa", 1, " ");
Germline *germline1 ;
germline1 = new Germline("TRG", 'G', seqV, seqV, seqJ, -50, 50);
germline1 = new Germline("TRG", 'G', seqV, seqV, seqJ, -50);
germline1->new_index("##########");
Germline *germline2 ;
germline2 = new Germline("TRG2", 'G', seqV, seqV, seqJ, -50, 50);
germline2 = new Germline("TRG2", 'G', seqV, seqV, seqJ, -50);
germline2->new_index("##########");
for (int i = 0; i < data.size(); i++) {
......@@ -104,7 +104,7 @@ void testSegmentationCause() {
Fasta data("../../data/segmentation.fasta", 1, " ");
Germline *germline ;
germline = new Germline("TRG", 'G', seqV, seqV, seqJ, 0, 10);
germline = new Germline("TRG", 'G', seqV, seqV, seqJ, 0);
germline->new_index("##########");
int nb_checked = 0;
......@@ -218,7 +218,7 @@ void testExtractor() {
OnlineFasta data("../../data/segmentation.fasta", 1, " ");
Germline *germline ;
germline = new Germline("TRG", 'G', seqV, seqV, seqJ, 0, 10, 0);
germline = new Germline("TRG", 'G', seqV, seqV, seqJ, 0, 0);
germline->new_index("##########");
MultiGermline *multi ;
......@@ -298,7 +298,7 @@ void testProbability() {
V.add(v);
J.add(j);
}
Germline germline("Test", 'T', V, Fasta(), J, 0, 30);
Germline germline("Test", 'T', V, Fasta(), J, 0);
germline.new_index("####");
......
......@@ -7,7 +7,7 @@ void testWSAdd() {
map<string, string> labels;
WindowsStorage ws(labels);
Sequence seq = {"label", "l", "GATACATTAGACAGCT", "", NULL};
Germline germline("Test", 't', "../../data/small_V.fa", "", "../../data/small_J.fa", -10, 50);
Germline germline("Test", 't', "../../data/small_V.fa", "", "../../data/small_J.fa", -10);
TAP_TEST(ws.size() == 0, TEST_WS_SIZE_NONE, "");
......@@ -49,7 +49,7 @@ void testWSAdd() {
TAP_TEST(it->label_full == "other", TEST_WS_GET_READS, "");
TAP_TEST(it->sequence == "TAAGATTAGCCACGGACT", TEST_WS_GET_READS, "");
Germline germline2("Other test", 'o', "../../data/small_V.fa", "", "../../data/small_J.fa", -20, 30);
Germline germline2("Other test", 'o', "../../data/small_V.fa", "", "../../data/small_J.fa", -20);
// Insert a sequence from another germline 2 times
for (int i = 0; i < 2; i++) {
ws.add("CATT", seq, SEG_MINUS, &germline2);
......@@ -59,7 +59,7 @@ void testWSAdd() {
TAP_TEST(ws.getGermline("ATTAG") == &germline,TEST_WS_GET_GERMLINE, "");
TAP_TEST(ws.getGermline("CATT") == &germline2,TEST_WS_GET_GERMLINE, "");
Germline germline3("Another test", 'a', "../../data/small_V.fa", "", "../../data/small_J.fa", -52, 12);
Germline germline3("Another test", 'a', "../../data/small_V.fa", "", "../../data/small_J.fa", -52);
// Insert a sequence from another germline 6 times
for (int i = 0; i < 6; i++) {
ws.add("ATAGCAT", seq, SEG_MINUS, &germline3);
......@@ -114,7 +114,7 @@ void testWSAddWithLimit() {
ws.setBinParameters(1, 20);
Sequence seq = {"label", "l", "GATACATTAGACAGCT", "", NULL};
Sequence seq_long = {"label", "l", "GATACATTAGACAGCTTATATATATATTTATAT", "", NULL};
Germline germline("Test", 't', "../../data/small_V.fa", "", "../../data/small_J.fa", -10, 50);
Germline germline("Test", 't', "../../data/small_V.fa", "", "../../data/small_J.fa", -10);
ws.add("ATTAG", seq, SEG_PLUS, &germline);
ws.add("ATTAG", seq, SEG_PLUS, &germline);
......
......@@ -102,10 +102,8 @@ enum { CMD_WINDOWS, CMD_CLONES, CMD_SEGMENT, CMD_GERMLINES } ;
#define DEFAULT_SEED ""
#define DEFAULT_DELTA_MIN -10
#define DEFAULT_DELTA_MAX 20
#define DEFAULT_DELTA_MIN_D 0
#define DEFAULT_DELTA_MAX_D 80
#define DEFAULT_MAX_AUDITIONED 2000
#define DEFAULT_RATIO_REPRESENTATIVE 0.5
......@@ -179,7 +177,6 @@ void usage(char *progname, bool advanced)
<< " (using -k option is equivalent to set with -s a contiguous seed with only '#' characters)" << endl
#endif
<< " -m <int> minimal admissible delta between last V and first J k-mer (default: " << DEFAULT_DELTA_MIN << ") (default with -D: " << DEFAULT_DELTA_MIN_D << ")" << endl
<< " -M <int> maximal admissible delta between last V and first J k-mer (default: " << DEFAULT_DELTA_MAX << ") (default with -D: " << DEFAULT_DELTA_MAX_D << ")" << endl
<< " -w <int> w-mer size used for the length of the extracted window (default: " << DEFAULT_W << ")" << endl
<< " -e <float> maximal e-value for determining if a segmentation can be trusted (default: " << THRESHOLD_NB_EXPECTED << ")" << endl
<< " -t <int> trim V and J genes (resp. 5' and 3' regions) to keep at most <int> nt (default: " << DEFAULT_TRIM << ") (0: no trim)" << endl
......@@ -334,7 +331,6 @@ int main (int argc, char **argv)
// Admissible delta between left and right segmentation points
int delta_min = DEFAULT_DELTA_MIN ; // Kmer+Fine
int delta_max = DEFAULT_DELTA_MAX ; // Fine
int trim_sequences = DEFAULT_TRIM;
bool output_sequences_by_cluster = false;
......@@ -368,7 +364,7 @@ int main (int argc, char **argv)
//$$ options: getopt
while ((c = getopt(argc, argv, "A!x:X:hHaiI12g:G:V:D:J:k:r:vw:e:C:f:W:l:Fc:m:M:N:s:b:Sn:o:L%:y:z:uUK3E:t:")) != EOF)
while ((c = getopt(argc, argv, "A!x:X:hHaiI12g:G:V:D:J:k:r:vw:e:C:f:W:l:Fc:m:N:s:b:Sn:o:L%:y:z:uUK3E:t:")) != EOF)
switch (c)
{
......@@ -404,7 +400,6 @@ int main (int argc, char **argv)
case 'D':
f_reps_D.push_back(optarg);
delta_min = DEFAULT_DELTA_MIN_D ;
delta_max = DEFAULT_DELTA_MAX_D ;
break;
case 'J':
......@@ -445,7 +440,6 @@ int main (int argc, char **argv)
{
f_reps_D.push_back(putative_f_rep_D.c_str()) ;
delta_min = DEFAULT_DELTA_MIN_D ;
delta_max = DEFAULT_DELTA_MAX_D ;
}
}
f_reps_J.push_back((germline_system + "J.fa").c_str()) ;
......@@ -479,10 +473,6 @@ int main (int argc, char **argv)
delta_min = atoi(optarg);
break;
case 'M':
delta_max = atoi(optarg);
break;
case '!':
keep_unsegmented_as_clone = true;
break;
......@@ -788,7 +778,7 @@ int main (int argc, char **argv)
Germline *germline;
germline = new Germline(germline_system, 'X',
f_reps_V, f_reps_D, f_reps_J,
delta_min, delta_max, trim_sequences);
delta_min, trim_sequences);
germline->new_index(seed);
......@@ -807,7 +797,7 @@ int main (int argc, char **argv)
multigermline->build_with_one_index(seed, false);
}
Germline *pseudo = new Germline(PSEUDO_GERMLINE_MAX12, 'x', -10, 80, trim_sequences);
Germline *pseudo = new Germline(PSEUDO_GERMLINE_MAX12, 'x', -10, trim_sequences);
pseudo->index = multigermline->index ;
multigermline->germlines.push_back(pseudo);
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment