Commit 6184ddb5 authored by Mathieu Giraud's avatar Mathieu Giraud

vidjil: new experimental '-t' option

parents cdbcef54 fcc26944
......@@ -3,11 +3,13 @@
#include <ctype.h>
void Germline::init(string _code, char _shortcut,
int _delta_min, int _delta_max)
int _delta_min, int _delta_max,
int max_indexing)
{
code = _code ;
shortcut = _shortcut ;
index = 0 ;
this->max_indexing = max_indexing;
affect_5 = "V" ;
affect_4 = "" ;
......@@ -26,17 +28,19 @@ void Germline::init(string _code, char _shortcut,
Germline::Germline(string _code, char _shortcut,
int _delta_min, int _delta_max)
int _delta_min, int _delta_max,
int max_indexing)
{
init(_code, _shortcut, _delta_min, _delta_max);
init(_code, _shortcut, _delta_min, _delta_max, max_indexing);
}
Germline::Germline(string _code, char _shortcut,
string f_rep_5, string f_rep_4, string f_rep_3,
int _delta_min, int _delta_max)
int _delta_min, int _delta_max,
int max_indexing)
{
init(_code, _shortcut, _delta_min, _delta_max);
init(_code, _shortcut, _delta_min, _delta_max, max_indexing);
f_reps_5.push_back(f_rep_5);
f_reps_4.push_back(f_rep_4);
......@@ -50,9 +54,10 @@ Germline::Germline(string _code, char _shortcut,
Germline::Germline(string _code, char _shortcut,
list <string> _f_reps_5, list <string> _f_reps_4, list <string> _f_reps_3,
int _delta_min, int _delta_max)
int _delta_min, int _delta_max,
int max_indexing)
{
init(_code, _shortcut, _delta_min, _delta_max);
init(_code, _shortcut, _delta_min, _delta_max, max_indexing);
f_reps_5 = _f_reps_5 ;
f_reps_4 = _f_reps_4 ;
......@@ -75,9 +80,10 @@ Germline::Germline(string _code, char _shortcut,
Germline::Germline(string _code, char _shortcut,
Fasta _rep_5, Fasta _rep_4, Fasta _rep_3,
int _delta_min, int _delta_max)
int _delta_min, int _delta_max,
int max_indexing)
{
init(_code, _shortcut, _delta_min, _delta_max);
init(_code, _shortcut, _delta_min, _delta_max, max_indexing);
rep_5 = _rep_5 ;
rep_4 = _rep_4 ;
......@@ -101,23 +107,22 @@ void Germline::set_index(IKmerStore<KmerAffect> *_index)
void Germline::update_index(IKmerStore<KmerAffect> *_index)
{
if (!_index) _index = index ;
_index->insert(rep_5, affect_5);
_index->insert(rep_5, affect_5, max_indexing);
if (affect_4.size())
_index->insert(rep_4, affect_4);
_index->insert(rep_3, affect_3);
_index->insert(rep_3, affect_3, -max_indexing);
}
void Germline::mark_as_ambiguous(Germline *other)
{
index->insert(other->rep_5, AFFECT_AMBIGUOUS_SYMBOL);
index->insert(other->rep_5, AFFECT_AMBIGUOUS_SYMBOL, max_indexing);
if (other->affect_4.size())
index->insert(other->rep_4, AFFECT_AMBIGUOUS_SYMBOL);
index->insert(other->rep_3, AFFECT_AMBIGUOUS_SYMBOL);
index->insert(other->rep_3, AFFECT_AMBIGUOUS_SYMBOL, -max_indexing);
}
......@@ -178,36 +183,36 @@ void MultiGermline::add_germline(Germline *germline, string seed)
germlines.push_back(germline);
}
void MultiGermline::build_default_set(string path)
void MultiGermline::build_default_set(string path, int max_indexing)
{
// Should parse 'data/germlines.data'
add_germline(new Germline("TRA", 'A', path + "/TRAV.fa", "", path + "/TRAJ.fa", -10, 20), SEED_S13);
add_germline(new Germline("TRB", 'B', path + "/TRBV.fa", path + "/TRBD.fa", path + "/TRBJ.fa", 0, 80), SEED_S12);
add_germline(new Germline("TRG", 'G', path + "/TRGV.fa", "", path + "/TRGJ.fa", -10, 30), SEED_S10);
add_germline(new Germline("TRD", 'D', path + "/TRDV.fa", path + "/TRDD.fa", path + "/TRDJ.fa", 0, 80), SEED_S10);
add_germline(new Germline("IGH", 'H', path + "/IGHV.fa", path + "/IGHD.fa", path + "/IGHJ.fa", 0, 80), SEED_S12);
add_germline(new Germline("IGK", 'K', path + "/IGKV.fa", "", path + "/IGKJ.fa", -10, 20), SEED_S10);
add_germline(new Germline("IGL", 'L', path + "/IGLV.fa", "", path + "/IGLJ.fa", -10, 20), SEED_S10);
add_germline(new Germline("TRA", 'A', path + "/TRAV.fa", "", path + "/TRAJ.fa", -10, 20, max_indexing), SEED_S13);
add_germline(new Germline("TRB", 'B', path + "/TRBV.fa", path + "/TRBD.fa", path + "/TRBJ.fa", 0, 80, max_indexing), SEED_S12);
add_germline(new Germline("TRG", 'G', path + "/TRGV.fa", "", path + "/TRGJ.fa", -10, 30, max_indexing), SEED_S10);
add_germline(new Germline("TRD", 'D', path + "/TRDV.fa", path + "/TRDD.fa", path + "/TRDJ.fa", 0, 80, max_indexing), SEED_S10);
add_germline(new Germline("IGH", 'H', path + "/IGHV.fa", path + "/IGHD.fa", path + "/IGHJ.fa", 0, 80, max_indexing), SEED_S12);
add_germline(new Germline("IGK", 'K', path + "/IGKV.fa", "", path + "/IGKJ.fa", -10, 20, max_indexing), SEED_S10);
add_germline(new Germline("IGL", 'L', path + "/IGLV.fa", "", path + "/IGLJ.fa", -10, 20, max_indexing), SEED_S10);
}
void MultiGermline::build_incomplete_set(string path)
void MultiGermline::build_incomplete_set(string path, int max_indexing)
{
// Should parse 'data/germlines.data'
// VdJa
add_germline(new Germline("VdJa", 'a', path + "/TRDV.fa", path + "/TRDD.fa", path + "/TRAJ.fa", -10, 80), SEED_S13);
add_germline(new Germline("VdJa", 'a', path + "/TRDV.fa", path + "/TRDD.fa", path + "/TRAJ.fa", -10, 80, max_indexing), SEED_S13);
// DD2-DD3
add_germline(new Germline("TRD+", 'd', path + "/TRDD2-01.fa", "", path + "/TRDJ.fa", -10, 60), SEED_9);
add_germline(new Germline("TRD+", 'd', path + "/TRDV.fa", "", path + "/TRDD3-01.fa", -10, 50), SEED_9);
add_germline(new Germline("TRD+", 'd', path + "/TRDD2-01.fa", "", path + "/TRDD3-01.fa", -10, 50), SEED_9);
add_germline(new Germline("TRD+", 'd', path + "/TRDD2-01.fa", "", path + "/TRDJ.fa", -10, 60, max_indexing), SEED_9);
add_germline(new Germline("TRD+", 'd', path + "/TRDV.fa", "", path + "/TRDD3-01.fa", -10, 50, max_indexing), SEED_9);
add_germline(new Germline("TRD+", 'd', path + "/TRDD2-01.fa", "", path + "/TRDD3-01.fa", -10, 50, max_indexing), SEED_9);
// DH-JH
add_germline(new Germline("IGH+", 'h', path + "/IGHD.fa", "", path + "/IGHJ.fa", -10, 20), SEED_S12);
add_germline(new Germline("IGH+", 'h', path + "/IGHD.fa", "", path + "/IGHJ.fa", -10, 20, max_indexing), SEED_S12);
// IGK: KDE, INTRON
add_germline(new Germline("IGK+", 'k', path + "/IGK-INTRON.fa", "", path + "/IGK-KDE.fa", -10, 80), SEED_S10);
add_germline(new Germline("IGK+", 'k', path + "/IGKV.fa", "", path + "/IGK-KDE.fa", -10, 80), SEED_S10);
add_germline(new Germline("IGK+", 'k', path + "/IGK-INTRON.fa", "", path + "/IGK-KDE.fa", -10, 80, max_indexing), SEED_S10);
add_germline(new Germline("IGK+", 'k', path + "/IGKV.fa", "", path + "/IGK-KDE.fa", -10, 80, max_indexing), SEED_S10);
}
/* if 'one_index_per_germline' was not set, this should be called once all germlines have been loaded */
......
......@@ -15,8 +15,11 @@ using namespace std;
class Germline {
private:
int max_indexing;
void init(string _code, char _shortcut,
int _delta_min, int _delta_max);
int _delta_min, int _delta_max,
int max_indexing);
public:
/*
......@@ -26,22 +29,27 @@ class Germline {
* @param delta_min: the maximal distance between the right bound and the left bound
* so that the segmentation is accepted
* (left bound: end of V, right bound : start of J)
* @param max_indexing: maximal length of the sequence to be indexed (0: all)
*/
Germline(string _code, char _shortcut,
list <string> f_rep_5, list <string> f_rep_4, list <string> f_rep_3,
int _delta_min, int _delta_max);
int _delta_min, int _delta_max,
int max_indexing=0);
Germline(string _code, char _shortcut,
string f_rep_5, string f_rep_4, string f_rep_3,
int _delta_min, int _delta_max);
int _delta_min, int _delta_max,
int max_indexing=0);
Germline(string _code, char _shortcut,
Fasta _rep_5, Fasta _rep_4, Fasta _rep_3,
int _delta_min, int _delta_max);
int _delta_min, int _delta_max,
int max_indexing=0);
Germline(string _code, char _shortcut,
int _delta_min, int _delta_max);
int _delta_min, int _delta_max,
int max_indexing=0);
~Germline();
......@@ -96,8 +104,8 @@ class MultiGermline {
void insert(Germline *germline);
void add_germline(Germline *germline, string seed);
void build_default_set(string path);
void build_incomplete_set(string path);
void build_default_set(string path, int max_indexing);
void build_incomplete_set(string path, int max_indexing);
// Creates and update an unique index for all the germlines
// If 'set_index' is set, set this index as the index for all germlines
......
......@@ -56,23 +56,27 @@ public:
* @param label: label that must be associated to the given files
* @post All the sequences in the FASTA files have been indexed, and the label is stored in the list of labels
*/
void insert(Fasta& input, const string& label="");
void insert(Fasta& input, const string& label="", int keep_only = 0);
/**
* @param input: A list of FASTA files
* @param label: label that must be associated to the given files
* @post All the sequences in the FASTA files have been indexed, and the label is stored in the list of labels
*/
void insert(list<Fasta>& input, const string& label="");
void insert(list<Fasta>& input, const string& label="", int keep_only = 0);
/**
* @param input: A sequence to be cut in k-mers
* @param label: label that must be associated to the given files
* @param keep_only: if > 0 will keep at most the last keep_only nucleotides
* of the sequence. if < 0 will keep at most the first
* keep_only nucleotides of the sequence. if == 0,
* will keep all the sequence.
* @post All the k-mers in the sequence have been indexed.
*/
void insert(const seqtype &sequence,
const string &label,
bool ignore_extended_nucleotides=true);
bool ignore_extended_nucleotides=true, int keep_only = 0);
/**
* @param word: a k-mer
......@@ -187,17 +191,19 @@ IKmerStore<T>::~IKmerStore(){}
template<class T>
void IKmerStore<T>::insert(list<Fasta>& input,
const string &label){
const string &label,
int keep_only){
for(list<Fasta>::iterator it = input.begin() ; it != input.end() ; it++){
insert(*it, label);
insert(*it, label, keep_only);
}
}
template<class T>
void IKmerStore<T>::insert(Fasta& input,
const string &label){
const string &label,
int keep_only){
for (int r = 0; r < input.size(); r++) {
insert(input.sequence(r), label);
insert(input.sequence(r), label, true, keep_only);
}
labels.push_back(make_pair(T(label, 1), label)) ;
......@@ -206,8 +212,16 @@ void IKmerStore<T>::insert(Fasta& input,
template<class T>
void IKmerStore<T>::insert(const seqtype &sequence,
const string &label,
bool ignore_extended_nucleotides){
for(size_t i = 0 ; i + s < sequence.length() + 1 ; i++) {
bool ignore_extended_nucleotides,
int keep_only){
size_t start_indexing = 0;
size_t end_indexing = sequence.length();
if (keep_only > 0 && sequence.length() > (size_t)keep_only) {
start_indexing = sequence.length() - keep_only;
} else if (keep_only < 0 && sequence.length() > (size_t) -keep_only) {
end_indexing = -keep_only;
}
for(size_t i = start_indexing ; i + s < end_indexing + 1 ; i++) {
seqtype substr = sequence.substr(i, s);
seqtype kmer = spaced(substr, seed);
......
!LAUNCH: ../../vidjil -g ../../germline -2 -i ../../data/chimera-fake.fa
!LAUNCH: ../../vidjil -t 0 -g ../../germline -2 -i ../../data/chimera-fake.fa
$ Segment all reads
1:junction detected in 3 reads
......
!LAUNCH: (cd ../.. ; $LAUNCHER ./vidjil -c germlines -s '######-######' data/Stanford_S22.fasta)
!LAUNCH: (cd ../.. ; $LAUNCHER ./vidjil -c germlines -t 100 -s '######-######' data/Stanford_S22.fasta)
$ number of reads and kmers
1:13153 reads, 3020179 kmers
$ k-mers, IGHV
1:13148 .* 1832527 .*IGHV
1:13115 .* 1223118 .*IGHV
$ k-mers, IGHJ
1:5 .* 407486 .*IGHJ
1:38 .* 435867 .*IGHJ
$ k-mers, ambiguous
1:98857 .*\\?
1:47370 .*\\?
$ k-mers, unknown
1:618273 .*_
1:1251710 .*_
......@@ -213,7 +213,7 @@ void testExtractor() {
OnlineFasta data("../../data/segmentation.fasta", 1, " ");
Germline *germline ;
germline = new Germline("TRG", 'G', seqV, seqV, seqJ, 0, 10);
germline = new Germline("TRG", 'G', seqV, seqV, seqJ, 0, 10, 0);
germline->new_index("##########");
MultiGermline *multi ;
......
......@@ -116,6 +116,8 @@ enum { CMD_WINDOWS, CMD_CLONES, CMD_SEGMENT, CMD_GERMLINES } ;
#define DEFAULT_CLUSTER_COST Cluster
#define DEFAULT_SEGMENT_COST VDJ
#define DEFAULT_TRIM 0
// error
#define ERROR_STRING "[error] "
......@@ -180,6 +182,7 @@ void usage(char *progname, bool advanced)
<< " -M <int> maximal admissible delta between last V and first J k-mer (default: " << DEFAULT_DELTA_MAX << ") (default with -D: " << DEFAULT_DELTA_MAX_D << ")" << endl
<< " -w <int> w-mer size used for the length of the extracted window (default: " << DEFAULT_W << ")" << endl
<< " -e <float> maximal e-value for determining if a segmentation can be trusted (default: " << THRESHOLD_NB_EXPECTED << ")" << endl
<< " -t <int> trim V and J genes (resp. 5' and 3' regions) to keep at most <int> nt (default: " << DEFAULT_TRIM << ") (0: no trim)" << endl
<< endl
<< "Labeled windows (these windows will be kept even if -r/-% thresholds are not reached)" << endl
......@@ -316,6 +319,7 @@ int main (int argc, char **argv)
// Admissible delta between left and right segmentation points
int delta_min = DEFAULT_DELTA_MIN ; // Kmer+Fine
int delta_max = DEFAULT_DELTA_MAX ; // Fine
int trim_sequences = DEFAULT_TRIM;
bool output_sequences_by_cluster = false;
bool output_segmented = false;
......@@ -348,7 +352,7 @@ int main (int argc, char **argv)
//$$ options: getopt
while ((c = getopt(argc, argv, "A!x:X:hHaiI12g:G:V:D:J:k:r:vw:e:C:f:W:l:Fc:m:M:N:s:b:Sn:o:L%:y:z:uUK3E:")) != EOF)
while ((c = getopt(argc, argv, "A!x:X:hHaiI12g:G:V:D:J:k:r:vw:e:C:f:W:l:Fc:m:M:N:s:b:Sn:o:L%:y:z:uUK3E:t:")) != EOF)
switch (c)
{
......@@ -485,6 +489,10 @@ int main (int argc, char **argv)
output_sequences_by_cluster = true;
break;
case 't':
trim_sequences = atoi(optarg);
break;
case 'v':
verbose += 1 ;
break;
......@@ -772,7 +780,7 @@ int main (int argc, char **argv)
if (multi_germline)
{
multigermline->build_default_set(multi_germline_file);
multigermline->build_default_set(multi_germline_file, trim_sequences);
}
else
{
......@@ -780,7 +788,7 @@ int main (int argc, char **argv)
Germline *germline;
germline = new Germline(germline_system, 'X',
f_reps_V, f_reps_D, f_reps_J,
delta_min, delta_max);
delta_min, delta_max, trim_sequences);
germline->new_index(seed);
......@@ -799,7 +807,7 @@ int main (int argc, char **argv)
multigermline->build_with_one_index(seed, false);
}
Germline *pseudo = new Germline(PSEUDO_GERMLINE_MAX12, 'x', -10, 80);
Germline *pseudo = new Germline(PSEUDO_GERMLINE_MAX12, 'x', -10, 80, trim_sequences);
pseudo->index = multigermline->index ;
multigermline->germlines.push_back(pseudo);
}
......@@ -807,7 +815,7 @@ int main (int argc, char **argv)
// Should come after the initialization of regular (and possibly pseudo) germlines
if (multi_germline_incomplete) {
multigermline->one_index_per_germline = true; // Starting from now, creates new indexes
multigermline->build_incomplete_set(multi_germline_file);
multigermline->build_incomplete_set(multi_germline_file, trim_sequences);
}
if (multi_germline_mark)
......
......@@ -5,6 +5,7 @@ This changelog concerns the algorithmic part (C++) of Vidjil.
* New default e-value threshold (-e 1.0), improving the segmentation heuristic
* New default for window length (-w 50), even with -D or with -g, streamlining the window handling
* Better multi-germline analysis (-g), selecting the best locus on the e-value (core/segment.cpp)
* New experimental trim option (-t), considering only the relevant ends of the germline genes (core/kmerstore.h)
* Streamlined unsegmentation causes, including 'too short for w(indow)'
* Updated main and debug ouptut
* Updated help (algo.org, and new locus.org)
......
......@@ -161,6 +161,7 @@ Window prediction
-M <int> maximal admissible delta between last V and first J k-mer (default: 20) (default with -D: 80)
-w <int> w-mer size used for the length of the extracted window (default: 50)
-e <float> maximal e-value for determining if a segmentation can be trusted (default: 'all', no limit)
-t <int> trim V and J genes (resp. 5' and 3' regions) to keep at most <int> nt (default: 0) (0: no trim)
#+END_EXAMPLE
The =-s=, =-k=, =-m= and =-M= options are the options of the seed-based heuristic. A detailed
......@@ -189,6 +190,12 @@ The default value is 1.0, but values such as 1000, 1e-3 or even less can be used
to have a more or less permissive segmentation.
The threshold can be disabled with =-e all=.
The =-t= option sets the maximal number of nucleotides that will be indexed in
V genes (the 3' end) or in J genes (the 5' end). This reduces the load of the
indexes, giving more precise window estimation and e-value computation.
This option is currently not set, it will be set by default in a next release.
Using =-t 100= is generally safe.
** Threshold on clone output
The following options control how many clones are output and analyzed.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment