Commit 166ffae1 authored by Marc Duez's avatar Marc Duez
Browse files
parents 553e6716 a9dd3ea5
......@@ -346,6 +346,7 @@ IKmerStore<T> *KmerStoreFactory::createIndex(string seed, bool revcomp) {
try{
index = new ArrayKmerStore<T>(seed, revcomp);
}catch(exception e){
cout << " (using a MapKmer to fit into memory)" << endl;
index = new MapKmerStore<T>(seed, revcomp);
}
return index;
......
!LAUNCH: ../../vidjil -w 100 -G ../../germline/IGH -d ../../data/Stanford_S22.fasta
!LAUNCH: ../../vidjil -s '#####-#####' -w 100 -G ../../germline/IGH -d ../../data/Stanford_S22.fasta
!LOG: stanford-w100.log
$ Parses IGHV.fa germline
......@@ -11,10 +11,10 @@ $ Parses germline/IGHJ.fa
1: 701 bp in 13 sequences
$ Find the good number of "too short sequences" for windows of size 100
1: SEG, but no window -> 2046
1: SEG, but no window -> 2052
$ Find the good number of segmented sequences (including "too short sequences")
1: segmented 11220 reads .85.3%.
1: segmented 11216 reads .85.3%.
$ Find the good number of windows of size 100 in Stanford S22
1: found 8233 100-windows in 9174 segments
1: found 8224 100-windows in 9164 segments
......@@ -11,5 +11,5 @@ $ Parses germline/IGHJ.fa
1: 701 bp in 13 sequences
$ Find the good number of windows in Stanford S22
1: found 9327 60-windows in 11220 segments
1: found 10413 60-windows in 12532 segments
......@@ -68,7 +68,7 @@
#define MIN_READS_WINDOW 10
#define MIN_READS_CLONE 10
#define MAX_CLONES 20
#define RATIO_READS_CLONE 0.1
#define RATIO_READS_CLONE 0.0
#define COMMAND_WINDOWS "windows"
#define COMMAND_ANALYSIS "clones"
......@@ -102,7 +102,7 @@ enum { CMD_WINDOWS, CMD_ANALYSIS, CMD_SEGMENT } ;
#define DEFAULT_MAX_AUDITIONED 2000
#define DEFAULT_RATIO_REPRESENTATIVE 0.5
#define DEFAULT_MIN_COVER_REPRESENTATIVE 5
#define MIN_COVER_REPRESENTATIVE_RATIO_MIN_READS_CLONE 1.0
#define DEFAULT_EPSILON 0
#define DEFAULT_MINPTS 10
......@@ -110,6 +110,9 @@ enum { CMD_WINDOWS, CMD_ANALYSIS, CMD_SEGMENT } ;
#define DEFAULT_CLUSTER_COST Cluster
#define DEFAULT_SEGMENT_COST VDJ
// warn
#define WARN_RATIO_SEGMENTED 0.4
// display
#define WIDTH_NB_READS 7
#define WIDTH_NB_CLONES 3
......@@ -188,15 +191,15 @@ void usage(char *progname)
<< " -p <string> prefix output filenames by the specified string" << endl
<< " -a output all sequences by cluster (" << SEQUENCES_FILENAME << ")" << endl
<< " -x no detailed analysis of each cluster" << endl
<< " -x do not compute representative sequences" << endl
<< " -v verbose mode" << endl
<< endl
<< endl
<< "Examples (see doc/README)" << endl
<< " " << progname << " -G germline/IGH -d data/Stanford_S22.fasta" << endl
<< " " << progname << " -c clones -G germline/IGH -r 1 -R 1 -% 0 -d data/Stanford_S22.fasta" << endl
<< " " << progname << " -c segment -G germline/IGH -d data/Stanford_S22.fasta" << endl
<< " " << progname << " -G germline/IGH -d data/Stanford_S22.fasta" << endl
<< " " << progname << " -c clones -G germline/IGH -r 5 -R 5 -d data/Stanford_S22.fasta" << endl
<< " " << progname << " -c segment -G germline/IGH -d data/Stanford_S22.fasta" << endl
;
exit(1);
}
......@@ -245,7 +248,6 @@ int main (int argc, char **argv)
float ratio_reads_clone = RATIO_READS_CLONE;
// int average_deletion = 4; // Average number of deletion in V or J
size_t min_cover_representative = DEFAULT_MIN_COVER_REPRESENTATIVE;
float ratio_representative = DEFAULT_RATIO_REPRESENTATIVE;
unsigned int max_auditionned = DEFAULT_MAX_AUDITIONED;
......@@ -454,7 +456,7 @@ int main (int argc, char **argv)
//$$ options: post-processing+display
size_t min_cover_representative = (size_t) (MIN_COVER_REPRESENTATIVE_RATIO_MIN_READS_CLONE * min_reads_clone) ;
// Default seeds
......@@ -633,12 +635,20 @@ int main (int argc, char **argv)
// nb_segmented is the main denominator for the following (but will be normalized)
int nb_segmented = we.getNbSegmented(TOTAL_SEG_AND_WINDOW);
float ratio_segmented = 100 * (float) nb_segmented / nb_total_reads ;
cout << " ==> found " << windowsStorage->size() << " " << w << "-windows"
<< " in " << nb_segmented << " segments"
<< " (" << setprecision(3) << 100 * (float) nb_segmented / nb_total_reads << "%)"
<< " (" << setprecision(3) << ratio_segmented << "%)"
<< " inside " << nb_total_reads << " sequences" << endl ;
// warn if there are too few segmented sequences
if (ratio_segmented < WARN_RATIO_SEGMENTED)
{
cout << " ! There are not so many CDR3 windows found in this set of reads." << endl ;
cout << " ! If this is unexpected, check the germline (-G) and try to change seed parameters (-k)." << endl ;
}
cout << " # av. length" << endl ;
for (int i=0; i<STATS_SIZE; i++)
......@@ -775,6 +785,7 @@ int main (int argc, char **argv)
VirtualReadScore *scorer = new KmerAffectReadScore(*index);
int num_clone = 0 ;
int clones_without_representative = 0 ;
ofstream out_edges((out_dir+prefix_filename + EDGES_FILENAME).c_str());
int nb_edges = 0 ;
......@@ -886,8 +897,16 @@ int main (int argc, char **argv)
ratio_representative,
max_auditionned);
if (representative == NULL_SEQUENCE) {
clones_without_representative++ ;
if (verbose)
cout << "# (no representative sequence with current parameters)" ;
} else {
//$$ There is one representative, FineSegmenter
if (representative != NULL_SEQUENCE) {
representative.label = string_of_int(it->second) + "-"
+ representative.label;
FineSegmenter seg(representative, rep_V, rep_J, delta_min, delta_max, segment_cost);
......@@ -1013,7 +1032,7 @@ int main (int argc, char **argv)
}
//$$ Very detailed cluster analysis (with sequences)
//$$ Very detailed cluster analysis (with sequences) // NOT USED NOW
list<string> msa;
bool good_msa = false ;
......@@ -1147,11 +1166,19 @@ int main (int argc, char **argv)
SimilarityMatrix matrix = compare_all(representatives_this_clone, true);
cout << RawOutputSimilarityMatrix(matrix, 90);
}
//$$ End of very_detailed_cluster_analysis
out_edges.close() ;
cout << "#### end of clones" << endl;
cout << endl;
if (clones_without_representative > 0)
{
cout << clones_without_representative << " clones without representatives" ;
cout << " (requiring " << min_cover_representative << "/" << ratio_representative << " supporting reads)" << endl ;
cout << endl ;
}
//$$ Compare representatives of all clones
......
2014-03-26 The Vidjil Team
* Better default seed selection, depending on the germline, segments more reads (vidjil.cpp)
* Better selection of representative read XXX setRequiredSequence (core/representative.cpp)
* Better selection of representative read (core/representative.cpp)
* New option to output all clones (-A), for testing purposes
* Updated debug option (-u) to display k-mer affection (core/windowExtractor.cpp)
* Improved documentation
* New unit tests, updated some tests
* Improved documentation and comments on main stdout
2014-02-20 The Vidjil Team
* Refactored main vidjil.cpp (core/windows.cpp, core/windowExtractor.cpp)
......
......@@ -62,7 +62,8 @@ make test # run self-tests
### Optional dependencies
clustalw (to compute alignments between windows from a same clone)
clustalw (to compute alignments between windows from a same clone, by setting
very_detailed_cluster_analysis in vidjil.cpp)
neato (to display graph of neighbors for the automatic clusterisation)
### Vidjil parameters
......@@ -118,7 +119,7 @@ CTATGATAGTAGTGGTTATTACGGGGTAGGGCAGTACTACTACTACTACATGGACGTCTG
# then gather them into clones (-R 1, with at least 1 read each:
# there are many 1-read clones due to sequencing errors.)
# A more natural option could be -R 5.
# No representative selection / clustalw postprocessing (-x)
# No representative selection (-x)
# Results are in out/segmented.fa, out/windows.fa-* and out/clones*
# out/segmented.fa list segmented reads using the .vdj format (see below)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment