Commit 315c4bfb authored by Mathieu Giraud's avatar Mathieu Giraud
Browse files

Merge branch 'feature-a/4665-v-d-kmers' into 'dev'

fix typos, show D seeds, bikeshed .affects

Closes #4727 and #4338

See merge request !932
parents 9b0a61a3 9fa2024b
Pipeline #232129 passed with stages
in 8 minutes and 28 seconds
......@@ -182,7 +182,8 @@ ostream &operator<<(ostream &out, const FilterWithACAutomaton& obj){
int total_sequences_origin = total_filtered_calls * origin_bioreader_size;
float aligned_rate = ((float)total_sequences_filtered/(float)total_sequences_origin) * 100;
out << fixed << setw(8) << total_sequences_filtered << "/"
out << right
<< fixed << setw(8) << total_sequences_filtered << "/"
<< fixed << setw(8) << total_sequences_origin << " "
<< fixed << setprecision(1) << setw(6) << aligned_rate << "%"
<< endl ;
......
......@@ -230,23 +230,33 @@ Germline::~Germline()
}
}
void out_index_seed(ostream &out,
const Germline &germline,
string seed_x,
string affect_x)
{
size_t seed_x_span = seed_x.size();
out << " " << fixed << setprecision(3) << setw(5)
<< 100 * germline.index->getIndexLoad(KmerAffect(affect_x, 1, seed_x_span)) << "%"
<< " l" << left << setw(2) << seed_x.length()
<< " k" << left << setw(2) << seed_weight(seed_x)
<< " " << left << setw(15) << seed_x ;
}
ostream &operator<<(ostream &out, const Germline &germline)
{
out << setw(5) << left << germline.code << right << " '" << germline.shortcut << "' "
<< " ";
size_t seed_5_span = germline.seed_5.size();
size_t seed_5_w = seed_weight(germline.seed_5);
size_t seed_3_span = germline.seed_3.size();
size_t seed_3_w = seed_weight(germline.seed_3);
if (germline.index) {
out << " 0x" << hex << setw(2) << setfill('0') << germline.index->id << dec << setfill(' ') << " " ;
out << fixed << setprecision(3) << setw(8)
<< 100 * germline.index->getIndexLoad(KmerAffect(germline.affect_5, 1, seed_5_span)) << "%" << " "
<< 100 * germline.index->getIndexLoad(KmerAffect(germline.affect_3, 1, seed_3_span)) << "%";
out << " l" << germline.seed_5.length() << " k" << seed_5_w << " " << germline.seed_5 ;
out << " l" << germline.seed_3.length() << " k" << seed_3_w << " " << germline.seed_3 ;
out_index_seed(out, germline, germline.seed_5, germline.affect_5);
if (germline.rep_4.size())
out_index_seed(out, germline, germline.seed_4, germline.affect_4);
out_index_seed(out, germline, germline.seed_3, germline.affect_3);
}
out << endl;
......
......@@ -431,13 +431,20 @@ string KmerSegmenter::getInfoLineWithAffects() const
{
stringstream ss;
ss << "# "
ss << "= " << right << setw(10) << segmented_germline->code << " "
<< right << setw(3) << score << " "
<< left << setw(30)
<< getInfoLine() ;
<< getInfoLine();
if (getSegmentationStatus() != UNSEG_TOO_SHORT)
ss << getKmerAffectAnalyser()->toString();
{
ss << endl;
ss << "# " << right << setw(10) << segmented_germline->code << " "
<< getKmerAffectAnalyser()->toStringValues();
ss << endl;
ss << "$ " << right << setw(10) << segmented_germline->code << " "
<< getKmerAffectAnalyser()->toStringSigns();
}
return ss.str();
}
......
......@@ -118,6 +118,7 @@ JsonOutputWindowsMatrix::JsonOutputWindowsMatrix(SimilarityMatrix &m, float sim,
ostream &operator<<(ostream &out, const RawOutputSimilarityMatrix &outputMatrix) {
SimilarityMatrix &matrix = outputMatrix.matrix;
out << right;
out << " | " ;
for (int num = 0; num < matrix.size(); num++)
......
......@@ -88,10 +88,10 @@ string fixed_string_of_float(float number, int precision)
return ss.str();
}
string scientific_string_of_double(double number)
string scientific_string_of_double(double number, int precision)
{
stringstream ss;
ss << scientific << number ;
ss << scientific << setprecision(precision) << number ;
return ss.str();
}
......
......@@ -128,7 +128,7 @@ bool pair_occurrence_sort(pair<T, int> a, pair<T, int> b);
string string_of_int(int number, int pad_to_width=0);
string fixed_string_of_float(float number, int precision);
string scientific_string_of_double(double number);
string scientific_string_of_double(double number, int precision=2);
string string_of_map(map <string, string> m, const string &before);
/**
......
......@@ -53,7 +53,9 @@ WindowsStorage *WindowExtractor::extract(OnlineBioReader *reads,
nb_reads++;
if (out_affects) {
*out_affects << reads->getSequence();
Sequence seq = reads->getSequence();
*out_affects << ">" << seq.label << endl
<< setw(13) << " " << seq.sequence << endl;
}
KmerMultiSegmenter kmseg(reads->getSequence(), multigermline, out_affects, nb_expected, nb_reads_for_evalue);
......@@ -114,7 +116,7 @@ WindowsStorage *WindowExtractor::extract(OnlineBioReader *reads,
// Last line of detailed affects output
if (out_affects) {
*out_affects << "#>" << seg->label << " " << seg->getInfoLine() << endl << endl;
*out_affects << "==> " << seg->label << " " << seg->getInfoLine() << endl << endl;
}
// Progress bar
......@@ -125,7 +127,9 @@ WindowsStorage *WindowExtractor::extract(OnlineBioReader *reads,
cout << "." ;
if (!(nb_reads % (PROGRESS_POINT * PROGRESS_LINE)))
cout << setw(10) << nb_reads / 1000 << "k reads " << fixed << setprecision(2) << setw(14) << bp_total / 1E6 << " Mbp" << endl ;
cout << right
<< setw(10) << nb_reads / 1000 << "k reads "
<< fixed << setprecision(2) << setw(14) << bp_total / 1E6 << " Mbp" << endl ;
cout.flush() ;
}
......
!OUTPUT_FILE: out/bug4225-j.affects
!LAUNCH: $VIDJIL_DIR/$EXEC -g $VIDJIL_DIR/germline -r 1 -1 -2 -K bug4225-j.fa
!LAUNCH: $VIDJIL_DIR/$EXEC -c windows -g $VIDJIL_DIR/germline -r 1 -1 -2 -K bug4225-j.fa
$ Show name and sequence
1:>igkj1
1:GTGGACGTTCGGCCAAGGGACCAAGGTGGAAATCAAAC
$ Display three lines per system
3:TRG
3:IGH\+
$ Find only +k and ? affects before the stretch of _ for all loci
16: seed .*(\+k| \?){28}( _)+$
16:^\#.* (k|\?){28}_+$
16:^\$.* (\+| ){28}( )+$
!OUTPUT_FILE: out/chimera-fake-half.affects
!LAUNCH: $VIDJIL_DIR/$EXEC -g $VIDJIL_DIR/germline -r 1 -4 -K ../data/chimera-fake-half.fa
!LAUNCH: $VIDJIL_DIR/$EXEC -c windows -g $VIDJIL_DIR/germline/homo-sapiens.g:TRB -r 1 -4 -K ../data/chimera-fake-half.fa
$ Find only 48 +B affects on the TRB and unexpected lines
2:^\#.* (_){40}(B){48}(_){10}
2:^\$.* (\s){40}(\+){48}(\s){10}
$ Find only +B and ? affects on the TRB and unexpected lines
2: seed .* _(\+B| \?){48} _
......@@ -13,8 +13,8 @@ $ Parses IGHD.fa germline
$ Parses germline/homo-sapiens/IGHJ.fa
1: 701 bp in 13 sequences
$ Find the good index load
1:custom .* 0.078% 0.002% l13 k12
$ Find the good index loads
1:custom .* 0.078% l13 k12 .* 0.002% l13 k12
$ Find approximately the good number of sequences for e-value computation
1: approx. 131.. sequences
......
......@@ -4,7 +4,8 @@ $ Segments the sequence
1: SEG .* -> .* 1
$ Find the good affects for the D segment, including a AMBIGUOUS k-mer, common with V
1:_ _ _ \?.f.f.f.f.f.f _ _ _ _
1:___\?ffffff____
1: \+\+\+\+\+\+
$ The windows found by the KmerSegmenter has a fairly good position
2: 1 163 180 220
......@@ -4,6 +4,6 @@
$ Clone 13 is correctly analyzed
1:FLN1FA001EP9M2.* IGHV2-26.* 2/GAT.*GCC/8 IGHJ2
$ Statistics on -Z
1:Statistics on clone analysis
$ Statistics on --analysis-filter
1:Statistics on filtered genes for clone analysis
rb1: IGH 3[0-2][0-9]{2}/ 1[0-2][0-9]{3} 28..%
\ No newline at end of file
!LAUNCH: $VIDJIL_DIR/$EXEC -r 5 -K -g $VIDJIL_DIR/germline/homo-sapiens.g:TRA,TRB,TRD,TRG,IGH,IGK,IGL $VIDJIL_DATA/multi-short.fa ; head -n 17 out/multi-short.affects
!OUTPUT_FILE: out/multi-short.affects
!LAUNCH: $VIDJIL_DIR/$EXEC -r 5 -K --first-reads 1 -g $VIDJIL_DIR/germline/homo-sapiens.g:TRA,TRB,TRD,TRG,IGH,IGK,IGL $VIDJIL_DATA/multi-short.fa
# Testing .affects output (-K)
$ First sequence (TRA), display sequence
......@@ -7,8 +8,8 @@ $ First sequence (TRA), display sequence
$ First sequence (TRA), count of k-mers
1:129 .* TRA
$ First sequence (TRA), segmentation
1:>TRA--V1-1.01--J1.01 .* seed TRA SEG_+
$ First sequence (TRA), detection
1:==> TRA--V1-1.01--J1.01 .* seed TRA SEG_+
!LAUNCH: $LAUNCHER $VIDJIL_DIR/$EXEC $EXTRA -k 9 -V $VIDJIL_DIR/germline/homo-sapiens/IGHV.fa -J $VIDJIL_DIR/germline/homo-sapiens/IGHJ.fa -K -c clones $VIDJIL_DATA/revcomp.fa ; grep 'X.X.X' out/revcomp.affects | sed 's/[^X]//g' | sort -u ; grep '#>' out/revcomp.affects | sed 's/.*SEG.../e-value:/' | cut -f 1 -d' '
!LAUNCH: $LAUNCHER $VIDJIL_DIR/$EXEC $EXTRA -k 9 -V $VIDJIL_DIR/germline/homo-sapiens/IGHV.fa -J $VIDJIL_DIR/germline/homo-sapiens/IGHJ.fa -K -c clones $VIDJIL_DATA/revcomp.fa
!NO_LAUNCHER:
!LAUNCH: grep 'X.X.X' out/revcomp.affects | sed 's/[^X]//g' | sort -u
!LAUNCH: grep '==>' out/revcomp.affects | sed 's/.*SEG.../e-value:/' | cut -f 1 -d' '
$ Segments both reads, normal and reverse
1:junction detected in 2 reads
......
......@@ -427,7 +427,7 @@ int main (int argc, char **argv)
bool keep_unsegmented_as_clone = false;
app.add_flag("--not-analyzed-as-clones", keep_unsegmented_as_clone,
"consider not analyzed reads as clones, taking for junction the complete sequence, to be used on very small datasets (for example --not-analyzed-as-clones -AX 20)")
"consider not analyzed reads as clones, taking for junction the complete sequence, to be used on very small datasets (for example --not-analyzed-as-clones --all -X 20)")
-> group(group) -> level();
......@@ -855,7 +855,7 @@ int main (int argc, char **argv)
if (max_clones == NO_LIMIT_VALUE || max_clones > WARN_MAX_CLONES)
{
cout << endl
<< "* WARNING: " << PROGNAME << " was run with '-A' option or with a large '-z' option" << endl ;
<< "* WARNING: " << PROGNAME << " was run with '--all' option or with a large '--max-designations/-z' option" << endl ;
}
if (command == CMD_SEGMENT)
......@@ -1335,7 +1335,7 @@ int main (int argc, char **argv)
if (sort_clones.size() == 0)
{
cout << " ! No clones with current parameters." << endl;
cout << " ! See the 'Limits to report a clone' options (-r, --ratio, -z, -A)." << endl;
cout << " ! See the 'Limits to report and to analyze clones' options (-r, --min-ratio, -z, --all)." << endl;
}
else
{
......@@ -1458,7 +1458,7 @@ int main (int argc, char **argv)
{
cout << "." ;
if (!(num_clone % (PROGRESS_POINT_CLONES * PROGRESS_LINE)))
cout << setw(10) << num_clone / 1000 << "k clones " << endl;
cout << right << setw(10) << num_clone / 1000 << "k clones " << endl;
cout.flush() ;
}
}
......@@ -1518,7 +1518,7 @@ int main (int argc, char **argv)
output.addClone(it->first, clone);
clone->set("sequence", kseg->getSequence().sequence);
clone->set("_coverage", { repComp.getCoverage() });
clone->set("_average_read_length", { windowsStorage->getAverageLength(it->first) });
clone->set("_average_read_length", { fixed_string_of_float(windowsStorage->getAverageLength(it->first), 2) });
clone->set("_coverage_info", {repComp.getCoverageInfo()});
//From KmerMultiSegmenter
kseg->toOutput(clone);
......@@ -1857,7 +1857,7 @@ int main (int argc, char **argv)
//$ Output statistics on filter()
if (verbose && (kmer_threshold != NO_LIMIT_VALUE)) {
cout << "Statistics on clone analysis (-Z):" << endl;
cout << "Statistics on filtered genes for clone analysis (--analysis-filter):" << endl;
for(list<Germline*>::const_iterator it = multigermline->germlines.begin(); it != multigermline->germlines.end(); ++it){
FilterWithACAutomaton *f = (*it)->getFilter_5();
if (f)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment