Commit f4ea8cc5 authored by Mathieu Giraud's avatar Mathieu Giraud
Browse files

merge -- window qualities

A new feature by @duez: we can now report the average quality in the representative sequence
of each clone. The quality is computed over Vidjil's window, shared between all reads of the clone.
parents d0a47246 e6918bd2
......@@ -63,6 +63,10 @@ float KmerRepresentativeComputer::getCoverage() const{
return coverage;
}
string KmerRepresentativeComputer::getQuality() const{
return quality;
}
string KmerRepresentativeComputer::getCoverageInfo() const{
return coverage_info;
}
......@@ -98,14 +102,20 @@ void KmerRepresentativeComputer::compute() {
size_t length_longest_run = 0;
size_t seq_index_longest_run = 1;
Sequence sequence_longest_run;
int sequence_used_for_quality = 0;
int window_quality_sum [required.length()];
memset(window_quality_sum, 0, required.length()*sizeof(int));
size_t k = getSeed().length();
for (size_t seq = 1; seq <= sequences.size() && seq <= seq_index_longest_run + stability_limit ; seq++) {
Sequence sequence = rc.getithBest(seq);
if (sequence.sequence.size() <= length_longest_run) {
// Break as soon as the sequences are too small
if (sequence.sequence.size() < required.size()) {
break;
}
size_t pos_required = sequence.sequence.find(required);
if (pos_required == string::npos && revcomp)
pos_required = sequence.sequence.find(::revcomp(required));
......@@ -113,6 +123,20 @@ void KmerRepresentativeComputer::compute() {
if (pos_required == string::npos) {
continue;
}
//sum quality
if (sequence.quality.length() > 0) {
for (size_t i = 0; i<required.length(); i++) {
window_quality_sum[i] += static_cast<int>(sequence.quality[i+pos_required]);
}
sequence_used_for_quality++;
}
// When sequences are smaller than length_longest_run,
// they are used only for the above quality computation
if (sequence.sequence.size() <= length_longest_run) {
continue;
}
size_t pos_end_required = pos_required + required.length();
......@@ -159,8 +183,26 @@ void KmerRepresentativeComputer::compute() {
+ string_of_int(pos_longest_run + length_longest_run - 1) + "]"
+ "-#" + string_of_int(seq_index_longest_run);
representative.quality = "";
quality = "";
if (sequence_used_for_quality > 0) {
//init default quality
for (size_t i = 0; i < representative.sequence.length(); i++)
quality += "!";
//add window quality
size_t pos_required = representative.sequence.find(required);
if (pos_required == string::npos && revcomp) {
size_t pos_required_rev = ::revcomp(representative.sequence).find(required);
size_t pos_end_required_rev = pos_required_rev+required.length();
for (size_t i = 0; i<required.length(); i++)
quality[pos_end_required_rev-i-1] = static_cast<char>(window_quality_sum[i]/sequence_used_for_quality);
}else{
for (size_t i = 0; i<required.length(); i++)
quality[pos_required+i] = static_cast<char>(window_quality_sum[i]/sequence_used_for_quality);
}
}
coverage = (float) length_longest_run / coverage_reference_length;
coverage_info = string_of_int(length_longest_run) + " bp"
......
......@@ -24,6 +24,7 @@ protected:
string required;
float coverage_reference_length;
float coverage;
string quality;
string coverage_info;
public:
RepresentativeComputer(list<Sequence> &r);
......@@ -105,6 +106,7 @@ public:
// Getters, setters
string getSeed() const;
float getCoverage() const;
string getQuality() const;
string getCoverageInfo() const;
/**
......
!LAUNCH: $VIDJIL_DIR/vidjil $VIDJIL_DEFAULT_OPTIONS -c clones -A -G $VIDJIL_DIR/germline/TRG -A $VIDJIL_DIR/data/segment_lec.fq > /dev/null ; cat out/segment_lec.vidjil
$ Window
1:"id": "GGGGTCTATTACTGTGCCACCTGGGCCTTATTATAAGAAACTCTTTGGCA"
$ Average quality over the window
1: !!!KLMNOPQRSTUVWXYZ0123456789ABCDEFGHIJKLMNOP@RSTUVWX!!!!
$ All 'start' fields are 1-based, they never equal to zero
0: "start": 0
......@@ -13,6 +13,9 @@ $ Most abundant window
$ Affect values are over all the sequence
1: "affectValues": .[^}]*"start": 1, "stop": 128
$ No quality information here
0: "quality"
$ Segmentation
1:"name": "IGHV3-23.05 6/ACCCGGGAGGAACAATAT/9 IGHD6-13.01 0//5 IGHJ4.02"
......
......@@ -2,7 +2,7 @@
#include "core/representative.h"
void testRepresentative() {
list<Sequence> reads = Fasta("../../data/representative.fa").getAll();
list<Sequence> reads = Fasta("../../data/representative.fq").getAll();
KmerRepresentativeComputer krc(reads, "##############");
......@@ -35,9 +35,16 @@ void testRepresentative() {
krc.setRequiredSequence("ATCGCGCCCT"); // revcomp
krc.compute();
representative = krc.getRepresentative();
string quality = krc.getQuality();
TAP_TEST(representative.label.find("seq2-[33,52]") == 0, TEST_KMER_REPRESENTATIVE_REQUIRED_SEQ,
"When requiring sequence ATCGCGCCCT, we should have seq2 (" << representative.label << " instead)");
TAP_TEST(representative.sequence.length() == quality.length() ,TEST_KMER_REPRESENTATIVE_QUALITY,
"representative sequence length(" << representative.sequence.length() << ") and quality sequence length(" << quality.length() << ") must be equal");
TAP_TEST(quality == "!!!!!!!>--------<!!!" ,TEST_KMER_REPRESENTATIVE_QUALITY,
"representative quality sequence and seq2 quality must be the same for the required sequence");
krc.setRevcomp(false);
krc.compute();
TAP_TEST(! krc.hasRepresentative(), TEST_KMER_REPRESENTATIVE_REQUIRED_SEQ,
......@@ -52,7 +59,7 @@ void testRepresentative() {
}
void testRevcompRepresentative() {
list<Sequence> reads = Fasta("../../data/representative_revcomp.fa").getAll();
list<Sequence> reads = Fasta("../../data/representative_revcomp.fq").getAll();
KmerRepresentativeComputer krc(reads, "##############");
krc.setOptions(false, 3, 0.5);
......@@ -82,4 +89,4 @@ void testRevcompRepresentative() {
TAP_TEST(revcomp(representative.sequence) == representative2.sequence, TEST_KMER_REPRESENTATIVE_REVCOMP,
"The two representatives should have the same sequence (but revcomp-ed)");
}
}
\ No newline at end of file
......@@ -103,6 +103,7 @@ enum {
TEST_KMER_REPRESENTATIVE,
TEST_KMER_REPRESENTATIVE_REQUIRED_SEQ,
TEST_KMER_REPRESENTATIVE_REVCOMP,
TEST_KMER_REPRESENTATIVE_QUALITY,
/* Kmer segmentation */
TEST_KMER_IS_SEGMENTED,
......@@ -292,6 +293,7 @@ inline void declare_tests() {
RECORD_TAP_TEST(TEST_KMER_REPRESENTATIVE, "Test KmerRepresentativeComputer computations");
RECORD_TAP_TEST(TEST_KMER_REPRESENTATIVE_REQUIRED_SEQ, "Test KmerRepresentativeComputer computations with a required sequence");
RECORD_TAP_TEST(TEST_KMER_REPRESENTATIVE_REVCOMP, "Test KmerRepresentativeComputer computations on a dataset and its revcomp");
RECORD_TAP_TEST(TEST_KMER_REPRESENTATIVE_QUALITY, "Test KmerRepresentativeComputer quality computations");
RECORD_TAP_TEST(TEST_BUG_SEGMENTATION, "Test segmentation bug");
RECORD_TAP_TEST(TEST_SEGMENT_POSITION, "Test V,D,J position");
RECORD_TAP_TEST(TEST_KMER_SEGMENT_OVERLAP, "Test kmer segmentation with an overlap");
......
......@@ -1356,6 +1356,13 @@ int main (int argc, char **argv)
json_clone["_coverage_info"] = {repComp.getCoverageInfo()};
//From KmerMultiSegmenter
json_clone["seg"] = kseg->toJson();
if (repComp.getQuality().length())
json_clone["seg"]["quality"] = {
{"start", 1},
{"stop", kseg->getSequence().sequence.length()},
{"seq", repComp.getQuality()}
};
//$$ If max_clones is reached, we stop here but still outputs the representative
......
......@@ -36,7 +36,7 @@ function Clone(data, model, index, virtual) {
this.segEdited = false
this.virtual = typeof virtual !== 'undefined' ? virtual : false
var key = Object.keys(data)
for (var i=0; i<key.length; i++ ){
this[key[i]]=data[key[i]]
}
......
......@@ -331,6 +331,8 @@ Model_loader.prototype = {
for (var i=0 ; i<this.reads.segmented.length; i++){
this.reads.segmented_all[i] = this.reads.segmented[i]
}
this.compute_average_quality();
//extract germline
......@@ -700,6 +702,29 @@ Model_loader.prototype = {
newSeg = seg[key+bound];
}
return newSeg;
},
compute_average_quality: function() {
this.min_quality = 255;
this.max_quality = 0;
this.average_quality = 0;
var count = 0;
for (var i in this.clones){
if (typeof this.clones[i].seg != "undefined" && typeof this.clones[i].seg.quality != "undefined"){
var s = this.clones[i].seg.quality.seq;
for (var j in s){
if (s[j] != "!"){
var c = s[j].charCodeAt(0);
if (c < this.min_quality) this.min_quality=c;
if (c > this.max_quality) this.max_quality=c;
this.average_quality += c;
count++;
}
}
}
}
this.average_quality = this.average_quality/count;
}
};
......@@ -1134,7 +1134,7 @@ Sequence.prototype = {
highlightSpan.dataset.tooltip = h.tooltip;
highlightSpan.dataset.tooltip_position = "right";
}
highlightSpan.appendChild(document.createTextNode(h.seq));
highlightSpan.innerHTML = h.seq;
var highlightWrapper = document.createElement("span");
highlightWrapper.appendChild(highlightSpan);
......@@ -1273,6 +1273,16 @@ Sequence.prototype = {
if (this.seq[k] != '-') {
var cc = raw_seq[j++]
if ((cc != '_') && (cc != ' ')) c = cc ;
if (field == "quality") {
var percent = (cc.charCodeAt(0)-this.m.min_quality)/(this.m.max_quality-this.m.min_quality)
var color_hue = (percent * 100);
var color = colorGenerator(color_hue);
c = "<span style='height:"+(50+(percent*50))+"%;"
c += "top:"+(100-(50+(percent*50)))+"%;"
c += "position:relative;"
c += "background-color:"+color+";"
c += "'>\u00A0</span>"
}
}
h.seq += c
}
......
>seq1
TAAAAAAAAAACCCCCCCCCCGGGGGGGGGGTTTTTTTTTTA
>seq2
AAAACCCCCCCCCCGGGGGGGGGGTTTTTTTTTTA
TAGGGCGCGATCGACTTC
>seq3
GCGCGATCGACTTCTCGAGCATGTAGGCTACACGGTA
TAAAAAAAAAACCCCCCCCCCGGGGGGGGGGTTTTTT
>seq4
TATAGGGCGCGATCGACTTC
@seq1
TAAAAAAAAAACCCCCCCCCCGGGGGGGGGGTTTTTTTTTTA
+
ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOP
@seq2
AAAACCCCCCCCCCGGGGGGGGGGTTTTTTTTTTATAGGGCGCGATCGACTTC
+
ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJ<-------->UVWXYZA
@seq3
GCGCGATCGACTTCTCGAGCATGTAGGCTACACGGTATAAAAAAAAAACCCCCCCCCCGGGGGGGGGGTTTTTT
+
ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUV
@seq4
TATAGGGCGCGATCGACTTC
+
ABCDEFGHIJKLMNOPQRST
>lcl|FLN1FA002QB069.1|
@lcl|FLN1FA002QB069.1|
ggactggagtggattgggtatatctattacagtgggagcaccaactacaacccctccctcaagagtcgagtcaccatatcagtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgctgcggacacggccgtgtattactgtgcgagagggacatggtataacgacggctggcctcactttgactactggggcccgggaaccctggtcaccgtctcctcag
>lcl|FLN1FA001DWH5L.1|
+
ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMN<_____________>DEFGHIJKLMNOPQRSTUVWXYZA
@lcl|FLN1FA001DWH5L.1|
cagtgggagcaccctttacaacccctccctcaagagtcgagtcaccatatcagaagacacgtccgagaaccagttctccctgaagctgagctctgtgaccgctgcggacacggccatatattactgtgcgagagggacatggtataacgacggctggcctcactttgactactggggcccgggaaccctggtcaccgtctcctcag
>lcl|FLN1FA001CU46B.1|
+
ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJK<_____________>ABCDEFGHIJKLMNOPQRSTUVWX
@lcl|FLN1FA001CU46B.1|
gcctggagtggattgggtacatctattacagtgggagcacctactacaacccgtccctcaagagtcgagttaccatatcagtagacacgtctaagaaccagttctccctgaagctgagctctgtgactgccgcggacacggccgtgtattactgtgcgagagggacatggtataacgacggctggcctcactttgactactggggcccgggaaccctggtcaccgtctcctcag
>lcl|FLN1FA002Q5881.1|
+
ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLM<_____________>CDEFGHIJKLMNOPQRSTUVWXYZ
@lcl|FLN1FA002Q5881.1|
cagtgggagcaccctttacaacccctccctcaagagtcgagtcaccatatcagaagacacgtccgagaaccagttctccctgaagctgagctctgtgaccgctgcggacacggccatatattactgtgcgagagggacatggtataacgacggctggcctcactttgactactggggcccgggaaccctggtcaccgtctcctcag
>lcl|FLN1FA001CR02J.1|
+
ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJK<_____________>ABCDEFGHIJKLMNOPQRSTUVWX
@lcl|FLN1FA001CR02J.1|
ggctggagtggattggggaaatcaatcatagtggaagcaccaactacaacccgtccctcaagagtcgagtcaccatatcagtagacacgtccaagaaccagttctccctgaagctgagctctgtgaccgccgcggacacggctgtgtattactgtgcgagagggacatggtataacgacggctggcctcactttgactactggggcccgggaaccctggtcaccgtctcctcag
+
ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLM<_____________>CDEFGHIJKLMNOPQRSTUVWXYZ
@read1
CTCATACACCCAGGAGGTGGAGCTGGATATTGATACTACGAAATCTAATTGAAAATGATTCTGGGGTCTATTACTGTGCCACCTGGGCCTTATTATAAGAAACTCTTTGGCAGTGGAAC
+
ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZAAAAA
@read2
CTCATACACCCAGGAGGTGGAGCTGGATATTGATACTACGAAATCTAATTGAAAATGATTCTGGGGTCTATTACTGTGCCACCTGGGCCTTATTATAAGAAACTCTTTGGCAGTGGAAC
+
ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789ABCDEFGHIJKLMNOP0RSTUVWXYZAAAAA
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment