Commit e977c03c authored by Mikael Salson's avatar Mikael Salson Committed by Mathieu Giraud

read_score.{h,cpp}: ReadQualityScorer

A new score depending on quality and length

The percentile of the quality is computed multiplied by the read length
and is then divided by what we consider being a good quality.
This somehow corresponds to artificially diminishing the read length if
the quality is bad.

We increase the number of bins to make sure that the better quality
reads will be in a different bin than medium quality reads.
parent e9e141d5
#include "read_score.h"
#include <cstdlib>
KmerAffectReadScore::KmerAffectReadScore(IKmerStore<KmerAffect> &idx,
float unambiguous_score,
......@@ -65,3 +66,29 @@ ReadLengthScore::~ReadLengthScore(){}
float ReadLengthScore::getScore(const Sequence &sequence) const {
return sequence.sequence.size();
}
////////////////////////////////////////////////////////////////////////////////
////////////////////////////// ReadQualityScore ///////////////////////////////
////////////////////////////////////////////////////////////////////////////////
ReadQualityScore::ReadQualityScore(){}
ReadQualityScore::~ReadQualityScore(){}
float ReadQualityScore::getScore(const Sequence &sequence) const {
size_t *qualities = (size_t *)calloc(MAX_QUALITY, sizeof(size_t));
for (size_t i = 0; i < sequence.quality.size(); i++) {
qualities[(sequence.quality[i]) - '!']++;
}
int max_percentile = (int) round(sequence.quality.size()*1. / 100);
int percent_quality = 0;
// Computes the percentile of the quality
for (size_t i = 0; i < MAX_QUALITY; i++) {
max_percentile -= qualities[i];
if (max_percentile < 0) {
percent_quality = i;
break;
}
}
free(qualities);
return percent_quality * sequence.sequence.size() / GOOD_QUALITY;
}
......@@ -5,6 +5,9 @@
#include "kmerstore.h"
#include "kmeraffect.h"
#define MAX_QUALITY 50 /* Maximal value for the quality */
#define GOOD_QUALITY 30 /* Min value considered as a good quality */
/**
* This virtual class contains a single method that allows to compute a score
* for a string.
......@@ -71,4 +74,19 @@ class ReadLengthScore: public VirtualReadScore {
*/
float getScore(const Sequence &sequence) const;
};
/**
* A simple implementation of VirtualReadScore.
* The score is a trade-off between quality and length of the read
*/
class ReadQualityScore: public VirtualReadScore {
public:
ReadQualityScore();
~ReadQualityScore();
/**
* @return the sequence quality
*/
float getScore(const Sequence &sequence) const;
};
#endif
......@@ -94,7 +94,7 @@ void KmerRepresentativeComputer::compute() {
size_t max = sequences.size();
// Create a read chooser to have the sequences sorted by length
ReadLengthScore *rlc = new ReadLengthScore();
ReadQualityScore *rlc = new ReadQualityScore();
ReadChooser rc(sequences, *rlc);
delete rlc;
......
......@@ -23,7 +23,7 @@
#include "stats.h"
#include "../lib/json.hpp"
#define NB_BINS 15
#define NB_BINS 30
#define MAX_VALUE_BINS 500
using namespace std;
......@@ -40,7 +40,7 @@ class WindowsStorage {
list<pair <junction, size_t> > sort_all_windows;
map<junction, int> id_by_window;
size_t max_reads_per_window;
ReadLengthScore scorer;
ReadQualityScore scorer;
/* Parameters for the read storage */
size_t nb_bins;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment