Commit bf570c13 authored by Mathieu Giraud's avatar Mathieu Giraud

Merge branch 'feature-a/3354' into 'dev'

Fix #3354

Closes #3354

See merge request !246
parents 7066cb07 f06cd9cc
Pipeline #32763 canceled with stages
in 3 seconds
......@@ -835,22 +835,10 @@ bool comp_pair (pair<int,int> i,pair<int,int> j)
}
/**
* Align a read against a collection of sequences, maximizing the alignment 'score'
* @param read: the read
* @param rep: a collection of reference sequences
* @param reverse_ref: if true, reverse the reference sequences (VkVk)
* @param reverse_both: if true, reverse both the read and the reference sequences (J segment)
* @param local: if true, Local alignment (D segment), otherwise LocalEndWithSomeDeletions and onlyBottomTriangle (V and J segments)
* @param box: the AligBox to fill
* @param segment_cost: the cost used by the dynamic programing
* @param banded_dp: should we perform a banded DP?
* @post box is filled
*/
void align_against_collection(string &read, BioReader &rep, int forbidden_rep_id,
bool reverse_ref, bool reverse_both, bool local,
AlignBox *box, Cost segment_cost, bool banded_dp)
AlignBox *box, Cost segment_cost, bool banded_dp,
double evalue_threshold)
{
int best_score = MINUS_INF ;
......@@ -920,11 +908,15 @@ void align_against_collection(string &read, BioReader &rep, int forbidden_rep_id
length = min(length, (int) rep.sequence(box->ref_nb).size());
length += del_end;
// length is an estimation of the number of aligned nucleotides. It would be better with #2138
int score_with_limit_number_of_indels = (length - BOTTOM_TRIANGLE_SHIFT) * segment_cost.match + BOTTOM_TRIANGLE_SHIFT * segment_cost.insertion;
if (onlyBottomTriangle && best_score < score_with_limit_number_of_indels) {
int min_number_of_matches = min(int(length * FRACTION_ALIGNED_AT_WORST), length - BOTTOM_TRIANGLE_SHIFT); // Minimal number of matches we can have with a triangle
int max_number_of_insertions = length - min_number_of_matches;
int score_with_limit_number_of_indels = min_number_of_matches * segment_cost.match + max_number_of_insertions * segment_cost.insertion;
float evalue = sequence_or_rc.size() * rep.totalSize() * segment_cost.toPValue(best_score);
if (onlyBottomTriangle && (best_score < score_with_limit_number_of_indels
|| evalue > evalue_threshold)) {
// Too many indels/mismatches, let's do a full DP
align_against_collection(read, rep, forbidden_rep_id, reverse_ref, reverse_both,
local, box, segment_cost, false);
local, box, segment_cost, false,evalue_threshold);
return;
}
......@@ -1030,13 +1022,15 @@ FineSegmenter::FineSegmenter(Sequence seq, Germline *germline, Cost segment_c,
sequence_or_rc = revcomp(sequence, reversed); // sequence, possibly reversed
// the threshold is lowered by the number of independent tests made
double standardised_threshold_evalue = threshold / multiplier;
/* Read mapping */
if (germline->seg_method == SEG_METHOD_ONE)
{
align_against_collection(sequence_or_rc, germline->rep_4, NO_FORBIDDEN_ID, false, false,
true, // local
box_D, segment_cost);
box_D, segment_cost, false, standardised_threshold_evalue);
segmented = true ;
because = reversed ? SEG_MINUS : SEG_PLUS ;
......@@ -1058,13 +1052,13 @@ FineSegmenter::FineSegmenter(Sequence seq, Germline *germline, Cost segment_c,
FilterWithACAutomaton* f = germline->getFilter_5();
BioReader filtered = f->filterBioReaderWithACAutomaton(sequence_or_rc, kmer_threshold);
align_against_collection(sequence_or_rc, filtered, NO_FORBIDDEN_ID, reverse_V, reverse_V, false,
box_V, segment_cost);
box_V, segment_cost, false, standardised_threshold_evalue);
}else{
align_against_collection(sequence_or_rc, germline->rep_5, NO_FORBIDDEN_ID, reverse_V, reverse_V, false,
box_V, segment_cost);
box_V, segment_cost, false, standardised_threshold_evalue);
}
align_against_collection(sequence_or_rc, germline->rep_3, NO_FORBIDDEN_ID, reverse_J, !reverse_J, false,
box_J, segment_cost);
box_J, segment_cost, false, standardised_threshold_evalue);
// J was run with '!reverseJ', we copy the box informations from right to left
// Should this directly be handled in align_against_collection() ?
box_J->start = box_J->end ;
......@@ -1144,7 +1138,7 @@ bool FineSegmenter::FineSegmentD(Germline *germline,
// Align
align_against_collection(str, germline->rep_4, forbidden_id, false, false, true,
box_DD, segment_cost);
box_DD, segment_cost, false, evalue_standardised_threshold_evalue);
box_DD->start += l ;
box_DD->end += l ;
......
......@@ -54,7 +54,7 @@
will trigger UNSEG_TOO_SHORT_FOR_WINDOW.
- Margins above the half of the window length may produce shortened or shifted windows
*/
#define FRACTION_ALIGNED_AT_WORST .5 /* Fraction of the sequence that should be aligned before deactivating the heuristics */
using namespace std;
using json = nlohmann::json;
......@@ -366,6 +366,7 @@ class FineSegmenter : public Segmenter
* Build a fineSegmenter based on KmerSegmentation
* @param seq: An object read from a FASTA/FASTQ file
* @param germline: germline used
* @param threshold: threshold of randomly expected segmentation
* @param kmer_threshold: This threshold is used while filtering the V
* BioReader in Germline. If this value is 0, every K-mer from getMultiResults
* is used for the filtering. Otherwise if N > 0, the N best K-mers are used
......@@ -403,8 +404,23 @@ class FineSegmenter : public Segmenter
};
/**
* Align a read against a collection of sequences, maximizing the alignment 'score'
* @param read: the read
* @param rep: a collection of reference sequences
* @param reverse_ref: if true, reverse the reference sequences (VkVk)
* @param reverse_both: if true, reverse both the read and the reference sequences (J segment)
* @param local: if true, Local alignment (D segment), otherwise LocalEndWithSomeDeletions and onlyBottomTriangle (V and J segments)
* @param box: the AligBox to fill
* @param segment_cost: the cost used by the dynamic programing
* @param banded_dp: Should we use banded dynamic programming?
* @param evalue_threshold: threshold for randomly expected segmentation (evalue) to relaunch a full DP without banded_dp
* @post box is filled
*/
void align_against_collection(string &read, BioReader &rep, int forbidden_rep_id,
bool reverse_ref, bool reverse_both, bool local,
AlignBox *box, Cost segment_cost, bool banded_dp=true);
AlignBox *box, Cost segment_cost, bool banded_dp=true,
double evalue_threshold=1.);
#endif
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment