Commit 37c28d85 authored by Mikaël Salson's avatar Mikaël Salson

Merge branch 'feature-a/2643-minimize' into 'dev'

Feature a/2643 minimize

See merge request !76
parents 1b59c773 ff54d77b
Pipeline #15183 passed with stages
in 20 seconds
......@@ -59,6 +59,34 @@ int KmerAffectAnalyser::count(const KmerAffect &affect) const{
}
int KmerAffectAnalyser::minimize(const KmerAffect &affect, int margin, int width) const {
int i = margin ;
uint64_t val_max = 0 ;
int i_max = NO_MINIMIZING_POSITION ;
for (vector<KmerAffect>::const_iterator it = affectations.begin() + margin;
it < affectations.end() - margin && i <= seq.length() - width;
it++, i++) {
if (*it == affect)
{
uint64_t val = dna_to_hash(&seq[i], width) ;
if (val > val_max) {
val_max = val ;
i_max = i ;
}
}
}
if (i_max == NO_MINIMIZING_POSITION)
return i_max ;
return i_max + (seq.length() - affectations.size() + 1) / 2;
}
const KmerAffect&KmerAffectAnalyser::getAffectation(int i) const{
assert(i >= 0 && i < count());
return affectations[i];
......
......@@ -11,6 +11,8 @@
#include <iostream>
#include <iomanip>
#define NO_MINIMIZING_POSITION -1
// Define two constant affectations: ambiguous and unknown.
/* Declaration of types */
......@@ -148,6 +150,15 @@ class KmerAffectAnalyser: public AffectAnalyser {
int count(const KmerAffect &affect) const;
/**
* @return a minimizing position on the matching affectations (taking into account the next affectations),
* or NO_MINIMIZING_POSITION if there is no such position
* @param affect: affectation to match
* @param margin: number of positions at both ends in which one does not take into account the matching affectation
* @param width: number of affectations to consider, starting from the matching affectation, for the minimization
*/
int minimize(const KmerAffect &affect, int margin, int width) const;
const KmerAffect &getAffectation(int i) const;
vector<KmerAffect> getAllAffectations(affect_options_t options) const;
......
......@@ -477,11 +477,17 @@ KmerSegmenter::KmerSegmenter(Sequence seq, Germline *germline, double threshold,
return ;
}
int pos = kaa.minimize(kmer, DEFAULT_MINIMIZE_ONE_MARGIN, DEFAULT_MINIMIZE_WIDTH);
if (pos == NO_MINIMIZING_POSITION)
{
because = UNSEG_TOO_SHORT_FOR_WINDOW;
return ;
}
segmented = true ;
because = reversed ? SEG_MINUS : SEG_PLUS ;
int pos = sequence.size() / 2 ;
info = "=" + string_of_int(c) + " @" + string_of_int(pos) ;
// getJunction() will be centered on pos
......
......@@ -46,6 +46,14 @@
be overriden by the command line by
providing a shorter -w*/
#define DEFAULT_MINIMIZE_WIDTH 12 /* Number of nucleotides on which the minimization integer is computed */
#define DEFAULT_MINIMIZE_ONE_MARGIN 20 /* Number of nucleotides on ends of the reads excluded from the minimization in SEG_METHOD_ONE.
- A read smaller than (DEFAULT_MINIMIZE_ONE_MARGIN*2 + max(k, DEFAULT_MINIMIZE_WIDTH)), that is 52,
will trigger UNSEG_TOO_SHORT_FOR_WINDOW.
- Margins above the half of the window length may produce shortened or shifted windows
*/
using namespace std;
using json = nlohmann::json;
......
......@@ -137,6 +137,17 @@ int dna_to_int(const string &word, int size) {
return index_word;
}
uint64_t dna_to_hash(const string &word, int size) {
// djb2-xor hash
// no collision on 12-char strings on "ACGT" (nor on 8-char strings on "ACGTacgt")
// min/max for 12-char strings on "AGCT" are "AAGAACCAACCC"/"TGAACCAACCAA"
uint64_t hash = 5381;
for(int i = 0 ; i < size ; i++){
hash = ((hash << 5) + hash) ^ word[i];
}
return hash;
}
string nuc_to_aa(const string &word) {
string aa;
int index_word = 0;
......
......@@ -136,9 +136,10 @@ inline int nuc_to_int(char nuc) {
}
/**
* Convert size nucleotides from a DNA string to an integer.
* Convert size nucleotides from a DNA string to an integer or to an hash.
*/
int dna_to_int(const string &, int size);
uint64_t dna_to_hash(const string &, int size);
#define GENETIC_CODE \
"KNKN" "TTTT" "RSRS" "IIMI" \
......
>cd19-1-100
CTGGACCCATGTGCACCCCAAGGGGCCTAAGTCATTGCTGAGCCTAGAGCTGAAGGACGATCGCCCGGCCAGAGATATGTGGGTAATGGAGACGGGTCTG
# Shortened reads
>cd19-1-95
CTGGACCCATGTGCACCCCAAGGGGCCTAAGTCATTGCTGAGCCTAGAGCTGAAGGACGATCGCCCGGCCAGAGATATGTGGGTAATGGAGACGG
>cd19-1-90
CTGGACCCATGTGCACCCCAAGGGGCCTAAGTCATTGCTGAGCCTAGAGCTGAAGGACGATCGCCCGGCCAGAGATATGTGGGTAATGGA
>cd19-1-85
CTGGACCCATGTGCACCCCAAGGGGCCTAAGTCATTGCTGAGCCTAGAGCTGAAGGACGATCGCCCGGCCAGAGATATGTGGGTA
>cd19-1-80
CTGGACCCATGTGCACCCCAAGGGGCCTAAGTCATTGCTGAGCCTAGAGCTGAAGGACGATCGCCCGGCCAGAGATATGT
>cd19-1-75
CTGGACCCATGTGCACCCCAAGGGGCCTAAGTCATTGCTGAGCCTAGAGCTGAAGGACGATCGCCCGGCCAGAGA
>cd19-1-70
CTGGACCCATGTGCACCCCAAGGGGCCTAAGTCATTGCTGAGCCTAGAGCTGAAGGACGATCGCCCGGCC
>cd19-1-65
CTGGACCCATGTGCACCCCAAGGGGCCTAAGTCATTGCTGAGCCTAGAGCTGAAGGACGATCGCC
>cd19-1-60
CTGGACCCATGTGCACCCCAAGGGGCCTAAGTCATTGCTGAGCCTAGAGCTGAAGGACGA
>cd19-1-55
CTGGACCCATGTGCACCCCAAGGGGCCTAAGTCATTGCTGAGCCTAGAGCTGAAG
>cd19-1-50
CTGGACCCATGTGCACCCCAAGGGGCCTAAGTCATTGCTGAGCCTAGAGC
>cd19-6-100
CCCATGTGCACCCCAAGGGGCCTAAGTCATTGCTGAGCCTAGAGCTGAAGGACGATCGCCCGGCCAGAGATATGTGGGTAATGGAGACGGGTCTG
>cd19-11-100
GTGCACCCCAAGGGGCCTAAGTCATTGCTGAGCCTAGAGCTGAAGGACGATCGCCCGGCCAGAGATATGTGGGTAATGGAGACGGGTCTG
>cd19-16-100
CCCCAAGGGGCCTAAGTCATTGCTGAGCCTAGAGCTGAAGGACGATCGCCCGGCCAGAGATATGTGGGTAATGGAGACGGGTCTG
>cd19-21-100
AGGGGCCTAAGTCATTGCTGAGCCTAGAGCTGAAGGACGATCGCCCGGCCAGAGATATGTGGGTAATGGAGACGGGTCTG
>cd19-26-100
CCTAAGTCATTGCTGAGCCTAGAGCTGAAGGACGATCGCCCGGCCAGAGATATGTGGGTAATGGAGACGGGTCTG
>cd19-31-100
GTCATTGCTGAGCCTAGAGCTGAAGGACGATCGCCCGGCCAGAGATATGTGGGTAATGGAGACGGGTCTG
>cd19-36-100
TGCTGAGCCTAGAGCTGAAGGACGATCGCCCGGCCAGAGATATGTGGGTAATGGAGACGGGTCTG
>cd19-41-100
AGCCTAGAGCTGAAGGACGATCGCCCGGCCAGAGATATGTGGGTAATGGAGACGGGTCTG
>cd19-46-100
AGAGCTGAAGGACGATCGCCCGGCCAGAGATATGTGGGTAATGGAGACGGGTCTG
>cd19-51-100
TGAAGGACGATCGCCCGGCCAGAGATATGTGGGTAATGGAGACGGGTCTG
>cd19-6-95
CCCATGTGCACCCCAAGGGGCCTAAGTCATTGCTGAGCCTAGAGCTGAAGGACGATCGCCCGGCCAGAGATATGTGGGTAATGGAGACGG
>cd19-11-90
GTGCACCCCAAGGGGCCTAAGTCATTGCTGAGCCTAGAGCTGAAGGACGATCGCCCGGCCAGAGATATGTGGGTAATGGA
>cd19-16-85
CCCCAAGGGGCCTAAGTCATTGCTGAGCCTAGAGCTGAAGGACGATCGCCCGGCCAGAGATATGTGGGTA
>cd19-21-80
AGGGGCCTAAGTCATTGCTGAGCCTAGAGCTGAAGGACGATCGCCCGGCCAGAGATATGT
>cd19-26-75
CCTAAGTCATTGCTGAGCCTAGAGCTGAAGGACGATCGCCCGGCCAGAGA
......@@ -2,15 +2,16 @@
>read-CD4-exact-1
GTAGTAGCCCCTCAGTGCAATGTAGGAGTCCAAGGGGTAAAAACATACAGGGGGGGAAGACCCTCTCCGT
GTCTCAGCTGGAGCTCCAGGATAGTGGCACCTGGACATGCACTGTCTTGCAGAACCAGAAGAAGGTGGAG
>read-CD4-exact-2
GTAGTAGCCCCTCAGTGCAATGTAGGAGTCCAAGGGGTAAAAACATACAGGGGGGGAAGACCCTCTCCGT
>read-CD4-exact-trimmed
CCCCTCAGTGCAATGTAGGAGTCCAAGGGGTAAAAACATACAGGGGGGGAAGACCCTCTCCGT
GTCTCAGCTGGAGCTCCAGGATAGTGGCACCTGGACATGCACTGTCTTGCAGAACCAGAAGAAGGTGGAG
>read-CD19-exact-1
CTGGACCCATGTGCACCCCAAGGGGCCTAAGTCATTGCTGAGCCTAGAGCTGAAGGACGATCGCCCGGCC
AGAGATATGTGGGTAATGGAGACGGGTCTGTTGTTGCCCCGGGCCACAGCTCAAGACGCTGGAAAGTATT
>read-CD19-exact-1
CTGGACCCATGTGCACCCCAAGGGGCCTAAGTCATTGCTGAGCCTAGAGCTGAAGGACGATCGCCCGGCC
>read-CD19-exact-1-trimmed
CCCCAAGGGGCCTAAGTCATTGCTGAGCCTAGAGCTGAAGGACGATCGCCCGGCC
AGAGATATGTGGGTAATGGAGACGGGTCTGTTGTTGCCCCGGGCCACAGCTCAAGACGCTGGAAAGTATT
......
!LAUNCH: $VIDJIL_DIR/$EXEC -a -g $VIDJIL_DIR/germline/homo-sapiens-cd.g -A $VIDJIL_DATA/cd-19-trimmed.fa
$ Load CD-sorting.fa
1:homo-sapiens/CD-sorting.fa .* 28 sequences
$ KmerSegmenter, do not map reads < 52bp
1: UNSEG too short w .* 3 .* 50.0
$ KmerSegmenter, map reads and gather some reads, including a large clone
1: found 9 50-windows in 23 reads
1:Clone .* 13 reads
$ FineSegmenter, find 9 clones with CD19
9:clone-.* CD19 .* SEG
!LAUNCH: $VIDJIL_DIR/$EXEC -g $VIDJIL_DIR/germline/homo-sapiens-cd.g -A $VIDJIL_DATA/cd-4-19.fa
!LAUNCH: $VIDJIL_DIR/$EXEC -K -g $VIDJIL_DIR/germline/homo-sapiens-cd.g -A $VIDJIL_DATA/cd-4-19.fa ; grep 'seed' out/cd-4-19.affects
$ Load CD-sorting.fa
1:homo-sapiens/CD-sorting.fa .* 28 sequences
......@@ -6,6 +6,13 @@ $ Load CD-sorting.fa
$ KmerSegmenter, map reads
1: found 6 50-windows in 8 reads
$ KmerSegmenter, cluster lightly trimmed or mutated reads with original ones
2:Clone .* 2 reads
$ KmerSegmenter, the above clusterisation comes from coherent minimizing positions
1:read-CD4-exact-1 .* @85
1:read-CD4-exact-trimmed .* @78
$ FineSegmenter, find 3 clones with CD4 and 3 with CD19
3:clone-00.* CD4 .* SEG
3:clone-00.* CD19 .* SEG
......
......@@ -23,6 +23,14 @@ void testAffectAnalyser1() {
TAP_TEST_EQUAL(ckaa.getAllowedOverlap(), k-1, TEST_COUNT_AA_GET_OVERLAP, "");
TAP_TEST_EQUAL(ckaa.getSequence(), "AAAACCCCCGGGGG", TEST_AA_GET_SEQUENCE, "actual: " << ckaa.getSequence());
int shift = (kaa.getSequence().length() - kaa.getAllAffectations(AO_NONE).size() + 1) / 2;
TAP_TEST_EQUAL(kaa.minimize(KmerAffect::getAmbiguous(), 0, 4), 0+shift, TEST_AA_MINIMIZE, ""); // first k-mer, AAAA is ambiguous
TAP_TEST_EQUAL(kaa.minimize(KmerAffect::getAmbiguous(), 1, 4), NO_MINIMIZING_POSITION, TEST_AA_MINIMIZE, ""); // too large margin (left side)
TAP_TEST_EQUAL(kaa.minimize(KmerAffect("A", 1, k), 0, 3), NO_MINIMIZING_POSITION, TEST_AA_MINIMIZE, ""); // no non-ambiguous AAAA
TAP_TEST_EQUAL(kaa.minimize(KmerAffect("C", 1, k), 3, 4), 4+shift, TEST_AA_MINIMIZE, ""); // margin = 3, does not affect C
TAP_TEST_EQUAL(kaa.minimize(KmerAffect("C", 1, k), 5, 4), 5+shift, TEST_AA_MINIMIZE, ""); // margin = 5, second k-mer C exactly fits between both margins
TAP_TEST_EQUAL(kaa.minimize(KmerAffect("G", 1, k), 5, 4), NO_MINIMIZING_POSITION, TEST_AA_MINIMIZE, ""); // too large margin (right side)
for (int i = 2; i < nb_seq-1; i++) {
// i starts at 2 because AAAA is not found: there is an ambiguity with
// AAAA coming from AAAACAAAACAAAAC or AAAAAAAAAAAAAAA
......
......@@ -280,6 +280,9 @@ void testDNAToInt() {
TAP_TEST_EQUAL(dna_to_int("TTTT", 4), 255, TEST_DNA_TO_INT, "");
}
void testDNAToHash() {
TAP_TEST_EQUAL(dna_to_hash("ACGT", 4), 6383640340, TEST_DNA_TO_HASH, "");
}
void testNucToAA() {
cout << GENETIC_CODE << endl;
......@@ -482,6 +485,7 @@ void testTools() {
testCreateSequence();
testNucToInt();
testDNAToInt();
testDNAToHash();
testNucToAA();
testRevcompInt();
testExtendedNucleotides();
......
......@@ -30,6 +30,7 @@ enum {
TEST_SEQUENCE_OUT,
TEST_NUC_TO_INT,
TEST_DNA_TO_INT,
TEST_DNA_TO_HASH,
TEST_NUC_TO_AA,
TEST_REVCOMP_INT,
TEST_EXTENDED_NUCL,
......@@ -100,6 +101,7 @@ enum {
TEST_COUNT_AA_MAX,
TEST_COUNT_AA_MAX12,
TEST_AA_SORT_LEFT_RIGHT,
TEST_AA_MINIMIZE,
/* Cluster */
TEST_CLUSTER,
......@@ -210,6 +212,7 @@ inline void declare_tests() {
RECORD_TAP_TEST(TEST_SEQUENCE_OUT, "Test operator<< for Sequence");
RECORD_TAP_TEST(TEST_NUC_TO_INT, "nuc_to_int()");
RECORD_TAP_TEST(TEST_DNA_TO_INT, "dna_to_int()");
RECORD_TAP_TEST(TEST_DNA_TO_HASH, "dna_to_hash()");
RECORD_TAP_TEST(TEST_REVCOMP_INT, "revcomp_int()");
RECORD_TAP_TEST(TEST_EXTRACT_BASENAME, "extractBasename()");
RECORD_TAP_TEST(TEST_N_CHOOSE_K, "test nChooseK()");
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment