Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
vidjil
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
1,696
Issues
1,696
List
Boards
Labels
Service Desk
Milestones
Merge Requests
88
Merge Requests
88
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Operations
Operations
Incidents
Environments
Packages & Registries
Packages & Registries
Container Registry
Analytics
Analytics
CI / CD
Repository
Value Stream
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
vidjil
vidjil
Commits
37c28d85
Commit
37c28d85
authored
Jan 29, 2018
by
Mikaël Salson
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'feature-a/2643-minimize' into 'dev'
Feature a/2643 minimize See merge request
!76
parents
1b59c773
ff54d77b
Pipeline
#15183
passed with stages
in 20 seconds
Changes
13
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
166 additions
and
8 deletions
+166
-8
algo/core/affectanalyser.cpp
algo/core/affectanalyser.cpp
+28
-0
algo/core/affectanalyser.h
algo/core/affectanalyser.h
+11
-0
algo/core/segment.cpp
algo/core/segment.cpp
+8
-2
algo/core/segment.h
algo/core/segment.h
+8
-0
algo/core/tools.cpp
algo/core/tools.cpp
+11
-0
algo/core/tools.h
algo/core/tools.h
+2
-1
algo/tests/data/cd-19-trimmed.fa
algo/tests/data/cd-19-trimmed.fa
+54
-0
algo/tests/data/cd-4-19.fa
algo/tests/data/cd-4-19.fa
+5
-4
algo/tests/should-get-tests/cd-trimmed.should-get
algo/tests/should-get-tests/cd-trimmed.should-get
+16
-0
algo/tests/should-get-tests/cd.should-get
algo/tests/should-get-tests/cd.should-get
+8
-1
algo/tests/unit-tests/testAffectAnalyser.cpp
algo/tests/unit-tests/testAffectAnalyser.cpp
+8
-0
algo/tests/unit-tests/testTools.cpp
algo/tests/unit-tests/testTools.cpp
+4
-0
algo/tests/unit-tests/tests.h
algo/tests/unit-tests/tests.h
+3
-0
No files found.
algo/core/affectanalyser.cpp
View file @
37c28d85
...
...
@@ -59,6 +59,34 @@ int KmerAffectAnalyser::count(const KmerAffect &affect) const{
}
int
KmerAffectAnalyser
::
minimize
(
const
KmerAffect
&
affect
,
int
margin
,
int
width
)
const
{
int
i
=
margin
;
uint64_t
val_max
=
0
;
int
i_max
=
NO_MINIMIZING_POSITION
;
for
(
vector
<
KmerAffect
>::
const_iterator
it
=
affectations
.
begin
()
+
margin
;
it
<
affectations
.
end
()
-
margin
&&
i
<=
seq
.
length
()
-
width
;
it
++
,
i
++
)
{
if
(
*
it
==
affect
)
{
uint64_t
val
=
dna_to_hash
(
&
seq
[
i
],
width
)
;
if
(
val
>
val_max
)
{
val_max
=
val
;
i_max
=
i
;
}
}
}
if
(
i_max
==
NO_MINIMIZING_POSITION
)
return
i_max
;
return
i_max
+
(
seq
.
length
()
-
affectations
.
size
()
+
1
)
/
2
;
}
const
KmerAffect
&
KmerAffectAnalyser
::
getAffectation
(
int
i
)
const
{
assert
(
i
>=
0
&&
i
<
count
());
return
affectations
[
i
];
...
...
algo/core/affectanalyser.h
View file @
37c28d85
...
...
@@ -11,6 +11,8 @@
#include <iostream>
#include <iomanip>
#define NO_MINIMIZING_POSITION -1
// Define two constant affectations: ambiguous and unknown.
/* Declaration of types */
...
...
@@ -148,6 +150,15 @@ class KmerAffectAnalyser: public AffectAnalyser {
int
count
(
const
KmerAffect
&
affect
)
const
;
/**
* @return a minimizing position on the matching affectations (taking into account the next affectations),
* or NO_MINIMIZING_POSITION if there is no such position
* @param affect: affectation to match
* @param margin: number of positions at both ends in which one does not take into account the matching affectation
* @param width: number of affectations to consider, starting from the matching affectation, for the minimization
*/
int
minimize
(
const
KmerAffect
&
affect
,
int
margin
,
int
width
)
const
;
const
KmerAffect
&
getAffectation
(
int
i
)
const
;
vector
<
KmerAffect
>
getAllAffectations
(
affect_options_t
options
)
const
;
...
...
algo/core/segment.cpp
View file @
37c28d85
...
...
@@ -477,11 +477,17 @@ KmerSegmenter::KmerSegmenter(Sequence seq, Germline *germline, double threshold,
return
;
}
int
pos
=
kaa
.
minimize
(
kmer
,
DEFAULT_MINIMIZE_ONE_MARGIN
,
DEFAULT_MINIMIZE_WIDTH
);
if
(
pos
==
NO_MINIMIZING_POSITION
)
{
because
=
UNSEG_TOO_SHORT_FOR_WINDOW
;
return
;
}
segmented
=
true
;
because
=
reversed
?
SEG_MINUS
:
SEG_PLUS
;
int
pos
=
sequence
.
size
()
/
2
;
info
=
"="
+
string_of_int
(
c
)
+
" @"
+
string_of_int
(
pos
)
;
// getJunction() will be centered on pos
...
...
algo/core/segment.h
View file @
37c28d85
...
...
@@ -46,6 +46,14 @@
be overriden by the command line by
providing a shorter -w*/
#define DEFAULT_MINIMIZE_WIDTH 12
/* Number of nucleotides on which the minimization integer is computed */
#define DEFAULT_MINIMIZE_ONE_MARGIN 20
/* Number of nucleotides on ends of the reads excluded from the minimization in SEG_METHOD_ONE.
- A read smaller than (DEFAULT_MINIMIZE_ONE_MARGIN*2 + max(k, DEFAULT_MINIMIZE_WIDTH)), that is 52,
will trigger UNSEG_TOO_SHORT_FOR_WINDOW.
- Margins above the half of the window length may produce shortened or shifted windows
*/
using
namespace
std
;
using
json
=
nlohmann
::
json
;
...
...
algo/core/tools.cpp
View file @
37c28d85
...
...
@@ -137,6 +137,17 @@ int dna_to_int(const string &word, int size) {
return
index_word
;
}
uint64_t
dna_to_hash
(
const
string
&
word
,
int
size
)
{
// djb2-xor hash
// no collision on 12-char strings on "ACGT" (nor on 8-char strings on "ACGTacgt")
// min/max for 12-char strings on "AGCT" are "AAGAACCAACCC"/"TGAACCAACCAA"
uint64_t
hash
=
5381
;
for
(
int
i
=
0
;
i
<
size
;
i
++
){
hash
=
((
hash
<<
5
)
+
hash
)
^
word
[
i
];
}
return
hash
;
}
string
nuc_to_aa
(
const
string
&
word
)
{
string
aa
;
int
index_word
=
0
;
...
...
algo/core/tools.h
View file @
37c28d85
...
...
@@ -136,9 +136,10 @@ inline int nuc_to_int(char nuc) {
}
/**
* Convert size nucleotides from a DNA string to an integer.
* Convert size nucleotides from a DNA string to an integer
or to an hash
.
*/
int
dna_to_int
(
const
string
&
,
int
size
);
uint64_t
dna_to_hash
(
const
string
&
,
int
size
);
#define GENETIC_CODE \
"KNKN" "TTTT" "RSRS" "IIMI" \
...
...
algo/tests/data/cd-19-trimmed.fa
0 → 100644
View file @
37c28d85
>cd19-1-100
CTGGACCCATGTGCACCCCAAGGGGCCTAAGTCATTGCTGAGCCTAGAGCTGAAGGACGATCGCCCGGCCAGAGATATGTGGGTAATGGAGACGGGTCTG
# Shortened reads
>cd19-1-95
CTGGACCCATGTGCACCCCAAGGGGCCTAAGTCATTGCTGAGCCTAGAGCTGAAGGACGATCGCCCGGCCAGAGATATGTGGGTAATGGAGACGG
>cd19-1-90
CTGGACCCATGTGCACCCCAAGGGGCCTAAGTCATTGCTGAGCCTAGAGCTGAAGGACGATCGCCCGGCCAGAGATATGTGGGTAATGGA
>cd19-1-85
CTGGACCCATGTGCACCCCAAGGGGCCTAAGTCATTGCTGAGCCTAGAGCTGAAGGACGATCGCCCGGCCAGAGATATGTGGGTA
>cd19-1-80
CTGGACCCATGTGCACCCCAAGGGGCCTAAGTCATTGCTGAGCCTAGAGCTGAAGGACGATCGCCCGGCCAGAGATATGT
>cd19-1-75
CTGGACCCATGTGCACCCCAAGGGGCCTAAGTCATTGCTGAGCCTAGAGCTGAAGGACGATCGCCCGGCCAGAGA
>cd19-1-70
CTGGACCCATGTGCACCCCAAGGGGCCTAAGTCATTGCTGAGCCTAGAGCTGAAGGACGATCGCCCGGCC
>cd19-1-65
CTGGACCCATGTGCACCCCAAGGGGCCTAAGTCATTGCTGAGCCTAGAGCTGAAGGACGATCGCC
>cd19-1-60
CTGGACCCATGTGCACCCCAAGGGGCCTAAGTCATTGCTGAGCCTAGAGCTGAAGGACGA
>cd19-1-55
CTGGACCCATGTGCACCCCAAGGGGCCTAAGTCATTGCTGAGCCTAGAGCTGAAG
>cd19-1-50
CTGGACCCATGTGCACCCCAAGGGGCCTAAGTCATTGCTGAGCCTAGAGC
>cd19-6-100
CCCATGTGCACCCCAAGGGGCCTAAGTCATTGCTGAGCCTAGAGCTGAAGGACGATCGCCCGGCCAGAGATATGTGGGTAATGGAGACGGGTCTG
>cd19-11-100
GTGCACCCCAAGGGGCCTAAGTCATTGCTGAGCCTAGAGCTGAAGGACGATCGCCCGGCCAGAGATATGTGGGTAATGGAGACGGGTCTG
>cd19-16-100
CCCCAAGGGGCCTAAGTCATTGCTGAGCCTAGAGCTGAAGGACGATCGCCCGGCCAGAGATATGTGGGTAATGGAGACGGGTCTG
>cd19-21-100
AGGGGCCTAAGTCATTGCTGAGCCTAGAGCTGAAGGACGATCGCCCGGCCAGAGATATGTGGGTAATGGAGACGGGTCTG
>cd19-26-100
CCTAAGTCATTGCTGAGCCTAGAGCTGAAGGACGATCGCCCGGCCAGAGATATGTGGGTAATGGAGACGGGTCTG
>cd19-31-100
GTCATTGCTGAGCCTAGAGCTGAAGGACGATCGCCCGGCCAGAGATATGTGGGTAATGGAGACGGGTCTG
>cd19-36-100
TGCTGAGCCTAGAGCTGAAGGACGATCGCCCGGCCAGAGATATGTGGGTAATGGAGACGGGTCTG
>cd19-41-100
AGCCTAGAGCTGAAGGACGATCGCCCGGCCAGAGATATGTGGGTAATGGAGACGGGTCTG
>cd19-46-100
AGAGCTGAAGGACGATCGCCCGGCCAGAGATATGTGGGTAATGGAGACGGGTCTG
>cd19-51-100
TGAAGGACGATCGCCCGGCCAGAGATATGTGGGTAATGGAGACGGGTCTG
>cd19-6-95
CCCATGTGCACCCCAAGGGGCCTAAGTCATTGCTGAGCCTAGAGCTGAAGGACGATCGCCCGGCCAGAGATATGTGGGTAATGGAGACGG
>cd19-11-90
GTGCACCCCAAGGGGCCTAAGTCATTGCTGAGCCTAGAGCTGAAGGACGATCGCCCGGCCAGAGATATGTGGGTAATGGA
>cd19-16-85
CCCCAAGGGGCCTAAGTCATTGCTGAGCCTAGAGCTGAAGGACGATCGCCCGGCCAGAGATATGTGGGTA
>cd19-21-80
AGGGGCCTAAGTCATTGCTGAGCCTAGAGCTGAAGGACGATCGCCCGGCCAGAGATATGT
>cd19-26-75
CCTAAGTCATTGCTGAGCCTAGAGCTGAAGGACGATCGCCCGGCCAGAGA
algo/tests/data/cd-4-19.fa
View file @
37c28d85
...
...
@@ -2,15 +2,16 @@
>read-CD4-exact-1
GTAGTAGCCCCTCAGTGCAATGTAGGAGTCCAAGGGGTAAAAACATACAGGGGGGGAAGACCCTCTCCGT
GTCTCAGCTGGAGCTCCAGGATAGTGGCACCTGGACATGCACTGTCTTGCAGAACCAGAAGAAGGTGGAG
>read-CD4-exact-
2
GTAGTAG
CCCCTCAGTGCAATGTAGGAGTCCAAGGGGTAAAAACATACAGGGGGGGAAGACCCTCTCCGT
>read-CD4-exact-
trimmed
CCCCTCAGTGCAATGTAGGAGTCCAAGGGGTAAAAACATACAGGGGGGGAAGACCCTCTCCGT
GTCTCAGCTGGAGCTCCAGGATAGTGGCACCTGGACATGCACTGTCTTGCAGAACCAGAAGAAGGTGGAG
>read-CD19-exact-1
CTGGACCCATGTGCACCCCAAGGGGCCTAAGTCATTGCTGAGCCTAGAGCTGAAGGACGATCGCCCGGCC
AGAGATATGTGGGTAATGGAGACGGGTCTGTTGTTGCCCCGGGCCACAGCTCAAGACGCTGGAAAGTATT
>read-CD19-exact-1
C
TGGACCCATGTGCAC
CCCAAGGGGCCTAAGTCATTGCTGAGCCTAGAGCTGAAGGACGATCGCCCGGCC
>read-CD19-exact-1
-trimmed
CCCCAAGGGGCCTAAGTCATTGCTGAGCCTAGAGCTGAAGGACGATCGCCCGGCC
AGAGATATGTGGGTAATGGAGACGGGTCTGTTGTTGCCCCGGGCCACAGCTCAAGACGCTGGAAAGTATT
...
...
algo/tests/should-get-tests/cd-trimmed.should-get
0 → 100644
View file @
37c28d85
!LAUNCH: $VIDJIL_DIR/$EXEC -a -g $VIDJIL_DIR/germline/homo-sapiens-cd.g -A $VIDJIL_DATA/cd-19-trimmed.fa
$ Load CD-sorting.fa
1:homo-sapiens/CD-sorting.fa .* 28 sequences
$ KmerSegmenter, do not map reads < 52bp
1: UNSEG too short w .* 3 .* 50.0
$ KmerSegmenter, map reads and gather some reads, including a large clone
1: found 9 50-windows in 23 reads
1:Clone .* 13 reads
$ FineSegmenter, find 9 clones with CD19
9:clone-.* CD19 .* SEG
algo/tests/should-get-tests/cd.should-get
View file @
37c28d85
!LAUNCH: $VIDJIL_DIR/$EXEC -
g $VIDJIL_DIR/germline/homo-sapiens-cd.g -A $VIDJIL_DATA/cd-4-19.fa
!LAUNCH: $VIDJIL_DIR/$EXEC -
K -g $VIDJIL_DIR/germline/homo-sapiens-cd.g -A $VIDJIL_DATA/cd-4-19.fa ; grep 'seed' out/cd-4-19.affects
$ Load CD-sorting.fa
1:homo-sapiens/CD-sorting.fa .* 28 sequences
...
...
@@ -6,6 +6,13 @@ $ Load CD-sorting.fa
$ KmerSegmenter, map reads
1: found 6 50-windows in 8 reads
$ KmerSegmenter, cluster lightly trimmed or mutated reads with original ones
2:Clone .* 2 reads
$ KmerSegmenter, the above clusterisation comes from coherent minimizing positions
1:read-CD4-exact-1 .* @85
1:read-CD4-exact-trimmed .* @78
$ FineSegmenter, find 3 clones with CD4 and 3 with CD19
3:clone-00.* CD4 .* SEG
3:clone-00.* CD19 .* SEG
...
...
algo/tests/unit-tests/testAffectAnalyser.cpp
View file @
37c28d85
...
...
@@ -23,6 +23,14 @@ void testAffectAnalyser1() {
TAP_TEST_EQUAL
(
ckaa
.
getAllowedOverlap
(),
k
-
1
,
TEST_COUNT_AA_GET_OVERLAP
,
""
);
TAP_TEST_EQUAL
(
ckaa
.
getSequence
(),
"AAAACCCCCGGGGG"
,
TEST_AA_GET_SEQUENCE
,
"actual: "
<<
ckaa
.
getSequence
());
int
shift
=
(
kaa
.
getSequence
().
length
()
-
kaa
.
getAllAffectations
(
AO_NONE
).
size
()
+
1
)
/
2
;
TAP_TEST_EQUAL
(
kaa
.
minimize
(
KmerAffect
::
getAmbiguous
(),
0
,
4
),
0
+
shift
,
TEST_AA_MINIMIZE
,
""
);
// first k-mer, AAAA is ambiguous
TAP_TEST_EQUAL
(
kaa
.
minimize
(
KmerAffect
::
getAmbiguous
(),
1
,
4
),
NO_MINIMIZING_POSITION
,
TEST_AA_MINIMIZE
,
""
);
// too large margin (left side)
TAP_TEST_EQUAL
(
kaa
.
minimize
(
KmerAffect
(
"A"
,
1
,
k
),
0
,
3
),
NO_MINIMIZING_POSITION
,
TEST_AA_MINIMIZE
,
""
);
// no non-ambiguous AAAA
TAP_TEST_EQUAL
(
kaa
.
minimize
(
KmerAffect
(
"C"
,
1
,
k
),
3
,
4
),
4
+
shift
,
TEST_AA_MINIMIZE
,
""
);
// margin = 3, does not affect C
TAP_TEST_EQUAL
(
kaa
.
minimize
(
KmerAffect
(
"C"
,
1
,
k
),
5
,
4
),
5
+
shift
,
TEST_AA_MINIMIZE
,
""
);
// margin = 5, second k-mer C exactly fits between both margins
TAP_TEST_EQUAL
(
kaa
.
minimize
(
KmerAffect
(
"G"
,
1
,
k
),
5
,
4
),
NO_MINIMIZING_POSITION
,
TEST_AA_MINIMIZE
,
""
);
// too large margin (right side)
for
(
int
i
=
2
;
i
<
nb_seq
-
1
;
i
++
)
{
// i starts at 2 because AAAA is not found: there is an ambiguity with
// AAAA coming from AAAACAAAACAAAAC or AAAAAAAAAAAAAAA
...
...
algo/tests/unit-tests/testTools.cpp
View file @
37c28d85
...
...
@@ -280,6 +280,9 @@ void testDNAToInt() {
TAP_TEST_EQUAL
(
dna_to_int
(
"TTTT"
,
4
),
255
,
TEST_DNA_TO_INT
,
""
);
}
void
testDNAToHash
()
{
TAP_TEST_EQUAL
(
dna_to_hash
(
"ACGT"
,
4
),
6383640340
,
TEST_DNA_TO_HASH
,
""
);
}
void
testNucToAA
()
{
cout
<<
GENETIC_CODE
<<
endl
;
...
...
@@ -482,6 +485,7 @@ void testTools() {
testCreateSequence
();
testNucToInt
();
testDNAToInt
();
testDNAToHash
();
testNucToAA
();
testRevcompInt
();
testExtendedNucleotides
();
...
...
algo/tests/unit-tests/tests.h
View file @
37c28d85
...
...
@@ -30,6 +30,7 @@ enum {
TEST_SEQUENCE_OUT
,
TEST_NUC_TO_INT
,
TEST_DNA_TO_INT
,
TEST_DNA_TO_HASH
,
TEST_NUC_TO_AA
,
TEST_REVCOMP_INT
,
TEST_EXTENDED_NUCL
,
...
...
@@ -100,6 +101,7 @@ enum {
TEST_COUNT_AA_MAX
,
TEST_COUNT_AA_MAX12
,
TEST_AA_SORT_LEFT_RIGHT
,
TEST_AA_MINIMIZE
,
/* Cluster */
TEST_CLUSTER
,
...
...
@@ -210,6 +212,7 @@ inline void declare_tests() {
RECORD_TAP_TEST
(
TEST_SEQUENCE_OUT
,
"Test operator<< for Sequence"
);
RECORD_TAP_TEST
(
TEST_NUC_TO_INT
,
"nuc_to_int()"
);
RECORD_TAP_TEST
(
TEST_DNA_TO_INT
,
"dna_to_int()"
);
RECORD_TAP_TEST
(
TEST_DNA_TO_HASH
,
"dna_to_hash()"
);
RECORD_TAP_TEST
(
TEST_REVCOMP_INT
,
"revcomp_int()"
);
RECORD_TAP_TEST
(
TEST_EXTRACT_BASENAME
,
"extractBasename()"
);
RECORD_TAP_TEST
(
TEST_N_CHOOSE_K
,
"test nChooseK()"
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment