Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
vidjil
vidjil
Commits
c15efbd6
Commit
c15efbd6
authored
Apr 08, 2015
by
Marc Duez
Browse files
Merge branch 'master' of
git+ssh://scm.gforge.inria.fr//gitroot/vidjil/vidjil
parents
dd249f3b
40c08830
Changes
13
Hide whitespace changes
Inline
Side-by-side
algo/core/fasta.cpp
View file @
c15efbd6
...
...
@@ -31,6 +31,14 @@
#include
"../lib/gzstream.h"
// http://stackoverflow.com/a/5840160/4475279
unsigned
long
long
filesize
(
const
char
*
filename
)
{
std
::
ifstream
in
(
filename
,
std
::
ifstream
::
ate
|
std
::
ifstream
::
binary
);
return
in
.
tellg
();
}
void
Fasta
::
init
(
int
extract_field
,
string
extract_separator
)
{
this
->
extract_field
=
extract_field
;
...
...
@@ -133,11 +141,16 @@ OnlineFasta::~OnlineFasta() {
}
void
OnlineFasta
::
init
()
{
char_nb
=
0
;
line_nb
=
0
;
line
=
getInterestingLine
();
current
.
seq
=
NULL
;
}
unsigned
long
long
OnlineFasta
::
getPos
()
{
return
char_nb
;
}
size_t
OnlineFasta
::
getLineNb
()
{
return
line_nb
;
}
...
...
@@ -226,6 +239,7 @@ string OnlineFasta::getInterestingLine(int state) {
string
line
;
while
(
line
.
length
()
==
0
&&
hasNext
()
&&
getline
(
*
input
,
line
))
{
line_nb
++
;
char_nb
+=
line
.
length
()
+
1
;
remove_trailing_whitespaces
(
line
);
if
(
line
.
length
()
&&
line
[
0
]
==
'#'
&&
state
!=
FASTX_FASTQ_SEP
)
...
...
@@ -274,8 +288,11 @@ ostream &operator<<(ostream &out, const Sequence &seq) {
return
out
;
}
int
nb_sequences_in_fasta
(
string
f
)
int
nb_sequences_in_fasta
(
string
f
,
bool
approx
)
{
if
(
approx
)
return
approx_nb_sequences_in_fasta
(
f
);
OnlineFasta
*
sequences
=
new
OnlineFasta
(
f
,
1
,
" "
);
int
nb_sequences
=
0
;
...
...
@@ -288,3 +305,33 @@ int nb_sequences_in_fasta(string f)
cout
<<
" ==> "
<<
nb_sequences
<<
" sequences"
<<
endl
;
return
nb_sequences
;
}
#define SAMPLE_APPROX_NB_SEQUENCES 200
int
approx_nb_sequences_in_fasta
(
string
f
)
{
OnlineFasta
*
sequences
=
new
OnlineFasta
(
f
,
1
,
" "
);
int
nb_sequences
=
0
;
while
(
nb_sequences
<
SAMPLE_APPROX_NB_SEQUENCES
&&
sequences
->
hasNext
())
{
sequences
->
next
();
nb_sequences
++
;
}
cout
<<
" ==> "
;
if
(
sequences
->
hasNext
())
{
cout
<<
"approx. "
;
float
ratio
=
(
float
)
filesize
(
f
.
c_str
())
/
(
float
)
sequences
->
getPos
();
nb_sequences
=
(
int
)
(
ratio
*
nb_sequences
);
}
cout
<<
nb_sequences
<<
" sequences"
<<
endl
;
delete
sequences
;
return
nb_sequences
;
}
algo/core/fasta.h
View file @
c15efbd6
...
...
@@ -27,6 +27,8 @@ typedef enum {
#include
"tools.h"
unsigned
long
long
filesize
(
const
char
*
filename
);
class
Fasta
{
void
init
(
int
extract_field
,
string
extract_separator
);
...
...
@@ -91,6 +93,7 @@ class OnlineFasta {
string
line
;
bool
input_allocated
;
size_t
line_nb
;
unsigned
long
long
char_nb
;
public:
...
...
@@ -113,7 +116,12 @@ class OnlineFasta {
int
extract_field
=
0
,
string
extract_separator
=
"|"
);
~
OnlineFasta
();
/**
* @return the position in the file
*/
unsigned
long
long
getPos
();
/**
* @return the current line number
*/
...
...
@@ -166,6 +174,7 @@ ostream &operator<<(ostream &out, const Sequence &seq);
* Count the number of sequences in a Fasta file
* @return the number of sequences
*/
int
nb_sequences_in_fasta
(
string
f
);
int
nb_sequences_in_fasta
(
string
f
,
bool
approx
=
false
);
int
approx_nb_sequences_in_fasta
(
string
f
);
#endif
algo/core/segment.cpp
View file @
c15efbd6
...
...
@@ -79,6 +79,10 @@ bool Segmenter::isDSegmented() const {
return
dSegmented
;
}
bool
KmerSegmenter
::
isDetected
()
const
{
return
detected
;
}
// Chevauchement
string
Segmenter
::
removeChevauchement
()
...
...
@@ -241,9 +245,18 @@ KmerSegmenter::KmerSegmenter(Sequence seq, Germline *germline)
CountKmerAffectAnalyser
ckaa
(
*
(
germline
->
index
),
sequence
);
pair
<
KmerAffect
,
KmerAffect
>
max12
=
ckaa
.
sortLeftRight
(
ckaa
.
max12
(
forbidden
));
if
(
max12
.
first
.
isUnknown
()
||
max12
.
second
.
isUnknown
())
{
because
=
UNSEG_TOO_FEW_ZERO
;
return
;
}
strand
=
nb_strand
[
0
]
>
nb_strand
[
1
]
?
-
1
:
1
;
computeSegmentation
(
strand
,
max12
.
first
,
max12
.
second
);
if
(
!
detected
)
because
=
UNSEG_TOO_FEW_ZERO
;
// The pseudo-germline should never take precedence over the regular germlines
score
=
1
;
}
...
...
@@ -313,7 +326,8 @@ KmerSegmenter::~KmerSegmenter() {
delete
kaa
;
}
KmerMultiSegmenter
::
KmerMultiSegmenter
(
Sequence
seq
,
MultiGermline
*
multigermline
,
ostream
*
out_unsegmented
,
double
threshold
)
KmerMultiSegmenter
::
KmerMultiSegmenter
(
Sequence
seq
,
MultiGermline
*
multigermline
,
ostream
*
out_unsegmented
,
double
threshold
,
int
nb_reads_for_evalue
)
{
int
best_score_seg
=
0
;
// Best score, segmented sequences
int
best_score_unseg
=
0
;
// Best score, unsegmented sequences
...
...
@@ -383,9 +397,9 @@ KmerMultiSegmenter::KmerMultiSegmenter(Sequence seq, MultiGermline *multigermlin
if
(
threshold_nb_expected
>
NO_LIMIT_VALUE
)
if
(
the_kseg
->
isSegmented
())
{
// the_kseg->evalue also depends on the number of germlines from the *Multi*KmerSegmenter
the_kseg
->
evalue
=
getNbExpected
();
the_kseg
->
evalue
=
getNbExpected
(
nb_reads_for_evalue
);
pair
<
double
,
double
>
p
=
getNbExpectedLeftRight
();
pair
<
double
,
double
>
p
=
getNbExpectedLeftRight
(
nb_reads_for_evalue
);
the_kseg
->
evalue_left
=
p
.
first
;
the_kseg
->
evalue_right
=
p
.
second
;
...
...
@@ -401,14 +415,14 @@ KmerMultiSegmenter::KmerMultiSegmenter(Sequence seq, MultiGermline *multigermlin
}
}
double
KmerMultiSegmenter
::
getNbExpected
()
const
{
pair
<
double
,
double
>
p
=
getNbExpectedLeftRight
();
double
KmerMultiSegmenter
::
getNbExpected
(
int
multiplier
)
const
{
pair
<
double
,
double
>
p
=
getNbExpectedLeftRight
(
multiplier
);
return
(
p
.
first
+
p
.
second
);
}
pair
<
double
,
double
>
KmerMultiSegmenter
::
getNbExpectedLeftRight
()
const
{
pair
<
double
,
double
>
KmerMultiSegmenter
::
getNbExpectedLeftRight
(
int
multiplier
)
const
{
pair
<
double
,
double
>
p
=
the_kseg
->
getKmerAffectAnalyser
()
->
getLeftRightProbabilityAtLeastOrAbove
();
return
pair
<
double
,
double
>
(
p
.
first
*
multi_germline
->
germlines
.
size
(),
p
.
second
*
multi_germline
->
germlines
.
size
());
return
pair
<
double
,
double
>
(
p
.
first
*
multiplier
*
multi_germline
->
germlines
.
size
(),
p
.
second
*
multiplier
*
multi_germline
->
germlines
.
size
());
}
KmerMultiSegmenter
::~
KmerMultiSegmenter
()
{
...
...
@@ -425,12 +439,10 @@ void KmerSegmenter::computeSegmentation(int strand, KmerAffect before, KmerAffec
max
=
kaa
->
getMaximum
(
before
,
after
);
// We labeled it detected if there were both enough affect_5 and enough affect_3
bool
detected
=
(
max
.
nb_before_left
+
max
.
nb_before_right
>=
DETECT_THRESHOLD
)
detected
=
(
max
.
nb_before_left
+
max
.
nb_before_right
>=
DETECT_THRESHOLD
)
&&
(
max
.
nb_after_left
+
max
.
nb_after_right
>=
DETECT_THRESHOLD
);
if
(
!
max
.
max_found
)
{
// ----------------------------
// This code is only here to set the UNSEG cause. It could be streamlined.
if
((
strand
==
1
&&
max
.
nb_before_left
==
0
)
||
(
strand
==
-
1
&&
max
.
nb_after_right
==
0
))
because
=
detected
?
UNSEG_AMBIGUOUS
:
UNSEG_TOO_FEW_V
;
...
...
@@ -440,7 +452,6 @@ void KmerSegmenter::computeSegmentation(int strand, KmerAffect before, KmerAffec
because
=
detected
?
UNSEG_AMBIGUOUS
:
UNSEG_TOO_FEW_J
;
}
else
because
=
UNSEG_AMBIGUOUS
;
// ----------------------------
}
else
{
Vend
=
max
.
first_pos_max
;
Jstart
=
max
.
last_pos_max
+
1
;
...
...
algo/core/segment.h
View file @
c15efbd6
...
...
@@ -169,6 +169,7 @@ class KmerSegmenter : public Segmenter
string
affects
;
public:
bool
isDetected
()
const
;
int
score
;
KmerSegmenter
();
...
...
@@ -206,18 +207,18 @@ class KmerMultiSegmenter
* @param threshold: threshold of randomly expected segmentation
*/
KmerMultiSegmenter
(
Sequence
seq
,
MultiGermline
*
multigermline
,
ostream
*
out_unsegmented
,
double
threshold
=
THRESHOLD_NB_EXPECTED
);
double
threshold
=
THRESHOLD_NB_EXPECTED
,
int
nb_reads_for_evalue
=
1
);
/**
* @return expected number of Segmenter that would have yield the maximum score by chance
*/
double
getNbExpected
()
const
;
double
getNbExpected
(
int
multiplier
)
const
;
/**
* @return expected number of Segmenter that would have yield the maximum score by chance
* on the left part of the read and on the right part of the read respectively.
*/
pair
<
double
,
double
>
getNbExpectedLeftRight
()
const
;
pair
<
double
,
double
>
getNbExpectedLeftRight
(
int
multiplier
)
const
;
~
KmerMultiSegmenter
();
...
...
algo/core/windowExtractor.cpp
View file @
c15efbd6
...
...
@@ -11,7 +11,7 @@ WindowsStorage *WindowExtractor::extract(OnlineFasta *reads, MultiGermline *mult
size_t
w
,
map
<
string
,
string
>
&
windows_labels
,
int
stop_after
,
int
only_nth_read
,
bool
keep_unsegmented_as_clone
,
double
nb_expected
)
{
double
nb_expected
,
int
nb_reads_for_evalue
)
{
init_stats
();
WindowsStorage
*
windowsStorage
=
new
WindowsStorage
(
windows_labels
);
...
...
@@ -38,7 +38,7 @@ WindowsStorage *WindowExtractor::extract(OnlineFasta *reads, MultiGermline *mult
*
out_affects
<<
reads
->
getSequence
();
}
KmerMultiSegmenter
kmseg
(
reads
->
getSequence
(),
multigermline
,
out_affects
,
nb_expected
);
KmerMultiSegmenter
kmseg
(
reads
->
getSequence
(),
multigermline
,
out_affects
,
nb_expected
,
nb_reads_for_evalue
);
KmerSegmenter
*
seg
=
kmseg
.
the_kseg
;
int
read_length
=
seg
->
getSequence
().
sequence
.
length
();
...
...
algo/core/windowExtractor.h
View file @
c15efbd6
...
...
@@ -40,6 +40,7 @@ class WindowExtractor {
* @param w: length of the window
* @param windows_labels: Windows that must be kept and registered as such.
* @param nb_expected: maximal e-value of the segmentation
* @param nb_reads_for_evalue: number of reads, used for e-value computation. Can be approximate or faked.
* @return a pointer to a WindowsStorage that will contain all the windows.
* It is a pointer so that the WindowsStorage is not duplicated.
* @post Statistics on segmentation will be provided through the getSegmentationStats() methods
...
...
@@ -49,7 +50,7 @@ class WindowExtractor {
size_t
w
,
map
<
string
,
string
>
&
windows_labels
,
int
stop_after
=-
1
,
int
only_nth_reads
=
1
,
bool
keep_unsegmented_as_clone
=
false
,
double
nb_expected
=
THRESHOLD_NB_EXPECTED
);
double
nb_expected
=
THRESHOLD_NB_EXPECTED
,
int
nb_reads_for_evalue
=
1
);
/**
* @return the average length of sequences whose segmentation has been classified as seg
...
...
algo/tests/testTools.cpp
View file @
c15efbd6
...
...
@@ -26,6 +26,18 @@ void testOnlineFasta1() {
TAP_TEST
(
nb_seq
==
5
,
TEST_O_FASTA_HAS_NEXT
,
""
);
}
void
testFastaNbSequences
()
{
TAP_TEST
(
nb_sequences_in_fasta
(
"../../germline/IGHV.fa"
)
==
344
,
TEST_FASTA_NB_SEQUENCES
,
"ccc"
);
int
a1
=
approx_nb_sequences_in_fasta
(
"../../germline/IGHV.fa"
);
TAP_TEST
(
a1
>=
340
&&
a1
<=
348
,
TEST_FASTA_NB_SEQUENCES
,
""
);
int
a2
=
nb_sequences_in_fasta
(
"../../data/Stanford_S22.fasta"
,
true
);
TAP_TEST
(
a2
>=
13100
&&
a2
<=
13200
,
TEST_FASTA_NB_SEQUENCES
,
""
);
}
void
testFasta1
()
{
Fasta
fa
(
"../../data/test1.fa"
);
Fasta
fq
(
"../../data/test1.fq"
);
...
...
@@ -284,6 +296,7 @@ void testNChooseK() {
void
testTools
()
{
testOnlineFasta1
();
testFastaNbSequences
();
testFasta1
();
testFastaAdd
();
testFastaAddThrows
();
...
...
algo/tests/tests.h
View file @
c15efbd6
...
...
@@ -14,6 +14,7 @@ enum {
TEST_FASTA_ADD
,
TEST_FASTA_INVALID_FILE
,
TEST_FASTA_OUT
,
TEST_FASTA_NB_SEQUENCES
,
TEST_CREATE_SEQUENCE_LABEL_FULL
,
TEST_CREATE_SEQUENCE_LABEL
,
TEST_CREATE_SEQUENCE_SEQUENCE
,
...
...
@@ -167,6 +168,7 @@ inline void declare_tests() {
RECORD_TAP_TEST
(
TEST_FASTA_ADD
,
"Fasta add() method"
);
RECORD_TAP_TEST
(
TEST_FASTA_INVALID_FILE
,
"Fasta with invalid file"
);
RECORD_TAP_TEST
(
TEST_FASTA_OUT
,
"Test operator<< with Fasta"
);
RECORD_TAP_TEST
(
TEST_FASTA_NB_SEQUENCES
,
"Nb sequences in Fasta"
);
RECORD_TAP_TEST
(
TEST_CREATE_SEQUENCE_LABEL_FULL
,
"create_sequence: label_full field"
);
RECORD_TAP_TEST
(
TEST_CREATE_SEQUENCE_LABEL
,
"create_sequence: label field"
);
RECORD_TAP_TEST
(
TEST_CREATE_SEQUENCE_SEQUENCE
,
"create_sequence: sequence field"
);
...
...
algo/tests/tools.cpp
deleted
100644 → 0
View file @
dd249f3b
#include
<core/tools.h>
#include
<iostream>
#include
<cassert>
#include
<core/fasta.h>
#include
<core/kmerstore.h>
#include
<core/kmeraffect.h>
#include
<core/affectanalyser.h>
#include
<core/segment.h>
#include
"tests.h"
using
namespace
std
;
const
string
seq
[]
=
{
"CCCCCCCCCCCCCCCCCCCC"
,
"C lots of"
,
"GGGGGGGGGGGGGGGGGGGG"
,
"G lots of"
,
"AAAAAAAAAAAAAAAAAAAA"
,
"A lots of"
,
"TTTTTTTTTTTTTTTTTTTT"
,
"T lots of"
,
"TTTTATTTTATTTTATTTTA"
,
"U : TTTA lots of"
,
"AAAACAAAACAAAACAAAAC"
,
"V : AAAC lots of"
};
const
int
nb_seq
=
6
;
template
<
typename
Index
>
Index
*
createIndex
(
int
k
,
bool
revcomp
)
{
Index
*
index
=
new
Index
(
k
,
revcomp
);
for
(
int
i
=
0
;
i
<
nb_seq
;
i
++
)
index
->
insert
(
seq
[
2
*
i
],
seq
[
2
*
i
+
1
]);
return
index
;
}
void
testFasta1
()
{
Fasta
fa
(
"data/test1.fa"
);
Fasta
fq
(
"data/test1.fq"
);
TAP_TEST
(
fa
.
size
()
==
fq
.
size
(),
TEST_FASTA_SIZE
,
""
);
for
(
int
i
=
0
;
i
<
fa
.
size
();
i
++
)
{
TAP_TEST
(
fa
.
label
(
i
)
==
fq
.
label
(
i
),
TEST_FASTA_LABEL
,
""
);
TAP_TEST
(
fa
.
label_full
(
i
)
==
fq
.
label_full
(
i
),
TEST_FASTA_LABEL_FULL
,
""
);
TAP_TEST
(
fa
.
sequence
(
i
)
==
fq
.
sequence
(
i
),
TEST_FASTA_SEQUENCE
,
""
);
}
}
template
<
template
<
class
>
class
T
>
void
testKmerStoreWithKmer
(
int
k
,
bool
revcomp
,
int
test_id
)
{
T
<
Kmer
>
*
index
=
createIndex
<
T
<
Kmer
>
>
(
k
,
revcomp
);
for
(
int
i
=
0
;
i
<
nb_seq
-
2
;
i
++
)
{
string
tmp
=
seq
[
2
*
i
].
substr
(
0
,
k
);
if
(
revcomp
)
{
TAP_TEST
((
*
index
)[
tmp
].
count
==
(
seq
[
2
*
i
].
length
()
-
k
+
1
)
*
2
,
test_id
,
""
);
}
else
{
TAP_TEST
((
*
index
)[
tmp
].
count
==
(
seq
[
2
*
i
].
length
()
-
k
+
1
),
test_id
,
""
);
}
}
delete
index
;
}
template
<
template
<
class
>
class
T
,
class
KAffect
>
void
testAffectAnalyser1
()
{
const
int
k
=
4
;
const
bool
revcomp
=
false
;
T
<
KAffect
>
*
index
=
createIndex
<
T
<
KAffect
>
>
(
k
,
revcomp
);
KmerAffectAnalyser
<
KAffect
>
kaa
(
*
index
,
"AAAACCCCCGGGGG"
);
for
(
int
i
=
2
;
i
<
nb_seq
-
1
;
i
++
)
{
KAffect
current_affect
(
""
,
seq
[
2
*
i
+
1
],
1
);
TAP_TEST
(
kaa
.
count
(
current_affect
)
==
0
,
TEST_AA_COUNT
,
""
);
TAP_TEST
(
kaa
.
first
(
current_affect
)
==
(
int
)
string
::
npos
,
TEST_AA_FIRST
,
""
);
TAP_TEST
(
kaa
.
last
(
current_affect
)
==
(
int
)
string
::
npos
,
TEST_AA_LAST
,
""
);
}
for
(
int
i
=
0
;
i
<
2
;
i
++
)
{
KAffect
current_affect
(
""
,
seq
[
2
*
i
+
1
],
1
);
TAP_TEST
(
kaa
.
count
(
current_affect
)
==
2
,
TEST_AA_COUNT
,
""
);
TAP_TEST
(
kaa
.
getAffectation
(
kaa
.
first
(
current_affect
))
==
current_affect
,
TEST_AA_GET_AFFECT
,
""
);
TAP_TEST
(
kaa
.
getAffectation
(
kaa
.
last
(
current_affect
))
==
current_affect
,
TEST_AA_GET_AFFECT
,
""
);
}
TAP_TEST
(
kaa
.
count
(
KAffect
(
""
,
seq
[
2
*
(
nb_seq
-
1
)
+
1
],
1
))
==
1
,
TEST_AA_COUNT
,
""
);
TAP_TEST
((
kaa
.
first
(
KAffect
(
""
,
seq
[
2
*
(
nb_seq
-
1
)
+
1
],
1
))
==
kaa
.
last
(
KAffect
(
""
,
seq
[
2
*
(
nb_seq
-
1
)
+
1
],
1
)))
==
1
,
TEST_AA_FIRST
,
""
);
TAP_TEST
(
kaa
.
getAffectation
(
3
).
isUnknown
(),
TEST_AA_PREDICATES
,
""
);
TAP_TEST
(
kaa
.
getAffectation
(
8
).
isUnknown
(),
TEST_AA_PREDICATES
,
""
);
TAP_TEST
(
kaa
.
getAffectation
(
0
).
isAmbiguous
(),
TEST_AA_PREDICATES
,
""
);
TAP_TEST
(
kaa
.
getDistinctAffectations
().
size
()
==
5
,
TEST_AA_GET_DISTINCT_AFFECT
,
""
);
delete
index
;
}
template
<
template
<
class
>
class
T
,
class
KAffect
>
void
testAffectAnalyser2
()
{
const
int
k
=
5
;
const
bool
revcomp
=
true
;
T
<
KAffect
>
*
index
=
createIndex
<
T
<
KAffect
>
>
(
k
,
revcomp
);
KmerAffectAnalyser
<
KAffect
>
kaa
(
*
index
,
"TTTTTGGGGG"
);
TAP_TEST
(
kaa
.
getAffectation
(
1
)
==
KAffect
(
""
,
seq
[
2
*
(
nb_seq
-
1
)
+
1
],
-
1
),
TEST_AA_GET_AFFECT
,
""
);
TAP_TEST
(
kaa
.
count
(
kaa
.
getAffectation
(
1
))
==
1
,
TEST_AA_GET_AFFECT
,
""
);
TAP_TEST
(
kaa
.
getAffectation
(
0
)
==
kaa
.
getAffectation
(
10
-
k
),
TEST_AA_GET_AFFECT
,
""
);
for
(
int
i
=
2
;
i
<
10
-
k
;
i
++
)
TAP_TEST
(
kaa
.
getAffectation
(
i
).
isUnknown
(),
TEST_AA_PREDICATES
,
""
);
TAP_TEST
(
kaa
.
getDistinctAffectations
().
size
()
==
3
,
TEST_AA_GET_DISTINCT_AFFECT
,
""
);
delete
index
;
}
void
testSegmentationBug1
(
int
delta_min
,
int
delta_max
)
{
string
buggy_sequences
=
"bugs/kmersegment.fa"
;
int
k
=
14
;
bool
rc
=
true
;
Fasta
seqV
(
"data/Repertoire/TRGV.fa"
);
Fasta
seqJ
(
"data/Repertoire/TRGJ.fa"
);
IKmerStore
<
KmerAffect
>
*
index
=
new
ArrayKmerStore
<
KmerAffect
>
(
k
,
rc
);
index
->
insert
(
seqV
,
"V"
);
index
->
insert
(
seqJ
,
"J"
);
OnlineFasta
input
(
buggy_sequences
);
while
(
input
.
hasNext
())
{
input
.
next
();
KmerAffectAnalyser
<
KmerAffect
>
*
kaa
=
new
KmerAffectAnalyser
<
KmerAffect
>
(
*
index
,
input
.
getSequence
().
sequence
);
set
<
KmerAffect
>
distinct_a
=
kaa
->
getDistinctAffectations
();
int
strand
=
0
;
for
(
set
<
KmerAffect
>::
iterator
it
=
distinct_a
.
begin
();
it
!=
distinct_a
.
end
()
&&
strand
!=
2
;
it
++
)
{
if
(
!
it
->
isAmbiguous
()
&&
!
it
->
isUnknown
())
{
if
(
strand
==
0
)
strand
=
affect_strand
(
it
->
affect
);
else
if
((
strand
==
1
&&
affect_strand
(
it
->
affect
)
==
-
1
)
||
(
strand
==
-
1
&&
affect_strand
(
it
->
affect
)
==
1
))
strand
=
2
;
}
}
int
stats
[
STATS_SIZE
];
Segmenter
*
segment
=
new
KmerSegmenter
(
input
.
getSequence
(),
index
,
delta_min
,
delta_max
,
stats
);
if
(
strand
==
2
||
(
strand
==
1
&&
(
kaa
->
last
(
AFFECT_V
)
==
(
int
)
string
::
npos
||
kaa
->
first
(
AFFECT_J
)
==
(
int
)
string
::
npos
))
||
(
strand
==
-
1
&&
(
kaa
->
first
(
AFFECT_V
)
==
(
int
)
string
::
npos
||
kaa
->
last
(
AFFECT_J
)
==
(
int
)
string
::
npos
)))
TAP_TEST
(
!
segment
->
isSegmented
(),
TEST_BUG_SEGMENTATION
,
""
);
delete
segment
;
delete
kaa
;
}
delete
index
;
}
int
main
(
void
)
{
TAP_START
(
NB_TESTS
);
declare_tests
();
TAP_TEST
(
complement
(
"AATCAGactgactagATCGAn"
)
==
"TTAGTCTGACTGATCTAGCTN"
,
TEST_REVCOMP
,
""
);
TAP_TEST
(
revcomp
(
"AATCAGactgactagATCGAn"
)
==
"NTCGATCTAGTCAGTCTGATT"
,
TEST_REVCOMP
,
""
);
TAP_TEST
(
revcomp
(
""
)
==
""
,
TEST_REVCOMP
,
""
);
TAP_TEST
(
revcomp
(
"aaaaaa"
)
==
"TTTTTT"
,
TEST_REVCOMP
,
""
);
testFasta1
();
testKmerStoreWithKmer
<
ArrayKmerStore
>
(
5
,
false
,
TEST_ARRAY_KMERSTORE
);
testKmerStoreWithKmer
<
ArrayKmerStore
>
(
5
,
true
,
TEST_ARRAY_KMERSTORE_RC
);
testKmerStoreWithKmer
<
MapKmerStore
>
(
5
,
false
,
TEST_MAP_KMERSTORE
);
testKmerStoreWithKmer
<
MapKmerStore
>
(
5
,
true
,
TEST_MAP_KMERSTORE_RC
);
testAffectAnalyser1
<
ArrayKmerStore
,
KmerAffect
>
();
testAffectAnalyser2
<
ArrayKmerStore
,
KmerAffect
>
();
testAffectAnalyser1
<
ArrayKmerStore
,
KmerStringAffect
>
();
testAffectAnalyser2
<
ArrayKmerStore
,
KmerStringAffect
>
();
// Bug with one sequence on segmentation
testSegmentationBug1
(
-
10
,
15
);
TAP_END_TEST_EXIT
}
algo/vidjil.cpp
View file @
c15efbd6
...
...
@@ -811,6 +811,10 @@ int main (int argc, char **argv)
cout
<<
"Germlines loaded"
<<
endl
;
cout
<<
*
multigermline
;
cout
<<
endl
;
// Number of reads for e-value computation
int
nb_reads_for_evalue
=
(
expected_value
>
NO_LIMIT_VALUE
)
?
nb_sequences_in_fasta
(
f_reads
,
true
)
:
1
;
//////////////////////////////////
//$$ Read sequence files
...
...
@@ -959,7 +963,8 @@ int main (int argc, char **argv)
we
.
setAffectsOutput
(
out_affects
);
}
WindowsStorage
*
windowsStorage
=
we
.
extract
(
reads
,
multigermline
,
w
,
windows_labels
,
max_reads_processed
,
only_nth_read
,
keep_unsegmented_as_clone
,
expected_value
);
WindowsStorage
*
windowsStorage
=
we
.
extract
(
reads
,
multigermline
,
w
,
windows_labels
,
max_reads_processed
,
only_nth_read
,
keep_unsegmented_as_clone
,
expected_value
,
nb_reads_for_evalue
);
windowsStorage
->
setIdToAll
();
size_t
nb_total_reads
=
we
.
getNbReads
();
...
...
@@ -1255,7 +1260,7 @@ int main (int argc, char **argv)
seg
.
toJsonList
(
&
json_seg
);
// Re-launch also a KmerMultiSegmenter, for control purposes (affectations, evalue)
KmerMultiSegmenter
kmseg
(
seg
.
getSequence
(),
multigermline
,
0
,
expected_value
);
KmerMultiSegmenter
kmseg
(
seg
.
getSequence
(),
multigermline
,
0
,
expected_value
,
nb_reads_for_evalue
);
KmerSegmenter
*
kseg
=
kmseg
.
the_kseg
;
kseg
->
toJsonList
(
&
json_seg
);
if
(
verbose
)
...
...
doc/algo.org
View file @
c15efbd6
...
...
@@ -181,8 +181,10 @@ different clones. Setting =-w= to lower values is not recommended.
The =-e= option sets the maximal e-value accepted for segmenting a sequence.
If a segmentation has a higher e-value, it will not be segmented.
The e-value computation takes into account both the number of reads in the
input sequence and the number of locus searched for.
The default value is 'all', but the recommended value is to use something
like 1
e-6 for datasets with a billion of reads
.
like 1
, 1e-2 or 1
.
Further help on this point will come in next releases.
** Threshold on clone output
...
...
germline/fasta.py
View file @
c15efbd6
...
...
@@ -24,7 +24,13 @@ def parse(fasta, endline=''):
if
header
or
sequence
:
yield
(
header
,
sequence
)
def
extract_field_if_exists
(
str
,
separator
,
field_number
):
fields
=
str
.
split
(
separator
)
if
len
(
fields
)
>
field_number
:
return
fields
[
field_number
]
return
str
def
parse_as_Fasta
(
fasta
):
for
(
header
,
sequence
)
in
parse
(
fasta
):
yield
Fasta
(
header
,
sequence
)
...
...
@@ -38,11 +44,11 @@ class Fasta():
@
property
def
name
(
self
):
return
self
.
header
.
split
(
'|'
)[
1
]
return
extract_field_if_exists
(
self
.
header
,
'|'
,
1
)
@
property
def
species
(
self
):
return
self
.
header
.
split
(
'|'
)[
2
]
return
extract_field_if_exists
(
self
.
header
,
'|'
,
2
)
def
__len__
(
self
):
return
len
(
self
.
seq
)
...
...
germline/generate-recombinations.py
View file @
c15efbd6
...
...
@@ -7,8 +7,8 @@ import random
random
.
seed
(
33328778554
)
def
recombine_VJ
(
seq5
,
remove5
,
N
,
remove3
,
seq3
,
code
):
name
=
"%s %d/%s/%d %s
%s
"
%
(
seq5
.
name
,
remove5
,
N
,
remove3
,
seq3
.
name
,
code
)
def
recombine_VJ
(
seq5
,
remove5
,
N
,
remove3
,
seq3
):
name
=
"%s %d/%s/%d %s"
%
(
seq5
.
name
,
remove5
,
N
,
remove3
,
seq3
.
name
)
seq
=
seq5
.
seq
[:
len
(
seq5
)
-
remove5
]
+
'
\n
'
+
N
+
'
\n
'
+
seq3
.
seq
[
remove3
:]
return
fasta
.
Fasta
(
name
,
seq
)
...
...
@@ -17,7 +17,7 @@ def recombine_VJ(seq5, remove5, N, remove3, seq3, code):
def
random_sequence
(
characters
,
length
):
return
''
.
join
([
random
.
choice
(
characters
)
for
x
in
range
(
length
)])
def
recombine_VJ_with_removes
(
seq5
,
remove5
,
Nlength
,
remove3
,
seq3
,
code
):
def
recombine_VJ_with_removes
(
seq5
,
remove5
,
Nlength
,
remove3
,
seq3
):
'''Recombine V and J with a random N, ensuring that the bases in N are not the same that what was ultimately removed from V or J'''
assert
(
Nlength
>=
1
)
...
...
@@ -29,36 +29,53 @@ def recombine_VJ_with_removes(seq5, remove5, Nlength, remove3, seq3, code):
if
c
in
available
:
available
.
remove
(
c
)
return
recombine_VJ
(
seq5
,
remove5
,
random_sequence
(
available
,
Nlength
),
remove3
,
seq3
,
code
)
return
recombine_VJ
(
seq5
,
remove5
,
random_sequence
(
available
,
Nlength
),
remove3
,
seq3
)