Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
vidjil
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
1,712
Issues
1,712
List
Boards
Labels
Service Desk
Milestones
Merge Requests
87
Merge Requests
87
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Operations
Operations
Incidents
Environments
Packages & Registries
Packages & Registries
Container Registry
Analytics
Analytics
CI / CD
Repository
Value Stream
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
vidjil
vidjil
Commits
ba7c8b9e
Commit
ba7c8b9e
authored
Jun 20, 2017
by
Mikaël Salson
Committed by
Mathieu Giraud
Jul 07, 2017
1
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
algo/: Count number of sequences independently of input format
Use OnlineBioReader rather than a Fasta reader
parent
3d123f2e
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
76 additions
and
72 deletions
+76
-72
algo/core/bioreader.cpp
algo/core/bioreader.cpp
+54
-0
algo/core/bioreader.hpp
algo/core/bioreader.hpp
+17
-0
algo/core/fasta.cpp
algo/core/fasta.cpp
+0
-55
algo/core/fasta.h
algo/core/fasta.h
+0
-12
algo/tests/unit-tests/testTools.cpp
algo/tests/unit-tests/testTools.cpp
+3
-3
algo/vidjil.cpp
algo/vidjil.cpp
+2
-2
No files found.
algo/core/bioreader.cpp
View file @
ba7c8b9e
...
...
@@ -237,3 +237,57 @@ OnlineBioReader *OnlineBioReaderFactory::create(const string &filename,
int
nb_sequences_max
,
int
only_nth_sequence
)
{
return
new
OnlineFasta
(
filename
,
extract_field
,
extract_separator
,
nb_sequences_max
,
only_nth_sequence
);
}
// http://stackoverflow.com/a/5840160/4475279
unsigned
long
long
filesize
(
const
char
*
filename
)
{
std
::
ifstream
in
(
filename
,
std
::
ifstream
::
ate
|
std
::
ifstream
::
binary
);
return
in
.
tellg
();
}
unsigned
long
long
nb_sequences_in_file
(
string
f
,
bool
approx
)
{
if
(
approx
)
return
approx_nb_sequences_in_file
(
f
);
OnlineBioReader
*
sequences
=
OnlineBioReaderFactory
::
create
(
f
,
1
,
" "
);
unsigned
long
long
nb_sequences
=
0
;
while
(
sequences
->
hasNext
())
{
sequences
->
next
();
nb_sequences
++
;
}
cout
<<
" ==> "
<<
nb_sequences
<<
" sequences"
<<
endl
;
delete
sequences
;
return
nb_sequences
;
}
unsigned
long
long
approx_nb_sequences_in_file
(
string
f
)
{
OnlineBioReader
*
sequences
=
OnlineBioReaderFactory
::
create
(
f
,
1
,
" "
);
int
nb_sequences
=
0
;
while
(
nb_sequences
<
SAMPLE_APPROX_NB_SEQUENCES
&&
sequences
->
hasNext
())
{
sequences
->
next
();
nb_sequences
++
;
}
cout
<<
" ==> "
;
if
(
sequences
->
hasNext
())
{
cout
<<
"approx. "
;
float
ratio
=
(
float
)
filesize
(
f
.
c_str
())
/
(
float
)
sequences
->
getPos
();
nb_sequences
=
(
unsigned
long
long
)
(
ratio
*
nb_sequences
);
}
cout
<<
nb_sequences
<<
" sequences"
<<
endl
;
delete
sequences
;
return
nb_sequences
;
}
algo/core/bioreader.hpp
View file @
ba7c8b9e
...
...
@@ -28,6 +28,8 @@
#include <vector>
#include <list>
#define SAMPLE_APPROX_NB_SEQUENCES 200
using
namespace
std
;
typedef
string
seqtype
;
...
...
@@ -223,6 +225,21 @@ ostream &operator<<(ostream &out, const Sequence &seq);
const
BioReader
BIOREADER_UNKNOWN
=
BioReader
(
true
,
"_"
);
const
BioReader
BIOREADER_AMBIGUOUS
=
BioReader
(
true
,
"?"
);
unsigned
long
long
filesize
(
const
char
*
filename
);
/**
* Count the number of sequences in a Fasta file.
* @complexity Linear in the file size
* @return the number of sequences
*/
unsigned
long
long
nb_sequences_in_file
(
string
f
,
bool
approx
=
false
);
/**
* Give an approximate count of the number of sequences in the file
* @complexity linear in SAMPLE_APPROX_NB_SEQUENCES
* @return the approximate number of sequences
*/
unsigned
long
long
approx_nb_sequences_in_file
(
string
f
);
class
OnlineBioReaderFactory
{
public:
...
...
algo/core/fasta.cpp
View file @
ba7c8b9e
...
...
@@ -32,13 +32,6 @@
#include "../lib/gzstream.h"
// http://stackoverflow.com/a/5840160/4475279
unsigned
long
long
filesize
(
const
char
*
filename
)
{
std
::
ifstream
in
(
filename
,
std
::
ifstream
::
ate
|
std
::
ifstream
::
binary
);
return
in
.
tellg
();
}
// OnlineFasta
OnlineFasta
::
OnlineFasta
(
int
extract_field
,
string
extract_separator
,
...
...
@@ -175,52 +168,4 @@ void OnlineFasta::unexpectedEOF() {
throw
invalid_argument
(
"Unexpected EOF while reading FASTA/FASTQ file"
);
}
unsigned
long
long
nb_sequences_in_fasta
(
string
f
,
bool
approx
)
{
if
(
approx
)
return
approx_nb_sequences_in_fasta
(
f
);
OnlineFasta
*
sequences
=
new
OnlineFasta
(
f
,
1
,
" "
);
unsigned
long
long
nb_sequences
=
0
;
while
(
sequences
->
hasNext
())
{
sequences
->
next
();
nb_sequences
++
;
}
cout
<<
" ==> "
<<
nb_sequences
<<
" sequences"
<<
endl
;
delete
sequences
;
return
nb_sequences
;
}
#define SAMPLE_APPROX_NB_SEQUENCES 200
unsigned
long
long
approx_nb_sequences_in_fasta
(
string
f
)
{
OnlineFasta
*
sequences
=
new
OnlineFasta
(
f
,
1
,
" "
);
int
nb_sequences
=
0
;
while
(
nb_sequences
<
SAMPLE_APPROX_NB_SEQUENCES
&&
sequences
->
hasNext
())
{
sequences
->
next
();
nb_sequences
++
;
}
cout
<<
" ==> "
;
if
(
sequences
->
hasNext
())
{
cout
<<
"approx. "
;
float
ratio
=
(
float
)
filesize
(
f
.
c_str
())
/
(
float
)
sequences
->
getPos
();
nb_sequences
=
(
unsigned
long
long
)
(
ratio
*
nb_sequences
);
}
cout
<<
nb_sequences
<<
" sequences"
<<
endl
;
delete
sequences
;
return
nb_sequences
;
}
algo/core/fasta.h
View file @
ba7c8b9e
...
...
@@ -10,9 +10,6 @@ using namespace std;
#include "bioreader.hpp"
unsigned
long
long
filesize
(
const
char
*
filename
);
/**
* Read a FASTA/FASTQ file.
* Space complexity: constant. Only one sequence is stored at most in memory.
...
...
@@ -78,13 +75,4 @@ protected:
*/
string
getInterestingLine
(
int
state
=
FASTX_UNINIT
);
};
/**
* Count the number of sequences in a Fasta file
* @return the number of sequences
*/
unsigned
long
long
nb_sequences_in_fasta
(
string
f
,
bool
approx
=
false
);
unsigned
long
long
approx_nb_sequences_in_fasta
(
string
f
);
#endif
algo/tests/unit-tests/testTools.cpp
View file @
ba7c8b9e
...
...
@@ -50,12 +50,12 @@ void testOnlineBioReaderMaxNth() {
void
testFastaNbSequences
()
{
TAP_TEST
(
nb_sequences_in_f
asta
(
"../../germline/homo-sapiens/IGHV.fa"
)
==
349
,
TEST_FASTA_NB_SEQUENCES
,
"ccc"
);
TAP_TEST
(
nb_sequences_in_f
ile
(
"../../germline/homo-sapiens/IGHV.fa"
)
==
349
,
TEST_FASTA_NB_SEQUENCES
,
"ccc"
);
int
a1
=
approx_nb_sequences_in_f
asta
(
"../../germline/homo-sapiens/IGHV.fa"
);
int
a1
=
approx_nb_sequences_in_f
ile
(
"../../germline/homo-sapiens/IGHV.fa"
);
TAP_TEST
(
a1
>=
345
&&
a1
<=
355
,
TEST_FASTA_NB_SEQUENCES
,
""
);
int
a2
=
nb_sequences_in_f
asta
(
"../../data/Stanford_S22.fasta"
,
true
);
int
a2
=
nb_sequences_in_f
ile
(
"../../data/Stanford_S22.fasta"
,
true
);
TAP_TEST
(
a2
>=
13100
&&
a2
<=
13200
,
TEST_FASTA_NB_SEQUENCES
,
""
);
}
...
...
algo/vidjil.cpp
View file @
ba7c8b9e
...
...
@@ -916,7 +916,7 @@ int main (int argc, char **argv)
cout
<<
endl
;
// Number of reads for e-value computation
unsigned
long
long
nb_reads_for_evalue
=
(
expected_value
>
NO_LIMIT_VALUE
)
?
nb_sequences_in_f
asta
(
f_reads
,
true
)
:
1
;
unsigned
long
long
nb_reads_for_evalue
=
(
expected_value
>
NO_LIMIT_VALUE
)
?
nb_sequences_in_f
ile
(
f_reads
,
true
)
:
1
;
//////////////////////////////////
...
...
@@ -925,7 +925,7 @@ int main (int argc, char **argv)
int
only_nth_read
=
1
;
if
(
max_reads_processed_sample
!=
NO_LIMIT_VALUE
)
{
only_nth_read
=
nb_sequences_in_f
asta
(
f_reads
)
/
max_reads_processed_sample
;
only_nth_read
=
nb_sequences_in_f
ile
(
f_reads
)
/
max_reads_processed_sample
;
if
(
only_nth_read
==
0
)
only_nth_read
=
1
;
...
...
Mathieu Giraud
@magiraud
mentioned in merge request
!887 (merged)
·
Jan 15, 2021
mentioned in merge request
!887 (merged)
mentioned in merge request !887
Toggle commit list
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment