Commit ba7c8b9e authored by Mikaël Salson's avatar Mikaël Salson Committed by Mathieu Giraud

algo/: Count number of sequences independently of input format

Use OnlineBioReader rather than a Fasta reader
parent 3d123f2e
......@@ -237,3 +237,57 @@ OnlineBioReader *OnlineBioReaderFactory::create(const string &filename,
int nb_sequences_max, int only_nth_sequence) {
return new OnlineFasta(filename, extract_field, extract_separator, nb_sequences_max, only_nth_sequence);
}
// http://stackoverflow.com/a/5840160/4475279
unsigned long long filesize(const char* filename)
{
std::ifstream in(filename, std::ifstream::ate | std::ifstream::binary);
return in.tellg();
}
unsigned long long nb_sequences_in_file(string f, bool approx)
{
if (approx)
return approx_nb_sequences_in_file(f);
OnlineBioReader *sequences = OnlineBioReaderFactory::create(f, 1, " ");
unsigned long long nb_sequences = 0 ;
while (sequences->hasNext())
{
sequences->next();
nb_sequences++ ;
}
cout << " ==> " << nb_sequences << " sequences" << endl;
delete sequences ;
return nb_sequences ;
}
unsigned long long approx_nb_sequences_in_file(string f)
{
OnlineBioReader *sequences = OnlineBioReaderFactory::create(f, 1, " ");
int nb_sequences = 0 ;
while (nb_sequences < SAMPLE_APPROX_NB_SEQUENCES && sequences->hasNext())
{
sequences->next();
nb_sequences++ ;
}
cout << " ==> " ;
if (sequences->hasNext())
{
cout << "approx. " ;
float ratio = (float) filesize(f.c_str()) / (float) sequences->getPos();
nb_sequences = (unsigned long long) (ratio * nb_sequences);
}
cout << nb_sequences << " sequences" << endl;
delete sequences ;
return nb_sequences ;
}
......@@ -28,6 +28,8 @@
#include <vector>
#include <list>
#define SAMPLE_APPROX_NB_SEQUENCES 200
using namespace std;
typedef string seqtype ;
......@@ -223,6 +225,21 @@ ostream &operator<<(ostream &out, const Sequence &seq);
const BioReader BIOREADER_UNKNOWN = BioReader(true, "_");
const BioReader BIOREADER_AMBIGUOUS = BioReader(true, "?");
unsigned long long filesize(const char* filename);
/**
* Count the number of sequences in a Fasta file.
* @complexity Linear in the file size
* @return the number of sequences
*/
unsigned long long nb_sequences_in_file(string f, bool approx = false);
/**
* Give an approximate count of the number of sequences in the file
* @complexity linear in SAMPLE_APPROX_NB_SEQUENCES
* @return the approximate number of sequences
*/
unsigned long long approx_nb_sequences_in_file(string f);
class OnlineBioReaderFactory {
public:
......
......@@ -32,13 +32,6 @@
#include "../lib/gzstream.h"
// http://stackoverflow.com/a/5840160/4475279
unsigned long long filesize(const char* filename)
{
std::ifstream in(filename, std::ifstream::ate | std::ifstream::binary);
return in.tellg();
}
// OnlineFasta
OnlineFasta::OnlineFasta(int extract_field, string extract_separator,
......@@ -175,52 +168,4 @@ void OnlineFasta::unexpectedEOF() {
throw invalid_argument("Unexpected EOF while reading FASTA/FASTQ file");
}
unsigned long long nb_sequences_in_fasta(string f, bool approx)
{
if (approx)
return approx_nb_sequences_in_fasta(f);
OnlineFasta *sequences = new OnlineFasta(f, 1, " ");
unsigned long long nb_sequences = 0 ;
while (sequences->hasNext())
{
sequences->next();
nb_sequences++ ;
}
cout << " ==> " << nb_sequences << " sequences" << endl;
delete sequences ;
return nb_sequences ;
}
#define SAMPLE_APPROX_NB_SEQUENCES 200
unsigned long long approx_nb_sequences_in_fasta(string f)
{
OnlineFasta *sequences = new OnlineFasta(f, 1, " ");
int nb_sequences = 0 ;
while (nb_sequences < SAMPLE_APPROX_NB_SEQUENCES && sequences->hasNext())
{
sequences->next();
nb_sequences++ ;
}
cout << " ==> " ;
if (sequences->hasNext())
{
cout << "approx. " ;
float ratio = (float) filesize(f.c_str()) / (float) sequences->getPos();
nb_sequences = (unsigned long long) (ratio * nb_sequences);
}
cout << nb_sequences << " sequences" << endl;
delete sequences ;
return nb_sequences ;
}
......@@ -10,9 +10,6 @@ using namespace std;
#include "bioreader.hpp"
unsigned long long filesize(const char* filename);
/**
* Read a FASTA/FASTQ file.
* Space complexity: constant. Only one sequence is stored at most in memory.
......@@ -78,13 +75,4 @@ protected:
*/
string getInterestingLine(int state = FASTX_UNINIT);
};
/**
* Count the number of sequences in a Fasta file
* @return the number of sequences
*/
unsigned long long nb_sequences_in_fasta(string f, bool approx = false);
unsigned long long approx_nb_sequences_in_fasta(string f);
#endif
......@@ -50,12 +50,12 @@ void testOnlineBioReaderMaxNth() {
void testFastaNbSequences() {
TAP_TEST(nb_sequences_in_fasta("../../germline/homo-sapiens/IGHV.fa") == 349, TEST_FASTA_NB_SEQUENCES, "ccc");
TAP_TEST(nb_sequences_in_file("../../germline/homo-sapiens/IGHV.fa") == 349, TEST_FASTA_NB_SEQUENCES, "ccc");
int a1 = approx_nb_sequences_in_fasta("../../germline/homo-sapiens/IGHV.fa");
int a1 = approx_nb_sequences_in_file("../../germline/homo-sapiens/IGHV.fa");
TAP_TEST(a1 >= 345 && a1 <= 355, TEST_FASTA_NB_SEQUENCES, "");
int a2 = nb_sequences_in_fasta("../../data/Stanford_S22.fasta", true);
int a2 = nb_sequences_in_file("../../data/Stanford_S22.fasta", true);
TAP_TEST(a2 >= 13100 && a2 <= 13200, TEST_FASTA_NB_SEQUENCES, "");
}
......
......@@ -916,7 +916,7 @@ int main (int argc, char **argv)
cout << endl ;
// Number of reads for e-value computation
unsigned long long nb_reads_for_evalue = (expected_value > NO_LIMIT_VALUE) ? nb_sequences_in_fasta(f_reads, true) : 1 ;
unsigned long long nb_reads_for_evalue = (expected_value > NO_LIMIT_VALUE) ? nb_sequences_in_file(f_reads, true) : 1 ;
//////////////////////////////////
......@@ -925,7 +925,7 @@ int main (int argc, char **argv)
int only_nth_read = 1 ;
if (max_reads_processed_sample != NO_LIMIT_VALUE)
{
only_nth_read = nb_sequences_in_fasta(f_reads) / max_reads_processed_sample;
only_nth_read = nb_sequences_in_file(f_reads) / max_reads_processed_sample;
if (only_nth_read == 0)
only_nth_read = 1 ;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment