Commit f8a4008b authored by Mathieu Giraud's avatar Mathieu Giraud

core/fasta.{h,cpp}: approx_nb_sequences_in_fasta

Get an approximation of the number of sequences in a fasta file
by interpolating from the size of the first 200 sequences.
parent afca1d86
......@@ -31,6 +31,14 @@
#include "../lib/gzstream.h"
// http://stackoverflow.com/a/5840160/4475279
unsigned long long filesize(const char* filename)
{
std::ifstream in(filename, std::ifstream::ate | std::ifstream::binary);
return in.tellg();
}
void Fasta::init(int extract_field, string extract_separator)
{
this -> extract_field = extract_field ;
......@@ -280,8 +288,11 @@ ostream &operator<<(ostream &out, const Sequence &seq) {
return out;
}
int nb_sequences_in_fasta(string f)
int nb_sequences_in_fasta(string f, bool approx)
{
if (approx)
return approx_nb_sequences_in_fasta(f);
OnlineFasta *sequences = new OnlineFasta(f, 1, " ");
int nb_sequences = 0 ;
......@@ -294,3 +305,31 @@ int nb_sequences_in_fasta(string f)
cout << " ==> " << nb_sequences << " sequences" << endl;
return nb_sequences ;
}
#define SAMPLE_APPROX_NB_SEQUENCES 200
int approx_nb_sequences_in_fasta(string f)
{
OnlineFasta *sequences = new OnlineFasta(f, 1, " ");
int nb_sequences = 0 ;
while (nb_sequences < SAMPLE_APPROX_NB_SEQUENCES && sequences->hasNext())
{
sequences->next();
nb_sequences++ ;
}
cout << " ==> " ;
if (sequences->hasNext())
{
cout << "approx. " ;
float ratio = (float) filesize(f.c_str()) / (float) sequences->getPos();
nb_sequences = (int) (ratio * nb_sequences);
}
cout << nb_sequences << " sequences" << endl;
return nb_sequences ;
}
......@@ -27,6 +27,8 @@ typedef enum {
#include "tools.h"
unsigned long long filesize(const char* filename);
class Fasta
{
void init(int extract_field, string extract_separator);
......@@ -172,6 +174,7 @@ ostream &operator<<(ostream &out, const Sequence &seq);
* Count the number of sequences in a Fasta file
* @return the number of sequences
*/
int nb_sequences_in_fasta(string f);
int nb_sequences_in_fasta(string f, bool approx = false);
int approx_nb_sequences_in_fasta(string f);
#endif
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment