Commit 3d8ff242 authored by Mikaël Salson's avatar Mikaël Salson Committed by Mathieu Giraud

algo/: Refactor Fasta to BioReader

We now have an abstract class to deal with biological sequence files. This
will allow to more easily manage different file types.

This commit only reorganizes the code so that we will be able to add a BAM
reader easily. Functionnally the code should be equivalent to its previous
version.

Some functions that were not used have been removed.

The operator>> has been removed as it was only used in unit testing. This
operator is not convenient as having the filename may be useful to reopen the
file or to know its extension, to guess the filetype.

See #2016
parent 9ab1ef81
......@@ -4,7 +4,7 @@
#include <string.h>
#include <cstdlib>
#include "../core/dynprog.h"
#include "../core/fasta.h"
#include "../core/bioreader.hpp"
#include "../core/lazy_msa.h"
#include "../lib/json.hpp"
......@@ -83,7 +83,7 @@ int main(int argc, char* argv[])
if (!cgi_mode) cout <<endl;
if (! error) {
Fasta fa(fdata, 1, " ", !cgi_mode);
BioReader fa(fdata, 1, " ", 0, !cgi_mode);
string seq0 = fa.sequence(0);
......
......@@ -5,7 +5,7 @@
#include <cstdlib>
#include <unistd.h>
#include "../core/dynprog.h"
#include "../core/fasta.h"
#include "../core/bioreader.hpp"
#include "../core/lazy_msa.h"
#include "../core/similarityMatrix.h"
#include "../core/compare-all.h"
......@@ -111,7 +111,7 @@ int main(int argc, char* argv[])
if (!cgi_mode) cout <<endl;
if (! error) {
Fasta fa(fdata, 1, " ", !output_json);
BioReader fa(fdata, 1, " ", 0, !output_json);
list<Sequence> reads;
reads = fa.getAll();
......
/*
This file is part of Vidjil <http://www.vidjil.org>
Copyright (C) 2011-2017 by Bonsai bioinformatics
at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille
Contributors:
Mathieu Giraud <mathieu.giraud@vidjil.org>
Mikaël Salson <mikael.salson@vidjil.org>
Marc Duez <marc.duez@vidjil.org>
"Vidjil" is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
"Vidjil" is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with "Vidjil". If not, see <http://www.gnu.org/licenses/>
*/
#include "bioreader.hpp"
#include "fasta.h"
OnlineBioReader::OnlineBioReader(int extract_field, string extract_separator,
int nb_sequences_max, int only_nth_sequence):
filename(""), extract_field(extract_field),
extract_separator(extract_separator),
nb_sequences_max(nb_sequences_max), only_nth_sequence(only_nth_sequence){}
OnlineBioReader::OnlineBioReader(const string &input_filename,
int extract_field, string extract_separator,
int nb_sequences_max, int only_nth_sequence):
filename(input_filename), extract_field(extract_field),
extract_separator(extract_separator),
nb_sequences_max(nb_sequences_max), only_nth_sequence(only_nth_sequence)
{
input_allocated = true;
init();
}
OnlineBioReader::~OnlineBioReader() {
if (current.seq)
delete [] current.seq;
}
void OnlineBioReader::init() {
mark_pos = 0;
nb_sequences_parsed = 0;
nb_sequences_returned = 0;
char_nb = 0;
current.seq = NULL;
current.marked_pos = 0;
current_gaps = 0;
}
unsigned long long OnlineBioReader::getPos() {
return char_nb;
}
void OnlineBioReader::setMarkPos(int mark_pos) {
this -> mark_pos = mark_pos;
}
Sequence OnlineBioReader::getSequence() {
return current;
}
void OnlineBioReader::skipToNthSequence() {
// Possibly skip some reads, when only_nth_sequence > 1
while (hasNextData())
if (nb_sequences_parsed % only_nth_sequence)
{
nb_sequences_returned--;
this->next();
continue ;
}
else
return ;
}
void OnlineBioReader::addLineToCurrentSequence(string line)
{
for (char& c : line)
{
if (c == ' ')
continue ;
if (c == '.') {
current_gaps++;
continue ;
}
current.sequence += c;
if (mark_pos) {
if ((int) current.sequence.length() + current_gaps == mark_pos)
current.marked_pos = current.sequence.length();
}
}
}
void OnlineBioReader::unexpectedEOF() {
throw invalid_argument("Unexpected EOF while reading sequence file");
}
//// BioReader
void BioReader::init(int extract_field, string extract_separator, size_t mark_pos)
{
this -> extract_field = extract_field ;
this -> extract_separator = extract_separator ;
this -> mark_pos = mark_pos;
total_size = 0;
name = "";
basename = "";
}
BioReader::BioReader(bool virtualfasta, string name)
{
UNUSED(virtualfasta);
init(0, "");
this -> name = name;
basename = extract_basename(name);
}
BioReader::BioReader(int extract_field, string extract_separator, int mark_pos)
{
init(extract_field, extract_separator, mark_pos);
}
BioReader::BioReader(const string &input,
int extract_field, string extract_separator,
int mark_pos, bool verbose)
{
init(extract_field, extract_separator, mark_pos);
if (!input.size()) // Do not open empty filenames (D germline if not segmentD)
return ;
add(input, verbose);
}
void BioReader::add(const string &filename, bool verbose) {
OnlineBioReader *reader = OnlineBioReaderFactory::create(filename, extract_field,
extract_separator);
if (name.size())
name += " ";
if (basename.size())
basename += " ";
name += filename;
basename += extract_basename(filename);
if (verbose)
cout << " <== " << filename ;
add(*reader);
delete reader;
if (verbose)
cout << "\t" << setw(6) << total_size << " bp in " << setw(3) << size() << " sequences" << endl ;
}
void BioReader::add(OnlineBioReader &reader) {
string line;
Sequence read;
reader.setMarkPos(mark_pos);
while (reader.hasNext()) {
reader.next();
add(reader.getSequence());
}
}
void BioReader::add(Sequence seq) {
reads.push_back(seq);
total_size += seq.sequence.size();
}
int BioReader::size() const{ return (int)reads.size(); }
size_t BioReader::totalSize() const { return total_size ; }
list<Sequence> BioReader::getAll() const {
list<Sequence> reads;
for (int i=0; i < size(); i++) {
reads.push_back(read(i));
}
return reads;
}
const string& BioReader::label(int index) const{ return reads[index].label; }
const string& BioReader::label_full(int index) const{ return reads[index].label_full; }
const Sequence& BioReader::read(int index) const {return reads[index];}
const string& BioReader::sequence(int index) const{ return reads[index].sequence; }
// Operators
ostream& operator<<(ostream& out, BioReader& fasta){
for(int i = 0 ; i < fasta.size() ; i++){
out << fasta.read(i);
}
return out;
}
ostream &operator<<(ostream &out, const Sequence &seq) {
bool is_fastq=false;
if (seq.quality.length() > 0) {
is_fastq = true;
out << "@";
} else
out << ">";
out << seq.label;
if (seq.marked_pos) {
out << " !@" << seq.marked_pos ;
}
out << endl;
out << seq.sequence << endl;
if (is_fastq) {
out << "+" << endl << seq.quality << endl;
}
return out;
}
OnlineBioReader *OnlineBioReaderFactory::create(const string &filename,
int extract_field, string extract_separator,
int nb_sequences_max, int only_nth_sequence) {
return new OnlineFasta(filename, extract_field, extract_separator, nb_sequences_max, only_nth_sequence);
}
/*
This file is part of Vidjil <http://www.vidjil.org>
Copyright (C) 2011-2017 by Bonsai bioinformatics
at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille
Contributors:
Mathieu Giraud <mathieu.giraud@vidjil.org>
Mikaël Salson <mikael.salson@vidjil.org>
Marc Duez <marc.duez@vidjil.org>
"Vidjil" is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
"Vidjil" is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with "Vidjil". If not, see <http://www.gnu.org/licenses/>
*/
#ifndef BIO_READER_HPP
#define BIO_READER_HPP
#include <string>
#include <vector>
#include <list>
using namespace std;
typedef string seqtype ;
typedef struct read_t
{
string label_full;
string label;
string sequence; // Sequence: original string representation
string quality;
int* seq; // Sequence: seq representation
size_t marked_pos; // Some marked position in the sequence
} Sequence;
typedef enum {
FASTX_UNINIT, FASTX_FASTA,
FASTX_FASTQ_ID, FASTX_FASTQ_SEQ,
FASTX_FASTQ_SEP, FASTX_FASTQ_QUAL
} fasta_state;
#include "tools.h"
class OnlineBioReader {
protected:
string filename;
Sequence current;
int current_gaps;
int extract_field;
string extract_separator;
string line;
bool input_allocated;
unsigned long long char_nb;
int mark_pos;
int nb_sequences_parsed;
int nb_sequences_returned;
int nb_sequences_max;
int only_nth_sequence;
public:
/**
* Default constructor
*/
OnlineBioReader(int extract_field=0, string extract_separator="|",
int nb_sequences_max=NO_LIMIT_VALUE, int only_nth_sequence=1);
/**
* Open the file. No sequence is read at first.
* @param input: filename
* @param extract_field: the field number to extract from the header
* (0: keep the header full)
* @param extract_separator: the string used as a separator
* @param nb_sequences_max: maximal number of sequences to read
* (default: NO_LIMIT_VALUE)
* @param only_nth_sequence: modulo of the sequence to retrieve
* (default: 1 (ie. all))
* @post getSequence() does not return the first sequence yet.
* next() must be called first.
* @throws invalid_argument if file cannot be opened or is not
* well-formed
*/
OnlineBioReader(const string &input_filename,
int extract_field=0, string extract_separator="|",
int nb_sequences_max=NO_LIMIT_VALUE, int only_nth_sequence=1);
virtual ~OnlineBioReader();
/**
* sets a position to be followed in gapped sequences
*/
void setMarkPos(int mark_pos);
/**
* @return the position in the file
*/
unsigned long long getPos();
/**
* @return the current sequence or an undetermined sequence if the end
* of the file is reached
*/
Sequence getSequence();
/**
* @return true iff we did not reach yet the end of the file.
*/
virtual bool hasNextData() = 0;
/**
* @return true iff we did not reach yet both the end of the file and the maximal number of returned sequences
*/
virtual bool hasNext() = 0;
/**
* Go to the next sequence in the file.
* @post hasNext() ==> getSequence() returns the following sequence in the file.
* @throws invalid_argument if the file is not well formated
*/
virtual void next() = 0;
protected:
/**
* Initialisation of the object.
* Must be redefined in child classes for instance to open the file…
* This function is called from the controller.
* So because of C++ limits, the base controller won't be able to call the derived init().
* Therefore a derived init() must be called in the derived constructors
*/
virtual void init();
/**
* Parse the read line and keep the valuable characters in memory. It
* discards the . (and consider them as being gaps eg. for the IMGT gapped
* sequences). It also discards spaces
*/
virtual void addLineToCurrentSequence(string line);
/**
* Skip to the next sequence that is a multiple of 'only_nth_sequence'
*/
void skipToNthSequence();
/**
* Called when we have an unexcepted EOF.
* @throws exception
*/
virtual void unexpectedEOF();
};
class BioReader
{
void init(int extract_field, string extract_separator, size_t mark_pos=0);
size_t total_size;
int extract_field;
int mark_pos;
string extract_separator;
vector<Sequence> reads;
// ostream *oout ;
public:
BioReader(int extract_field=0, string extract_separator="|", int mark_pos=0);
/**
* Read all the sequences in the input filename and record them in the object.
*
* @throws invalid_argument if filename or file content is not
* valid
*/
BioReader(const string &input,
int extract_field=0, string extract_separator="|",
int mark_pos = 0, bool verbose=true);
BioReader(bool virtualfasta, const string name); // virtualfasta unused
string name;
string basename;
int size() const;
size_t totalSize() const;
/**
* Get all the sequences from the FASTA file
* @return a list of sequences in the same order as in the input file
*/
list<Sequence> getAll() const;
const string& label(int index) const;
const string& label_full(int index) const;
const Sequence &read(int index) const;
const string& sequence(int index) const;
/**
* Add the content of the file to the current object
* @throws invalid_argument if the file cannot be opened or
* if the content is not valid
*/
void add(const string &filename, bool verbose=true);
/**
* Add the content of an OnlineBioReader to the current object
*/
void add(OnlineBioReader& reader);
void add(const Sequence sequence);
};
istream& operator>>(istream& in, BioReader& reader);
ostream& operator<<(ostream& out, BioReader& fasta);
ostream &operator<<(ostream &out, const Sequence &seq);
const BioReader BIOREADER_UNKNOWN = BioReader(true, "_");
const BioReader BIOREADER_AMBIGUOUS = BioReader(true, "?");
class OnlineBioReaderFactory {
public:
/**
* Instantiates an OnlineBioReader.
* @throws invalid_argument if the file could not be opened
*/
static OnlineBioReader *create(const string &filename,int extract_field=0, string extract_separator="|",
int nb_sequences_max=NO_LIMIT_VALUE, int only_nth_sequence=1);
};
#endif
......@@ -4,7 +4,7 @@
#include <iomanip>
#include "dynprog.h"
#include "fasta.h"
#include "bioreader.hpp"
#include "similarityMatrix.h"
#include "windows.h"
......
......@@ -39,166 +39,37 @@ unsigned long long filesize(const char* filename)
return in.tellg();
}
void Fasta::init(int extract_field, string extract_separator, size_t mark_pos)
{
this -> extract_field = extract_field ;
this -> extract_separator = extract_separator ;
this -> mark_pos = mark_pos;
total_size = 0;
name = "";
basename = "";
}
Fasta::Fasta(bool virtualfasta, string name)
{
UNUSED(virtualfasta);
init(0, "");
this -> name = name;
basename = extract_basename(name);
}
Fasta::Fasta(int extract_field, string extract_separator, int mark_pos)
{
init(extract_field, extract_separator, mark_pos);
}
Fasta::Fasta(const string &input,
int extract_field, string extract_separator,
bool verbose)
{
init(extract_field, extract_separator);
if (!input.size()) // Do not open empty filenames (D germline if not segmentD)
return ;
add(input, verbose);
}
void Fasta::add(istream &in, bool verbose) {
in >> *this;
if (verbose)
cout << "\t" << setw(6) << total_size << " bp in " << setw(3) << size() << " sequences" << endl ;
}
void Fasta::add(const string &filename, bool verbose) {
igzstream is(filename.c_str());
if (is.fail())
{
throw invalid_argument(" !! Error in opening file: "+ filename);
}
if (name.size())
name += " ";
if (basename.size())
basename += " ";
name += filename;
basename += extract_basename(filename);
if (verbose)
cout << " <== " << filename ;
add(is, verbose);
is.close();
}
void Fasta::add(Sequence seq) {
reads.push_back(seq);
total_size += seq.sequence.size();
}
int Fasta::size() const{ return (int)reads.size(); }
size_t Fasta::totalSize() const { return total_size ; }
list<Sequence> Fasta::getAll() const {
list<Sequence> reads;
for (int i=0; i < size(); i++) {
reads.push_back(read(i));
}
return reads;
}
const string& Fasta::label(int index) const{ return reads[index].label; }
const string& Fasta::label_full(int index) const{ return reads[index].label_full; }
const Sequence& Fasta::read(int index) const {return reads[index];}
const string& Fasta::sequence(int index) const{ return reads[index].sequence; }
// OnlineFasta
OnlineFasta::OnlineFasta(int extract_field, string extract_separator,
int nb_sequences_max, int only_nth_sequence):
input(NULL),
extract_field(extract_field), extract_separator(extract_separator),
nb_sequences_max(nb_sequences_max), only_nth_sequence(only_nth_sequence){}
OnlineFasta::OnlineFasta(const string &input,
int extract_field, string extract_separator,
int nb_sequences_max, int only_nth_sequence)
:input(new igzstream(input.c_str())),
extract_field(extract_field),
extract_separator(extract_separator),
nb_sequences_max(nb_sequences_max), only_nth_sequence(only_nth_sequence)
{
if (this->input->fail()) {
delete this->input;
throw invalid_argument("!! Error in opening file "+input);
}
:OnlineBioReader(extract_field, extract_separator, nb_sequences_max, only_nth_sequence) {}
cout << " <== " << input << endl ;
input_allocated = true;
init();
}
OnlineFasta::OnlineFasta(istream &input,
OnlineFasta::OnlineFasta(const string &input_filename,
int extract_field, string extract_separator,
int nb_sequences_max, int only_nth_sequence)
:input(&input), extract_field(extract_field),
extract_separator(extract_separator),
nb_sequences_max(nb_sequences_max), only_nth_sequence(only_nth_sequence)
{
input_allocated = false;
init();
}
:OnlineBioReader(input_filename, extract_field, extract_separator, nb_sequences_max, only_nth_sequence) {this->init();}
OnlineFasta::~OnlineFasta() {
if (input_allocated)
delete input;
if (current.seq)
delete [] current.seq;
}
void OnlineFasta::init() {
mark_pos = 0;
nb_sequences_parsed = 0;
nb_sequences_returned = 0;
char_nb = 0;
line_nb = 0;
current.seq = NULL;
current.marked_pos = 0;
current_gaps = 0;
if (! filename.empty()) {
input = new igzstream(filename.c_str());
line = getInterestingLine();
}
unsigned long long OnlineFasta::getPos() {
return char_nb;
}
void OnlineFasta::setMarkPos(int mark_pos) {
this -> mark_pos = mark_pos;
}
if (this->input->fail()) {
delete this->input;
throw invalid_argument("!! Error in opening file "+filename);
}
size_t OnlineFasta::getLineNb() {
return line_nb;
}
cout << " <== " << filename << endl ;
}
Sequence OnlineFasta::getSequence() {
return current;
line = getInterestingLine();
}
bool OnlineFasta::hasNextData() {
return ((!input->eof()) || line.length() > 0);
}
......@@ -208,40 +79,6 @@ bool OnlineFasta::hasNext() {
&& ((nb_sequences_max == NO_LIMIT_VALUE) || (nb_sequences_returned < nb_sequences_max));
}
void OnlineFasta::skipToNthSequence() {
// Possibly skip some reads, when only_nth_sequence > 1
while (hasNextData())
if (nb_sequences_parsed % only_nth_sequence)
{
nb_sequences_returned--;
next();
continue ;
}
else
return ;
}
void OnlineFasta::addLineToCurrentSequence(string line)
{
for (char& c : line)
{
if (c == ' ')
continue ;
if (c == '.') {
current_gaps++;
continue ;
}
current.sequence += c;
if (mark_pos) {