fasta.h 5.16 KB
Newer Older
Mikaël Salson's avatar
Mikaël Salson committed
1 2 3 4 5 6
#ifndef FASTA_H
#define FASTA_H

#include <istream>
#include <string>
#include <vector>
Mathieu Giraud's avatar
Mathieu Giraud committed
7
#include <list>
Mikaël Salson's avatar
Mikaël Salson committed
8 9 10

using namespace std;

11 12
typedef string seqtype ;

Mikaël Salson's avatar
Mikaël Salson committed
13 14 15 16
typedef struct read_t
{
  string label_full;
  string label;
17
  string sequence; // Sequence: original string representation
Mikaël Salson's avatar
Mikaël Salson committed
18
  string quality;
19
  int*   seq;      // Sequence: seq representation
Mikaël Salson's avatar
Mikaël Salson committed
20 21 22 23 24 25 26 27
} Sequence;

typedef enum {
  FASTX_UNINIT, FASTX_FASTA,
  FASTX_FASTQ_ID, FASTX_FASTQ_SEQ,
  FASTX_FASTQ_SEP, FASTX_FASTQ_QUAL
} fasta_state;

Mathieu Giraud's avatar
Mathieu Giraud committed
28 29
#include "tools.h"

30 31
unsigned long long filesize(const char* filename);

Mikaël Salson's avatar
Mikaël Salson committed
32 33
class Fasta
{
34 35
        void init(int extract_field, string extract_separator);

Mikaël Salson's avatar
Mikaël Salson committed
36 37 38 39 40 41 42 43
        int total_size;
        int extract_field;
	string extract_separator;
	
	vector<Sequence> reads;
	// ostream *oout ;

public:
44
        Fasta(int extract_field=0, string extract_separator="|");
Mikaël Salson's avatar
Mikaël Salson committed
45 46 47
        /**
         * Read all the sequences in the input filename and record them in the object.
         *
48 49 50
         * @throws invalid_argument if filename or file content is not
         *         valid
         */
Mikaël Salson's avatar
Mikaël Salson committed
51
	Fasta(const string &input, 
52 53
	      int extract_field=0, string extract_separator="|",
              bool verbose=true);
54 55 56 57

        Fasta(bool virtualfasta, const string name); // virtualfasta unused

        string name;
58
        string basename;
Mikaël Salson's avatar
Mikaël Salson committed
59
	int size() const;
60 61
        int totalSize() const;

Mathieu Giraud's avatar
Mathieu Giraud committed
62 63 64 65 66
        /**
         * Get all the sequences from the FASTA file
         * @return a list of sequences in the same order as in the input file
         */
        list<Sequence> getAll() const;
Mikaël Salson's avatar
Mikaël Salson committed
67 68 69 70
	const string& label(int index) const;
	const string& label_full(int index) const;
        const Sequence &read(int index) const;
	const string& sequence(int index) const;
71

72 73 74
        /**
         * Add the content of the stream to the current object
         */
75
        void add(istream &in, bool verbose=true);
76 77 78 79 80
        /**
         * Add the content of the file to the current object
         * @throws invalid_argument if the file cannot be opened or
         *         if the content is not valid
         */
81
        void add(const string &filename, bool verbose=true);
82 83

        void add(const Sequence sequence);
Mikaël Salson's avatar
Mikaël Salson committed
84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101
	
	friend istream& operator>>(istream&, Fasta&);
};

/**
 * Read a FASTA/FASTQ file.
 * Space complexity: constant. Only one sequence is stored at most in memory.
 * The file is read online meaning that we cannot access a random sequence.
 */
class OnlineFasta {
 private:
  Sequence current;
  istream *input;
  int extract_field;
  string extract_separator;
  string line;
  bool input_allocated;
  size_t line_nb;
102
  unsigned long long char_nb;
Mikaël Salson's avatar
Mikaël Salson committed
103

104
  int nb_sequences_parsed;
105
  int nb_sequences_returned;
106 107 108
  int nb_sequences_max;
  int only_nth_sequence;

Mikaël Salson's avatar
Mikaël Salson committed
109 110 111 112 113
 public:

  /**
   * Default constructor
   */
114 115
  OnlineFasta(int extract_field=0, string extract_separator="|",
              int nb_sequences_max=NO_LIMIT_VALUE, int only_nth_sequence=1);
Mikaël Salson's avatar
Mikaël Salson committed
116 117

  /**
Vidjil Team's avatar
Vidjil Team committed
118
   * Open the file. No sequence is read at first.
Mikaël Salson's avatar
Mikaël Salson committed
119 120
   * @post getSequence() does not return the first sequence yet. 
   *       next() must be called first.
121 122
   * @throws invalid_argument if file cannot be opened or is not
   *         well-formed
Mikaël Salson's avatar
Mikaël Salson committed
123 124
   */
  OnlineFasta(const string &input, 
125 126
              int extract_field=0, string extract_separator="|",
              int nb_sequences_max=NO_LIMIT_VALUE, int only_nth_sequence=1);
Mikaël Salson's avatar
Mikaël Salson committed
127 128

  OnlineFasta(istream &input, 
129 130
              int extract_field=0, string extract_separator="|",
              int nb_sequences_max=NO_LIMIT_VALUE, int only_nth_sequence=1);
Mikaël Salson's avatar
Mikaël Salson committed
131 132

  ~OnlineFasta();
133 134 135 136 137 138

  /**
   * @return the position in the file
   */
  unsigned long long getPos();

Mikaël Salson's avatar
Mikaël Salson committed
139 140 141 142 143 144 145 146 147 148 149 150 151 152
  /**
   * @return the current line number
   */
  size_t getLineNb();

  /**
   * @return the current sequence or an undetermined sequence if the end
   * of the file is reached
   */
  Sequence getSequence();

  /**
   * @return true iff we did not reach yet the end of the file.
   */
153 154 155 156 157
  bool hasNextData();

  /**
   * @return true iff we did not reach yet both the end of the file and the maximal number of returned sequences
   */
Mikaël Salson's avatar
Mikaël Salson committed
158 159 160 161 162
  bool hasNext();

  /**
   * Go to the next sequence in the file.
   * @post hasNext() ==> getSequence() returns the following sequence in the file.
163
   * @throws invalid_argument if the file is not well formated
Mikaël Salson's avatar
Mikaël Salson committed
164 165 166 167 168 169 170 171 172 173
   */
  void next();

 private:

  /**
   * Initialisation of the object
   */
  void init();

174 175 176 177 178
  /**
   * Skip to the next sequence that is a multiple of 'only_nth_sequence'
   */
  void skipToNthSequence();

Mikaël Salson's avatar
Mikaël Salson committed
179 180 181 182 183
  /**
   * Reads line in the input stream until we have a line with at least one
   * non-whitespace character.
   * @return A non-empty string whose trailing whitespaces have been removed
   */
184
  string getInterestingLine(int state = FASTX_UNINIT);
Mikaël Salson's avatar
Mikaël Salson committed
185 186 187 188 189 190 191 192 193 194 195 196

  /**
   * Called when we have an unexcepted EOF.
   * @throws exception
   */
  void unexpectedEOF();
};

istream& operator>>(istream& in, Fasta& fasta);
ostream& operator<<(ostream& out, Fasta& fasta);
ostream &operator<<(ostream &out, const Sequence &seq);

197 198 199
const Fasta FASTA_UNKNOWN = Fasta(true, "_");
const Fasta FASTA_AMBIGUOUS = Fasta(true, "?");

200 201 202 203
/**
 * Count the number of sequences in a Fasta file
 * @return the number of sequences
 */
204 205
int nb_sequences_in_fasta(string f, bool approx = false);
int approx_nb_sequences_in_fasta(string f);
206

Mikaël Salson's avatar
Mikaël Salson committed
207
#endif