fasta.h 4.22 KB
Newer Older
Mikaël Salson's avatar
Mikaël Salson committed
1 2 3 4 5 6
#ifndef FASTA_H
#define FASTA_H

#include <istream>
#include <string>
#include <vector>
Mathieu Giraud's avatar
Mathieu Giraud committed
7
#include <list>
Mikaël Salson's avatar
Mikaël Salson committed
8 9 10

using namespace std;

11 12
typedef string seqtype ;

Mikaël Salson's avatar
Mikaël Salson committed
13 14 15 16
typedef struct read_t
{
  string label_full;
  string label;
17
  string sequence; // Sequence: original string representation
Mikaël Salson's avatar
Mikaël Salson committed
18
  string quality;
19
  int*   seq;      // Sequence: seq representation
Mikaël Salson's avatar
Mikaël Salson committed
20 21 22 23 24 25 26 27
} Sequence;

typedef enum {
  FASTX_UNINIT, FASTX_FASTA,
  FASTX_FASTQ_ID, FASTX_FASTQ_SEQ,
  FASTX_FASTQ_SEP, FASTX_FASTQ_QUAL
} fasta_state;

Mathieu Giraud's avatar
Mathieu Giraud committed
28 29
#include "tools.h"

Mikaël Salson's avatar
Mikaël Salson committed
30 31
class Fasta
{
32 33
        void init(int extract_field, string extract_separator);

Mikaël Salson's avatar
Mikaël Salson committed
34 35 36 37 38 39 40 41
        int total_size;
        int extract_field;
	string extract_separator;
	
	vector<Sequence> reads;
	// ostream *oout ;

public:
42
        Fasta(int extract_field=0, string extract_separator="|");
Mikaël Salson's avatar
Mikaël Salson committed
43 44 45
        /**
         * Read all the sequences in the input filename and record them in the object.
         *
46 47 48
         * @throws invalid_argument if filename or file content is not
         *         valid
         */
Mikaël Salson's avatar
Mikaël Salson committed
49
	Fasta(const string &input, 
50 51
	      int extract_field=0, string extract_separator="|",
              bool verbose=true);
Mikaël Salson's avatar
Mikaël Salson committed
52 53
	
	int size() const;
Mathieu Giraud's avatar
Mathieu Giraud committed
54 55 56 57 58
        /**
         * Get all the sequences from the FASTA file
         * @return a list of sequences in the same order as in the input file
         */
        list<Sequence> getAll() const;
Mikaël Salson's avatar
Mikaël Salson committed
59 60 61 62
	const string& label(int index) const;
	const string& label_full(int index) const;
        const Sequence &read(int index) const;
	const string& sequence(int index) const;
63

64 65 66
        /**
         * Add the content of the stream to the current object
         */
67
        void add(istream &in, bool verbose=true);
68 69 70 71 72
        /**
         * Add the content of the file to the current object
         * @throws invalid_argument if the file cannot be opened or
         *         if the content is not valid
         */
73
        void add(const string &filename, bool verbose=true);
74 75

        void add(const Sequence sequence);
Mikaël Salson's avatar
Mikaël Salson committed
76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93
	
	friend istream& operator>>(istream&, Fasta&);
};

/**
 * Read a FASTA/FASTQ file.
 * Space complexity: constant. Only one sequence is stored at most in memory.
 * The file is read online meaning that we cannot access a random sequence.
 */
class OnlineFasta {
 private:
  Sequence current;
  istream *input;
  int extract_field;
  string extract_separator;
  string line;
  bool input_allocated;
  size_t line_nb;
94
  unsigned long long char_nb;
Mikaël Salson's avatar
Mikaël Salson committed
95 96 97 98 99 100 101 102 103 104 105 106

 public:

  /**
   * Default constructor
   */
  OnlineFasta(int extract_field=0, string extract_separator="|");

  /**
   * Open the file and read the first sequence.
   * @post getSequence() does not return the first sequence yet. 
   *       next() must be called first.
107 108
   * @throws invalid_argument if file cannot be opened or is not
   *         well-formed
Mikaël Salson's avatar
Mikaël Salson committed
109 110 111 112 113 114 115 116
   */
  OnlineFasta(const string &input, 
              int extract_field=0, string extract_separator="|");

  OnlineFasta(istream &input, 
              int extract_field=0, string extract_separator="|");

  ~OnlineFasta();
117 118 119 120 121 122

  /**
   * @return the position in the file
   */
  unsigned long long getPos();

Mikaël Salson's avatar
Mikaël Salson committed
123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141
  /**
   * @return the current line number
   */
  size_t getLineNb();

  /**
   * @return the current sequence or an undetermined sequence if the end
   * of the file is reached
   */
  Sequence getSequence();

  /**
   * @return true iff we did not reach yet the end of the file.
   */
  bool hasNext();

  /**
   * Go to the next sequence in the file.
   * @post hasNext() ==> getSequence() returns the following sequence in the file.
142
   * @throws invalid_argument if the file is not well formated
Mikaël Salson's avatar
Mikaël Salson committed
143 144 145 146 147 148 149 150 151 152 153 154 155 156 157
   */
  void next();

 private:

  /**
   * Initialisation of the object
   */
  void init();

  /**
   * Reads line in the input stream until we have a line with at least one
   * non-whitespace character.
   * @return A non-empty string whose trailing whitespaces have been removed
   */
158
  string getInterestingLine(int state = FASTX_UNINIT);
Mikaël Salson's avatar
Mikaël Salson committed
159 160 161 162 163 164 165 166 167 168 169 170

  /**
   * Called when we have an unexcepted EOF.
   * @throws exception
   */
  void unexpectedEOF();
};

istream& operator>>(istream& in, Fasta& fasta);
ostream& operator<<(ostream& out, Fasta& fasta);
ostream &operator<<(ostream &out, const Sequence &seq);

171 172 173 174 175 176
/**
 * Count the number of sequences in a Fasta file
 * @return the number of sequences
 */
int nb_sequences_in_fasta(string f);

Mikaël Salson's avatar
Mikaël Salson committed
177
#endif