fasta.h 5.47 KB
Newer Older
Mikaël Salson's avatar
Mikaël Salson committed
1 2 3 4 5 6
#ifndef FASTA_H
#define FASTA_H

#include <istream>
#include <string>
#include <vector>
Mathieu Giraud's avatar
Mathieu Giraud committed
7
#include <list>
Mikaël Salson's avatar
Mikaël Salson committed
8 9 10

using namespace std;

11 12
typedef string seqtype ;

Mikaël Salson's avatar
Mikaël Salson committed
13 14 15 16
typedef struct read_t
{
  string label_full;
  string label;
17
  string sequence; // Sequence: original string representation
Mikaël Salson's avatar
Mikaël Salson committed
18
  string quality;
19
  int*   seq;      // Sequence: seq representation
20
  size_t    marked_pos; // Some marked position in the sequence
Mikaël Salson's avatar
Mikaël Salson committed
21 22 23 24 25 26 27 28
} Sequence;

typedef enum {
  FASTX_UNINIT, FASTX_FASTA,
  FASTX_FASTQ_ID, FASTX_FASTQ_SEQ,
  FASTX_FASTQ_SEP, FASTX_FASTQ_QUAL
} fasta_state;

Mathieu Giraud's avatar
Mathieu Giraud committed
29 30
#include "tools.h"

31 32
unsigned long long filesize(const char* filename);

Mikaël Salson's avatar
Mikaël Salson committed
33 34
class Fasta
{
35
        void init(int extract_field, string extract_separator, size_t mark_pos=0);
36

37
        size_t total_size;
Mikaël Salson's avatar
Mikaël Salson committed
38
        int extract_field;
39
        int mark_pos;
Mikaël Salson's avatar
Mikaël Salson committed
40 41 42 43 44 45
	string extract_separator;
	
	vector<Sequence> reads;
	// ostream *oout ;

public:
46
        Fasta(int extract_field=0, string extract_separator="|", int mark_pos=0);
Mikaël Salson's avatar
Mikaël Salson committed
47 48 49
        /**
         * Read all the sequences in the input filename and record them in the object.
         *
50 51 52
         * @throws invalid_argument if filename or file content is not
         *         valid
         */
Mikaël Salson's avatar
Mikaël Salson committed
53
	Fasta(const string &input, 
54 55
	      int extract_field=0, string extract_separator="|",
              bool verbose=true);
56 57 58 59

        Fasta(bool virtualfasta, const string name); // virtualfasta unused

        string name;
60
        string basename;
Mikaël Salson's avatar
Mikaël Salson committed
61
	int size() const;
62
        size_t totalSize() const;
63

Mathieu Giraud's avatar
Mathieu Giraud committed
64 65 66 67 68
        /**
         * Get all the sequences from the FASTA file
         * @return a list of sequences in the same order as in the input file
         */
        list<Sequence> getAll() const;
Mikaël Salson's avatar
Mikaël Salson committed
69 70 71 72
	const string& label(int index) const;
	const string& label_full(int index) const;
        const Sequence &read(int index) const;
	const string& sequence(int index) const;
73

74 75 76
        /**
         * Add the content of the stream to the current object
         */
77
        void add(istream &in, bool verbose=true);
78 79 80 81 82
        /**
         * Add the content of the file to the current object
         * @throws invalid_argument if the file cannot be opened or
         *         if the content is not valid
         */
83
        void add(const string &filename, bool verbose=true);
84 85

        void add(const Sequence sequence);
Mikaël Salson's avatar
Mikaël Salson committed
86 87 88 89 90 91 92 93 94 95 96 97
	
	friend istream& operator>>(istream&, Fasta&);
};

/**
 * Read a FASTA/FASTQ file.
 * Space complexity: constant. Only one sequence is stored at most in memory.
 * The file is read online meaning that we cannot access a random sequence.
 */
class OnlineFasta {
 private:
  Sequence current;
98 99
  int current_gaps;

Mikaël Salson's avatar
Mikaël Salson committed
100 101 102 103 104 105
  istream *input;
  int extract_field;
  string extract_separator;
  string line;
  bool input_allocated;
  size_t line_nb;
106
  unsigned long long char_nb;
Mikaël Salson's avatar
Mikaël Salson committed
107

108
  int mark_pos;
109 110
  void addLineToCurrentSequence(string line);

111
  int nb_sequences_parsed;
112
  int nb_sequences_returned;
113 114 115
  int nb_sequences_max;
  int only_nth_sequence;

Mikaël Salson's avatar
Mikaël Salson committed
116 117 118 119 120
 public:

  /**
   * Default constructor
   */
121 122
  OnlineFasta(int extract_field=0, string extract_separator="|",
              int nb_sequences_max=NO_LIMIT_VALUE, int only_nth_sequence=1);
Mikaël Salson's avatar
Mikaël Salson committed
123 124

  /**
Vidjil Team's avatar
Vidjil Team committed
125
   * Open the file. No sequence is read at first.
Mikaël Salson's avatar
Mikaël Salson committed
126 127
   * @post getSequence() does not return the first sequence yet. 
   *       next() must be called first.
128 129
   * @throws invalid_argument if file cannot be opened or is not
   *         well-formed
Mikaël Salson's avatar
Mikaël Salson committed
130 131
   */
  OnlineFasta(const string &input, 
132 133
              int extract_field=0, string extract_separator="|",
              int nb_sequences_max=NO_LIMIT_VALUE, int only_nth_sequence=1);
Mikaël Salson's avatar
Mikaël Salson committed
134 135

  OnlineFasta(istream &input, 
136 137
              int extract_field=0, string extract_separator="|",
              int nb_sequences_max=NO_LIMIT_VALUE, int only_nth_sequence=1);
Mikaël Salson's avatar
Mikaël Salson committed
138 139

  ~OnlineFasta();
140

141 142 143 144 145
  /**
   * sets a position to be followed in gapped sequences
   */
  void setMarkPos(int mark_pos);

146 147 148 149 150
  /**
   * @return the position in the file
   */
  unsigned long long getPos();

Mikaël Salson's avatar
Mikaël Salson committed
151 152 153 154 155 156 157 158 159 160 161 162 163 164
  /**
   * @return the current line number
   */
  size_t getLineNb();

  /**
   * @return the current sequence or an undetermined sequence if the end
   * of the file is reached
   */
  Sequence getSequence();

  /**
   * @return true iff we did not reach yet the end of the file.
   */
165 166 167 168 169
  bool hasNextData();

  /**
   * @return true iff we did not reach yet both the end of the file and the maximal number of returned sequences
   */
Mikaël Salson's avatar
Mikaël Salson committed
170 171 172 173 174
  bool hasNext();

  /**
   * Go to the next sequence in the file.
   * @post hasNext() ==> getSequence() returns the following sequence in the file.
175
   * @throws invalid_argument if the file is not well formated
Mikaël Salson's avatar
Mikaël Salson committed
176 177 178 179 180 181 182 183 184 185
   */
  void next();

 private:

  /**
   * Initialisation of the object
   */
  void init();

186 187 188 189 190
  /**
   * Skip to the next sequence that is a multiple of 'only_nth_sequence'
   */
  void skipToNthSequence();

Mikaël Salson's avatar
Mikaël Salson committed
191 192 193 194 195
  /**
   * Reads line in the input stream until we have a line with at least one
   * non-whitespace character.
   * @return A non-empty string whose trailing whitespaces have been removed
   */
196
  string getInterestingLine(int state = FASTX_UNINIT);
Mikaël Salson's avatar
Mikaël Salson committed
197 198 199 200 201 202 203 204 205 206 207 208

  /**
   * Called when we have an unexcepted EOF.
   * @throws exception
   */
  void unexpectedEOF();
};

istream& operator>>(istream& in, Fasta& fasta);
ostream& operator<<(ostream& out, Fasta& fasta);
ostream &operator<<(ostream &out, const Sequence &seq);

209 210 211
const Fasta FASTA_UNKNOWN = Fasta(true, "_");
const Fasta FASTA_AMBIGUOUS = Fasta(true, "?");

212 213 214 215
/**
 * Count the number of sequences in a Fasta file
 * @return the number of sequences
 */
216 217
int nb_sequences_in_fasta(string f, bool approx = false);
int approx_nb_sequences_in_fasta(string f);
218

Mikaël Salson's avatar
Mikaël Salson committed
219
#endif