fasta.h 4.34 KB
Newer Older
Mikaël Salson's avatar
Mikaël Salson committed
1 2 3 4 5 6
#ifndef FASTA_H
#define FASTA_H

#include <istream>
#include <string>
#include <vector>
Mathieu Giraud's avatar
Mathieu Giraud committed
7
#include <list>
Mikaël Salson's avatar
Mikaël Salson committed
8 9 10

using namespace std;

11 12
typedef string seqtype ;

Mikaël Salson's avatar
Mikaël Salson committed
13 14 15 16
typedef struct read_t
{
  string label_full;
  string label;
17
  string sequence; // Sequence: original string representation
Mikaël Salson's avatar
Mikaël Salson committed
18
  string quality;
19
  int*   seq;      // Sequence: seq representation
Mikaël Salson's avatar
Mikaël Salson committed
20 21 22 23 24 25 26 27
} Sequence;

typedef enum {
  FASTX_UNINIT, FASTX_FASTA,
  FASTX_FASTQ_ID, FASTX_FASTQ_SEQ,
  FASTX_FASTQ_SEP, FASTX_FASTQ_QUAL
} fasta_state;

Mathieu Giraud's avatar
Mathieu Giraud committed
28 29
#include "tools.h"

30 31
unsigned long long filesize(const char* filename);

Mikaël Salson's avatar
Mikaël Salson committed
32 33
class Fasta
{
34 35
        void init(int extract_field, string extract_separator);

Mikaël Salson's avatar
Mikaël Salson committed
36 37 38 39 40 41 42 43
        int total_size;
        int extract_field;
	string extract_separator;
	
	vector<Sequence> reads;
	// ostream *oout ;

public:
44
        Fasta(int extract_field=0, string extract_separator="|");
Mikaël Salson's avatar
Mikaël Salson committed
45 46 47
        /**
         * Read all the sequences in the input filename and record them in the object.
         *
48 49 50
         * @throws invalid_argument if filename or file content is not
         *         valid
         */
Mikaël Salson's avatar
Mikaël Salson committed
51
	Fasta(const string &input, 
52 53
	      int extract_field=0, string extract_separator="|",
              bool verbose=true);
Mikaël Salson's avatar
Mikaël Salson committed
54 55
	
	int size() const;
Mathieu Giraud's avatar
Mathieu Giraud committed
56 57 58 59 60
        /**
         * Get all the sequences from the FASTA file
         * @return a list of sequences in the same order as in the input file
         */
        list<Sequence> getAll() const;
Mikaël Salson's avatar
Mikaël Salson committed
61 62 63 64
	const string& label(int index) const;
	const string& label_full(int index) const;
        const Sequence &read(int index) const;
	const string& sequence(int index) const;
65

66 67 68
        /**
         * Add the content of the stream to the current object
         */
69
        void add(istream &in, bool verbose=true);
70 71 72 73 74
        /**
         * Add the content of the file to the current object
         * @throws invalid_argument if the file cannot be opened or
         *         if the content is not valid
         */
75
        void add(const string &filename, bool verbose=true);
76 77

        void add(const Sequence sequence);
Mikaël Salson's avatar
Mikaël Salson committed
78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
	
	friend istream& operator>>(istream&, Fasta&);
};

/**
 * Read a FASTA/FASTQ file.
 * Space complexity: constant. Only one sequence is stored at most in memory.
 * The file is read online meaning that we cannot access a random sequence.
 */
class OnlineFasta {
 private:
  Sequence current;
  istream *input;
  int extract_field;
  string extract_separator;
  string line;
  bool input_allocated;
  size_t line_nb;
96
  unsigned long long char_nb;
Mikaël Salson's avatar
Mikaël Salson committed
97 98 99 100 101 102 103 104 105

 public:

  /**
   * Default constructor
   */
  OnlineFasta(int extract_field=0, string extract_separator="|");

  /**
Vidjil Team's avatar
Vidjil Team committed
106
   * Open the file. No sequence is read at first.
Mikaël Salson's avatar
Mikaël Salson committed
107 108
   * @post getSequence() does not return the first sequence yet. 
   *       next() must be called first.
109 110
   * @throws invalid_argument if file cannot be opened or is not
   *         well-formed
Mikaël Salson's avatar
Mikaël Salson committed
111 112 113 114 115 116 117 118
   */
  OnlineFasta(const string &input, 
              int extract_field=0, string extract_separator="|");

  OnlineFasta(istream &input, 
              int extract_field=0, string extract_separator="|");

  ~OnlineFasta();
119 120 121 122 123 124

  /**
   * @return the position in the file
   */
  unsigned long long getPos();

Mikaël Salson's avatar
Mikaël Salson committed
125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
  /**
   * @return the current line number
   */
  size_t getLineNb();

  /**
   * @return the current sequence or an undetermined sequence if the end
   * of the file is reached
   */
  Sequence getSequence();

  /**
   * @return true iff we did not reach yet the end of the file.
   */
  bool hasNext();

  /**
   * Go to the next sequence in the file.
   * @post hasNext() ==> getSequence() returns the following sequence in the file.
144
   * @throws invalid_argument if the file is not well formated
Mikaël Salson's avatar
Mikaël Salson committed
145 146 147 148 149 150 151 152 153 154 155 156 157 158 159
   */
  void next();

 private:

  /**
   * Initialisation of the object
   */
  void init();

  /**
   * Reads line in the input stream until we have a line with at least one
   * non-whitespace character.
   * @return A non-empty string whose trailing whitespaces have been removed
   */
160
  string getInterestingLine(int state = FASTX_UNINIT);
Mikaël Salson's avatar
Mikaël Salson committed
161 162 163 164 165 166 167 168 169 170 171 172

  /**
   * Called when we have an unexcepted EOF.
   * @throws exception
   */
  void unexpectedEOF();
};

istream& operator>>(istream& in, Fasta& fasta);
ostream& operator<<(ostream& out, Fasta& fasta);
ostream &operator<<(ostream &out, const Sequence &seq);

173 174 175 176
/**
 * Count the number of sequences in a Fasta file
 * @return the number of sequences
 */
177 178
int nb_sequences_in_fasta(string f, bool approx = false);
int approx_nb_sequences_in_fasta(string f);
179

Mikaël Salson's avatar
Mikaël Salson committed
180
#endif