fasta.cpp 10.7 KB
Newer Older
Mikaël Salson's avatar
Mikaël Salson committed
1
/*
2 3 4 5 6 7 8
  This file is part of Vidjil <http://www.vidjil.org>
  Copyright (C) 2011, 2012, 2013, 2014, 2015 by Bonsai bioinformatics 
  at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille
  Contributors: 
      Mathieu Giraud <mathieu.giraud@vidjil.org>
      Mikaël Salson <mikael.salson@vidjil.org>
      Marc Duez <marc.duez@vidjil.org>
Mikaël Salson's avatar
Mikaël Salson committed
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31

  "Vidjil" is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.

  "Vidjil" is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with "Vidjil". If not, see <http://www.gnu.org/licenses/>
*/

#include <fstream>
#include <iostream>
#include <iomanip>
#include <algorithm>
#include <cctype>
#include <stdexcept>
#include "fasta.h"

32
#include "../lib/gzstream.h"
33

34 35 36 37 38 39 40 41

// http://stackoverflow.com/a/5840160/4475279
unsigned long long filesize(const char* filename)
{
    std::ifstream in(filename, std::ifstream::ate | std::ifstream::binary);
    return in.tellg();
}

42
void Fasta::init(int extract_field, string extract_separator, size_t mark_pos)
43 44 45
{
  this -> extract_field = extract_field ;
  this -> extract_separator = extract_separator ; 
46
  this -> mark_pos = mark_pos;
47
  total_size = 0;
48
  name = "";
49
  basename = "";
50 51 52 53 54 55
}

Fasta::Fasta(bool virtualfasta, string name)
{
  init(0, "");
  this -> name = name;
56
  basename = extract_basename(name);
57 58
}

59
Fasta::Fasta(int extract_field, string extract_separator, int mark_pos)
60
{
61
  init(extract_field, extract_separator, mark_pos);
Mikaël Salson's avatar
Mikaël Salson committed
62 63 64
}

Fasta::Fasta(const string &input, 
65 66
	     int extract_field, string extract_separator,
             bool verbose) 
Mikaël Salson's avatar
Mikaël Salson committed
67
{
68 69
  init(extract_field, extract_separator);

70
  if (!input.size()) // Do not open empty filenames (D germline if not segmentD)
Mikaël Salson's avatar
Mikaël Salson committed
71 72
    return ;

73
  add(input, verbose);  
74 75
}

76
void Fasta::add(istream &in, bool verbose) {
77
  in >> *this;
78

79
  if (verbose)
80
  cout << "\t" << setw(6) << total_size << " bp in " << setw(3) << size() << " sequences" << endl ;
81 82
}

83
void Fasta::add(const string &filename, bool verbose) {
84
  igzstream is(filename.c_str());
Mikaël Salson's avatar
Mikaël Salson committed
85 86
  if (is.fail())
    {
87
      throw invalid_argument(" !! Error in opening file: "+ filename);
Mikaël Salson's avatar
Mikaël Salson committed
88
    }
89

90 91 92 93 94 95 96 97
  if (name.size())
    name += " ";

  if (basename.size())
    basename += " ";

  name += filename;
  basename += extract_basename(filename);
98

99
  if (verbose)
100
  cout << " <== " << filename ;
101
  add(is, verbose);
Mikaël Salson's avatar
Mikaël Salson committed
102 103 104
  is.close();
}

105 106 107 108 109
void Fasta::add(Sequence seq) {
  reads.push_back(seq);
  total_size += seq.sequence.size();
}

Mikaël Salson's avatar
Mikaël Salson committed
110
int Fasta::size() const{ return (int)reads.size(); }
111
size_t Fasta::totalSize() const { return total_size ; }
112

Mathieu Giraud's avatar
Mathieu Giraud committed
113 114 115 116 117 118 119 120 121
list<Sequence> Fasta::getAll() const {
  list<Sequence> reads;

  for (int i=0; i < size(); i++) {
    reads.push_back(read(i));
  }

  return reads;
}
Mikaël Salson's avatar
Mikaël Salson committed
122 123 124 125 126 127 128
const string& Fasta::label(int index) const{ return reads[index].label; }
const string& Fasta::label_full(int index) const{ return reads[index].label_full; }
const Sequence& Fasta::read(int index) const {return reads[index];}
const string& Fasta::sequence(int index) const{ return reads[index].sequence; }

// OnlineFasta

129 130 131 132 133
OnlineFasta::OnlineFasta(int extract_field, string extract_separator,
                         int nb_sequences_max, int only_nth_sequence):
  input(NULL),
  extract_field(extract_field), extract_separator(extract_separator),
  nb_sequences_max(nb_sequences_max), only_nth_sequence(only_nth_sequence){}
Mikaël Salson's avatar
Mikaël Salson committed
134 135

OnlineFasta::OnlineFasta(const string &input, 
136 137
                         int extract_field, string extract_separator,
                         int nb_sequences_max, int only_nth_sequence)
138
  :input(new igzstream(input.c_str())),
Mikaël Salson's avatar
Mikaël Salson committed
139
  extract_field(extract_field), 
140 141
   extract_separator(extract_separator),
   nb_sequences_max(nb_sequences_max), only_nth_sequence(only_nth_sequence)
Mikaël Salson's avatar
Mikaël Salson committed
142 143
{
  if (this->input->fail()) {
144
    delete this->input;
145
    throw invalid_argument("!! Error in opening file "+input);
Mikaël Salson's avatar
Mikaël Salson committed
146
  }
Mathieu Giraud's avatar
Mathieu Giraud committed
147 148

  cout << "  <== " << input << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
149 150 151 152 153
  input_allocated = true;
  init();
}

OnlineFasta::OnlineFasta(istream &input, 
154 155
                         int extract_field, string extract_separator,
                         int nb_sequences_max, int only_nth_sequence)
156
  :input(&input), extract_field(extract_field),
157 158
   extract_separator(extract_separator),
   nb_sequences_max(nb_sequences_max), only_nth_sequence(only_nth_sequence)
Mikaël Salson's avatar
Mikaël Salson committed
159 160 161 162 163 164 165 166
{
  input_allocated = false;
  init();
}

OnlineFasta::~OnlineFasta() {
  if (input_allocated)
    delete input;
167 168
  if (current.seq)
    delete [] current.seq;
Mikaël Salson's avatar
Mikaël Salson committed
169 170 171
}

void OnlineFasta::init() {
172
  mark_pos = 0;
173
  nb_sequences_parsed = 0;
174
  nb_sequences_returned = 0;
175
  char_nb = 0;
Mikaël Salson's avatar
Mikaël Salson committed
176
  line_nb = 0;
177
  current.seq = NULL;
178
  current.marked_pos = 0;
179
  current_gaps = 0;
180 181

  line = getInterestingLine();
Mikaël Salson's avatar
Mikaël Salson committed
182 183
}

184 185 186 187
unsigned long long OnlineFasta::getPos() {
  return char_nb;
}

188 189 190 191
void OnlineFasta::setMarkPos(int mark_pos) {
  this -> mark_pos = mark_pos;
}

Mikaël Salson's avatar
Mikaël Salson committed
192 193 194 195 196 197 198 199
size_t OnlineFasta::getLineNb() {
  return line_nb;
}

Sequence OnlineFasta::getSequence() {
  return current;
}

200 201 202 203 204

bool OnlineFasta::hasNextData() {
  return ((!input->eof()) || line.length() > 0);
}

Mikaël Salson's avatar
Mikaël Salson committed
205
bool OnlineFasta::hasNext() {
206
  return hasNextData()
207
    && ((nb_sequences_max == NO_LIMIT_VALUE) || (nb_sequences_returned < nb_sequences_max));
Mikaël Salson's avatar
Mikaël Salson committed
208 209
}

210 211
void OnlineFasta::skipToNthSequence() {
  // Possibly skip some reads, when only_nth_sequence > 1
212
  while (hasNextData())
213 214 215 216 217 218 219 220 221 222
    if (nb_sequences_parsed % only_nth_sequence)
      {
        nb_sequences_returned--;
        next();
        continue ;
      }
    else
      return  ;
}

223 224
void OnlineFasta::addLineToCurrentSequence(string line)
{
225 226 227 228 229 230 231 232 233 234 235
  for (char& c : line)
    {
      if (c == ' ')
        continue ;

      if (c == '.') {
        current_gaps++;
        continue ;
      }

      current.sequence += c;
236 237

      if (mark_pos) {
238
        if ((int) current.sequence.length() + current_gaps == mark_pos)
239 240
          current.marked_pos = current.sequence.length();
      }
241
    }
242
}
243

Mikaël Salson's avatar
Mikaël Salson committed
244 245 246 247 248 249 250 251
void OnlineFasta::next() {
  fasta_state state = FASTX_UNINIT;

  // Reinit the Sequence object
  current.label_full.erase();
  current.label.erase();
  current.sequence.erase();
  current.quality.erase();
252
  if (current.seq) {
253
    delete [] current.seq;
254
    current.seq = NULL;
255
    current.marked_pos = 0;
256
    current_gaps = 0;
257
  }
Mikaël Salson's avatar
Mikaël Salson committed
258

259
  if  (hasNextData()) {
Mikaël Salson's avatar
Mikaël Salson committed
260 261 262 263
    switch(line[0]) {
    case '>': state=FASTX_FASTA; break;
    case '@': state=FASTX_FASTQ_ID; break;
    default: 
264
      throw invalid_argument("The file seems to be malformed!");
Mikaël Salson's avatar
Mikaël Salson committed
265 266 267
    }
    
    // Identifier line
268
    nb_sequences_parsed++;
269
    nb_sequences_returned++;
Mikaël Salson's avatar
Mikaël Salson committed
270 271 272 273
    current.label_full = line.substr(1);
    current.label = extract_from_label(current.label_full, extract_field, extract_separator);

    line = getInterestingLine();
274
    while (hasNextData() && ((state != FASTX_FASTA || line[0] != '>')
Mikaël Salson's avatar
Mikaël Salson committed
275 276
                         && (state != FASTX_FASTQ_QUAL || line[0] != '@'))) {

277
      if (hasNextData()) {
Mikaël Salson's avatar
Mikaël Salson committed
278 279 280
        switch(state) {
        case FASTX_FASTA: case FASTX_FASTQ_ID:
          // Sequence
281
          addLineToCurrentSequence(line);
Mikaël Salson's avatar
Mikaël Salson committed
282 283 284
          break;
        case FASTX_FASTQ_SEQ:
          // FASTQ separator between sequence and quality
285 286
          if (line[0] != '+')
            throw invalid_argument("Expected line starting with + in FASTQ file");
Mikaël Salson's avatar
Mikaël Salson committed
287 288 289 290
          break;
        case FASTX_FASTQ_SEP:
          // Reading quality
          current.quality = line;
291 292
          if (current.quality.length() != current.sequence.length())
            throw invalid_argument("Quality and sequence don't have the same length ");
Mikaël Salson's avatar
Mikaël Salson committed
293 294
          break;
        default:
295
          throw invalid_argument("Unexpected state after reading identifiers line");
Mikaël Salson's avatar
Mikaël Salson committed
296 297 298 299 300 301
        }
        if (state >= FASTX_FASTQ_ID && state <= FASTX_FASTQ_SEP)
          state = (fasta_state)(((int)state) + 1);
      } else {
        unexpectedEOF();
      }
302
      line = getInterestingLine(state);
Mikaël Salson's avatar
Mikaël Salson committed
303 304 305 306 307 308 309
    }

    if (state >= FASTX_FASTQ_ID && state < FASTX_FASTQ_QUAL) 
      unexpectedEOF();

    // Sequence in uppercase
    transform(current.sequence.begin(), current.sequence.end(), current.sequence.begin(), (int (*)(int))toupper);
310 311 312

    // Compute seq
    current.seq = new int[current.sequence.length()];
313
    for (unsigned int i=0; i< current.sequence.length(); i++)
314
      {
315
	current.seq[i] = nuc_to_int(current.sequence[i]) ;
316 317
      }

Mikaël Salson's avatar
Mikaël Salson committed
318 319
  } else
    unexpectedEOF();
320 321

  skipToNthSequence();
Mikaël Salson's avatar
Mikaël Salson committed
322 323
}

324
string OnlineFasta::getInterestingLine(int state) {
Mikaël Salson's avatar
Mikaël Salson committed
325
  string line;
326
  while (line.length() == 0 && hasNextData() && getline(*input, line)) {
Mikaël Salson's avatar
Mikaël Salson committed
327
    line_nb++;
328
    char_nb += line.length() + 1;
Mikaël Salson's avatar
Mikaël Salson committed
329
    remove_trailing_whitespaces(line);
330 331 332

    if (line.length() && line[0] == '#' && state != FASTX_FASTQ_SEP)
      line = "" ;
Mikaël Salson's avatar
Mikaël Salson committed
333 334 335 336 337
  }
  return line;
}

void OnlineFasta::unexpectedEOF() {
338
  throw invalid_argument("Unexpected EOF while reading FASTA/FASTQ file");
Mikaël Salson's avatar
Mikaël Salson committed
339 340 341 342 343 344 345 346
}

// Operators

istream& operator>>(istream& in, Fasta& fasta){
	string line;
	Sequence read;
        OnlineFasta of(in, fasta.extract_field, fasta.extract_separator);
347
        of.setMarkPos(fasta.mark_pos);
Mikaël Salson's avatar
Mikaël Salson committed
348 349 350

        while (of.hasNext()) {
          of.next();
351
          fasta.add(of.getSequence());
Mikaël Salson's avatar
Mikaël Salson committed
352 353 354 355 356 357
        }
	return in;
}

ostream& operator<<(ostream& out, Fasta& fasta){
	for(int i = 0 ; i < fasta.size() ; i++){
358
          out << fasta.read(i);
Mikaël Salson's avatar
Mikaël Salson committed
359 360 361 362 363
	}
	return out;
}

ostream &operator<<(ostream &out, const Sequence &seq) {
364 365 366 367 368 369
  bool is_fastq=false;
  if (seq.quality.length() > 0) {
    is_fastq = true;
    out << "@";
  } else
    out << ">";
370 371 372 373 374 375 376 377
  out << seq.label;

  if (seq.marked_pos) {
    out << " !@" << seq.marked_pos ;
  }

  out << endl;

Mikaël Salson's avatar
Mikaël Salson committed
378
  out << seq.sequence << endl;
379 380 381
  if (is_fastq) {
    out << "+" << endl << seq.quality << endl;
  }
Mikaël Salson's avatar
Mikaël Salson committed
382 383
  return out;
}
384

385
int nb_sequences_in_fasta(string f, bool approx)
386
{
387 388 389
  if (approx)
    return approx_nb_sequences_in_fasta(f);

390 391 392 393 394 395 396 397 398 399
  OnlineFasta *sequences = new OnlineFasta(f, 1, " ");
  int nb_sequences = 0 ;

  while (sequences->hasNext())
    {
      sequences->next();
      nb_sequences++ ;
    }

  cout << "  ==> " << nb_sequences << " sequences" << endl;
400 401

  delete sequences ;
402 403
  return nb_sequences ;
}
404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429


#define SAMPLE_APPROX_NB_SEQUENCES 200

int approx_nb_sequences_in_fasta(string f)
{
  OnlineFasta *sequences = new OnlineFasta(f, 1, " ");
  int nb_sequences = 0 ;

  while (nb_sequences < SAMPLE_APPROX_NB_SEQUENCES && sequences->hasNext())
    {
      sequences->next();
      nb_sequences++ ;
    }

  cout << "  ==> " ;

  if (sequences->hasNext())
    {
      cout << "approx. " ;
      float ratio = (float) filesize(f.c_str()) / (float) sequences->getPos();
      nb_sequences = (int) (ratio * nb_sequences);
    }

  cout << nb_sequences << " sequences" << endl;

430
  delete sequences ;
431 432
  return nb_sequences ;
}