Mise à jour terminée. Pour connaître les apports de la version 13.8.4 par rapport à notre ancienne version vous pouvez lire les "Release Notes" suivantes :
https://about.gitlab.com/releases/2021/02/11/security-release-gitlab-13-8-4-released/
https://about.gitlab.com/releases/2021/02/05/gitlab-13-8-3-released/

fasta.cpp 7.05 KB
Newer Older
Mikaël Salson's avatar
Mikaël Salson committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27
/*
  This file is part of "Vidjil" <http://bioinfo.lifl.fr/vid>
  Copyright (C) 2011, 2012, 2013 by Bonsai bioinformatics at LIFL (UMR CNRS 8022, Université Lille) and Inria Lille
  Contributors: Mathieu Giraud <mathieu.giraud@lifl.fr>, Mikaël Salson <mikael.salson@lifl.fr>, David Chatel

  "Vidjil" is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.

  "Vidjil" is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with "Vidjil". If not, see <http://www.gnu.org/licenses/>
*/

#include <fstream>
#include <iostream>
#include <iomanip>
#include <algorithm>
#include <cctype>
#include <stdexcept>
#include "fasta.h"

28 29 30 31 32 33 34 35 36 37 38

void Fasta::init(int extract_field, string extract_separator)
{
  this -> extract_field = extract_field ;
  this -> extract_separator = extract_separator ; 
  total_size = 0;
}

Fasta::Fasta(int extract_field, string extract_separator)
{
  init(extract_field, extract_separator);
Mikaël Salson's avatar
Mikaël Salson committed
39 40 41
}

Fasta::Fasta(const string &input, 
42 43
	     int extract_field, string extract_separator,
             bool verbose) 
Mikaël Salson's avatar
Mikaël Salson committed
44
{
45 46
  init(extract_field, extract_separator);

47
  if (!input.size()) // Do not open empty filenames (D germline if not segmentD)
Mikaël Salson's avatar
Mikaël Salson committed
48 49
    return ;

50
  add(input, verbose);  
51 52
}

53
void Fasta::add(istream &in, bool verbose) {
54
  in >> *this;
55

56
  if (verbose)
57
  cout << "\t" << setw(6) << total_size << " bp in " << setw(3) << size() << " sequences" << endl ;
58 59
}

60
void Fasta::add(const string &filename, bool verbose) {
61
  ifstream is(filename.c_str());
Mikaël Salson's avatar
Mikaël Salson committed
62 63
  if (is.fail())
    {
64
      throw invalid_argument(" !! Error in opening file: "+ filename);
Mikaël Salson's avatar
Mikaël Salson committed
65
    }
66

67
  if (verbose)
68
  cout << " <== " << filename ;
69
  add(is, verbose);
Mikaël Salson's avatar
Mikaël Salson committed
70 71 72 73
  is.close();
}

int Fasta::size() const{ return (int)reads.size(); }
Mathieu Giraud's avatar
Mathieu Giraud committed
74 75 76 77 78 79 80 81 82
list<Sequence> Fasta::getAll() const {
  list<Sequence> reads;

  for (int i=0; i < size(); i++) {
    reads.push_back(read(i));
  }

  return reads;
}
Mikaël Salson's avatar
Mikaël Salson committed
83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99
const string& Fasta::label(int index) const{ return reads[index].label; }
const string& Fasta::label_full(int index) const{ return reads[index].label_full; }
const Sequence& Fasta::read(int index) const {return reads[index];}
const string& Fasta::sequence(int index) const{ return reads[index].sequence; }

// OnlineFasta

OnlineFasta::OnlineFasta(int extract_field, string extract_separator):
  input(NULL), extract_field(extract_field), extract_separator(extract_separator){}

OnlineFasta::OnlineFasta(const string &input, 
                         int extract_field, string extract_separator)
  :input(new ifstream(input.c_str())), 
  extract_field(extract_field), 
  extract_separator(extract_separator)
{
  if (this->input->fail()) {
100
    delete this->input;
101
    throw invalid_argument("!! Error in opening file "+input);
Mikaël Salson's avatar
Mikaël Salson committed
102
  }
Mathieu Giraud's avatar
Mathieu Giraud committed
103 104

  cout << "  <== " << input << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
105 106 107 108 109 110
  input_allocated = true;
  init();
}

OnlineFasta::OnlineFasta(istream &input, 
                         int extract_field, string extract_separator)
111
  :input(&input), extract_field(extract_field),
Mikaël Salson's avatar
Mikaël Salson committed
112 113 114 115 116 117 118 119 120
  extract_separator(extract_separator)
{
  input_allocated = false;
  init();
}

OnlineFasta::~OnlineFasta() {
  if (input_allocated)
    delete input;
121 122
  if (current.seq)
    delete [] current.seq;
Mikaël Salson's avatar
Mikaël Salson committed
123 124 125 126 127
}

void OnlineFasta::init() {
  line_nb = 0;
  line = getInterestingLine();
128
  current.seq = NULL;
Mikaël Salson's avatar
Mikaël Salson committed
129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150
}

size_t OnlineFasta::getLineNb() {
  return line_nb;
}

Sequence OnlineFasta::getSequence() {
  return current;
}

bool OnlineFasta::hasNext() {
  return (! input->eof()) || line.length() > 0;
}

void OnlineFasta::next() {
  fasta_state state = FASTX_UNINIT;

  // Reinit the Sequence object
  current.label_full.erase();
  current.label.erase();
  current.sequence.erase();
  current.quality.erase();
151 152
  if (current.seq)
    delete [] current.seq;
Mikaël Salson's avatar
Mikaël Salson committed
153 154 155 156 157 158

  if  (hasNext()) {
    switch(line[0]) {
    case '>': state=FASTX_FASTA; break;
    case '@': state=FASTX_FASTQ_ID; break;
    default: 
159
      throw invalid_argument("The file seems to be malformed!");
Mikaël Salson's avatar
Mikaël Salson committed
160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177
    }
    
    // Identifier line
    current.label_full = line.substr(1);
    current.label = extract_from_label(current.label_full, extract_field, extract_separator);

    line = getInterestingLine();
    while (hasNext() && ((state != FASTX_FASTA || line[0] != '>')
                         && (state != FASTX_FASTQ_QUAL || line[0] != '@'))) {

      if (hasNext()) {
        switch(state) {
        case FASTX_FASTA: case FASTX_FASTQ_ID:
          // Sequence
          current.sequence += line;
          break;
        case FASTX_FASTQ_SEQ:
          // FASTQ separator between sequence and quality
178 179
          if (line[0] != '+')
            throw invalid_argument("Expected line starting with + in FASTQ file");
Mikaël Salson's avatar
Mikaël Salson committed
180 181 182 183
          break;
        case FASTX_FASTQ_SEP:
          // Reading quality
          current.quality = line;
184 185
          if (current.quality.length() != current.sequence.length())
            throw invalid_argument("Quality and sequence don't have the same length ");
Mikaël Salson's avatar
Mikaël Salson committed
186 187
          break;
        default:
188
          throw invalid_argument("Unexpected state after reading identifiers line");
Mikaël Salson's avatar
Mikaël Salson committed
189 190 191 192 193 194 195 196 197 198 199 200 201 202
        }
        if (state >= FASTX_FASTQ_ID && state <= FASTX_FASTQ_SEP)
          state = (fasta_state)(((int)state) + 1);
      } else {
        unexpectedEOF();
      }
      line = getInterestingLine();
    }

    if (state >= FASTX_FASTQ_ID && state < FASTX_FASTQ_QUAL) 
      unexpectedEOF();

    // Sequence in uppercase
    transform(current.sequence.begin(), current.sequence.end(), current.sequence.begin(), (int (*)(int))toupper);
203 204 205

    // Compute seq
    current.seq = new int[current.sequence.length()];
206
    for (unsigned int i=0; i< current.sequence.length(); i++)
207
      {
208
	current.seq[i] = nuc_to_int(current.sequence[i]) ;
209 210
      }

Mikaël Salson's avatar
Mikaël Salson committed
211 212 213 214 215 216 217 218 219 220 221 222 223 224
  } else
    unexpectedEOF();
}

string OnlineFasta::getInterestingLine() {
  string line;
  while (line.length() == 0 && hasNext() && getline(*input, line)) {
    line_nb++;
    remove_trailing_whitespaces(line);
  }
  return line;
}

void OnlineFasta::unexpectedEOF() {
225
  throw invalid_argument("Unexpected EOF while reading FASTA/FASTQ file");
Mikaël Salson's avatar
Mikaël Salson committed
226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244
}

// Operators

istream& operator>>(istream& in, Fasta& fasta){
	string line;
	Sequence read;
        OnlineFasta of(in, fasta.extract_field, fasta.extract_separator);

        while (of.hasNext()) {
          of.next();
          fasta.reads.push_back(of.getSequence());
          fasta.total_size += of.getSequence().sequence.size();
        }
	return in;
}

ostream& operator<<(ostream& out, Fasta& fasta){
	for(int i = 0 ; i < fasta.size() ; i++){
245
          out << fasta.read(i);
Mikaël Salson's avatar
Mikaël Salson committed
246 247 248 249 250
	}
	return out;
}

ostream &operator<<(ostream &out, const Sequence &seq) {
251 252 253 254 255 256 257
  bool is_fastq=false;
  if (seq.quality.length() > 0) {
    is_fastq = true;
    out << "@";
  } else
    out << ">";
  out << seq.label << endl;
Mikaël Salson's avatar
Mikaël Salson committed
258
  out << seq.sequence << endl;
259 260 261
  if (is_fastq) {
    out << "+" << endl << seq.quality << endl;
  }
Mikaël Salson's avatar
Mikaël Salson committed
262 263
  return out;
}