Mise à jour terminée. Pour connaître les apports de la version 13.8.4 par rapport à notre ancienne version vous pouvez lire les "Release Notes" suivantes :
https://about.gitlab.com/releases/2021/02/11/security-release-gitlab-13-8-4-released/
https://about.gitlab.com/releases/2021/02/05/gitlab-13-8-3-released/

fasta.cpp 6.94 KB
Newer Older
Mikaël Salson's avatar
Mikaël Salson committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27
/*
  This file is part of "Vidjil" <http://bioinfo.lifl.fr/vid>
  Copyright (C) 2011, 2012, 2013 by Bonsai bioinformatics at LIFL (UMR CNRS 8022, Université Lille) and Inria Lille
  Contributors: Mathieu Giraud <mathieu.giraud@lifl.fr>, Mikaël Salson <mikael.salson@lifl.fr>, David Chatel

  "Vidjil" is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.

  "Vidjil" is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with "Vidjil". If not, see <http://www.gnu.org/licenses/>
*/

#include <fstream>
#include <iostream>
#include <iomanip>
#include <algorithm>
#include <cctype>
#include <stdexcept>
#include "fasta.h"

28 29 30 31 32 33 34 35 36 37 38

void Fasta::init(int extract_field, string extract_separator)
{
  this -> extract_field = extract_field ;
  this -> extract_separator = extract_separator ; 
  total_size = 0;
}

Fasta::Fasta(int extract_field, string extract_separator)
{
  init(extract_field, extract_separator);
Mikaël Salson's avatar
Mikaël Salson committed
39 40 41
}

Fasta::Fasta(const string &input, 
42
	     int extract_field, string extract_separator) 
Mikaël Salson's avatar
Mikaël Salson committed
43
{
44 45
  init(extract_field, extract_separator);

46
  if (!input.size()) // Do not open empty filenames (D germline if not segmentD)
Mikaël Salson's avatar
Mikaël Salson committed
47 48
    return ;

49 50 51 52 53
  add(input);  
}

void Fasta::add(istream &in) {
  in >> *this;
54 55

  cout << "\t" << setw(6) << total_size << " bp in " << setw(3) << size() << " sequences" << endl ;
56 57 58 59
}

void Fasta::add(const string &filename) {
  ifstream is(filename.c_str());
Mikaël Salson's avatar
Mikaël Salson committed
60 61
  if (is.fail())
    {
62
      throw invalid_argument(" !! Error in opening file: "+ filename);
Mikaël Salson's avatar
Mikaël Salson committed
63
    }
64 65

  cout << " <== " << filename ;
66
  add(is);
Mikaël Salson's avatar
Mikaël Salson committed
67 68 69 70
  is.close();
}

int Fasta::size() const{ return (int)reads.size(); }
Mathieu Giraud's avatar
Mathieu Giraud committed
71 72 73 74 75 76 77 78 79
list<Sequence> Fasta::getAll() const {
  list<Sequence> reads;

  for (int i=0; i < size(); i++) {
    reads.push_back(read(i));
  }

  return reads;
}
Mikaël Salson's avatar
Mikaël Salson committed
80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96
const string& Fasta::label(int index) const{ return reads[index].label; }
const string& Fasta::label_full(int index) const{ return reads[index].label_full; }
const Sequence& Fasta::read(int index) const {return reads[index];}
const string& Fasta::sequence(int index) const{ return reads[index].sequence; }

// OnlineFasta

OnlineFasta::OnlineFasta(int extract_field, string extract_separator):
  input(NULL), extract_field(extract_field), extract_separator(extract_separator){}

OnlineFasta::OnlineFasta(const string &input, 
                         int extract_field, string extract_separator)
  :input(new ifstream(input.c_str())), 
  extract_field(extract_field), 
  extract_separator(extract_separator)
{
  if (this->input->fail()) {
97
    throw invalid_argument("!! Error in opening file "+input);
Mikaël Salson's avatar
Mikaël Salson committed
98
  }
Mathieu Giraud's avatar
Mathieu Giraud committed
99 100

  cout << "  <== " << input << endl ;
Mikaël Salson's avatar
Mikaël Salson committed
101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117
  input_allocated = true;
  init();
}

OnlineFasta::OnlineFasta(istream &input, 
                         int extract_field, string extract_separator)
  :extract_field(extract_field), 
  extract_separator(extract_separator)
{
  this->input = &input;
  input_allocated = false;
  init();
}

OnlineFasta::~OnlineFasta() {
  if (input_allocated)
    delete input;
118 119
  if (current.seq)
    delete [] current.seq;
Mikaël Salson's avatar
Mikaël Salson committed
120 121 122 123 124
}

void OnlineFasta::init() {
  line_nb = 0;
  line = getInterestingLine();
125
  current.seq = NULL;
Mikaël Salson's avatar
Mikaël Salson committed
126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147
}

size_t OnlineFasta::getLineNb() {
  return line_nb;
}

Sequence OnlineFasta::getSequence() {
  return current;
}

bool OnlineFasta::hasNext() {
  return (! input->eof()) || line.length() > 0;
}

void OnlineFasta::next() {
  fasta_state state = FASTX_UNINIT;

  // Reinit the Sequence object
  current.label_full.erase();
  current.label.erase();
  current.sequence.erase();
  current.quality.erase();
148 149
  if (current.seq)
    delete [] current.seq;
Mikaël Salson's avatar
Mikaël Salson committed
150 151 152 153 154 155

  if  (hasNext()) {
    switch(line[0]) {
    case '>': state=FASTX_FASTA; break;
    case '@': state=FASTX_FASTQ_ID; break;
    default: 
156
      throw invalid_argument("The file seems to be malformed!");
Mikaël Salson's avatar
Mikaël Salson committed
157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174
    }
    
    // Identifier line
    current.label_full = line.substr(1);
    current.label = extract_from_label(current.label_full, extract_field, extract_separator);

    line = getInterestingLine();
    while (hasNext() && ((state != FASTX_FASTA || line[0] != '>')
                         && (state != FASTX_FASTQ_QUAL || line[0] != '@'))) {

      if (hasNext()) {
        switch(state) {
        case FASTX_FASTA: case FASTX_FASTQ_ID:
          // Sequence
          current.sequence += line;
          break;
        case FASTX_FASTQ_SEQ:
          // FASTQ separator between sequence and quality
175 176
          if (line[0] != '+')
            throw invalid_argument("Expected line starting with + in FASTQ file");
Mikaël Salson's avatar
Mikaël Salson committed
177 178 179 180
          break;
        case FASTX_FASTQ_SEP:
          // Reading quality
          current.quality = line;
181 182
          if (current.quality.length() != current.sequence.length())
            throw invalid_argument("Quality and sequence don't have the same length ");
Mikaël Salson's avatar
Mikaël Salson committed
183 184
          break;
        default:
185
          throw invalid_argument("Unexpected state after reading identifiers line");
Mikaël Salson's avatar
Mikaël Salson committed
186 187 188 189 190 191 192 193 194 195 196 197 198 199
        }
        if (state >= FASTX_FASTQ_ID && state <= FASTX_FASTQ_SEP)
          state = (fasta_state)(((int)state) + 1);
      } else {
        unexpectedEOF();
      }
      line = getInterestingLine();
    }

    if (state >= FASTX_FASTQ_ID && state < FASTX_FASTQ_QUAL) 
      unexpectedEOF();

    // Sequence in uppercase
    transform(current.sequence.begin(), current.sequence.end(), current.sequence.begin(), (int (*)(int))toupper);
200 201 202

    // Compute seq
    current.seq = new int[current.sequence.length()];
203
    for (unsigned int i=0; i< current.sequence.length(); i++)
204
      {
205
	current.seq[i] = nuc_to_int(current.sequence[i]) ;
206 207
      }

Mikaël Salson's avatar
Mikaël Salson committed
208 209 210 211 212 213 214 215 216 217 218 219 220 221
  } else
    unexpectedEOF();
}

string OnlineFasta::getInterestingLine() {
  string line;
  while (line.length() == 0 && hasNext() && getline(*input, line)) {
    line_nb++;
    remove_trailing_whitespaces(line);
  }
  return line;
}

void OnlineFasta::unexpectedEOF() {
222
  throw invalid_argument("Unexpected EOF while reading FASTA/FASTQ file");
Mikaël Salson's avatar
Mikaël Salson committed
223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247
}

// Operators

istream& operator>>(istream& in, Fasta& fasta){
	string line;
	Sequence read;
        OnlineFasta of(in, fasta.extract_field, fasta.extract_separator);

        while (of.hasNext()) {
          of.next();
          fasta.reads.push_back(of.getSequence());
          fasta.total_size += of.getSequence().sequence.size();
        }
	return in;
}

ostream& operator<<(ostream& out, Fasta& fasta){
	for(int i = 0 ; i < fasta.size() ; i++){
          out << fasta.read(i) << endl;
	}
	return out;
}

ostream &operator<<(ostream &out, const Sequence &seq) {
248 249 250 251 252 253 254
  bool is_fastq=false;
  if (seq.quality.length() > 0) {
    is_fastq = true;
    out << "@";
  } else
    out << ">";
  out << seq.label << endl;
Mikaël Salson's avatar
Mikaël Salson committed
255
  out << seq.sequence << endl;
256 257 258
  if (is_fastq) {
    out << "+" << endl << seq.quality << endl;
  }
Mikaël Salson's avatar
Mikaël Salson committed
259 260
  return out;
}