Commit 487af082 authored by Thonier Florian's avatar Thonier Florian
Browse files

Merge branch 'prod-client' of gitlab.inria.fr:vidjil/vidjil into prod-client

parents 075e470b 0cb291e2
......@@ -155,9 +155,9 @@ cleanall: clean
RELEASE_TAG="notag"
RELEASE_H = $(VIDJIL_ALGO_SRC)/release.h
RELEASE_SOURCE = $(wildcard $(VIDJIL_ALGO_SRC)/*.cpp) $(wildcard $(VIDJIL_ALGO_SRC)/*.h) $(wildcard $(VIDJIL_ALGO_SRC)/core/*.cpp) $(wildcard $(VIDJIL_ALGO_SRC)/core/*.hpp) $(wildcard $(VIDJIL_ALGO_SRC)/tests/unit-tests/*.cpp) $(wildcard $(VIDJIL_ALGO_SRC)/core/*.h) $(wildcard $(VIDJIL_ALGO_SRC)/tests/unit-tests/*.h) $(wildcard $(VIDJIL_ALGO_SRC)/cgi/*.cpp) $(wildcard $(VIDJIL_ALGO_SRC)/lib/*.cpp) $(wildcard $(VIDJIL_ALGO_SRC)/lib/*.h) $(wildcard $(VIDJIL_ALGO_SRC)/lib/*.hpp) $(wildcard tools/*.py)
RELEASE_SOURCE = $(wildcard $(VIDJIL_ALGO_SRC)/*.cpp) $(wildcard $(VIDJIL_ALGO_SRC)/*.h) $(wildcard $(VIDJIL_ALGO_SRC)/core/*.cpp) $(wildcard $(VIDJIL_ALGO_SRC)/core/*.hpp) $(wildcard $(VIDJIL_ALGO_SRC)/tests/unit-tests/*.cpp) $(wildcard $(VIDJIL_ALGO_SRC)/core/*.h) $(wildcard $(VIDJIL_ALGO_SRC)/tests/unit-tests/*.h) $(wildcard $(VIDJIL_ALGO_SRC)/cgi/*.cpp) $(wildcard $(VIDJIL_ALGO_SRC)/tools/*.cpp) $(wildcard $(VIDJIL_ALGO_SRC)/tools/Makefile) $(wildcard $(VIDJIL_ALGO_SRC)/lib/*.cpp) $(wildcard $(VIDJIL_ALGO_SRC)/lib/*.h) $(wildcard $(VIDJIL_ALGO_SRC)/lib/*.hpp) $(wildcard $(VIDJIL_ALGO_SRC)/lib/unbam/*.c) $(wildcard $(VIDJIL_ALGO_SRC)/lib/unbam/*.h) $(wildcard $(VIDJIL_ALGO_SRC)/lib/unbam/Makefile) $(wildcard $(VIDJIL_ALGO_SRC)/lib/unbam/LICENSE) $(wildcard tools/*.py)
RELEASE_MAKE = ./Makefile $(VIDJIL_ALGO_SRC)/Makefile $(VIDJIL_ALGO_SRC)/core/Makefile $(VIDJIL_ALGO_SRC)/tests/Makefile $(VIDJIL_ALGO_SRC)/lib/Makefile germline/Makefile data/Makefile tools/tests/Makefile doc/Makefile
RELEASE_TESTS = doc/format-analysis.org data/get-sequences $(wildcard data/*.vidjil) $(wildcard data/*.analysis) $(wildcard data/*.g) $(wildcard data/*.fa) $(wildcard data/*.fq) $(VIDJIL_ALGO_SRC)/tests/repseq_vdj.py $(VIDJIL_ALGO_SRC)/tests/should-vdj-to-tap.py $(VIDJIL_ALGO_SRC)/tests/ansi.py $(wildcard $(VIDJIL_ALGO_SRC)/tests/should-vdj-tests/*.should-vdj.fa) $(wildcard $(VIDJIL_ALGO_SRC)/tests/should-vdj-tests/*.should-locus.fa) $(VIDJIL_ALGO_SRC)/tests/should-to-tap.sh $(wildcard $(VIDJIL_ALGO_SRC)/tests/should-get-tests/*.should-get) $(wildcard $(VIDJIL_ALGO_SRC)/tests/bugs/*.fa) $(wildcard $(VIDJIL_ALGO_SRC)/tests/bugs/*.should-get) $(VIDJIL_ALGO_SRC)/tests/format-json.sh $(wildcard doc/analysis-example.vidjil) $(wildcard tools/tests/*.should_get) tools/tests/should-to-tap.sh tools/diff_json.sh
RELEASE_TESTS = doc/format-analysis.org data/get-sequences $(wildcard data/*.vidjil) $(wildcard data/*.analysis) $(wildcard data/*.g) $(wildcard data/*.fa) $(wildcard data/*.fq) $(wildcard data/*.bam) $(VIDJIL_ALGO_SRC)/tests/repseq_vdj.py $(VIDJIL_ALGO_SRC)/tests/should-vdj-to-tap.py $(VIDJIL_ALGO_SRC)/tests/ansi.py $(wildcard $(VIDJIL_ALGO_SRC)/tests/should-vdj-tests/*.should-vdj.fa) $(wildcard $(VIDJIL_ALGO_SRC)/tests/should-vdj-tests/*.should-locus.fa) $(VIDJIL_ALGO_SRC)/tests/should-to-tap.sh $(wildcard $(VIDJIL_ALGO_SRC)/tests/should-get-tests/*.should-get) $(wildcard $(VIDJIL_ALGO_SRC)/tests/bugs/*.fa) $(wildcard $(VIDJIL_ALGO_SRC)/tests/bugs/*.should-get) $(VIDJIL_ALGO_SRC)/tests/format-json.sh $(wildcard doc/analysis-example.vidjil) $(wildcard tools/tests/*.should_get) tools/tests/should-to-tap.sh tools/diff_json.sh
RELEASE_GERMLINES = germline/germline_id germline/get-saved-germline germline/get-germline germline/split-from-imgt.py $(wildcard germline/*.g) germline/revcomp-fasta.py germline/fasta.py
RELEASE_HELP = doc/algo.org doc/locus.org doc/dev.org doc/should-vdj.org doc/credits.org doc/CHANGELOG doc/LICENSE README.org INSTALL.org
RELEASE_FILES = $(RELEASE_SOURCE) $(RELEASE_TESTS) $(RELEASE_MAKE) $(RELEASE_GERMLINES) $(RELEASE_HELP) data/segmentation.fasta $(wildcard data/*.fa.gz) $(wildcard data/*.label)
......
......@@ -42,6 +42,9 @@ CREATE_VERSION_GIT_H := $(shell test -x ./create-git-version-h.sh && ./create-gi
.PHONY: all core lib clean forcedep
all: $(VIDJIL) $(ALIGN_CGI) $(SIMILARITY_CGI) $(SIMILARITY_TOOL)
$(MAKE) -C $(TOOLDIR)
base: $(VIDJIL)
###
......
......@@ -4,7 +4,7 @@
#include <string.h>
#include <cstdlib>
#include "../core/dynprog.h"
#include "../core/fasta.h"
#include "../core/bioreader.hpp"
#include "../core/lazy_msa.h"
#include "../lib/json.hpp"
......@@ -83,7 +83,7 @@ int main(int argc, char* argv[])
if (!cgi_mode) cout <<endl;
if (! error) {
Fasta fa(fdata, 1, " ", !cgi_mode);
BioReader fa(fdata, 1, " ", 0, !cgi_mode);
string seq0 = fa.sequence(0);
......
......@@ -5,7 +5,7 @@
#include <cstdlib>
#include <unistd.h>
#include "../core/dynprog.h"
#include "../core/fasta.h"
#include "../core/bioreader.hpp"
#include "../core/lazy_msa.h"
#include "../core/similarityMatrix.h"
#include "../core/compare-all.h"
......@@ -111,7 +111,7 @@ int main(int argc, char* argv[])
if (!cgi_mode) cout <<endl;
if (! error) {
Fasta fa(fdata, 1, " ", !output_json);
BioReader fa(fdata, 1, " ", 0, !output_json);
list<Sequence> reads;
reads = fa.getAll();
......
CXX?=g++
OPTIM=-O2
override CXXFLAGS += -W -Wall -std=c++11 $(OPTIM)
INC=-I ../
SRCCORE=$(wildcard *.cpp)
OBJCORE=$(SRCCORE:.cpp=.o)
LIBCORE=vidjil.a
......@@ -12,6 +13,9 @@ all: $(LIBCORE)
$(LIBCORE): $(OBJCORE)
ar rc $@ $^
%.o: %.cpp
$(CXX) $(CXXFLAGS) $(INC) -c $<
clean:
rm -f $(OBJCORE) $(LIBCORE)
......
/*
This file is part of Vidjil <http://www.vidjil.org>
Copyright (C) 2011-2017 by Bonsai bioinformatics
at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille
Contributors:
Mathieu Giraud <mathieu.giraud@vidjil.org>
Mikaël Salson <mikael.salson@vidjil.org>
Marc Duez <marc.duez@vidjil.org>
"Vidjil" is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
"Vidjil" is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with "Vidjil". If not, see <http://www.gnu.org/licenses/>
*/
#include <fstream>
#include <iostream>
#include <iomanip>
#include <algorithm>
#include <cctype>
#include <stdexcept>
#include "bam.h"
#include <lib/unbam/sam.h>
#include "../lib/gzstream.h"
// OnlineBAM
OnlineBAM::OnlineBAM(int extract_field, string extract_separator,
int nb_sequences_max, int only_nth_sequence)
:OnlineBioReader(extract_field, extract_separator, nb_sequences_max, only_nth_sequence) {}
OnlineBAM::OnlineBAM(const string &input_filename,
int extract_field, string extract_separator,
int nb_sequences_max, int only_nth_sequence)
:OnlineBioReader(input_filename, extract_field, extract_separator, nb_sequences_max, only_nth_sequence) {this->init();}
OnlineBAM::~OnlineBAM() {
free(bam_entry);
bam_hdr_destroy(header);
if (input_allocated)
bgzf_close(input);
}
void OnlineBAM::init() {
if (! filename.empty()) {
input = bgzf_open(filename.c_str(), "r");
if (! input) {
throw invalid_argument("!! Error in opening file "+filename);
}
}
bam_entry = bam_init1();
header = bam_hdr_read(input);
bytes_read = bam_read1(input, bam_entry);
char_nb += bytes_read;
}
bool OnlineBAM::hasNextData() {
return hasNext();
}
bool OnlineBAM::hasNext() {
return bytes_read > 0;
}
void OnlineBAM::next() {
assert((bam_entry->core.flag & (BAM_FPAIRED | BAM_FREAD1 | BAM_FREAD2)) == 0);
assert(hasNext());
char *qualities = get_quality(bam_entry);
char *seq = get_sequence(bam_entry);
current.sequence.erase();
addLineToCurrentSequence(string(seq));
current.quality = string(qualities);
current.label_full = string(bam_get_qname(bam_entry));
current.label = extract_from_label(current.label_full, extract_field, extract_separator);
free(seq);
free(qualities);
bytes_read = bam_read1(input, bam_entry);
char_nb += bytes_read;
skipToNthSequence();
}
void OnlineBAM::unexpectedEOF() {
throw invalid_argument("Unexpected EOF while reading BAM file");
}
#ifndef BAM_H
#define BAM_H
#include <istream>
#include <string>
#include <vector>
#include <list>
extern "C" {
#include "lib/unbam/bamopen.h"
#include "lib/unbam/bgzf.h"
}
using namespace std;
#include "bioreader.hpp"
/**
* Read a BAM file.
* Space complexity: constant. Only one sequence is stored at most in memory.
* The file is read online meaning that we cannot access a random sequence.
*/
class OnlineBAM: public OnlineBioReader {
BGZF *input;
bam1_t *bam_entry;
int bytes_read;
bam_hdr_t *header;
public:
/**
* Default constructor
*/
OnlineBAM(int extract_field=0, string extract_separator="|",
int nb_sequences_max=NO_LIMIT_VALUE, int only_nth_sequence=1);
/**
* Open the file. No sequence is read at first.
* @post getSequence() does not return the first sequence yet.
* next() must be called first.
* @throws invalid_argument if file cannot be opened or is not
* well-formed
*/
OnlineBAM(const string &input_filename,
int extract_field=0, string extract_separator="|",
int nb_sequences_max=NO_LIMIT_VALUE, int only_nth_sequence=1);
virtual ~OnlineBAM();
/**
* @return true iff we did not reach yet the end of the file.
*/
bool hasNextData();
/**
* @return true iff we did not reach yet both the end of the file and the maximal number of returned sequences
*/
bool hasNext();
/**
* Go to the next sequence in the file.
* @post hasNext() ==> getSequence() returns the following sequence in the file.
* @throws invalid_argument if the file is not well formated
*/
void next();
protected:
/**
* @inherited
*/
void init();
/**
* @inherited
*/
void unexpectedEOF();
};
#endif
/*
This file is part of Vidjil <http://www.vidjil.org>
Copyright (C) 2011-2017 by Bonsai bioinformatics
at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille
Contributors:
Mathieu Giraud <mathieu.giraud@vidjil.org>
Mikaël Salson <mikael.salson@vidjil.org>
Marc Duez <marc.duez@vidjil.org>
"Vidjil" is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
"Vidjil" is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with "Vidjil". If not, see <http://www.gnu.org/licenses/>
*/
#include <algorithm>
#include <string>
#include <fstream>
#include "bioreader.hpp"
#include "fasta.h"
#include "bam.h"
OnlineBioReader::OnlineBioReader(int extract_field, string extract_separator,
int nb_sequences_max, int only_nth_sequence):
filename(""), extract_field(extract_field),
extract_separator(extract_separator),
nb_sequences_max(nb_sequences_max), only_nth_sequence(only_nth_sequence){}
OnlineBioReader::OnlineBioReader(const string &input_filename,
int extract_field, string extract_separator,
int nb_sequences_max, int only_nth_sequence):
filename(input_filename), extract_field(extract_field),
extract_separator(extract_separator),
nb_sequences_max(nb_sequences_max), only_nth_sequence(only_nth_sequence)
{
input_allocated = true;
init();
}
OnlineBioReader::~OnlineBioReader() {
if (current.seq)
delete [] current.seq;
}
void OnlineBioReader::init() {
mark_pos = 0;
nb_sequences_parsed = 0;
nb_sequences_returned = 0;
char_nb = 0;
current.seq = NULL;
current.marked_pos = 0;
current_gaps = 0;
}
unsigned long long OnlineBioReader::getPos() {
return char_nb;
}
void OnlineBioReader::setMarkPos(int mark_pos) {
this -> mark_pos = mark_pos;
}
Sequence OnlineBioReader::getSequence() {
return current;
}
void OnlineBioReader::skipToNthSequence() {
// Possibly skip some reads, when only_nth_sequence > 1
while (hasNextData())
if (nb_sequences_parsed % only_nth_sequence)
{
nb_sequences_returned--;
this->next();
continue ;
}
else
return ;
}
void OnlineBioReader::addLineToCurrentSequence(string line)
{
for (char& c : line)
{
if (c == ' ')
continue ;
if (c == '.') {
current_gaps++;
continue ;
}
current.sequence += c;
if (mark_pos) {
if ((int) current.sequence.length() + current_gaps == mark_pos)
current.marked_pos = current.sequence.length();
}
}
}
void OnlineBioReader::unexpectedEOF() {
throw invalid_argument("Unexpected EOF while reading sequence file");
}
//// BioReader
void BioReader::init(int extract_field, string extract_separator, size_t mark_pos)
{
this -> extract_field = extract_field ;
this -> extract_separator = extract_separator ;
this -> mark_pos = mark_pos;
total_size = 0;
name = "";
basename = "";
}
BioReader::BioReader(bool virtualfasta, string name)
{
UNUSED(virtualfasta);
init(0, "");
this -> name = name;
basename = extract_basename(name);
}
BioReader::BioReader(int extract_field, string extract_separator, int mark_pos)
{
init(extract_field, extract_separator, mark_pos);
}
BioReader::BioReader(const string &input,
int extract_field, string extract_separator,
int mark_pos, bool verbose)
{
init(extract_field, extract_separator, mark_pos);
if (!input.size()) // Do not open empty filenames (D germline if not segmentD)
return ;
add(input, verbose);
}
void BioReader::add(const string &filename, bool verbose) {
OnlineBioReader *reader = OnlineBioReaderFactory::create(filename, extract_field,
extract_separator);
if (name.size())
name += " ";
if (basename.size())
basename += " ";
name += filename;
basename += extract_basename(filename);
if (verbose)
cout << " <== " << filename ;
add(*reader);
delete reader;
if (verbose)
cout << "\t" << setw(6) << total_size << " bp in " << setw(3) << size() << " sequences" << endl ;
}
void BioReader::add(OnlineBioReader &reader) {
string line;
Sequence read;
reader.setMarkPos(mark_pos);
while (reader.hasNext()) {
reader.next();
add(reader.getSequence());
}
}
void BioReader::add(Sequence seq) {
reads.push_back(seq);
total_size += seq.sequence.size();
}
int BioReader::size() const{ return (int)reads.size(); }
size_t BioReader::totalSize() const { return total_size ; }
list<Sequence> BioReader::getAll() const {
list<Sequence> reads;
for (int i=0; i < size(); i++) {
reads.push_back(read(i));
}
return reads;
}
const string& BioReader::label(int index) const{ return reads[index].label; }
const string& BioReader::label_full(int index) const{ return reads[index].label_full; }
const Sequence& BioReader::read(int index) const {return reads[index];}
const string& BioReader::sequence(int index) const{ return reads[index].sequence; }
// Operators
ostream& operator<<(ostream& out, BioReader& fasta){
for(int i = 0 ; i < fasta.size() ; i++){
out << fasta.read(i);
}
return out;
}
ostream &operator<<(ostream &out, const Sequence &seq) {
bool is_fastq=false;
if (seq.quality.length() > 0) {
is_fastq = true;
out << "@";
} else
out << ">";
out << seq.label;
if (seq.marked_pos) {
out << " !@" << seq.marked_pos ;
}
out << endl;
out << seq.sequence << endl;
if (is_fastq) {
out << "+" << endl << seq.quality << endl;
}
return out;
}
OnlineBioReader *OnlineBioReaderFactory::create(const string &filename,
int extract_field, string extract_separator,
int nb_sequences_max, int only_nth_sequence) {
string extension = filename.substr(filename.find_last_of(".") + 1);
transform(extension.begin(), extension.end(), extension.begin(), ::tolower);
if (extension == "bam")
return new OnlineBAM(filename, extract_field, extract_separator, nb_sequences_max, only_nth_sequence);
else
return new OnlineFasta(filename, extract_field, extract_separator, nb_sequences_max, only_nth_sequence);
}
// http://stackoverflow.com/a/5840160/4475279
unsigned long long filesize(const char* filename)
{
std::ifstream in(filename, std::ifstream::ate | std::ifstream::binary);
return in.tellg();
}
unsigned long long nb_sequences_in_file(string f, bool approx)
{
if (approx)
return approx_nb_sequences_in_file(f);
OnlineBioReader *sequences = OnlineBioReaderFactory::create(f, 1, " ");
unsigned long long nb_sequences = 0 ;
while (sequences->hasNext())
{
sequences->next();
nb_sequences++ ;
}
cout << " ==> " << nb_sequences << " sequences" << endl;
delete sequences ;
return nb_sequences ;
}
unsigned long long approx_nb_sequences_in_file(string f)
{
OnlineBioReader *sequences = OnlineBioReaderFactory::create(f, 1, " ");
int nb_sequences = 0 ;
while (nb_sequences < SAMPLE_APPROX_NB_SEQUENCES && sequences->hasNext())
{
sequences->next();
nb_sequences++ ;
}
cout << " ==> " ;
if (sequences->hasNext())
{
cout << "approx. " ;
float ratio = (float) filesize(f.c_str()) / (float) sequences->getPos();
nb_sequences = (unsigned long long) (ratio * nb_sequences);
}
cout << nb_sequences << " sequences" << endl;
delete sequences ;
return nb_sequences ;
}
/*
This file is part of Vidjil <http://www.vidjil.org>
Copyright (C) 2011-2017 by Bonsai bioinformatics
at CRIStAL (UMR CNRS 9189, Université Lille) and Inria Lille
Contributors:
Mathieu Giraud <mathieu.giraud@vidjil.org>
Mikaël Salson <mikael.salson@vidjil.org>
Marc Duez <marc.duez@vidjil.org>
"Vidjil" is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
"Vidjil" is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with "Vidjil". If not, see <http://www.gnu.org/licenses/>
*/
#ifndef BIO_READER_HPP
#define BIO_READER_HPP
#include <string>
#include <vector>
#include <list>
#define SAMPLE_APPROX_NB_SEQUENCES 200
using namespace std;
typedef string seqtype ;
typedef struct read_t
{
string label_full;
string label;
str