Commit 7314fdfa authored by Marc Duez's avatar Marc Duez

Merge branch 'master' of git+ssh://scm.gforge.inria.fr//gitroot/vidjil/vidjil into rework_data

Conflicts:
	server/fuse.py
parents 5b7b5264 ad0ac1b4
......@@ -5,7 +5,7 @@ before_install:
- sudo pip install cpp-coveralls
before_script: make data ; make germline
script: make && make test
script: make COVERAGE=1 && make COVERAGE=1 test
after_success:
- make COVERAGE=1 unit should ; ln -s algo/core . ; coveralls --exclude algo/tests --exclude algo/tools --gcov-options '\-lp'
- make should_coveralls
......@@ -38,15 +38,28 @@ should: all
make COVERAGE="$(COVERAGE_OPTION)" -C $(VIDJIL_ALGO_SRC)/tests should
@echo "*** All .should_get tests passed"
### Code coverage
coverage: unit_coverage should_coverage
unit_coverage: clean
make COVERAGE=1 unit
which gcovr > /dev/null && (cd algo; gcovr -r . -e tests/ --xml > unit_coverage.xml) || echo "gcovr is needed to generate a full report"
should_coverage: clean
make COVERAGE=1 should
### Reports with gcovr
unit_gcovr: unit_coverage
which gcovr > /dev/null && (cd algo; gcovr -r . -e tests/ --xml > unit_coverage.xml) || echo "gcovr is needed to generate a full report"
should_gcovr: should_coverage
which gcovr > /dev/null && (cd algo; gcovr -r . -e tests/ --xml > should_coverage.xml) || echo "gcovr is needed to generate a full report"
### Upload to coveralls.io
unit_coveralls:
coveralls --exclude release --exclude algo/tests --exclude algo/tools --gcov-options '\-lp'
should_coveralls:
coveralls --exclude release --exclude algo/tests --exclude algo/tools --gcov-options '\-lp' -r algo
data germline: %:
make -C $@
......
......@@ -3,6 +3,7 @@
[[https://travis-ci.org/magiraud/vidjil][http://img.shields.io/travis/magiraud/vidjil.svg]]
[[http://opensource.org/licenses/GPL-3.0][http://img.shields.io/badge/license-GPLv3+-yellow.svg]]
[[https://coveralls.io/r/magiraud/vidjil][http://img.shields.io/coveralls/magiraud/vidjil.svg]]
# Vidjil -- V(D)J recombinations analysis -- [[http://www.vidjil.org]]
# Copyright (C) 2011, 2012, 2013, 2014 by Bonsai bioinformatics at LIFL (UMR CNRS 8022, Université Lille) and Inria Lille
......
......@@ -2,7 +2,7 @@ CC=g++
OPTIM=-O2 -DNDEBUG $(COVERAGE)
CXXFLAGS=-W -Wall $(OPTIM) $(DEBUG) $(CONFIG)
LDFLAGS=-lm
EXEC=vidjil kmer kmer_count cut count detailed_affect
EXEC=vidjil
MAINCORE=$(wildcard *.cpp)
LIBCORE=core/vidjil.a
......
......@@ -16,7 +16,7 @@ list<T> keep_n_first(list<T> l, size_t count) {
size_t i = 0;
typename list<T>::iterator it = l.begin();
for (; it != l.end() && i < count ; it++)
count++;
i++;
return list<T>(l.begin(), it);
}
......
#include<algorithm>
#include<utility>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <string>
#include <cstring>
#include <time.h>
#include <map>
#include <libgen.h>
#include "core/tools.h"
#include "core/cluster-junctions.h"
#include "core/fasta.h"
#include "core/dynprog.h"
#include "core/teestream.h"
#include "core/mkdir.h"
#define WIDTH 7
#define DEFAULT_READS "../../../seq/chr_pgm_50k.cut.fa"
#define DEFAULT_PATTERN "CTGTGCCACCTGGGCCTTATTATAAGAAAC"
#define DEFAULT_THRESHOLD -8
#define DEFAULT_THRESHOLD_JUNCTIONS -4
#define DEFAULT_MAX_DISPLAY_JUNCTIONS 10
#define DEFAULT_CUT "./cuts.fa"
#define OUT_DIR "./counts/"
#define F_COUNT "./counts.txt"
using namespace std ;
extern char *optarg;
extern int optind, optopt, opterr;
void usage()
{
cerr << "[options] pattern reads1 [reads2...]" << endl << endl;
cerr << "Options: " << endl
<< "\t-c" << endl
<< "\t\tcumulative counts" << endl
<< "\t-r <int>" << endl
<< "\t\treference for computing the percentages (otherwise, number of sequences)" << endl
<< "\t-T <int>" << endl
<< "\t\tthreshold (for displaying the counts)" << endl
<< "\t-T <int>" << endl
<< "\t\tthreshold (for storing the junctions)" << endl
<< "\t-j <int>" << endl
<< "\t\tmaximum number of junctions to display" << endl
<< "\t-b \tdisplay only the number of matches on the (b)oth strands (and not +/-)" << endl
<< "\t-d <directory>" << endl
<< "\t\toutput directory used for storing the results (default: " << OUT_DIR << ")" << endl
<< "\t-o <output file>" << endl
<< "\t\toutput filename that will be used for storing the results (default: " << F_COUNT << ")" << endl
<< "\t-v\tverbose mode" << endl ;
exit(1);
}
int main (int argc, char **argv)
{
string pattern = DEFAULT_PATTERN ;
string f_reads = DEFAULT_READS ;
int threshold = DEFAULT_THRESHOLD ;
int threshold_junctions = DEFAULT_THRESHOLD_JUNCTIONS ;
int max_display_junctions = DEFAULT_MAX_DISPLAY_JUNCTIONS ;
int reference = 0 ;
bool reference_option = false ;
string out_dir = OUT_DIR;
string f_count = F_COUNT;
int verbose = 0 ;
bool only_total = false ;
bool cumulative_counts = false ;
bool default_file = false;
int optind_files;
char c ;
while ((c = getopt(argc, argv, "T:t:bhvd:o:j:cr:")) != EOF)
switch (c)
{
case 'h':
cerr << "Usage: " << argv[0]<< " " ;
usage();
case 'v':
verbose += 1 ;
break;
case 'd':
out_dir = optarg;
break;
case 'o':
f_count = optarg;
break;
case 'T':
threshold_junctions = -atoi(optarg);
break;
case 't':
threshold = -atoi(optarg);
break;
case 'j':
max_display_junctions = -atoi(optarg);
break;
case 'r':
reference_option = true ;
reference = atoi(optarg);
break;
case 'c':
cumulative_counts = true;
break;
case 'b':
only_total = true;
break;
}
if (verbose)
cerr << "# verbose " << verbose << endl ;
if (optind+1 <= argc) {
pattern = argv[optind] ;
optind_files = optind+1;
} else {
cerr << "# using default pattern: " << pattern << endl ;
optind_files = optind;
}
if (optind_files+1 > argc) {
cerr << "# using default sequence file: " << f_reads << endl ;
default_file = true;
}
// Creating output directory
if (mkpath(out_dir.c_str(), 0755) == -1) {
perror("Directory creation");
exit(3);
}
f_count = out_dir+"/"+f_count;
cerr << " ==> " << f_count << endl ;
ofstream count_file(f_count.c_str());
teestream out(cout, count_file);
cerr << "Command line: ";
for (int i=0; i < argc; i++) {
cerr << argv[i] << " ";
}
cerr << endl;
//////////////////////////////////
string pattern_rev = revcomp(pattern);
//////////////////////////////////
time_t rawtime;
struct tm *timeinfo;
char time_buffer[80];
time (&rawtime );
timeinfo = localtime (&rawtime);
strftime (time_buffer, 80,"%F %T", timeinfo);
cerr << "# " << time_buffer << endl ;
//////////////////////////////////
for (int num_file=0;
default_file || optind_files + num_file < argc;
num_file++)
{
if (default_file) {
default_file = false;
// Keep default value for f_reads
} else {
f_reads = argv[optind+1+num_file] ;
}
//////////////////////////////////
cerr << "Read sequence file" << endl ;
Fasta reads(f_reads, 1, " ", out);
out << endl;
if (reference_option)
{
cerr << "# reference (given with -r option): " << reference << endl ;
}
else
{
reference = reads.size();
cerr << "# reference (number of sequences): " << reference << endl ;
}
string tag = "";
if (optind + 3 <= argc)
{
char *ch = strdup(f_reads.c_str());
// Affichage du nom de la seq s'il y en a au moins 2
tag = "\t" + string(basename(ch));
}
// const Cost countCost = Levenshtein ;
const Cost countCost = Cost(0, -2, -2, 0, -1);
out << "# using cost " << countCost << endl ;
////////////////////////////////////////
// COUNT THE PATTERNS //
////////////////////////////////////////
int ok = 0 ;
map <int, int> counts ;
map <int, int> counts_rev ;
map<pair<int, junction>, int> counts_junctions;
for(int i = 0 ; i < reads.size() ; i++)
{
if (!(ok++ % 10000))
{
cerr << "." ;
cerr.flush();
}
if (verbose)
cerr << endl << endl << reads.label(i) << endl;
// +
DynProg dp = DynProg(pattern, reads.sequence(i), DynProg::SemiGlobal, countCost);
int score = dp.compute();
counts[score]++ ;
if (verbose)
cout << "\t best score: " << score << " @" << dp.best_j << " " << dp.SemiGlobal_extract_best() << endl ;
if (score >= threshold_junctions)
{
string junc = dp.SemiGlobal_extract_best() ;
counts_junctions[make_pair(score, junc)]++ ;
}
// -
DynProg dp_rev = DynProg(pattern_rev, reads.sequence(i), DynProg::SemiGlobal, countCost);
int score_rev = dp_rev.compute();
counts_rev[score_rev]++ ;
}
cerr << endl;
out << "Scores distribution" ;
// Cumulative
if (cumulative_counts)
{
out << " (cumulative)" ;
for (int i=0; i >= threshold; i--)
{
counts[i-1] += counts[i] ;
counts_rev[i-1] += counts_rev[i] ;
}
}
// Legend
out << endl ;
out << " " << setw(3) << "" << " " ;
if (only_total)
out << setw(WIDTH) << "+/-" ;
else
out << setw(WIDTH) << "+" << setw(WIDTH) << "-" ;
out << setw(10) << "%" << endl ;
// Display counts
for (map <int, int>::const_iterator it = counts.begin(); it != counts.end(); ++it)
{
int key = it->first ;
if (key < threshold)
continue ;
out << " " << setw(3) << key << " |" ;
if (only_total)
{
out << setw(WIDTH) << counts[key] + counts_rev[key] ;
}
else
{
out << setw(WIDTH) << counts[key] ;
out << setw(WIDTH) << counts_rev[key] ;
}
out << setw(10) << setprecision(3) << ((float) (counts[key] + counts_rev[key])) / reference * 100 ;
out << tag << endl ;
}
out << endl ;
//////////////////////////////////
// Sort junctions
list<pair <pair<int, junction>, int> > sort_junctions;
// il y a surment un meilleur moyen de transformer un map en list :)
for (map<pair<int, junction>, int>::const_iterator it = counts_junctions.begin();
it != counts_junctions.end(); ++it)
{
sort_junctions.push_back(make_pair(it->first, it->second));
}
sort_junctions.sort(pair_occurrence_sort<pair<int, junction> >);
int nb = 0;
int nb_others = 0;
out << "Junctions distribution (only on +)" << endl ;
// Display junctions
for (list<pair <pair<int, junction>, int> >::const_iterator it = sort_junctions.begin();
it != sort_junctions.end(); ++it)
{
pair<int, junction> key = it->first ;
int score = key.first ;
string junc = key.second ;
if (nb++ <= max_display_junctions)
{
out << " " << setw(3) << score << " |"
<< setw(WIDTH) << counts_junctions[key]
// << setw(10) << setprecision(3) << ((float) counts_junctions[key] / reference * 100) << "%"
<< " # "
<< setw(score - threshold_junctions + 1) << " " // decalage pour joli affichage
<< junc << tag << endl ;
}
else
nb_others += counts_junctions[key] ;
}
if (nb_others)
out << " ... +" << setw(WIDTH) << nb_others << " other junctions within the "
<< threshold_junctions << " threshold" << tag << endl ;
} // for num_file
//////////////////////////////////
}
#include<algorithm>
#include<utility>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <string>
#include <cstring>
#include <time.h>
#include "core/tools.h"
#include "core/fasta.h"
#include "core/dynprog.h"
#include "core/teestream.h"
#include "core/cut-pcr.h"
#define DEFAULT_READS "../../../seq/chr_pgm_100seq.fa"
#define DEFAULT_PRIMERS "../../../seq/amorces-pcr4.fa"
#define DEFAULT_CUT "./cuts.fa"
#define OUT_DIR "./cuts/"
#define f_html "./cuts/out.html"
#define MAX_MISMATCHES 4
using namespace std ;
extern char *optarg;
extern int optind, optopt, opterr;
void usage()
{
cerr << "[options] <reads.fa>" << endl << endl;
cerr << "Options: " << endl
<< "\t-P <file>" << endl
<< "\t\tprimer multi-fasta file" << endl
<< "\t-d <directory>" << endl
<< "\t\toutput directory used for storing the results (default: " << OUT_DIR << ")" << endl
<< "\t-v\tverbose mode" << endl ;
exit(1);
}
int main (int argc, char **argv)
{
string f_reads = DEFAULT_READS ;
string f_primers = DEFAULT_PRIMERS ;
string f_cut = DEFAULT_CUT ;
string out_dir = OUT_DIR;
int verbose = 0 ;
char c ;
while ((c = getopt(argc, argv, "hV:J:k:r:R:vw:e:d:c:m:M:s:")) != EOF)
switch (c)
{
case 'h':
cerr << "Usage: " << argv[0]<< " " ;
usage();
case 'v':
verbose += 1 ;
break;
case 'd':
out_dir = optarg;
break;
}
if (verbose)
cerr << "# verbose " << verbose << endl ;
if (optind == argc)
{
cerr << "# using default sequence file: " << f_reads << endl ;
}
else if (optind+1 == argc)
{
f_reads = argv[optind] ;
}
else
{
cerr << "wrong number of arguments." << endl ;
exit(1);
}
ofstream html(f_html);
teestream out(cout, html);
cerr << "Command line: ";
for (int i=0; i < argc; i++) {
cerr << argv[i] << " ";
}
cerr << endl;
//////////////////////////////////
time_t rawtime;
struct tm *timeinfo;
char time_buffer[80];
time (&rawtime );
timeinfo = localtime (&rawtime);
strftime (time_buffer, 80,"%F %T", timeinfo);
cerr << "# " << time_buffer << endl ;
//////////////////////////////////
cerr << "Read primers" << endl ;
Fasta base_primers(f_primers, 1, " ", cerr);
// For each primer, add revcomp
vector<Sequence> primers;
for (int p = 0 ; p<base_primers.size() ; p++)
{
Sequence primer;
primer.sequence = base_primers.sequence(p);
primer.label = "+" ;
primer.label_full = base_primers.label(p);
primers.push_back(primer);
Sequence primer_revcomp;
primer_revcomp.sequence = revcomp(primer.sequence);
primer_revcomp.label = "-" ;
primer_revcomp.label_full = primer.label_full;
primers.push_back(primer_revcomp);
}
//////////////////////////////////
cerr << "Read sequence files" << endl ;
Fasta reads(f_reads, 1, " ", cerr);
out_dir += "/";
//////////////////////////////////
// cerr << " ==> " << f_cut << endl ;
ofstream cut_file(f_cut.c_str());
const Cost cutCost = VDJ ;
const int cutRelativeThreshold = -cutCost.match * MAX_MISMATCHES ;
// a reflechir...
cerr << "# using cost " << cutCost << endl ;
cerr << "# relative threshold " << cutRelativeThreshold << endl ;
////////////////////////////////////////
// CUT THE READS //
////////////////////////////////////////
int ok = 0 ;
map <string, int> stats ;
for(int i = 0 ; i < reads.size() ; i++)
{
if (!(ok++ % 10000))
{
cerr << "." ;
cerr.flush();
}
if (verbose)
cerr << endl << endl << reads.label(i) << endl;
cut(reads.read(i), primers, verbose, stats,
cutCost, cutRelativeThreshold); // cut_file);
}
cerr << endl;
//////////////////////////////////
// Display stats
cerr << "Primers occurrences" << endl ;
for (unsigned p = 0 ; p<primers.size() ; p++)
{
cerr << "\t" << setw(7) << stats[primers[p].sequence] ;
cerr << " " << primers[p].label
<< " " << primers[p].label_full ;
if (p % 2)
cerr << endl ;
}
}
#include <fstream>
#include <iostream>
#include <string>
#include <set>
#include <cstdlib>
#include "core/kmerstore.h"
#include "core/dynprog.h"
#include "core/fasta.h"
#include "core/segment.h"
// #include "core/output.h"
using namespace std;
#define DEFAULT_K 14
void usage() {
cerr << "[options] <reads.fa>" << endl << endl;
cerr << "The program just uses kmers to determine which V and J are in place."
<< endl
<< "It must be told, that the results are not really satisfying with this method" << endl << endl;
cerr << "Options: " << endl
<< "\t-V <file>" << endl
<< "\t\tV repertoire multi-fasta file" << endl
<< "\t-J <file>" << endl
<< "\t\tJ repertoire multi-fasta file" << endl
<< "\t-k <size>" << endl
<< "\t\tk-mer size used for the V/J affectation (default: " << DEFAULT_K << ")" << endl
<< "\t-r" << endl
<< "\t\tconsider revcomp of k-mers" << endl;
exit(EXIT_FAILURE);
}
void print_unique(vector<KmerStringAffect> &v) {
if (v.size() > 0) {
set<KmerStringAffect> s(v.begin(), v.end());
for(set<KmerStringAffect>::iterator it = s.begin(); it != s.end(); it++) {
if (it->strand != 0)
cout << *it << " ";
}
}
cout << endl;
}
int main(int argc, char* argv[])
{
const char* frep_V = "../../../seq/Repertoire/TRGV.fa" ;
const char* frep_J = "../../../seq/Repertoire/TRGJ.fa" ;
const char* fdata_default = "tests/data/leukemia.fa" ;
int k = DEFAULT_K;
bool rc = false ;
int c;
while ((c = getopt(argc, argv, "hV:J:k:r")) != EOF)
switch (c)
{
case 'h':
cerr << "Usage: " << argv[0]<< " " ;
usage();
case 'V':
frep_V = optarg;
break;
case 'J':
frep_J = optarg;
break;
case 'k':
k = atoi(optarg);