Commit de4c48e0 authored by Mathieu Giraud's avatar Mathieu Giraud

Vidjil: release 2013.07

	* New selection of representative read (core/read_chooser.cpp)
	* Faster spaced seed computation (core/tools.cpp)
	* New unit tests
	* Bugs closed
parent 9387e03a
...@@ -9,6 +9,7 @@ test: all ...@@ -9,6 +9,7 @@ test: all
should: all should: all
@echo @echo
@echo "*** Launching .should_get tests..." @echo "*** Launching .should_get tests..."
src/tests/should-to-tap.sh src/tests/bugs/bug20130617.should_get
src/tests/should-to-tap.sh src/tests/stanford.should_get src/tests/should-to-tap.sh src/tests/stanford.should_get
src/tests/should-to-tap.sh src/tests/clones_simul.should_get src/tests/should-to-tap.sh src/tests/clones_simul.should_get
src/tests/should-to-tap.sh src/tests/clones_simul_cluster.should_get src/tests/should-to-tap.sh src/tests/clones_simul_cluster.should_get
......
>seq1
TAAAAAAAAAACCCCCCCCCCGGGGGGGGGGTTTTTTTTTTA
>seq2
AAAACCCCCCCCCCGGGGGGGGGGTTTTTTTTTTA
TAGGGCGCGATCGACTTC
>seq3
ATCGGTACACAATCGAGCATGTAGGCTACACGGTA
TAAAAAAAAAACCCCCCCCCCGGGGGGGGGGTTTTTT
2013-04-18 The Vidjil Team
2013-07-03 The Vidjil Team
* New selection of representative read (core/read_chooser.cpp)
* Faster spaced seed computation (core/tools.cpp)
* New unit tests
* Bugs closed
2013-04-18 The Vidjil Team
* First public release * First public release
......
...@@ -45,7 +45,7 @@ make data ...@@ -45,7 +45,7 @@ make data
# Immunol, 184(12), 6986–92. # Immunol, 184(12), 6986–92.
make germline make germline
# get IMGT germline databases -- you have to agree to IMGT license: # get IMGT germline databases (IMGT/GENE-DB) -- you have to agree to IMGT license:
# academic research only, provided that it is referred to IMGT®, # academic research only, provided that it is referred to IMGT®,
# and cited as "IMGT®, the international ImMunoGeneTics information system® # and cited as "IMGT®, the international ImMunoGeneTics information system®
# http://www.imgt.org (founder and director: Marie-Paule Lefranc, Montpellier, France). # http://www.imgt.org (founder and director: Marie-Paule Lefranc, Montpellier, France).
......
#!/bin/sh #!/bin/sh
cd $(dirname $0) cd $(dirname $0)
wget -O - http://www.imgt.org/download/GENE-DB/GENEDB-ReferenceSequences.fasta-nt-WithoutGaps-F+ORF+inframeP | python split-from-imgt.py wget -O - http://www.imgt.org/download/GENE-DB/IMGTGENEDB-ReferenceSequences.fasta-nt-WithoutGaps-F+ORF+inframeP | python split-from-imgt.py
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
import sys import sys
# Parse lines such as: # Parse lines in IMGT/GENE-DB such as:
# >M12949|TRGV1*01|Homo sapiens|ORF|... # >M12949|TRGV1*01|Homo sapiens|ORF|...
open_files = {} open_files = {}
......
CC=g++ CC=g++
OPTIM=-O2 -DNO_SPACED_SEEDS OPTIM=-O2
CFLAGS=-W -Wall $(OPTIM) $(DEBUG) CFLAGS=-W -Wall $(OPTIM) $(DEBUG)
LDFLAGS= LDFLAGS=
EXEC=vidjil kmer kmer_count cut count detailed_affect EXEC=vidjil kmer kmer_count cut count detailed_affect
...@@ -12,6 +12,18 @@ BINDIR=.. ...@@ -12,6 +12,18 @@ BINDIR=..
v: vidjil v: vidjil
spaced: cleanspaced
make
nospaced: cleanspaced
make OPTIM="-O2 -DNO_SPACED_SEEDS"
cleanspaced:
rm vidjil.o core/tools.o
all: $(EXEC) all: $(EXEC)
debug: debug:
......
...@@ -58,6 +58,15 @@ Fasta::Fasta(const string &input, ...@@ -58,6 +58,15 @@ Fasta::Fasta(const string &input,
} }
int Fasta::size() const{ return (int)reads.size(); } int Fasta::size() const{ return (int)reads.size(); }
list<Sequence> Fasta::getAll() const {
list<Sequence> reads;
for (int i=0; i < size(); i++) {
reads.push_back(read(i));
}
return reads;
}
const string& Fasta::label(int index) const{ return reads[index].label; } const string& Fasta::label(int index) const{ return reads[index].label; }
const string& Fasta::label_full(int index) const{ return reads[index].label_full; } const string& Fasta::label_full(int index) const{ return reads[index].label_full; }
const Sequence& Fasta::read(int index) const {return reads[index];} const Sequence& Fasta::read(int index) const {return reads[index];}
......
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
#include <istream> #include <istream>
#include <string> #include <string>
#include <vector> #include <vector>
#include <list>
#include "tools.h" #include "tools.h"
using namespace std; using namespace std;
...@@ -42,6 +43,11 @@ public: ...@@ -42,6 +43,11 @@ public:
ostream &out=cout); ostream &out=cout);
int size() const; int size() const;
/**
* Get all the sequences from the FASTA file
* @return a list of sequences in the same order as in the input file
*/
list<Sequence> getAll() const;
const string& label(int index) const; const string& label(int index) const;
const string& label_full(int index) const; const string& label_full(int index) const;
const Sequence &read(int index) const; const Sequence &read(int index) const;
......
...@@ -5,29 +5,22 @@ ...@@ -5,29 +5,22 @@
using namespace std; using namespace std;
ReadChooser::ReadChooser(list<Sequence> &r, VirtualReadScore &scorer) { ReadChooser::ReadChooser(list<Sequence> &r, VirtualReadScore &scorer) {
float best_score = -1;
float current_score;
for (list <Sequence>::const_iterator it = r.begin(); it != r.end(); ++it) { for (list <Sequence>::const_iterator it = r.begin(); it != r.end(); ++it) {
current_score = scorer.getScore(it->sequence); scores[it->sequence] = scorer.getScore(it->sequence);
if (current_score > best_score) {
best_score = current_score;
best_sequence = *it;
}
} }
// vector<Sequence> test(r.begin(), r.end());
// sort(test.begin(), test.end(), *this); reads.assign(r.begin(), r.end());
// reads = list<Sequence>(test.begin(), test.end()); sort(reads.begin(), reads.end(), *this);
} }
Sequence ReadChooser::getBest() const{ Sequence ReadChooser::getBest() const{
return best_sequence; return reads[0];
} }
// list<Sequence> ReadChooser::getSorted() const { Sequence ReadChooser::getithBest(size_t i) const {
// return reads; return reads[i-1];
// } }
// bool ReadChooser::operator()(Sequence first, Sequence second) { bool ReadChooser::operator()(Sequence first, Sequence second) {
// return scorer.getScore(first.sequence) > scorer.getScore(second.sequence); return scores[first.sequence] > scores[second.sequence];
// } }
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
#include <list> #include <list>
#include "fasta.h" #include "fasta.h"
#include "read_score.h" #include "read_score.h"
#include <map>
/** /**
* This class aims at choosing the best read among a group of read. * This class aims at choosing the best read among a group of read.
...@@ -16,17 +16,30 @@ ...@@ -16,17 +16,30 @@
class ReadChooser { class ReadChooser {
private: private:
Sequence best_sequence; vector<Sequence> reads;
map<string, float> scores;
public: public:
ReadChooser(list<Sequence> &r, VirtualReadScore &scorer); ReadChooser(list<Sequence> &r, VirtualReadScore &scorer);
/** /**
* @return the best sequence among the list of sequences that have been * @return the best sequence among the list of sequences that have been
* given to the object * given to the object
*/ */
Sequence getBest() const; Sequence getBest() const;
/**
* @pre i >= 1 && i <= total number of sequences on the input
* @param i: starts at 1
* @return the i-th best scored sequence
*/
Sequence getithBest(size_t i) const;
/**
* A comparison based on scorer of the two sequences.
*/
bool operator()(Sequence first, Sequence second);
}; };
#endif #endif
...@@ -53,3 +53,14 @@ void KmerAffectReadScore::setUnambiguousScore(float score) { ...@@ -53,3 +53,14 @@ void KmerAffectReadScore::setUnambiguousScore(float score) {
void KmerAffectReadScore::setUnknownScore(float score) { void KmerAffectReadScore::setUnknownScore(float score) {
unknown_score = score; unknown_score = score;
} }
////////////////////////////////////////////////////////////////////////////////
////////////////////////////// ReadLengthScore ///////////////////////////////
////////////////////////////////////////////////////////////////////////////////
ReadLengthScore::ReadLengthScore(){}
float ReadLengthScore::getScore(const string &sequence) const {
return sequence.size();
}
...@@ -54,4 +54,18 @@ public: ...@@ -54,4 +54,18 @@ public:
void setUnambiguousScore(float score) ; void setUnambiguousScore(float score) ;
void setUnknownScore(float score) ; void setUnknownScore(float score) ;
}; };
/**
* A simple implementation of VirtualReadScore.
* The score is the length of the read
*/
class ReadLengthScore: public VirtualReadScore {
public:
ReadLengthScore();
/**
* @return the sequence length
*/
float getScore(const string &sequence) const;
};
#endif #endif
#include "representative.h"
#include "kmerstore.h"
#include "read_score.h"
#include "read_chooser.h"
#include <iostream>
using namespace std;
RepresentativeComputer::RepresentativeComputer(list<Sequence> &r)
:sequences(r),is_computed(false),representative() {
}
Sequence RepresentativeComputer::getRepresentative() const{
assert(hasRepresentative());
return representative;
}
list<Sequence>& RepresentativeComputer::getSequenceList() const{
return sequences;
}
bool RepresentativeComputer::hasRepresentative() const{
return is_computed;
}
int KmerRepresentativeComputer::getK() const{
return k;
}
void KmerRepresentativeComputer::setK(int k) {
this->k = k;
}
int KmerRepresentativeComputer::getStabilityLimit() const {
return stability_limit;
}
void KmerRepresentativeComputer::setStabilityLimit(int limit) {
stability_limit = limit;
}
KmerRepresentativeComputer::KmerRepresentativeComputer(list<Sequence> &r,
int k)
:RepresentativeComputer(r),k(k),stability_limit(DEFAULT_STABILITY_LIMIT){}
void KmerRepresentativeComputer::compute(bool do_revcomp, size_t min_cover,
float percent_cover) {
is_computed = false;
// First create an index on the set of reads
IKmerStore<Kmer> *index = KmerStoreFactory::createIndex<Kmer>(getK(), do_revcomp);
// Add sequences to the index
for (list<Sequence>::iterator it=sequences.begin(); it != sequences.end(); ++it) {
index->insert(it->sequence, it->label);
}
size_t max = sequences.size();
// Create a read chooser to have the sequences sorted by length
ReadLengthScore *rlc = new ReadLengthScore();
ReadChooser rc(sequences, *rlc);
delete rlc;
// Traverse the sequences to get the desired representative
size_t pos_longest_run = 0;
size_t length_longest_run = 0;
size_t seq_index_longest_run = 1;
Sequence sequence_longest_run;
for (size_t seq = 1; seq <= sequences.size() && seq <= seq_index_longest_run + stability_limit ; seq++) {
Sequence sequence = rc.getithBest(seq);
if (sequence.sequence.size() <= length_longest_run) {
break;
}
vector<Kmer> counts = index->getResults(sequence.sequence);
for (size_t i =0; i < counts.size(); i++) {
size_t length_run = 0;
// Search the longest "run" of consecutive k-mers that are sufficiently
// expressed in the read collection.
while (i < counts.size()
&& counts[i].count >= min_cover
&& counts[i].count >= max*percent_cover) {
length_run++;
i++;
}
if (length_run)
// Take into account the whole k-mer, not just the starting positions
length_run += getK() - 1;
if (length_run > length_longest_run) {
length_longest_run = length_run;
pos_longest_run = i - (length_run - getK() - 1);
sequence_longest_run = sequence;
seq_index_longest_run = seq;
}
}
}
if (length_longest_run) {
is_computed = true;
representative = sequence_longest_run;
representative.sequence = representative.sequence.substr(pos_longest_run, length_longest_run);
representative.label += "-[" + string_of_int(pos_longest_run) + ","
+ string_of_int(pos_longest_run + length_longest_run - 1) + "]";
}
delete index;
}
#ifndef REPRESENTATIVE_H
#define REPRESENTATIVE_H
#include <string>
#include <cassert>
#include <list>
#include "fasta.h"
using namespace std;
#define DEFAULT_STABILITY_LIMIT 30
/**
* Compute a representative sequence from a list of sequences.
* The sequences are supposed to share a common juction.
*/
class RepresentativeComputer {
protected:
list<Sequence> &sequences;
bool is_computed;
Sequence representative;
public:
RepresentativeComputer(list<Sequence> &r);
/**
* @pre hasRepresentative()
* @return the representative sequence of the set of sequences.
* The representative meets the criteria given to compute().
* The label of the sequence is composed of the read labels used for that
* purpose, plus the positions that have been extracted.
*/
Sequence getRepresentative() const;
/**
* @return the input sequences we are working on
*/
list<Sequence>& getSequenceList() const;
/**
* @return true iff compute() has been called and the criteria have been met.
*/
bool hasRepresentative() const;
/**
* Compute the representative depending on the specified parameters.
* @param do_revcomp: true iff sequences may be coming from any strand, and
* therefore should be revcomp-ed
* @param min_cover: minimal number of reads supporting each position of the
* representative
* @param percent_cover: minimal percent of the maximal coverage that is
* admissible for covering the representative.
* Any position is covered by at least percent_cover %
* of the maximal coverage.
*/
virtual void compute(bool do_revcomp, size_t min_cover, float percent_cover) = 0;
};
/**
* The representative is computed from the list of sequences. Those sequences
* must all share a common factor whose length is greater or equal to k.
*/
class KmerRepresentativeComputer : public RepresentativeComputer {
protected:
int k;
int stability_limit;
public:
KmerRepresentativeComputer(list<Sequence> &r, int k);
// Getters, setters
int getK() const;
/**
* Sets the length of the k-mer used for computing the representative
*/
void setK(int k);
int getStabilityLimit() const;
/**
* @param limit: maximal number of iterations to be performed before reaching
* stability. If after limit number of iterations, the length
* of the representative didn't improve, we keep it.
*/
void setStabilityLimit(int limit);
// Actions
void compute(bool do_revcomp, size_t min_cover, float percent_cover);
};
#endif
...@@ -519,7 +519,7 @@ FineSegmenter::FineSegmenter(Sequence seq, Fasta &rep_V, Fasta &rep_J, ...@@ -519,7 +519,7 @@ FineSegmenter::FineSegmenter(Sequence seq, Fasta &rep_V, Fasta &rep_J,
//overlap VJ //overlap VJ
if(right-left <=0){ if(right-left <=0){
int b_r, b_l; int b_r, b_l;
int overlap=left-left2+1; int overlap=left-right+1;
string seq_left = sequence.substr(0, left+1); string seq_left = sequence.substr(0, left+1);
string seq_right = sequence.substr(right); string seq_right = sequence.substr(right);
......
...@@ -16,22 +16,32 @@ int seed_weight(const string &seed) ...@@ -16,22 +16,32 @@ int seed_weight(const string &seed)
return count(seed.begin(), seed.end(), SEED_YES); return count(seed.begin(), seed.end(), SEED_YES);
} }
char spaced_buf[MAX_SEED_SIZE+1];
string spaced(const string &input, const string &seed) { string spaced(const string &input, const string &seed) {
// #ifdef STATIC_SPACED_SEED_FOURTEEN
// return input.substr(0, 7) + input.substr(8, 7);
// #endif
#ifdef NO_SPACED_SEEDS #ifdef NO_SPACED_SEEDS
return input ; return input ;
#endif #endif
string output = ""; int j = 0 ;
// cout << input << endl << seed << endl ; // cout << input << endl << seed << endl ;
assert(input.length() == seed.length()); assert(input.length() == seed.length());
for (size_t i = 0; i < input.length(); i++) for (size_t i = 0; i < input.length(); i++)
if (seed[i] == SEED_YES) if (seed[i] == SEED_YES)
output += input[i] ; spaced_buf[j++] = input[i] ;
return output ; spaced_buf[j] = (char) 0;
// cout << spaced_buf << "|" << string(spaced_buf) << "|" << input << "|" << endl ;
return string(spaced_buf);
} }
......
#ifndef TOOLS_H #ifndef TOOLS_H
#define TOOLS_H #define TOOLS_H
#define MAX_SEED_SIZE 50 // Spaced seed buffer
#define FIRST_POS 0 // Numbering of the base pairs #define FIRST_POS 0 // Numbering of the base pairs
#include <sstream> #include <sstream>
......
>seq1 TRG
ACCAGGCGAAGTTACTATGAGCTTAGTCCCTTCAGCAAATATCTTGAACCAACCAGTGGTATCCCACGCAGCACAGTAGTAAACGGCCATGTCTTCTTTCTCTACGGACTTGATGGTAAGGATTGAAGTGAGAGTTTGAGAATTCTTTCTTGCCTCCACTTTGTTGCTTGTCTTACCCATGC
>seq2
ACCAGGCGAAGTTACTATGAGCTTAGTCCCTTCAGCAAATATCTTGAACCAACCAGTGGTATCCCACGCAGCACAGTAGTAAACGGCCATGTCTTCTTTCTCTACGGACTTGATGGTAAGGATTGAAGTGAGAGTTTGAGAATTCTTTCTTGCCTCCACTTTGTTGCTTGTCTTACCCATGC
>seq3
ACCAGGCGAAGTTACTATGAGCTTAGTCCCTTCAGCAAATATCTTGAACCAACCAGTGGTATCCCACGCAGCACAGTAGTAAACGGCCATGTCTTCTTTCTCTACGGACTTGATGGTAAGGATTGAAGTGAGAGTTTGAGAATTCTTTCTTGCCTCCACTTTGTTGCTTGTCTTACCCATGC
>seq4
ACCAGGCGAAGTTACTATGAGCTTAGTCCCTTCAGCAAATATCTTGAACCAACCAGTGGTATCCCACGCAGCACAGTAGTAAACGGCCATGTCTTCTTTCTCTACGGACTTGATGGTAAGGATTGAAGTGAGAGTTTGAGAATTCTTTCTTGCCTCCACTTTGTTGCTTGTCTTACCCATGC
\ No newline at end of file
!LAUNCH: ../../../vidjil -c clones -G ../../../germline/TRG -r 1 -R 1 bug20130617.fa
$ Bug with J deletion being 122
# According to IMGT :
# V10*01 ends at gtttactactgtgctgcgtgg
# JP1*01 starts at ataccactggttg
# N1 is G
1:Clone #001 – 4 reads – 100%
!LAUNCH: ../../vidjil -c clones -G ../../germline/IGH -r 3 -R 5 -d ../../data/Stanford_S22.fasta !LAUNCH: ../../vidjil -c clones -G ../../germline/IGH -r 3 -R 5 -d ../../data/Stanford_S22.fasta
$ Junction extractions $ Window extractions
1:found 3867 junctions in 8219 segments 1:found 3867 windows in 8219 segments
$ Junction threshold $ Window threshold
1:keep 51 junctions in 2386 reads 1:keep 51 windows in 2386 reads
$ Clones output $ Clones output
1:Clone #001 – 630 reads 1:Clone #001 – 630 reads
!LAUNCH: ../../vidjil -c clones -G ../../germline/IGH -x -r 1 -R 1 -d ../../data/clones_simul.fa !LAUNCH: ../../vidjil -c clones -G ../../germline/IGH -x -r 1 -R 1 -d ../../data/clones_simul.fa
$ Junction extractions $ Junction extractions
1:found 20 junctions in 47 segments 1:found 20 50-windows in 47 segments
$ No clustering $ No clustering
1:==> 20 clones 1:==> 20 clones
......
!LAUNCH: ../../vidjil -c clones -G ../../germline/IGH -x -r 1 -R 5 -n 5 -d ../../data/clones_simul.fa !LAUNCH: ../../vidjil -c clones -G ../../germline/IGH -x -r 1 -R 5 -n 5 -d ../../data/clones_simul.fa
$ Junction extractions $ Window extractions
1:found 20 junctions in 47 segments 1:found 20 50-windows in 47 segments
$ Some clustering $ Some clustering
1:==> 2 clones 1:==> 2 clones
......
!LAUNCH: ../../vidjil -G ../../germline/IGH -d -c segment ../../data/segment_simul.fa | grep '^>' !LAUNCH: ../../vidjil -G ../../germline/IGH -d -c segment ../../data/segment_simul.fa | grep '^>'
$ First sequence, easy segmentation (no error, few deletions at the junctions, small N) $ First sequence, easy segmentation (no error, few deletions at the windows, small N)
# 72 77 89 94 # 72 77 89 94
# | | | | # | | | |
# seq : 61 attactgtgcgaCTTCataaccggaaccaGTTAtggttcgactcctggggccaaggaaccctggtcaccgtctcctcag # seq : 61 attactgtgcgaCTTCataaccggaaccaGTTAtggttcgactcctggggccaaggaaccctggtcaccgtctcctcag
......
...@@ -48,7 +48,7 @@ fi ...@@ -48,7 +48,7 @@ fi
debug() { debug() {
if [ ! -z "$DEBUG" ]; then if [ ! -z "$DEBUG" ]; then
echo $* >&2 echo "$*" >&2
fi fi
} }
...@@ -131,8 +131,8 @@ while read line; do ...@@ -131,8 +131,8 @@ while read line; do
skip=0 skip=0
know_to_fail=0 know_to_fail=0
pattern=$(cut -d: -f2- <<< $line) pattern=$(cut -d: -f2- <<< "$line")
nb_hits=$(cut -d: -f1 <<< $line) nb_hits=$(cut -d: -f1 <<< "$line")
if [ ${nb_hits:0:1} == "s" ]; then if [ ${nb_hits:0:1} == "s" ]; then
skip=1 # We skip the test if it fails skip=1 # We skip the test if it fails
...@@ -141,7 +141,7 @@ while read line; do ...@@ -141,7 +141,7 @@ while read line; do
know_to_fail=1 # We know the test fails, but don't fail globally know_to_fail=1 # We know the test fails, but don't fail globally
nb_hits=${nb_hits:1} nb_hits=${nb_hits:1}
fi fi
debug "Grepping \"$pattern\" in $FILE_TO_GREP" debug "Grepping $pattern in $FILE_TO_GREP"
if [ $(grep -cE "$pattern" $FILE_TO_GREP) -eq $nb_hits -o $skip -eq 1 ]; then if [ $(grep -cE "$pattern" $FILE_TO_GREP) -eq $nb_hits -o $skip -eq 1 ]; then
if [ $know_to_fail -eq 1 ]; then if [ $know_to_fail -eq 1 ]; then
echo "Warning: test $test_nb should have failed, but has not!" >&2 echo "Warning: test $test_nb should have failed, but has not!" >&2
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
!LOG: stanford.log !LOG: stanford.log
$ Parses IGHV.fa germline $ Parses IGHV.fa germline
1: 94301 bp in 324 sequences 1: 99341 bp in 341 sequences
$ Parses IGHD.fa germline $ Parses IGHD.fa germline
1: 1070 bp in 44 sequences 1: 1070 bp in 44 sequences
...@@ -10,6 +10,6 @@ $ Parses IGHD.fa germline ...@@ -10,6 +10,6 @@ $ Parses IGHD.fa germline
$ Parses germline/IGHJ.fa $ Parses germline/IGHJ.fa
1: 701 bp in 13 sequences 1: 701 bp in 13 sequences
$ Find the good number of junctions in Stanford S22 $ Find the good number of windows in Stanford S22
1: found 10087 junctions in 12317 segments 1: found 10087 50-windows in 12317 segments
#include "core/read_chooser.h"
#include "core/read_score.h"
#include "core/fasta.h"
void testChooser() {
list<Sequence> reads = Fasta("../../data/test1.fa").getAll();