Commit de4c48e0 authored by Mathieu Giraud's avatar Mathieu Giraud

Vidjil: release 2013.07

	* New selection of representative read (core/read_chooser.cpp)
	* Faster spaced seed computation (core/tools.cpp)
	* New unit tests
	* Bugs closed
parent 9387e03a
......@@ -9,6 +9,7 @@ test: all
should: all
@echo
@echo "*** Launching .should_get tests..."
src/tests/should-to-tap.sh src/tests/bugs/bug20130617.should_get
src/tests/should-to-tap.sh src/tests/stanford.should_get
src/tests/should-to-tap.sh src/tests/clones_simul.should_get
src/tests/should-to-tap.sh src/tests/clones_simul_cluster.should_get
......
>seq1
TAAAAAAAAAACCCCCCCCCCGGGGGGGGGGTTTTTTTTTTA
>seq2
AAAACCCCCCCCCCGGGGGGGGGGTTTTTTTTTTA
TAGGGCGCGATCGACTTC
>seq3
ATCGGTACACAATCGAGCATGTAGGCTACACGGTA
TAAAAAAAAAACCCCCCCCCCGGGGGGGGGGTTTTTT
2013-04-18 The Vidjil Team
2013-07-03 The Vidjil Team
* New selection of representative read (core/read_chooser.cpp)
* Faster spaced seed computation (core/tools.cpp)
* New unit tests
* Bugs closed
2013-04-18 The Vidjil Team
* First public release
......
......@@ -45,7 +45,7 @@ make data
# Immunol, 184(12), 6986–92.
make germline
# get IMGT germline databases -- you have to agree to IMGT license:
# get IMGT germline databases (IMGT/GENE-DB) -- you have to agree to IMGT license:
# academic research only, provided that it is referred to IMGT®,
# and cited as "IMGT®, the international ImMunoGeneTics information system®
# http://www.imgt.org (founder and director: Marie-Paule Lefranc, Montpellier, France).
......
#!/bin/sh
cd $(dirname $0)
wget -O - http://www.imgt.org/download/GENE-DB/GENEDB-ReferenceSequences.fasta-nt-WithoutGaps-F+ORF+inframeP | python split-from-imgt.py
wget -O - http://www.imgt.org/download/GENE-DB/IMGTGENEDB-ReferenceSequences.fasta-nt-WithoutGaps-F+ORF+inframeP | python split-from-imgt.py
......@@ -2,7 +2,7 @@
import sys
# Parse lines such as:
# Parse lines in IMGT/GENE-DB such as:
# >M12949|TRGV1*01|Homo sapiens|ORF|...
open_files = {}
......
CC=g++
OPTIM=-O2 -DNO_SPACED_SEEDS
OPTIM=-O2
CFLAGS=-W -Wall $(OPTIM) $(DEBUG)
LDFLAGS=
EXEC=vidjil kmer kmer_count cut count detailed_affect
......@@ -12,6 +12,18 @@ BINDIR=..
v: vidjil
spaced: cleanspaced
make
nospaced: cleanspaced
make OPTIM="-O2 -DNO_SPACED_SEEDS"
cleanspaced:
rm vidjil.o core/tools.o
all: $(EXEC)
debug:
......
......@@ -58,6 +58,15 @@ Fasta::Fasta(const string &input,
}
int Fasta::size() const{ return (int)reads.size(); }
list<Sequence> Fasta::getAll() const {
list<Sequence> reads;
for (int i=0; i < size(); i++) {
reads.push_back(read(i));
}
return reads;
}
const string& Fasta::label(int index) const{ return reads[index].label; }
const string& Fasta::label_full(int index) const{ return reads[index].label_full; }
const Sequence& Fasta::read(int index) const {return reads[index];}
......
......@@ -4,6 +4,7 @@
#include <istream>
#include <string>
#include <vector>
#include <list>
#include "tools.h"
using namespace std;
......@@ -42,6 +43,11 @@ public:
ostream &out=cout);
int size() const;
/**
* Get all the sequences from the FASTA file
* @return a list of sequences in the same order as in the input file
*/
list<Sequence> getAll() const;
const string& label(int index) const;
const string& label_full(int index) const;
const Sequence &read(int index) const;
......
......@@ -5,29 +5,22 @@
using namespace std;
ReadChooser::ReadChooser(list<Sequence> &r, VirtualReadScore &scorer) {
float best_score = -1;
float current_score;
for (list <Sequence>::const_iterator it = r.begin(); it != r.end(); ++it) {
current_score = scorer.getScore(it->sequence);
if (current_score > best_score) {
best_score = current_score;
best_sequence = *it;
}
scores[it->sequence] = scorer.getScore(it->sequence);
}
// vector<Sequence> test(r.begin(), r.end());
// sort(test.begin(), test.end(), *this);
// reads = list<Sequence>(test.begin(), test.end());
reads.assign(r.begin(), r.end());
sort(reads.begin(), reads.end(), *this);
}
Sequence ReadChooser::getBest() const{
return best_sequence;
return reads[0];
}
// list<Sequence> ReadChooser::getSorted() const {
// return reads;
// }
Sequence ReadChooser::getithBest(size_t i) const {
return reads[i-1];
}
// bool ReadChooser::operator()(Sequence first, Sequence second) {
// return scorer.getScore(first.sequence) > scorer.getScore(second.sequence);
// }
bool ReadChooser::operator()(Sequence first, Sequence second) {
return scores[first.sequence] > scores[second.sequence];
}
......@@ -3,7 +3,7 @@
#include <list>
#include "fasta.h"
#include "read_score.h"
#include <map>
/**
* This class aims at choosing the best read among a group of read.
......@@ -16,17 +16,30 @@
class ReadChooser {
private:
Sequence best_sequence;
vector<Sequence> reads;
map<string, float> scores;
public:
ReadChooser(list<Sequence> &r, VirtualReadScore &scorer);
/**
* @return the best sequence among the list of sequences that have been
* given to the object
*/
Sequence getBest() const;
/**
* @pre i >= 1 && i <= total number of sequences on the input
* @param i: starts at 1
* @return the i-th best scored sequence
*/
Sequence getithBest(size_t i) const;
/**
* A comparison based on scorer of the two sequences.
*/
bool operator()(Sequence first, Sequence second);
};
#endif
......@@ -53,3 +53,14 @@ void KmerAffectReadScore::setUnambiguousScore(float score) {
void KmerAffectReadScore::setUnknownScore(float score) {
unknown_score = score;
}
////////////////////////////////////////////////////////////////////////////////
////////////////////////////// ReadLengthScore ///////////////////////////////
////////////////////////////////////////////////////////////////////////////////
ReadLengthScore::ReadLengthScore(){}
float ReadLengthScore::getScore(const string &sequence) const {
return sequence.size();
}
......@@ -54,4 +54,18 @@ public:
void setUnambiguousScore(float score) ;
void setUnknownScore(float score) ;
};
/**
* A simple implementation of VirtualReadScore.
* The score is the length of the read
*/
class ReadLengthScore: public VirtualReadScore {
public:
ReadLengthScore();
/**
* @return the sequence length
*/
float getScore(const string &sequence) const;
};
#endif
#include "representative.h"
#include "kmerstore.h"
#include "read_score.h"
#include "read_chooser.h"
#include <iostream>
using namespace std;
RepresentativeComputer::RepresentativeComputer(list<Sequence> &r)
:sequences(r),is_computed(false),representative() {
}
Sequence RepresentativeComputer::getRepresentative() const{
assert(hasRepresentative());
return representative;
}
list<Sequence>& RepresentativeComputer::getSequenceList() const{
return sequences;
}
bool RepresentativeComputer::hasRepresentative() const{
return is_computed;
}
int KmerRepresentativeComputer::getK() const{
return k;
}
void KmerRepresentativeComputer::setK(int k) {
this->k = k;
}
int KmerRepresentativeComputer::getStabilityLimit() const {
return stability_limit;
}
void KmerRepresentativeComputer::setStabilityLimit(int limit) {
stability_limit = limit;
}
KmerRepresentativeComputer::KmerRepresentativeComputer(list<Sequence> &r,
int k)
:RepresentativeComputer(r),k(k),stability_limit(DEFAULT_STABILITY_LIMIT){}
void KmerRepresentativeComputer::compute(bool do_revcomp, size_t min_cover,
float percent_cover) {
is_computed = false;
// First create an index on the set of reads
IKmerStore<Kmer> *index = KmerStoreFactory::createIndex<Kmer>(getK(), do_revcomp);
// Add sequences to the index
for (list<Sequence>::iterator it=sequences.begin(); it != sequences.end(); ++it) {
index->insert(it->sequence, it->label);
}
size_t max = sequences.size();
// Create a read chooser to have the sequences sorted by length
ReadLengthScore *rlc = new ReadLengthScore();
ReadChooser rc(sequences, *rlc);
delete rlc;
// Traverse the sequences to get the desired representative
size_t pos_longest_run = 0;
size_t length_longest_run = 0;
size_t seq_index_longest_run = 1;
Sequence sequence_longest_run;
for (size_t seq = 1; seq <= sequences.size() && seq <= seq_index_longest_run + stability_limit ; seq++) {
Sequence sequence = rc.getithBest(seq);
if (sequence.sequence.size() <= length_longest_run) {
break;
}
vector<Kmer> counts = index->getResults(sequence.sequence);
for (size_t i =0; i < counts.size(); i++) {
size_t length_run = 0;
// Search the longest "run" of consecutive k-mers that are sufficiently
// expressed in the read collection.
while (i < counts.size()
&& counts[i].count >= min_cover
&& counts[i].count >= max*percent_cover) {
length_run++;
i++;
}
if (length_run)
// Take into account the whole k-mer, not just the starting positions
length_run += getK() - 1;
if (length_run > length_longest_run) {
length_longest_run = length_run;
pos_longest_run = i - (length_run - getK() - 1);
sequence_longest_run = sequence;
seq_index_longest_run = seq;
}
}
}
if (length_longest_run) {
is_computed = true;
representative = sequence_longest_run;
representative.sequence = representative.sequence.substr(pos_longest_run, length_longest_run);
representative.label += "-[" + string_of_int(pos_longest_run) + ","
+ string_of_int(pos_longest_run + length_longest_run - 1) + "]";
}
delete index;
}
#ifndef REPRESENTATIVE_H
#define REPRESENTATIVE_H
#include <string>
#include <cassert>
#include <list>
#include "fasta.h"
using namespace std;
#define DEFAULT_STABILITY_LIMIT 30
/**
* Compute a representative sequence from a list of sequences.
* The sequences are supposed to share a common juction.
*/
class RepresentativeComputer {
protected:
list<Sequence> &sequences;
bool is_computed;
Sequence representative;
public:
RepresentativeComputer(list<Sequence> &r);
/**
* @pre hasRepresentative()
* @return the representative sequence of the set of sequences.
* The representative meets the criteria given to compute().
* The label of the sequence is composed of the read labels used for that
* purpose, plus the positions that have been extracted.
*/
Sequence getRepresentative() const;
/**
* @return the input sequences we are working on
*/
list<Sequence>& getSequenceList() const;
/**
* @return true iff compute() has been called and the criteria have been met.
*/
bool hasRepresentative() const;
/**
* Compute the representative depending on the specified parameters.
* @param do_revcomp: true iff sequences may be coming from any strand, and
* therefore should be revcomp-ed
* @param min_cover: minimal number of reads supporting each position of the
* representative
* @param percent_cover: minimal percent of the maximal coverage that is
* admissible for covering the representative.
* Any position is covered by at least percent_cover %
* of the maximal coverage.
*/
virtual void compute(bool do_revcomp, size_t min_cover, float percent_cover) = 0;
};
/**
* The representative is computed from the list of sequences. Those sequences
* must all share a common factor whose length is greater or equal to k.
*/
class KmerRepresentativeComputer : public RepresentativeComputer {
protected:
int k;
int stability_limit;
public:
KmerRepresentativeComputer(list<Sequence> &r, int k);
// Getters, setters
int getK() const;
/**
* Sets the length of the k-mer used for computing the representative
*/
void setK(int k);
int getStabilityLimit() const;
/**
* @param limit: maximal number of iterations to be performed before reaching
* stability. If after limit number of iterations, the length
* of the representative didn't improve, we keep it.
*/
void setStabilityLimit(int limit);
// Actions
void compute(bool do_revcomp, size_t min_cover, float percent_cover);
};
#endif
......@@ -519,7 +519,7 @@ FineSegmenter::FineSegmenter(Sequence seq, Fasta &rep_V, Fasta &rep_J,
//overlap VJ
if(right-left <=0){
int b_r, b_l;
int overlap=left-left2+1;
int overlap=left-right+1;
string seq_left = sequence.substr(0, left+1);
string seq_right = sequence.substr(right);
......
......@@ -16,22 +16,32 @@ int seed_weight(const string &seed)
return count(seed.begin(), seed.end(), SEED_YES);
}
char spaced_buf[MAX_SEED_SIZE+1];
string spaced(const string &input, const string &seed) {
// #ifdef STATIC_SPACED_SEED_FOURTEEN
// return input.substr(0, 7) + input.substr(8, 7);
// #endif
#ifdef NO_SPACED_SEEDS
return input ;
#endif
string output = "";
int j = 0 ;
// cout << input << endl << seed << endl ;
assert(input.length() == seed.length());
for (size_t i = 0; i < input.length(); i++)
if (seed[i] == SEED_YES)
output += input[i] ;
spaced_buf[j++] = input[i] ;
return output ;
spaced_buf[j] = (char) 0;
// cout << spaced_buf << "|" << string(spaced_buf) << "|" << input << "|" << endl ;
return string(spaced_buf);
}
......
#ifndef TOOLS_H
#define TOOLS_H
#define MAX_SEED_SIZE 50 // Spaced seed buffer
#define FIRST_POS 0 // Numbering of the base pairs
#include <sstream>
......
>seq1 TRG
ACCAGGCGAAGTTACTATGAGCTTAGTCCCTTCAGCAAATATCTTGAACCAACCAGTGGTATCCCACGCAGCACAGTAGTAAACGGCCATGTCTTCTTTCTCTACGGACTTGATGGTAAGGATTGAAGTGAGAGTTTGAGAATTCTTTCTTGCCTCCACTTTGTTGCTTGTCTTACCCATGC
>seq2
ACCAGGCGAAGTTACTATGAGCTTAGTCCCTTCAGCAAATATCTTGAACCAACCAGTGGTATCCCACGCAGCACAGTAGTAAACGGCCATGTCTTCTTTCTCTACGGACTTGATGGTAAGGATTGAAGTGAGAGTTTGAGAATTCTTTCTTGCCTCCACTTTGTTGCTTGTCTTACCCATGC
>seq3
ACCAGGCGAAGTTACTATGAGCTTAGTCCCTTCAGCAAATATCTTGAACCAACCAGTGGTATCCCACGCAGCACAGTAGTAAACGGCCATGTCTTCTTTCTCTACGGACTTGATGGTAAGGATTGAAGTGAGAGTTTGAGAATTCTTTCTTGCCTCCACTTTGTTGCTTGTCTTACCCATGC
>seq4
ACCAGGCGAAGTTACTATGAGCTTAGTCCCTTCAGCAAATATCTTGAACCAACCAGTGGTATCCCACGCAGCACAGTAGTAAACGGCCATGTCTTCTTTCTCTACGGACTTGATGGTAAGGATTGAAGTGAGAGTTTGAGAATTCTTTCTTGCCTCCACTTTGTTGCTTGTCTTACCCATGC
\ No newline at end of file
!LAUNCH: ../../../vidjil -c clones -G ../../../germline/TRG -r 1 -R 1 bug20130617.fa
$ Bug with J deletion being 122
# According to IMGT :
# V10*01 ends at gtttactactgtgctgcgtgg
# JP1*01 starts at ataccactggttg
# N1 is G
1:Clone #001 – 4 reads – 100%
!LAUNCH: ../../vidjil -c clones -G ../../germline/IGH -r 3 -R 5 -d ../../data/Stanford_S22.fasta
$ Junction extractions
1:found 3867 junctions in 8219 segments
$ Window extractions
1:found 3867 windows in 8219 segments
$ Junction threshold
1:keep 51 junctions in 2386 reads
$ Window threshold
1:keep 51 windows in 2386 reads
$ Clones output
1:Clone #001 – 630 reads
!LAUNCH: ../../vidjil -c clones -G ../../germline/IGH -x -r 1 -R 1 -d ../../data/clones_simul.fa
$ Junction extractions
1:found 20 junctions in 47 segments
1:found 20 50-windows in 47 segments
$ No clustering
1:==> 20 clones
......
!LAUNCH: ../../vidjil -c clones -G ../../germline/IGH -x -r 1 -R 5 -n 5 -d ../../data/clones_simul.fa
$ Junction extractions
1:found 20 junctions in 47 segments
$ Window extractions
1:found 20 50-windows in 47 segments
$ Some clustering
1:==> 2 clones
......
!LAUNCH: ../../vidjil -G ../../germline/IGH -d -c segment ../../data/segment_simul.fa | grep '^>'
$ First sequence, easy segmentation (no error, few deletions at the junctions, small N)
$ First sequence, easy segmentation (no error, few deletions at the windows, small N)
# 72 77 89 94
# | | | |
# seq : 61 attactgtgcgaCTTCataaccggaaccaGTTAtggttcgactcctggggccaaggaaccctggtcaccgtctcctcag
......
......@@ -48,7 +48,7 @@ fi
debug() {
if [ ! -z "$DEBUG" ]; then
echo $* >&2
echo "$*" >&2
fi
}
......@@ -131,8 +131,8 @@ while read line; do
skip=0
know_to_fail=0
pattern=$(cut -d: -f2- <<< $line)
nb_hits=$(cut -d: -f1 <<< $line)
pattern=$(cut -d: -f2- <<< "$line")
nb_hits=$(cut -d: -f1 <<< "$line")
if [ ${nb_hits:0:1} == "s" ]; then
skip=1 # We skip the test if it fails
......@@ -141,7 +141,7 @@ while read line; do
know_to_fail=1 # We know the test fails, but don't fail globally
nb_hits=${nb_hits:1}
fi
debug "Grepping \"$pattern\" in $FILE_TO_GREP"
debug "Grepping $pattern in $FILE_TO_GREP"
if [ $(grep -cE "$pattern" $FILE_TO_GREP) -eq $nb_hits -o $skip -eq 1 ]; then
if [ $know_to_fail -eq 1 ]; then
echo "Warning: test $test_nb should have failed, but has not!" >&2
......
......@@ -2,7 +2,7 @@
!LOG: stanford.log
$ Parses IGHV.fa germline
1: 94301 bp in 324 sequences
1: 99341 bp in 341 sequences
$ Parses IGHD.fa germline
1: 1070 bp in 44 sequences
......@@ -10,6 +10,6 @@ $ Parses IGHD.fa germline
$ Parses germline/IGHJ.fa
1: 701 bp in 13 sequences
$ Find the good number of junctions in Stanford S22
1: found 10087 junctions in 12317 segments
$ Find the good number of windows in Stanford S22
1: found 10087 50-windows in 12317 segments
#include "core/read_chooser.h"
#include "core/read_score.h"
#include "core/fasta.h"
void testChooser() {
list<Sequence> reads = Fasta("../../data/test1.fa").getAll();
ReadLengthScore rls;
ReadChooser rc(reads, rls);
TAP_TEST(rc.getithBest(1).label == "seq4", TEST_READ_CHOOSER_SORTED,
"First sequence is " << rc.getithBest(1).label);
TAP_TEST(rc.getithBest(1).label == rc.getBest().label
&& rc.getithBest(1).sequence == rc.getBest().sequence,
TEST_READ_CHOOSER_BEST,"");
TAP_TEST(rc.getithBest(2).label == "seq2", TEST_READ_CHOOSER_SORTED,
"Second sequence is " << rc.getithBest(1).label);
TAP_TEST(rc.getithBest(3).label == "seq1", TEST_READ_CHOOSER_SORTED,
"Third sequence is " << rc.getithBest(1).label);
TAP_TEST(rc.getithBest(4).label == "", TEST_READ_CHOOSER_SORTED,
"First sequence is " << rc.getithBest(1).label);
}
#include "core/fasta.h"
#include "core/representative.h"
void testRepresentative() {
list<Sequence> reads = Fasta("../../data/representative.fa").getAll();
KmerRepresentativeComputer krc(reads, 14);
krc.setStabilityLimit(0);
krc.compute(false, 1, 0.5);
Sequence representative = krc.getRepresentative();
TAP_TEST(representative.label.find("seq3") == 0, TEST_KMER_REPRESENTATIVE,
"If we take the first representative we should have seq3 (" << representative.label << " instead)");
krc.setStabilityLimit(1);
krc.compute(false, 1, 0.5);
representative = krc.getRepresentative();
TAP_TEST(representative.label.find("seq3") == 0, TEST_KMER_REPRESENTATIVE,
"When allowing one step before stability, we should still have seq3 (" << representative.label << " instead)");
krc.setStabilityLimit(2);
krc.compute(false, 1, 0.5);
representative = krc.getRepresentative();
TAP_TEST(representative.label.find("seq1") == 0, TEST_KMER_REPRESENTATIVE,
"When allowing two steps before stability, we should reach seq1 (" << representative.label << " instead)");
krc.compute(false, 4, 0.5);
TAP_TEST(! krc.hasRepresentative(), TEST_KMER_REPRESENTATIVE,
"When requiring 4 reads to support the representative, we should not find any solution.");
}
#include "core/read_score.h"
void testScore() {
// ReadLengthScore testing
ReadLengthScore rls;
TAP_TEST(rls.getScore("") == 0., TEST_LENGTH_SCORE,
"score should be 0, is " << rls.getScore(""));
TAP_TEST(rls.getScore("ATCGTTTACGTC") == 12., TEST_LENGTH_SCORE,
"score should be 12, is " << rls.getScore("ATCGTTTACGTC"));
TAP_TEST(rls.getScore("A") == 1., TEST_LENGTH_SCORE,
"score should be 1, is " << rls.getScore("A"));
}
......@@ -7,7 +7,9 @@
#include "testBugs.cpp"
#include "testCluster.cpp"
#include "testSegment.cpp"
#include "testScore.cpp"
#include "testChooser.cpp"
#include "testRepresentative.cpp"
int main(void) {
TAP_START(NB_TESTS);
......@@ -19,6 +21,9 @@ int main(void) {
testBugs();
testCluster();
testSegment();
testScore();
testChooser();
testRepresentative();
TAP_END_TEST_EXIT
}
\ No newline at end of file
}
......@@ -26,6 +26,16 @@ enum {
/* Cluster */
TEST_CLUSTER,
/* Score */
TEST_LENGTH_SCORE,
/* Chooser */
TEST_READ_CHOOSER_BEST,
TEST_READ_CHOOSER_SORTED,
/* Representative */
TEST_KMER_REPRESENTATIVE,
/* Bugs */
TEST_BUG_SEGMENTATION,
TEST_SEGMENT_POSITION,
......@@ -56,6 +66,12 @@ inline void declare_tests() {
RECORD_TAP_TEST(TEST_CLUSTER, "Test automatic clusterisation");
RECORD_TAP_TEST(TEST_LENGTH_SCORE, "Test ReadLengthScore getScore()");
RECORD_TAP_TEST(TEST_READ_CHOOSER_BEST, "Test getBest() in ReadChooser");
RECORD_TAP_TEST(TEST_READ_CHOOSER_SORTED, "Test getSorted() in ReadChooser");
RECORD_TAP_TEST(TEST_KMER_REPRESENTATIVE, "Test KmerRepresentativeComputer computations");
RECORD_TAP_TEST(TEST_BUG_SEGMENTATION, "Test segmentation bug");
RECORD_TAP_TEST(TEST_SEGMENT_POSITION, "Test V,D,J position");
}
......
......@@ -44,6 +44,7 @@
#include "core/html.h"
#include "core/mkdir.h"