Une MAJ de sécurité est nécessaire sur notre version actuelle. Elle sera effectuée lundi 02/08 entre 12h30 et 13h. L'interruption de service devrait durer quelques minutes (probablement moins de 5 minutes).

Commit de4c48e0 authored by Mathieu Giraud's avatar Mathieu Giraud
Browse files

Vidjil: release 2013.07

	* New selection of representative read (core/read_chooser.cpp)
	* Faster spaced seed computation (core/tools.cpp)
	* New unit tests
	* Bugs closed
parent 9387e03a
......@@ -9,6 +9,7 @@ test: all
should: all
@echo
@echo "*** Launching .should_get tests..."
src/tests/should-to-tap.sh src/tests/bugs/bug20130617.should_get
src/tests/should-to-tap.sh src/tests/stanford.should_get
src/tests/should-to-tap.sh src/tests/clones_simul.should_get
src/tests/should-to-tap.sh src/tests/clones_simul_cluster.should_get
......
>seq1
TAAAAAAAAAACCCCCCCCCCGGGGGGGGGGTTTTTTTTTTA
>seq2
AAAACCCCCCCCCCGGGGGGGGGGTTTTTTTTTTA
TAGGGCGCGATCGACTTC
>seq3
ATCGGTACACAATCGAGCATGTAGGCTACACGGTA
TAAAAAAAAAACCCCCCCCCCGGGGGGGGGGTTTTTT
2013-04-18 The Vidjil Team
2013-07-03 The Vidjil Team
* New selection of representative read (core/read_chooser.cpp)
* Faster spaced seed computation (core/tools.cpp)
* New unit tests
* Bugs closed
2013-04-18 The Vidjil Team
* First public release
......
......@@ -45,7 +45,7 @@ make data
# Immunol, 184(12), 6986–92.
make germline
# get IMGT germline databases -- you have to agree to IMGT license:
# get IMGT germline databases (IMGT/GENE-DB) -- you have to agree to IMGT license:
# academic research only, provided that it is referred to IMGT®,
# and cited as "IMGT®, the international ImMunoGeneTics information system®
# http://www.imgt.org (founder and director: Marie-Paule Lefranc, Montpellier, France).
......
#!/bin/sh
cd $(dirname $0)
wget -O - http://www.imgt.org/download/GENE-DB/GENEDB-ReferenceSequences.fasta-nt-WithoutGaps-F+ORF+inframeP | python split-from-imgt.py
wget -O - http://www.imgt.org/download/GENE-DB/IMGTGENEDB-ReferenceSequences.fasta-nt-WithoutGaps-F+ORF+inframeP | python split-from-imgt.py
......@@ -2,7 +2,7 @@
import sys
# Parse lines such as:
# Parse lines in IMGT/GENE-DB such as:
# >M12949|TRGV1*01|Homo sapiens|ORF|...
open_files = {}
......
CC=g++
OPTIM=-O2 -DNO_SPACED_SEEDS
OPTIM=-O2
CFLAGS=-W -Wall $(OPTIM) $(DEBUG)
LDFLAGS=
EXEC=vidjil kmer kmer_count cut count detailed_affect
......@@ -12,6 +12,18 @@ BINDIR=..
v: vidjil
spaced: cleanspaced
make
nospaced: cleanspaced
make OPTIM="-O2 -DNO_SPACED_SEEDS"
cleanspaced:
rm vidjil.o core/tools.o
all: $(EXEC)
debug:
......
......@@ -58,6 +58,15 @@ Fasta::Fasta(const string &input,
}
int Fasta::size() const{ return (int)reads.size(); }
list<Sequence> Fasta::getAll() const {
list<Sequence> reads;
for (int i=0; i < size(); i++) {
reads.push_back(read(i));
}
return reads;
}
const string& Fasta::label(int index) const{ return reads[index].label; }
const string& Fasta::label_full(int index) const{ return reads[index].label_full; }
const Sequence& Fasta::read(int index) const {return reads[index];}
......
......@@ -4,6 +4,7 @@
#include <istream>
#include <string>
#include <vector>
#include <list>
#include "tools.h"
using namespace std;
......@@ -42,6 +43,11 @@ public:
ostream &out=cout);
int size() const;
/**
* Get all the sequences from the FASTA file
* @return a list of sequences in the same order as in the input file
*/
list<Sequence> getAll() const;
const string& label(int index) const;
const string& label_full(int index) const;
const Sequence &read(int index) const;
......
......@@ -5,29 +5,22 @@
using namespace std;
ReadChooser::ReadChooser(list<Sequence> &r, VirtualReadScore &scorer) {
float best_score = -1;
float current_score;
for (list <Sequence>::const_iterator it = r.begin(); it != r.end(); ++it) {
current_score = scorer.getScore(it->sequence);
if (current_score > best_score) {
best_score = current_score;
best_sequence = *it;
}
scores[it->sequence] = scorer.getScore(it->sequence);
}
// vector<Sequence> test(r.begin(), r.end());
// sort(test.begin(), test.end(), *this);
// reads = list<Sequence>(test.begin(), test.end());
reads.assign(r.begin(), r.end());
sort(reads.begin(), reads.end(), *this);
}
Sequence ReadChooser::getBest() const{
return best_sequence;
return reads[0];
}
// list<Sequence> ReadChooser::getSorted() const {
// return reads;
// }
Sequence ReadChooser::getithBest(size_t i) const {
return reads[i-1];
}
// bool ReadChooser::operator()(Sequence first, Sequence second) {
// return scorer.getScore(first.sequence) > scorer.getScore(second.sequence);
// }
bool ReadChooser::operator()(Sequence first, Sequence second) {
return scores[first.sequence] > scores[second.sequence];
}
......@@ -3,7 +3,7 @@
#include <list>
#include "fasta.h"
#include "read_score.h"
#include <map>
/**
* This class aims at choosing the best read among a group of read.
......@@ -16,17 +16,30 @@
class ReadChooser {
private:
Sequence best_sequence;
vector<Sequence> reads;
map<string, float> scores;
public:
ReadChooser(list<Sequence> &r, VirtualReadScore &scorer);
/**
* @return the best sequence among the list of sequences that have been
* given to the object
*/
Sequence getBest() const;
/**
* @pre i >= 1 && i <= total number of sequences on the input
* @param i: starts at 1
* @return the i-th best scored sequence
*/
Sequence getithBest(size_t i) const;
/**
* A comparison based on scorer of the two sequences.
*/
bool operator()(Sequence first, Sequence second);
};
#endif
......@@ -53,3 +53,14 @@ void KmerAffectReadScore::setUnambiguousScore(float score) {
void KmerAffectReadScore::setUnknownScore(float score) {
unknown_score = score;
}
////////////////////////////////////////////////////////////////////////////////
////////////////////////////// ReadLengthScore ///////////////////////////////
////////////////////////////////////////////////////////////////////////////////
ReadLengthScore::ReadLengthScore(){}
float ReadLengthScore::getScore(const string &sequence) const {
return sequence.size();
}
......@@ -54,4 +54,18 @@ public:
void setUnambiguousScore(float score) ;
void setUnknownScore(float score) ;
};
/**
* A simple implementation of VirtualReadScore.
* The score is the length of the read
*/
class ReadLengthScore: public VirtualReadScore {
public:
ReadLengthScore();
/**
* @return the sequence length
*/
float getScore(const string &sequence) const;
};
#endif
#include "representative.h"
#include "kmerstore.h"
#include "read_score.h"
#include "read_chooser.h"
#include <iostream>
using namespace std;
RepresentativeComputer::RepresentativeComputer(list<Sequence> &r)
:sequences(r),is_computed(false),representative() {
}
Sequence RepresentativeComputer::getRepresentative() const{
assert(hasRepresentative());
return representative;
}
list<Sequence>& RepresentativeComputer::getSequenceList() const{
return sequences;
}
bool RepresentativeComputer::hasRepresentative() const{
return is_computed;
}
int KmerRepresentativeComputer::getK() const{
return k;
}
void KmerRepresentativeComputer::setK(int k) {
this->k = k;
}
int KmerRepresentativeComputer::getStabilityLimit() const {
return stability_limit;
}
void KmerRepresentativeComputer::setStabilityLimit(int limit) {
stability_limit = limit;
}
KmerRepresentativeComputer::KmerRepresentativeComputer(list<Sequence> &r,
int k)
:RepresentativeComputer(r),k(k),stability_limit(DEFAULT_STABILITY_LIMIT){}
void KmerRepresentativeComputer::compute(bool do_revcomp, size_t min_cover,
float percent_cover) {
is_computed = false;
// First create an index on the set of reads
IKmerStore<Kmer> *index = KmerStoreFactory::createIndex<Kmer>(getK(), do_revcomp);
// Add sequences to the index
for (list<Sequence>::iterator it=sequences.begin(); it != sequences.end(); ++it) {
index->insert(it->sequence, it->label);
}
size_t max = sequences.size();
// Create a read chooser to have the sequences sorted by length
ReadLengthScore *rlc = new ReadLengthScore();
ReadChooser rc(sequences, *rlc);
delete rlc;
// Traverse the sequences to get the desired representative
size_t pos_longest_run = 0;
size_t length_longest_run = 0;
size_t seq_index_longest_run = 1;
Sequence sequence_longest_run;
for (size_t seq = 1; seq <= sequences.size() && seq <= seq_index_longest_run + stability_limit ; seq++) {
Sequence sequence = rc.getithBest(seq);
if (sequence.sequence.size() <= length_longest_run) {
break;
}
vector<Kmer> counts = index->getResults(sequence.sequence);
for (size_t i =0; i < counts.size(); i++) {
size_t length_run = 0;
// Search the longest "run" of consecutive k-mers that are sufficiently
// expressed in the read collection.
while (i < counts.size()
&& counts[i].count >= min_cover
&& counts[i].count >= max*percent_cover) {
length_run++;
i++;
}
if (length_run)
// Take into account the whole k-mer, not just the starting positions
length_run += getK() - 1;
if (length_run > length_longest_run) {
length_longest_run = length_run;
pos_longest_run = i - (length_run - getK() - 1);
sequence_longest_run = sequence;
seq_index_longest_run = seq;
}
}
}
if (length_longest_run) {
is_computed = true;
representative = sequence_longest_run;
representative.sequence = representative.sequence.substr(pos_longest_run, length_longest_run);
representative.label += "-[" + string_of_int(pos_longest_run) + ","
+ string_of_int(pos_longest_run + length_longest_run - 1) + "]";
}
delete index;
}
#ifndef REPRESENTATIVE_H
#define REPRESENTATIVE_H
#include <string>
#include <cassert>
#include <list>
#include "fasta.h"
using namespace std;
#define DEFAULT_STABILITY_LIMIT 30
/**
* Compute a representative sequence from a list of sequences.
* The sequences are supposed to share a common juction.
*/
class RepresentativeComputer {
protected:
list<Sequence> &sequences;
bool is_computed;
Sequence representative;
public:
RepresentativeComputer(list<Sequence> &r);
/**
* @pre hasRepresentative()
* @return the representative sequence of the set of sequences.
* The representative meets the criteria given to compute().
* The label of the sequence is composed of the read labels used for that
* purpose, plus the positions that have been extracted.
*/
Sequence getRepresentative() const;
/**
* @return the input sequences we are working on
*/
list<Sequence>& getSequenceList() const;
/**
* @return true iff compute() has been called and the criteria have been met.
*/
bool hasRepresentative() const;
/**
* Compute the representative depending on the specified parameters.
* @param do_revcomp: true iff sequences may be coming from any strand, and
* therefore should be revcomp-ed
* @param min_cover: minimal number of reads supporting each position of the
* representative
* @param percent_cover: minimal percent of the maximal coverage that is
* admissible for covering the representative.
* Any position is covered by at least percent_cover %
* of the maximal coverage.
*/
virtual void compute(bool do_revcomp, size_t min_cover, float percent_cover) = 0;
};
/**
* The representative is computed from the list of sequences. Those sequences
* must all share a common factor whose length is greater or equal to k.
*/
class KmerRepresentativeComputer : public RepresentativeComputer {
protected:
int k;
int stability_limit;
public:
KmerRepresentativeComputer(list<Sequence> &r, int k);
// Getters, setters
int getK() const;
/**
* Sets the length of the k-mer used for computing the representative
*/
void setK(int k);
int getStabilityLimit() const;
/**
* @param limit: maximal number of iterations to be performed before reaching
* stability. If after limit number of iterations, the length
* of the representative didn't improve, we keep it.
*/
void setStabilityLimit(int limit);
// Actions
void compute(bool do_revcomp, size_t min_cover, float percent_cover);
};
#endif
......@@ -519,7 +519,7 @@ FineSegmenter::FineSegmenter(Sequence seq, Fasta &rep_V, Fasta &rep_J,
//overlap VJ
if(right-left <=0){
int b_r, b_l;
int overlap=left-left2+1;
int overlap=left-right+1;
string seq_left = sequence.substr(0, left+1);
string seq_right = sequence.substr(right);
......
......@@ -16,22 +16,32 @@ int seed_weight(const string &seed)
return count(seed.begin(), seed.end(), SEED_YES);
}
char spaced_buf[MAX_SEED_SIZE+1];
string spaced(const string &input, const string &seed) {
// #ifdef STATIC_SPACED_SEED_FOURTEEN
// return input.substr(0, 7) + input.substr(8, 7);
// #endif
#ifdef NO_SPACED_SEEDS
return input ;
#endif
string output = "";
int j = 0 ;
// cout << input << endl << seed << endl ;
assert(input.length() == seed.length());
for (size_t i = 0; i < input.length(); i++)
if (seed[i] == SEED_YES)
output += input[i] ;
spaced_buf[j++] = input[i] ;
return output ;
spaced_buf[j] = (char) 0;
// cout << spaced_buf << "|" << string(spaced_buf) << "|" << input << "|" << endl ;
return string(spaced_buf);
}
......
#ifndef TOOLS_H
#define TOOLS_H
#define MAX_SEED_SIZE 50 // Spaced seed buffer
#define FIRST_POS 0 // Numbering of the base pairs
#include <sstream>
......
>seq1 TRG
ACCAGGCGAAGTTACTATGAGCTTAGTCCCTTCAGCAAATATCTTGAACCAACCAGTGGTATCCCACGCAGCACAGTAGTAAACGGCCATGTCTTCTTTCTCTACGGACTTGATGGTAAGGATTGAAGTGAGAGTTTGAGAATTCTTTCTTGCCTCCACTTTGTTGCTTGTCTTACCCATGC
>seq2
ACCAGGCGAAGTTACTATGAGCTTAGTCCCTTCAGCAAATATCTTGAACCAACCAGTGGTATCCCACGCAGCACAGTAGTAAACGGCCATGTCTTCTTTCTCTACGGACTTGATGGTAAGGATTGAAGTGAGAGTTTGAGAATTCTTTCTTGCCTCCACTTTGTTGCTTGTCTTACCCATGC
>seq3
ACCAGGCGAAGTTACTATGAGCTTAGTCCCTTCAGCAAATATCTTGAACCAACCAGTGGTATCCCACGCAGCACAGTAGTAAACGGCCATGTCTTCTTTCTCTACGGACTTGATGGTAAGGATTGAAGTGAGAGTTTGAGAATTCTTTCTTGCCTCCACTTTGTTGCTTGTCTTACCCATGC
>seq4
ACCAGGCGAAGTTACTATGAGCTTAGTCCCTTCAGCAAATATCTTGAACCAACCAGTGGTATCCCACGCAGCACAGTAGTAAACGGCCATGTCTTCTTTCTCTACGGACTTGATGGTAAGGATTGAAGTGAGAGTTTGAGAATTCTTTCTTGCCTCCACTTTGTTGCTTGTCTTACCCATGC
\ No newline at end of file
!LAUNCH: ../../../vidjil -c clones -G ../../../germline/TRG -r 1 -R 1 bug20130617.fa
$ Bug with J deletion being 122
# According to IMGT :
# V10*01 ends at gtttactactgtgctgcgtgg
# JP1*01 starts at ataccactggttg
# N1 is G
1:Clone #001 – 4 reads – 100%
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment