From de4c48e073badcee6a2a0c8aef8243ade6684185 Mon Sep 17 00:00:00 2001 From: Mathieu Giraud Date: Wed, 3 Jul 2013 08:20:28 +0200 Subject: [PATCH] Vidjil: release 2013.07 * New selection of representative read (core/read_chooser.cpp) * Faster spaced seed computation (core/tools.cpp) * New unit tests * Bugs closed --- Makefile | 1 + data/representative.fa | 9 + doc/CHANGELOG | 9 +- doc/README | 2 +- germline/get-germline | 2 +- germline/split-from-imgt.py | 2 +- src/Makefile | 14 +- src/core/fasta.cpp | 9 + src/core/fasta.h | 6 + src/core/read_chooser.cpp | 29 +- src/core/read_chooser.h | 19 +- src/core/read_score.cpp | 11 + src/core/read_score.h | 14 + src/core/representative.cpp | 108 +++++ src/core/representative.h | 90 ++++ src/core/segment.cpp | 2 +- src/core/tools.cpp | 18 +- src/core/tools.h | 1 + src/tests/bugs/bug20130617.fa | 8 + src/tests/bugs/bug20130617.should_get | 10 + src/tests/clones_S22.should_get | 8 +- src/tests/clones_simul.should_get | 2 +- src/tests/clones_simul_cluster.should_get | 4 +- src/tests/segment_simul.should_get | 2 +- src/tests/should-to-tap.sh | 8 +- src/tests/stanford.should_get | 6 +- src/tests/testChooser.cpp | 26 ++ src/tests/testRepresentative.cpp | 31 ++ src/tests/testScore.cpp | 16 + src/tests/tests.cpp | 9 +- src/tests/tests.h | 16 + src/vidjil.cpp | 507 ++++++++++++---------- 32 files changed, 710 insertions(+), 289 deletions(-) create mode 100644 data/representative.fa create mode 100644 src/core/representative.cpp create mode 100644 src/core/representative.h create mode 100644 src/tests/bugs/bug20130617.fa create mode 100644 src/tests/bugs/bug20130617.should_get create mode 100644 src/tests/testChooser.cpp create mode 100644 src/tests/testRepresentative.cpp create mode 100644 src/tests/testScore.cpp diff --git a/Makefile b/Makefile index d46067b16..0479d5ec5 100644 --- a/Makefile +++ b/Makefile @@ -9,6 +9,7 @@ test: all should: all @echo @echo "*** Launching .should_get tests..." + src/tests/should-to-tap.sh src/tests/bugs/bug20130617.should_get src/tests/should-to-tap.sh src/tests/stanford.should_get src/tests/should-to-tap.sh src/tests/clones_simul.should_get src/tests/should-to-tap.sh src/tests/clones_simul_cluster.should_get diff --git a/data/representative.fa b/data/representative.fa new file mode 100644 index 000000000..3ce6daed6 --- /dev/null +++ b/data/representative.fa @@ -0,0 +1,9 @@ +>seq1 +TAAAAAAAAAACCCCCCCCCCGGGGGGGGGGTTTTTTTTTTA +>seq2 +AAAACCCCCCCCCCGGGGGGGGGGTTTTTTTTTTA +TAGGGCGCGATCGACTTC +>seq3 +ATCGGTACACAATCGAGCATGTAGGCTACACGGTA +TAAAAAAAAAACCCCCCCCCCGGGGGGGGGGTTTTTT + diff --git a/doc/CHANGELOG b/doc/CHANGELOG index 611855447..4fc571f71 100644 --- a/doc/CHANGELOG +++ b/doc/CHANGELOG @@ -1,4 +1,11 @@ -2013-04-18 The Vidjil Team + +2013-07-03 The Vidjil Team + * New selection of representative read (core/read_chooser.cpp) + * Faster spaced seed computation (core/tools.cpp) + * New unit tests + * Bugs closed + +2013-04-18 The Vidjil Team * First public release diff --git a/doc/README b/doc/README index 9c9e49843..d79dbdb2e 100644 --- a/doc/README +++ b/doc/README @@ -45,7 +45,7 @@ make data # Immunol, 184(12), 6986–92. make germline - # get IMGT germline databases -- you have to agree to IMGT license: + # get IMGT germline databases (IMGT/GENE-DB) -- you have to agree to IMGT license: # academic research only, provided that it is referred to IMGT®, # and cited as "IMGT®, the international ImMunoGeneTics information system® # http://www.imgt.org (founder and director: Marie-Paule Lefranc, Montpellier, France). diff --git a/germline/get-germline b/germline/get-germline index f55a73d74..bb5fce43c 100644 --- a/germline/get-germline +++ b/germline/get-germline @@ -1,5 +1,5 @@ #!/bin/sh cd $(dirname $0) -wget -O - http://www.imgt.org/download/GENE-DB/GENEDB-ReferenceSequences.fasta-nt-WithoutGaps-F+ORF+inframeP | python split-from-imgt.py +wget -O - http://www.imgt.org/download/GENE-DB/IMGTGENEDB-ReferenceSequences.fasta-nt-WithoutGaps-F+ORF+inframeP | python split-from-imgt.py diff --git a/germline/split-from-imgt.py b/germline/split-from-imgt.py index 763b3adec..5c85c0ac8 100644 --- a/germline/split-from-imgt.py +++ b/germline/split-from-imgt.py @@ -2,7 +2,7 @@ import sys -# Parse lines such as: +# Parse lines in IMGT/GENE-DB such as: # >M12949|TRGV1*01|Homo sapiens|ORF|... open_files = {} diff --git a/src/Makefile b/src/Makefile index 83b326566..89836c2bb 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,5 +1,5 @@ CC=g++ -OPTIM=-O2 -DNO_SPACED_SEEDS +OPTIM=-O2 CFLAGS=-W -Wall $(OPTIM) $(DEBUG) LDFLAGS= EXEC=vidjil kmer kmer_count cut count detailed_affect @@ -12,6 +12,18 @@ BINDIR=.. v: vidjil + +spaced: cleanspaced + make + +nospaced: cleanspaced + make OPTIM="-O2 -DNO_SPACED_SEEDS" + +cleanspaced: + rm vidjil.o core/tools.o + + + all: $(EXEC) debug: diff --git a/src/core/fasta.cpp b/src/core/fasta.cpp index ff9ac5300..ce86495c3 100644 --- a/src/core/fasta.cpp +++ b/src/core/fasta.cpp @@ -58,6 +58,15 @@ Fasta::Fasta(const string &input, } int Fasta::size() const{ return (int)reads.size(); } +list Fasta::getAll() const { + list reads; + + for (int i=0; i < size(); i++) { + reads.push_back(read(i)); + } + + return reads; +} const string& Fasta::label(int index) const{ return reads[index].label; } const string& Fasta::label_full(int index) const{ return reads[index].label_full; } const Sequence& Fasta::read(int index) const {return reads[index];} diff --git a/src/core/fasta.h b/src/core/fasta.h index 6aafaff63..afc4671c0 100644 --- a/src/core/fasta.h +++ b/src/core/fasta.h @@ -4,6 +4,7 @@ #include #include #include +#include #include "tools.h" using namespace std; @@ -42,6 +43,11 @@ public: ostream &out=cout); int size() const; + /** + * Get all the sequences from the FASTA file + * @return a list of sequences in the same order as in the input file + */ + list getAll() const; const string& label(int index) const; const string& label_full(int index) const; const Sequence &read(int index) const; diff --git a/src/core/read_chooser.cpp b/src/core/read_chooser.cpp index 9dd35d5b4..99dcdbb2b 100644 --- a/src/core/read_chooser.cpp +++ b/src/core/read_chooser.cpp @@ -5,29 +5,22 @@ using namespace std; ReadChooser::ReadChooser(list &r, VirtualReadScore &scorer) { - float best_score = -1; - float current_score; - for (list ::const_iterator it = r.begin(); it != r.end(); ++it) { - current_score = scorer.getScore(it->sequence); - if (current_score > best_score) { - best_score = current_score; - best_sequence = *it; - } + scores[it->sequence] = scorer.getScore(it->sequence); } - // vector test(r.begin(), r.end()); - // sort(test.begin(), test.end(), *this); - // reads = list(test.begin(), test.end()); + + reads.assign(r.begin(), r.end()); + sort(reads.begin(), reads.end(), *this); } Sequence ReadChooser::getBest() const{ - return best_sequence; + return reads[0]; } -// list ReadChooser::getSorted() const { -// return reads; -// } +Sequence ReadChooser::getithBest(size_t i) const { + return reads[i-1]; +} -// bool ReadChooser::operator()(Sequence first, Sequence second) { -// return scorer.getScore(first.sequence) > scorer.getScore(second.sequence); -// } +bool ReadChooser::operator()(Sequence first, Sequence second) { + return scores[first.sequence] > scores[second.sequence]; +} diff --git a/src/core/read_chooser.h b/src/core/read_chooser.h index aef43c876..b7f301b15 100644 --- a/src/core/read_chooser.h +++ b/src/core/read_chooser.h @@ -3,7 +3,7 @@ #include #include "fasta.h" #include "read_score.h" - +#include /** * This class aims at choosing the best read among a group of read. @@ -16,17 +16,30 @@ class ReadChooser { private: - Sequence best_sequence; + vector reads; + map scores; public: ReadChooser(list &r, VirtualReadScore &scorer); - /** * @return the best sequence among the list of sequences that have been * given to the object */ Sequence getBest() const; + + /** + * @pre i >= 1 && i <= total number of sequences on the input + * @param i: starts at 1 + * @return the i-th best scored sequence + */ + Sequence getithBest(size_t i) const; + + /** + * A comparison based on scorer of the two sequences. + */ + bool operator()(Sequence first, Sequence second); }; + #endif diff --git a/src/core/read_score.cpp b/src/core/read_score.cpp index 0a17edcb3..444d10dc6 100644 --- a/src/core/read_score.cpp +++ b/src/core/read_score.cpp @@ -53,3 +53,14 @@ void KmerAffectReadScore::setUnambiguousScore(float score) { void KmerAffectReadScore::setUnknownScore(float score) { unknown_score = score; } + + +//////////////////////////////////////////////////////////////////////////////// +////////////////////////////// ReadLengthScore /////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// + +ReadLengthScore::ReadLengthScore(){} + +float ReadLengthScore::getScore(const string &sequence) const { + return sequence.size(); +} diff --git a/src/core/read_score.h b/src/core/read_score.h index c41d56710..71123285e 100644 --- a/src/core/read_score.h +++ b/src/core/read_score.h @@ -54,4 +54,18 @@ public: void setUnambiguousScore(float score) ; void setUnknownScore(float score) ; }; + +/** + * A simple implementation of VirtualReadScore. + * The score is the length of the read + */ +class ReadLengthScore: public VirtualReadScore { + public: + ReadLengthScore(); + + /** + * @return the sequence length + */ + float getScore(const string &sequence) const; +}; #endif diff --git a/src/core/representative.cpp b/src/core/representative.cpp new file mode 100644 index 000000000..d2dc9e667 --- /dev/null +++ b/src/core/representative.cpp @@ -0,0 +1,108 @@ +#include "representative.h" +#include "kmerstore.h" +#include "read_score.h" +#include "read_chooser.h" + +#include +using namespace std; + +RepresentativeComputer::RepresentativeComputer(list &r) + :sequences(r),is_computed(false),representative() { +} + +Sequence RepresentativeComputer::getRepresentative() const{ + assert(hasRepresentative()); + return representative; +} + +list& RepresentativeComputer::getSequenceList() const{ + return sequences; +} + +bool RepresentativeComputer::hasRepresentative() const{ + return is_computed; +} + + +int KmerRepresentativeComputer::getK() const{ + return k; +} + +void KmerRepresentativeComputer::setK(int k) { + this->k = k; +} + +int KmerRepresentativeComputer::getStabilityLimit() const { + return stability_limit; +} + +void KmerRepresentativeComputer::setStabilityLimit(int limit) { + stability_limit = limit; +} + +KmerRepresentativeComputer::KmerRepresentativeComputer(list &r, + int k) + :RepresentativeComputer(r),k(k),stability_limit(DEFAULT_STABILITY_LIMIT){} + +void KmerRepresentativeComputer::compute(bool do_revcomp, size_t min_cover, + float percent_cover) { + is_computed = false; + + // First create an index on the set of reads + IKmerStore *index = KmerStoreFactory::createIndex(getK(), do_revcomp); + + // Add sequences to the index + for (list::iterator it=sequences.begin(); it != sequences.end(); ++it) { + index->insert(it->sequence, it->label); + } + + size_t max = sequences.size(); + // Create a read chooser to have the sequences sorted by length + ReadLengthScore *rlc = new ReadLengthScore(); + ReadChooser rc(sequences, *rlc); + delete rlc; + + // Traverse the sequences to get the desired representative + size_t pos_longest_run = 0; + size_t length_longest_run = 0; + size_t seq_index_longest_run = 1; + Sequence sequence_longest_run; + + for (size_t seq = 1; seq <= sequences.size() && seq <= seq_index_longest_run + stability_limit ; seq++) { + Sequence sequence = rc.getithBest(seq); + if (sequence.sequence.size() <= length_longest_run) { + break; + } + vector counts = index->getResults(sequence.sequence); + + for (size_t i =0; i < counts.size(); i++) { + size_t length_run = 0; + // Search the longest "run" of consecutive k-mers that are sufficiently + // expressed in the read collection. + while (i < counts.size() + && counts[i].count >= min_cover + && counts[i].count >= max*percent_cover) { + length_run++; + i++; + } + if (length_run) + // Take into account the whole k-mer, not just the starting positions + length_run += getK() - 1; + if (length_run > length_longest_run) { + length_longest_run = length_run; + pos_longest_run = i - (length_run - getK() - 1); + sequence_longest_run = sequence; + seq_index_longest_run = seq; + } + } + } + + if (length_longest_run) { + is_computed = true; + representative = sequence_longest_run; + representative.sequence = representative.sequence.substr(pos_longest_run, length_longest_run); + representative.label += "-[" + string_of_int(pos_longest_run) + "," + + string_of_int(pos_longest_run + length_longest_run - 1) + "]"; + } + delete index; +} diff --git a/src/core/representative.h b/src/core/representative.h new file mode 100644 index 000000000..92950d7bc --- /dev/null +++ b/src/core/representative.h @@ -0,0 +1,90 @@ +#ifndef REPRESENTATIVE_H +#define REPRESENTATIVE_H +#include +#include +#include +#include "fasta.h" + +using namespace std; + +#define DEFAULT_STABILITY_LIMIT 30 + +/** + * Compute a representative sequence from a list of sequences. + * The sequences are supposed to share a common juction. + */ +class RepresentativeComputer { +protected: + list &sequences; + bool is_computed; + Sequence representative; +public: + RepresentativeComputer(list &r); + + /** + * @pre hasRepresentative() + * @return the representative sequence of the set of sequences. + * The representative meets the criteria given to compute(). + * The label of the sequence is composed of the read labels used for that + * purpose, plus the positions that have been extracted. + */ + Sequence getRepresentative() const; + + /** + * @return the input sequences we are working on + */ + list& getSequenceList() const; + + /** + * @return true iff compute() has been called and the criteria have been met. + */ + bool hasRepresentative() const; + + /** + * Compute the representative depending on the specified parameters. + * @param do_revcomp: true iff sequences may be coming from any strand, and + * therefore should be revcomp-ed + * @param min_cover: minimal number of reads supporting each position of the + * representative + * @param percent_cover: minimal percent of the maximal coverage that is + * admissible for covering the representative. + * Any position is covered by at least percent_cover % + * of the maximal coverage. + */ + virtual void compute(bool do_revcomp, size_t min_cover, float percent_cover) = 0; +}; + +/** + * The representative is computed from the list of sequences. Those sequences + * must all share a common factor whose length is greater or equal to k. + */ +class KmerRepresentativeComputer : public RepresentativeComputer { +protected: + int k; + int stability_limit; +public: + KmerRepresentativeComputer(list &r, int k); + + // Getters, setters + int getK() const; + + /** + * Sets the length of the k-mer used for computing the representative + */ + void setK(int k); + + int getStabilityLimit() const; + + /** + * @param limit: maximal number of iterations to be performed before reaching + * stability. If after limit number of iterations, the length + * of the representative didn't improve, we keep it. + */ + void setStabilityLimit(int limit); + + // Actions + void compute(bool do_revcomp, size_t min_cover, float percent_cover); + +}; + +#endif diff --git a/src/core/segment.cpp b/src/core/segment.cpp index 8f32b4488..408724e5c 100644 --- a/src/core/segment.cpp +++ b/src/core/segment.cpp @@ -519,7 +519,7 @@ FineSegmenter::FineSegmenter(Sequence seq, Fasta &rep_V, Fasta &rep_J, //overlap VJ if(right-left <=0){ int b_r, b_l; - int overlap=left-left2+1; + int overlap=left-right+1; string seq_left = sequence.substr(0, left+1); string seq_right = sequence.substr(right); diff --git a/src/core/tools.cpp b/src/core/tools.cpp index 85b3eed1e..021b462cc 100644 --- a/src/core/tools.cpp +++ b/src/core/tools.cpp @@ -16,22 +16,32 @@ int seed_weight(const string &seed) return count(seed.begin(), seed.end(), SEED_YES); } +char spaced_buf[MAX_SEED_SIZE+1]; + string spaced(const string &input, const string &seed) { +// #ifdef STATIC_SPACED_SEED_FOURTEEN +// return input.substr(0, 7) + input.substr(8, 7); +// #endif + #ifdef NO_SPACED_SEEDS return input ; #endif - string output = ""; - + int j = 0 ; + // cout << input << endl << seed << endl ; assert(input.length() == seed.length()); for (size_t i = 0; i < input.length(); i++) if (seed[i] == SEED_YES) - output += input[i] ; + spaced_buf[j++] = input[i] ; - return output ; + spaced_buf[j] = (char) 0; + + // cout << spaced_buf << "|" << string(spaced_buf) << "|" << input << "|" << endl ; + + return string(spaced_buf); } diff --git a/src/core/tools.h b/src/core/tools.h index ab4aa53cb..67ae035b3 100644 --- a/src/core/tools.h +++ b/src/core/tools.h @@ -1,6 +1,7 @@ #ifndef TOOLS_H #define TOOLS_H +#define MAX_SEED_SIZE 50 // Spaced seed buffer #define FIRST_POS 0 // Numbering of the base pairs #include diff --git a/src/tests/bugs/bug20130617.fa b/src/tests/bugs/bug20130617.fa new file mode 100644 index 000000000..5b99a051b --- /dev/null +++ b/src/tests/bugs/bug20130617.fa @@ -0,0 +1,8 @@ +>seq1 TRG +ACCAGGCGAAGTTACTATGAGCTTAGTCCCTTCAGCAAATATCTTGAACCAACCAGTGGTATCCCACGCAGCACAGTAGTAAACGGCCATGTCTTCTTTCTCTACGGACTTGATGGTAAGGATTGAAGTGAGAGTTTGAGAATTCTTTCTTGCCTCCACTTTGTTGCTTGTCTTACCCATGC +>seq2 +ACCAGGCGAAGTTACTATGAGCTTAGTCCCTTCAGCAAATATCTTGAACCAACCAGTGGTATCCCACGCAGCACAGTAGTAAACGGCCATGTCTTCTTTCTCTACGGACTTGATGGTAAGGATTGAAGTGAGAGTTTGAGAATTCTTTCTTGCCTCCACTTTGTTGCTTGTCTTACCCATGC +>seq3 +ACCAGGCGAAGTTACTATGAGCTTAGTCCCTTCAGCAAATATCTTGAACCAACCAGTGGTATCCCACGCAGCACAGTAGTAAACGGCCATGTCTTCTTTCTCTACGGACTTGATGGTAAGGATTGAAGTGAGAGTTTGAGAATTCTTTCTTGCCTCCACTTTGTTGCTTGTCTTACCCATGC +>seq4 +ACCAGGCGAAGTTACTATGAGCTTAGTCCCTTCAGCAAATATCTTGAACCAACCAGTGGTATCCCACGCAGCACAGTAGTAAACGGCCATGTCTTCTTTCTCTACGGACTTGATGGTAAGGATTGAAGTGAGAGTTTGAGAATTCTTTCTTGCCTCCACTTTGTTGCTTGTCTTACCCATGC \ No newline at end of file diff --git a/src/tests/bugs/bug20130617.should_get b/src/tests/bugs/bug20130617.should_get new file mode 100644 index 000000000..1833f55d3 --- /dev/null +++ b/src/tests/bugs/bug20130617.should_get @@ -0,0 +1,10 @@ +!LAUNCH: ../../../vidjil -c clones -G ../../../germline/TRG -r 1 -R 1 bug20130617.fa + + +$ Bug with J deletion being 122 +# According to IMGT : +# V10*01 ends at gtttactactgtgctgcgtgg +# JP1*01 starts at ataccactggttg +# N1 is G +1:Clone #001 – 4 reads – 100% + diff --git a/src/tests/clones_S22.should_get b/src/tests/clones_S22.should_get index 77b708ca6..baba647c7 100644 --- a/src/tests/clones_S22.should_get +++ b/src/tests/clones_S22.should_get @@ -1,10 +1,10 @@ !LAUNCH: ../../vidjil -c clones -G ../../germline/IGH -r 3 -R 5 -d ../../data/Stanford_S22.fasta -$ Junction extractions -1:found 3867 junctions in 8219 segments +$ Window extractions +1:found 3867 windows in 8219 segments -$ Junction threshold -1:keep 51 junctions in 2386 reads +$ Window threshold +1:keep 51 windows in 2386 reads $ Clones output 1:Clone #001 – 630 reads diff --git a/src/tests/clones_simul.should_get b/src/tests/clones_simul.should_get index b9f0e7ddf..25bf6a4f0 100644 --- a/src/tests/clones_simul.should_get +++ b/src/tests/clones_simul.should_get @@ -1,7 +1,7 @@ !LAUNCH: ../../vidjil -c clones -G ../../germline/IGH -x -r 1 -R 1 -d ../../data/clones_simul.fa $ Junction extractions -1:found 20 junctions in 47 segments +1:found 20 50-windows in 47 segments $ No clustering 1:==> 20 clones diff --git a/src/tests/clones_simul_cluster.should_get b/src/tests/clones_simul_cluster.should_get index 93a330e2e..55c510562 100644 --- a/src/tests/clones_simul_cluster.should_get +++ b/src/tests/clones_simul_cluster.should_get @@ -1,7 +1,7 @@ !LAUNCH: ../../vidjil -c clones -G ../../germline/IGH -x -r 1 -R 5 -n 5 -d ../../data/clones_simul.fa -$ Junction extractions -1:found 20 junctions in 47 segments +$ Window extractions +1:found 20 50-windows in 47 segments $ Some clustering 1:==> 2 clones diff --git a/src/tests/segment_simul.should_get b/src/tests/segment_simul.should_get index a0367c617..467fd6248 100644 --- a/src/tests/segment_simul.should_get +++ b/src/tests/segment_simul.should_get @@ -1,7 +1,7 @@ !LAUNCH: ../../vidjil -G ../../germline/IGH -d -c segment ../../data/segment_simul.fa | grep '^>' -$ First sequence, easy segmentation (no error, few deletions at the junctions, small N) +$ First sequence, easy segmentation (no error, few deletions at the windows, small N) # 72 77 89 94 # | | | | # seq : 61 attactgtgcgaCTTCataaccggaaccaGTTAtggttcgactcctggggccaaggaaccctggtcaccgtctcctcag diff --git a/src/tests/should-to-tap.sh b/src/tests/should-to-tap.sh index 420fa7614..2d378b44b 100755 --- a/src/tests/should-to-tap.sh +++ b/src/tests/should-to-tap.sh @@ -48,7 +48,7 @@ fi debug() { if [ ! -z "$DEBUG" ]; then - echo $* >&2 + echo "$*" >&2 fi } @@ -131,8 +131,8 @@ while read line; do skip=0 know_to_fail=0 - pattern=$(cut -d: -f2- <<< $line) - nb_hits=$(cut -d: -f1 <<< $line) + pattern=$(cut -d: -f2- <<< "$line") + nb_hits=$(cut -d: -f1 <<< "$line") if [ ${nb_hits:0:1} == "s" ]; then skip=1 # We skip the test if it fails @@ -141,7 +141,7 @@ while read line; do know_to_fail=1 # We know the test fails, but don't fail globally nb_hits=${nb_hits:1} fi - debug "Grepping \"$pattern\" in $FILE_TO_GREP" + debug "Grepping $pattern in $FILE_TO_GREP" if [ $(grep -cE "$pattern" $FILE_TO_GREP) -eq $nb_hits -o $skip -eq 1 ]; then if [ $know_to_fail -eq 1 ]; then echo "Warning: test $test_nb should have failed, but has not!" >&2 diff --git a/src/tests/stanford.should_get b/src/tests/stanford.should_get index 0a389048d..09a45a09b 100644 --- a/src/tests/stanford.should_get +++ b/src/tests/stanford.should_get @@ -2,7 +2,7 @@ !LOG: stanford.log $ Parses IGHV.fa germline -1: 94301 bp in 324 sequences +1: 99341 bp in 341 sequences $ Parses IGHD.fa germline 1: 1070 bp in 44 sequences @@ -10,6 +10,6 @@ $ Parses IGHD.fa germline $ Parses germline/IGHJ.fa 1: 701 bp in 13 sequences -$ Find the good number of junctions in Stanford S22 -1: found 10087 junctions in 12317 segments +$ Find the good number of windows in Stanford S22 +1: found 10087 50-windows in 12317 segments diff --git a/src/tests/testChooser.cpp b/src/tests/testChooser.cpp new file mode 100644 index 000000000..cd2798731 --- /dev/null +++ b/src/tests/testChooser.cpp @@ -0,0 +1,26 @@ +#include "core/read_chooser.h" +#include "core/read_score.h" +#include "core/fasta.h" + +void testChooser() { + list reads = Fasta("../../data/test1.fa").getAll(); + + ReadLengthScore rls; + ReadChooser rc(reads, rls); + + TAP_TEST(rc.getithBest(1).label == "seq4", TEST_READ_CHOOSER_SORTED, + "First sequence is " << rc.getithBest(1).label); + TAP_TEST(rc.getithBest(1).label == rc.getBest().label + && rc.getithBest(1).sequence == rc.getBest().sequence, + TEST_READ_CHOOSER_BEST,""); + + TAP_TEST(rc.getithBest(2).label == "seq2", TEST_READ_CHOOSER_SORTED, + "Second sequence is " << rc.getithBest(1).label); + + TAP_TEST(rc.getithBest(3).label == "seq1", TEST_READ_CHOOSER_SORTED, + "Third sequence is " << rc.getithBest(1).label); + + TAP_TEST(rc.getithBest(4).label == "", TEST_READ_CHOOSER_SORTED, + "First sequence is " << rc.getithBest(1).label); + +} diff --git a/src/tests/testRepresentative.cpp b/src/tests/testRepresentative.cpp new file mode 100644 index 000000000..6875c1159 --- /dev/null +++ b/src/tests/testRepresentative.cpp @@ -0,0 +1,31 @@ +#include "core/fasta.h" +#include "core/representative.h" + +void testRepresentative() { + list reads = Fasta("../../data/representative.fa").getAll(); + + KmerRepresentativeComputer krc(reads, 14); + + krc.setStabilityLimit(0); + krc.compute(false, 1, 0.5); + Sequence representative = krc.getRepresentative(); + + TAP_TEST(representative.label.find("seq3") == 0, TEST_KMER_REPRESENTATIVE, + "If we take the first representative we should have seq3 (" << representative.label << " instead)"); + + krc.setStabilityLimit(1); + krc.compute(false, 1, 0.5); + representative = krc.getRepresentative(); + TAP_TEST(representative.label.find("seq3") == 0, TEST_KMER_REPRESENTATIVE, + "When allowing one step before stability, we should still have seq3 (" << representative.label << " instead)"); + + krc.setStabilityLimit(2); + krc.compute(false, 1, 0.5); + representative = krc.getRepresentative(); + TAP_TEST(representative.label.find("seq1") == 0, TEST_KMER_REPRESENTATIVE, + "When allowing two steps before stability, we should reach seq1 (" << representative.label << " instead)"); + + krc.compute(false, 4, 0.5); + TAP_TEST(! krc.hasRepresentative(), TEST_KMER_REPRESENTATIVE, + "When requiring 4 reads to support the representative, we should not find any solution."); +} diff --git a/src/tests/testScore.cpp b/src/tests/testScore.cpp new file mode 100644 index 000000000..f239886df --- /dev/null +++ b/src/tests/testScore.cpp @@ -0,0 +1,16 @@ +#include "core/read_score.h" + +void testScore() { + // ReadLengthScore testing + ReadLengthScore rls; + + TAP_TEST(rls.getScore("") == 0., TEST_LENGTH_SCORE, + "score should be 0, is " << rls.getScore("")); + + TAP_TEST(rls.getScore("ATCGTTTACGTC") == 12., TEST_LENGTH_SCORE, + "score should be 12, is " << rls.getScore("ATCGTTTACGTC")); + + TAP_TEST(rls.getScore("A") == 1., TEST_LENGTH_SCORE, + "score should be 1, is " << rls.getScore("A")); + +} diff --git a/src/tests/tests.cpp b/src/tests/tests.cpp index 5b48d456c..cbff81c65 100644 --- a/src/tests/tests.cpp +++ b/src/tests/tests.cpp @@ -7,7 +7,9 @@ #include "testBugs.cpp" #include "testCluster.cpp" #include "testSegment.cpp" - +#include "testScore.cpp" +#include "testChooser.cpp" +#include "testRepresentative.cpp" int main(void) { TAP_START(NB_TESTS); @@ -19,6 +21,9 @@ int main(void) { testBugs(); testCluster(); testSegment(); + testScore(); + testChooser(); + testRepresentative(); TAP_END_TEST_EXIT -} \ No newline at end of file +} diff --git a/src/tests/tests.h b/src/tests/tests.h index d9f7de50d..baeae7986 100644 --- a/src/tests/tests.h +++ b/src/tests/tests.h @@ -26,6 +26,16 @@ enum { /* Cluster */ TEST_CLUSTER, + /* Score */ + TEST_LENGTH_SCORE, + + /* Chooser */ + TEST_READ_CHOOSER_BEST, + TEST_READ_CHOOSER_SORTED, + + /* Representative */ + TEST_KMER_REPRESENTATIVE, + /* Bugs */ TEST_BUG_SEGMENTATION, TEST_SEGMENT_POSITION, @@ -56,6 +66,12 @@ inline void declare_tests() { RECORD_TAP_TEST(TEST_CLUSTER, "Test automatic clusterisation"); + RECORD_TAP_TEST(TEST_LENGTH_SCORE, "Test ReadLengthScore getScore()"); + + RECORD_TAP_TEST(TEST_READ_CHOOSER_BEST, "Test getBest() in ReadChooser"); + RECORD_TAP_TEST(TEST_READ_CHOOSER_SORTED, "Test getSorted() in ReadChooser"); + + RECORD_TAP_TEST(TEST_KMER_REPRESENTATIVE, "Test KmerRepresentativeComputer computations"); RECORD_TAP_TEST(TEST_BUG_SEGMENTATION, "Test segmentation bug"); RECORD_TAP_TEST(TEST_SEGMENT_POSITION, "Test V,D,J position"); } diff --git a/src/vidjil.cpp b/src/vidjil.cpp index a15fc5dd8..69bb25652 100644 --- a/src/vidjil.cpp +++ b/src/vidjil.cpp @@ -44,6 +44,7 @@ #include "core/html.h" #include "core/mkdir.h" #include "core/labels.h" +#include "core/representative.h" #include "vidjil.h" @@ -58,20 +59,20 @@ // #define DEFAULT_READS "./data/Stanford_S22.fa" #define DEFAULT_READS "../seq/chr_pgm_50k.cut.fa" -#define MIN_READS_JUNCTION 10 +#define MIN_READS_WINDOW 10 #define MIN_READS_CLONE 100 #define RATIO_READS_CLONE 0.1 -#define COMMAND_JUNCTIONS "junctions" +#define COMMAND_WINDOWS "windows" #define COMMAND_ANALYSIS "clones" #define COMMAND_SEGMENT "segment" -enum { CMD_JUNCTIONS, CMD_ANALYSIS, CMD_SEGMENT } ; +enum { CMD_WINDOWS, CMD_ANALYSIS, CMD_SEGMENT } ; #define OUT_DIR "./out/" #define HTML_FILENAME "clones.html" #define CLONE_FILENAME "clone.fa-" -#define JUNCTIONS_FILENAME "junctions.fa" +#define WINDOWS_FILENAME "windows.fa" #define SEQUENCES_FILENAME "sequences.fa" #define SEGMENTED_FILENAME "segmented.vdj.fa" #define EDGES_FILENAME "edges" @@ -92,6 +93,9 @@ enum { CMD_JUNCTIONS, CMD_ANALYSIS, CMD_SEGMENT } ; #define DEFAULT_DELTA_MIN_D 0 #define DEFAULT_DELTA_MAX_D 50 +#define DEFAULT_RATIO_REPRESENTATIVE 0.5 +#define DEFAULT_MIN_COVER_REPRESENTATIVE 5 + #define DEFAULT_EPSILON 0 #define DEFAULT_MINPTS 10 @@ -114,7 +118,7 @@ void usage(char *progname) cerr << "Usage: " << progname << " [options] " << endl << endl; cerr << "Command selection" << endl - << " -c \t" << COMMAND_JUNCTIONS << "\t junction extracting (default)" << endl + << " -c \t" << COMMAND_WINDOWS << "\t window extracting (default)" << endl << " \t\t" << COMMAND_ANALYSIS << " \t clone analysis" << endl << " \t\t" << COMMAND_SEGMENT << " \t V(D)J segmentation" << endl << endl @@ -126,20 +130,20 @@ void usage(char *progname) << " -G prefix for V (D) and J repertoires (shortcut for -V V.fa -D D.fa -J J.fa)" << endl << endl - << "Junction prediction" << endl + << "Window prediction" << endl #ifndef NO_SPACED_SEEDS << " -s spaced seed used for the V/J affectation (default: " << DEFAULT_SEED << ")" << endl #endif << " -k k-mer size used for the V/J affectation (default: " << DEFAULT_K << ")" << endl - << " -w w-mer size used for the length of the extracted junction (default: " << DEFAULT_W << ")(default with -d: " << DEFAULT_W_D << ")" << endl + << " -w w-mer size used for the length of the extracted window (default: " << DEFAULT_W << ")(default with -d: " << DEFAULT_W_D << ")" << endl << endl - << "Junction annotations" << endl - << " -l labels for some junctions -- these junctions will be kept even if some limits are not reached" << endl + << "Window annotations" << endl + << " -l labels for some windows -- these windows will be kept even if some limits are not reached" << endl << endl - << "Limit to keep a junction" << endl - << " -r minimal number of reads containing a junction (default: " << MIN_READS_JUNCTION << ")" << endl + << "Limit to keep a window" << endl + << " -r minimal number of reads containing a window (default: " << MIN_READS_WINDOW << ")" << endl << endl << "Clusterisation" << endl @@ -212,13 +216,16 @@ int main (int argc, char **argv) int segment_D = 0; int verbose = 0 ; - int command = CMD_JUNCTIONS; + int command = CMD_WINDOWS; - int min_reads_junction = MIN_READS_JUNCTION ; + int min_reads_window = MIN_READS_WINDOW ; int min_reads_clone = MIN_READS_CLONE ; float ratio_reads_clone = RATIO_READS_CLONE; // int average_deletion = 4; // Average number of deletion in V or J + size_t min_cover_representative = DEFAULT_MIN_COVER_REPRESENTATIVE; + float ratio_representative = DEFAULT_RATIO_REPRESENTATIVE; + // Admissible delta between left and right segmentation points int delta_min = DEFAULT_DELTA_MIN ; // Kmer+Fine int delta_max = DEFAULT_DELTA_MAX ; // Fine @@ -229,7 +236,7 @@ int main (int argc, char **argv) string forced_edges = "" ; - string junctions_labels_file = "" ; + string windows_labels_file = "" ; char c ; @@ -252,7 +259,7 @@ int main (int argc, char **argv) forced_edges = optarg; break; case 'l': - junctions_labels_file = optarg; + windows_labels_file = optarg; break; case 'x': detailed_cluster_analysis = false; @@ -262,8 +269,8 @@ int main (int argc, char **argv) command = CMD_ANALYSIS; else if (!strcmp(COMMAND_SEGMENT,optarg)) command = CMD_SEGMENT; - else if (!strcmp(COMMAND_JUNCTIONS,optarg)) - command = CMD_JUNCTIONS; + else if (!strcmp(COMMAND_WINDOWS,optarg)) + command = CMD_WINDOWS; else { cerr << "Unknwown command " << optarg << endl; usage(argv[0]); @@ -323,7 +330,7 @@ int main (int argc, char **argv) break; case 'r': - min_reads_junction = atoi(optarg); + min_reads_window = atoi(optarg); break; case '%': @@ -339,7 +346,7 @@ int main (int argc, char **argv) seed = string(optarg); k = seed_weight(seed); #else - cerr << "The option -s is not available" << endl; + cerr << "To enable the option -s, please compile without NO_SPACED_SEEDS" << endl; #endif break; @@ -387,6 +394,17 @@ int main (int argc, char **argv) exit(1); } + + +#ifndef NO_SPACED_SEEDS + // Check seed buffer + if (seed.size() >= MAX_SEED_SIZE) + { + cout << "Seed size is too large (MAX_SEED_SIZE). Aborting." << endl ; + exit(1); + } +#endif + // Check that out_dir is an existing directory or creates it const char *out_cstr = out_dir.c_str(); @@ -398,7 +416,7 @@ int main (int argc, char **argv) out_dir += "/" ; /// Load labels ; - map junctions_labels = load_map(junctions_labels_file); + map windows_labels = load_map(windows_labels_file); /// HTML output string f_html = out_dir + prefix_filename + HTML_FILENAME ; @@ -430,7 +448,7 @@ int main (int argc, char **argv) teestream out(cout, html); switch(command) { - case CMD_JUNCTIONS: cout << "Extracting junctions" << endl; + case CMD_WINDOWS: cout << "Extracting windows" << endl; break; case CMD_ANALYSIS: cout << "Analysing clones" << endl; break; @@ -467,7 +485,7 @@ int main (int argc, char **argv) out << "# git: " ; out.flush(); if (system("git log -1 --pretty=format:'%h (%ci)' --abbrev-commit") == -1) { - perror("Cannot launch git"); + out << ""; } out << endl ; #endif @@ -495,11 +513,10 @@ int main (int argc, char **argv) out_dir += "/"; - //////////////////////////////////////// // CLONE ANALYSIS // //////////////////////////////////////// - if (command == CMD_ANALYSIS || command == CMD_JUNCTIONS) { + if (command == CMD_ANALYSIS || command == CMD_WINDOWS) { ////////////////////////////////// out << "# seed = " << seed << endl ; @@ -525,13 +542,13 @@ int main (int argc, char **argv) out << " ==> " << f_segmented << endl ; ofstream out_segmented(f_segmented.c_str()) ; - out << "Loop through reads, looking for junctions" ; + out << "Loop through reads, looking for windows" ; - MapKmerStore *junctions = new MapKmerStore(w); - map > seqs_by_junction ; + MapKmerStore *windows = new MapKmerStore(w); + map > seqs_by_window ; - int too_short_for_the_junction = 0 ; + int too_short_for_the_window = 0 ; int nb_segmented = 0 ; int ok = 0 ; size_t nb_total_reads = 0; @@ -565,11 +582,11 @@ int main (int argc, char **argv) if (junc.size()) { - junctions->insert(junc, "bloup"); - seqs_by_junction[junc].push_back(reads->getSequence()); + windows->insert(junc, "bloup"); + seqs_by_window[junc].push_back(reads->getSequence()); } else - too_short_for_the_junction++ ; + too_short_for_the_window++ ; ////////////////////////////////// // Output segmented @@ -581,53 +598,54 @@ int main (int argc, char **argv) } out << endl; - out << " ==> found " << seqs_by_junction.size() << " junctions" - << " in " << nb_segmented << " segments" - << " (" << setprecision(3) << 100 * (float) nb_segmented / nb_total_reads << "%) " - << " (" << too_short_for_the_junction << " too short, " - << setprecision(3) << 100 * (float) (nb_segmented - too_short_for_the_junction) / nb_total_reads << "% remaining) " << endl + out << " ==> segmented " << nb_segmented << " reads" + << " (" << setprecision(3) << 100 * (float) nb_segmented / nb_total_reads << "%)" + << endl ; + + out << " ==> found " << seqs_by_window.size() << " " << w << "-windows" + << " in " << (nb_segmented - too_short_for_the_window) << " segments" + << " (" << setprecision(3) << 100 * (float) (nb_segmented - too_short_for_the_window) / nb_total_reads << "%)" << " inside " << nb_total_reads << " sequences" << endl ; - for (int i=0; i " << stats_segmented[i] << endl ; - /// if (command == CMD_JUNCTIONS) /// on le fait meme si CMD_ANALYSIS + /// if (command == CMD_WINDOWS) /// on le fait meme si CMD_ANALYSIS { ////////////////////////////////// - // Sort junctions + // Sort windows - out << "Sort junctions by number of occurrences" << endl; - list > sort_all_junctions; + out << "Sort windows by number of occurrences" << endl; + list > sort_all_windows; - for (map >::const_iterator it = seqs_by_junction.begin(); - it != seqs_by_junction.end(); ++it) + for (map >::const_iterator it = seqs_by_window.begin(); + it != seqs_by_window.end(); ++it) { - sort_all_junctions.push_back(make_pair(it->first, it->second.size())); + sort_all_windows.push_back(make_pair(it->first, it->second.size())); } - sort_all_junctions.sort(pair_occurrence_sort); + sort_all_windows.sort(pair_occurrence_sort); ////////////////////////////////// - // Output junctions + // Output windows ////////////////////////////////// - string f_all_junctions = out_dir + prefix_filename + JUNCTIONS_FILENAME; - out << " ==> " << f_all_junctions << endl ; + string f_all_windows = out_dir + prefix_filename + WINDOWS_FILENAME; + out << " ==> " << f_all_windows << endl ; - ofstream out_all_junctions(f_all_junctions.c_str()); + ofstream out_all_windows(f_all_windows.c_str()); int num_seq = 0 ; - for (list >::const_iterator it = sort_all_junctions.begin(); - it != sort_all_junctions.end(); ++it) + for (list >::const_iterator it = sort_all_windows.begin(); + it != sort_all_windows.end(); ++it) { num_seq++ ; - out_all_junctions << ">" << it->second << "--junction--" << num_seq << " " << junctions_labels[it->first] << endl ; - out_all_junctions << it->first << endl; + out_all_windows << ">" << it->second << "--window--" << num_seq << " " << windows_labels[it->first] << endl ; + out_all_windows << it->first << endl; } } @@ -636,37 +654,38 @@ int main (int argc, char **argv) if (command == CMD_ANALYSIS) { ////////////////////////////////// - out << "Considering only junctions with >= " << min_reads_junction << " reads and labeled junctions" << endl; + out << "Considering only windows with >= " << min_reads_window << " reads and labeled windows" << endl; int removes = 0 ; int nb_reads = 0 ; - for (map >::iterator it = seqs_by_junction.begin(); it != seqs_by_junction.end(); ++it) + for (map >::iterator it = seqs_by_window.begin(); it != seqs_by_window.end(); ++it) { junction junc = it->first; - if (!(seqs_by_junction[junc].size() >= (size_t) min_reads_junction) && !(junctions_labels[junc].size())) + if (!(seqs_by_window[junc].size() >= (size_t) min_reads_window) && !(windows_labels.find(junc) == windows_labels.end())) + { removes++ ; - junctions->store.erase(junc); + windows->store.erase(junc); } else - nb_reads += seqs_by_junction[junc].size() ; + nb_reads += seqs_by_window[junc].size() ; } - out << " ==> keep " << seqs_by_junction.size() - removes << " junctions in " << nb_reads << " reads" ; + out << " ==> keep " << seqs_by_window.size() - removes << " windows in " << nb_reads << " reads" ; out << " (" << setprecision(3) << 100 * (float) nb_reads / nb_total_reads << "%) " << endl ; ////////////////////////////////// // Clustering - list > clones_junctions; - comp_matrix comp=comp_matrix(junctions); + list > clones_windows; + comp_matrix comp=comp_matrix(windows); if (epsilon || forced_edges.size()) { - out << "Cluster similar junctions" << endl ; + out << "Cluster similar windows" << endl ; if (load_comp==1) { @@ -682,19 +701,19 @@ int main (int argc, char **argv) comp.save(( out_dir+prefix_filename + comp_filename).c_str()); } - clones_junctions = comp.cluster(forced_edges, w, out, epsilon, minPts) ; - comp.stat_cluster(clones_junctions, out_dir + prefix_filename + GRAPH_FILENAME, out ); + clones_windows = comp.cluster(forced_edges, w, out, epsilon, minPts) ; + comp.stat_cluster(clones_windows, out_dir + prefix_filename + GRAPH_FILENAME, out ); comp.del(); } else { out << "No clustering" << endl ; - clones_junctions = comp.nocluster() ; + clones_windows = comp.nocluster() ; } - out << " ==> " << clones_junctions.size() << " clones" << endl ; + out << " ==> " << clones_windows.size() << " clones" << endl ; - map z = junctions->store; + map z = windows->store; // int size=z.size(); @@ -703,16 +722,16 @@ int main (int argc, char **argv) list, int> >sort_clones; - for (list >::const_iterator it = clones_junctions.begin(); it != clones_junctions.end(); ++it) + for (list >::const_iterator it = clones_windows.begin(); it != clones_windows.end(); ++it) { list clone = *it ; - int clone_nb_reads = total_nb_reads(clone, seqs_by_junction); + int clone_nb_reads = total_nb_reads(clone, seqs_by_window); bool labeled = false ; - // Is there a labeled junction ? + // Is there a labeled window ? for (list ::const_iterator iit = clone.begin(); iit != clone.end(); ++iit) { - if (junctions_labels[*iit].size()) + if (windows_labels.find(*iit) != windows_labels.end()) { labeled = true ; break ; @@ -734,7 +753,7 @@ int main (int argc, char **argv) out << "Output clones with >= " << min_reads_clone << " reads" << endl ; map clones_codes ; - map clones_map_junctions ; + map clones_map_windows ; list representatives ; list representatives_labels ; @@ -760,12 +779,16 @@ int main (int argc, char **argv) ++num_clone ; cout << "#### " ; string clone_file_name = out_dir+ prefix_filename + CLONE_FILENAME + string_of_int(num_clone) ; - string junctions_file_name = out_dir+ prefix_filename + JUNCTIONS_FILENAME + "-" + string_of_int(num_clone) ; + string windows_file_name = out_dir+ prefix_filename + WINDOWS_FILENAME + "-" + string_of_int(num_clone) ; string sequences_file_name = out_dir+ prefix_filename + SEQUENCES_FILENAME + "-" + string_of_int(num_clone) ; ofstream out_clone(clone_file_name.c_str()); - ofstream out_junctions(junctions_file_name.c_str()); - ofstream out_sequences(sequences_file_name.c_str()); + ofstream out_windows(windows_file_name.c_str()); + ofstream out_sequences; + + if (output_sequences_by_cluster) { + out_sequences.open(sequences_file_name.c_str()); + } html << "" << endl ; @@ -779,30 +802,30 @@ int main (int argc, char **argv) ////////////////////////////////// - list >sort_junctions; + list >sort_windows; for (list ::const_iterator it = clone.begin(); it != clone.end(); ++it) { - int nb_reads = seqs_by_junction[*it].size(); - sort_junctions.push_back(make_pair(*it, nb_reads)); + int nb_reads = seqs_by_window[*it].size(); + sort_windows.push_back(make_pair(*it, nb_reads)); } - sort_junctions.sort(pair_occurrence_sort); + sort_windows.sort(pair_occurrence_sort); - // Output junctions + // Output windows int num_seq = 0 ; - for (list >::const_iterator it = sort_junctions.begin(); - it != sort_junctions.end(); ++it) { + for (list >::const_iterator it = sort_windows.begin(); + it != sort_windows.end(); ++it) { num_seq++ ; - out_junctions << ">" << it->second << "--junction--" << num_seq << " " << junctions_labels[it->first] << endl ; - out_junctions << it->first << endl; + out_windows << ">" << it->second << "--window--" << num_seq << " " << windows_labels[it->first] << endl ; + out_windows << it->first << endl; if ((!detailed_cluster_analysis) && (num_seq == 1)) { out << "\t" << setw(WIDTH_NB_READS) << it->second << "\t"; out << it->first ; - out << "\t" << junctions_labels[it->first] ; + out << "\t" << windows_labels[it->first] ; } } @@ -818,137 +841,135 @@ int main (int argc, char **argv) string best_V ; string best_D ; string best_J ; - int more_junctions = 0 ; + int more_windows = 0 ; - for (list >::const_iterator it = sort_junctions.begin(); - it != sort_junctions.end(); ++it) { + for (list >::const_iterator it = sort_windows.begin(); + it != sort_windows.end(); ++it) { // Choose one representative - cout << "[choose " ; - cout.flush(); - ReadChooser chooser(seqs_by_junction[it->first], *scorer); - Sequence representative = chooser.getBest() ; - representative.label = string_of_int(it->second) + "-" + representative.label ; - - cout << "ok] "; - cout.flush() ; - + KmerRepresentativeComputer repComp(seqs_by_window[it->first], k); + repComp.compute(true, min_cover_representative, ratio_representative); + + if (repComp.hasRepresentative()) { + Sequence representative = repComp.getRepresentative(); + representative.label = string_of_int(it->second) + "-" + + representative.label; - FineSegmenter seg(representative, rep_V, rep_J, delta_min, delta_max, segment_cost); + FineSegmenter seg(representative, rep_V, rep_J, delta_min, delta_max, segment_cost); - if (segment_D) - seg.FineSegmentD(rep_V, rep_D, rep_J); + if (segment_D) + seg.FineSegmentD(rep_V, rep_D, rep_J); - if (seg.isSegmented()) + if (seg.isSegmented()) - { - // As soon as one representative is segmented + { + // As soon as one representative is segmented - representatives.push_back(seg.getSequence()); - representatives_labels.push_back("#" + string_of_int(num_clone)); - cout << seg.info << endl ; + representatives.push_back(seg.getSequence()); + representatives_labels.push_back("#" + string_of_int(num_clone)); + cout << seg.info << endl ; - // We need to find the junction in the representative - size_t junction_pos = seg.getSequence().sequence.find(it->first); + // We need to find the window in the representative + size_t window_pos = seg.getSequence().sequence.find(it->first); - // Default - int ww = 2*w/3 ; // /2 ; + // Default + int ww = 2*w/3 ; // /2 ; - if (junction_pos != string::npos) { - // for V. - ww = seg.getLeft() - junction_pos + seg.del_V; - } + if (window_pos != string::npos) { + // for V. + ww = seg.getLeft() - window_pos + seg.del_V; + } - string end_V =""; + string end_V =""; - // avoid case when V is not in the junction - if (seg.getLeft() > (int) junction_pos) - end_V = rep_V.sequence(seg.best_V).substr(rep_V.sequence(seg.best_V).size() - ww, - ww - seg.del_V); + // avoid case when V is not in the window + if (seg.getLeft() > (int) window_pos) + end_V = rep_V.sequence(seg.best_V).substr(rep_V.sequence(seg.best_V).size() - ww, + ww - seg.del_V); - string mid_D = ""; + string mid_D = ""; - if (segment_D) - mid_D = rep_D.sequence(seg.best_D).substr(seg.del_D_left, - rep_D.sequence(seg.best_D).size() - seg.del_D_left - seg.del_D_right ); + if (segment_D) + mid_D = rep_D.sequence(seg.best_D).substr(seg.del_D_left, + rep_D.sequence(seg.best_D).size() - seg.del_D_left - seg.del_D_right ); - if (junction_pos != string::npos) { - // for J. - ww = (junction_pos + w - 1) - seg.getRight() + seg.del_J; - } + if (window_pos != string::npos) { + // for J. + ww = (window_pos + w - 1) - seg.getRight() + seg.del_J; + } - string start_J = ""; + string start_J = ""; - // avoid case when J is not in the junction - if (seg.getRight() > (int) (junction_pos + w - 1)) - start_J=rep_J.sequence(seg.best_J).substr(seg.del_J, ww); + // avoid case when J is not in the window + if (seg.getRight() > (int) (window_pos + w - 1)) + start_J=rep_J.sequence(seg.best_J).substr(seg.del_J, ww); - best_V = rep_V.label(seg.best_V) ; - if (segment_D) best_D = rep_D.label(seg.best_D) ; - best_J = rep_J.label(seg.best_J) ; + best_V = rep_V.label(seg.best_V) ; + if (segment_D) best_D = rep_D.label(seg.best_D) ; + best_J = rep_J.label(seg.best_J) ; - // TODO: pad aux dimensions exactes - string pad_N = "NNNNNNNNNNNNNNNN" ; + // TODO: pad aux dimensions exactes + string pad_N = "NNNNNNNNNNNNNNNN" ; - // Add V, (D) and J to junctions to be aligned + // Add V, (D) and J to windows to be aligned - out_junctions << ">" << best_V << "-junction" << endl ; - out_junctions << end_V << pad_N << endl ; - more_junctions++; - - if (segment_D) { - out_junctions << ">" << best_D << "-junction" << endl ; - out_junctions << mid_D << endl ; - more_junctions++ ; - } + out_windows << ">" << best_V << "-window" << endl ; + out_windows << end_V << pad_N << endl ; + more_windows++; + + if (segment_D) { + out_windows << ">" << best_D << "-window" << endl ; + out_windows << mid_D << endl ; + more_windows++ ; + } - out_junctions << ">" << best_J << "-junction" << endl ; - out_junctions << pad_N << start_J << endl ; - more_junctions++; + out_windows << ">" << best_J << "-window" << endl ; + out_windows << pad_N << start_J << endl ; + more_windows++; - string code = seg.code ; - int cc = clones_codes[code]; + string code = seg.code ; + int cc = clones_codes[code]; - html << " – " << code << endl ; + html << " – " << code << endl ; - if (cc) - { - html << "" ; - out << " (similar to Clone #" << setfill('0') << setw(WIDTH_NB_CLONES) << cc << setfill(' ') << ")"; + if (cc) + { + html << "" ; + out << " (similar to Clone #" << setfill('0') << setw(WIDTH_NB_CLONES) << cc << setfill(' ') << ")"; - nb_edges++ ; - out_edges << clones_map_junctions[code] + " " + it->first + " " ; - out_edges << code << " " ; - out_edges << "Clone #" << setfill('0') << setw(WIDTH_NB_CLONES) << cc << setfill(' ') << " " ; - out_edges << "Clone #" << setfill('0') << setw(WIDTH_NB_CLONES) << num_clone << setfill(' ') << " " ; - out_edges << endl ; + nb_edges++ ; + out_edges << clones_map_windows[code] + " " + it->first + " " ; + out_edges << code << " " ; + out_edges << "Clone #" << setfill('0') << setw(WIDTH_NB_CLONES) << cc << setfill(' ') << " " ; + out_edges << "Clone #" << setfill('0') << setw(WIDTH_NB_CLONES) << num_clone << setfill(' ') << " " ; + out_edges << endl ; - html << "" ; - } - else - { - clones_codes[code] = num_clone ; - clones_map_junctions[code] = it->first ; - } + html << "" ; + } + else + { + clones_codes[code] = num_clone ; + clones_map_windows[code] = it->first ; + } - html << "" << endl ; - html << "
" << endl ;
+              html << "" << endl ;
+              html << "
" << endl ;
       
-	    // html (test)
-	    seg.html(html, segment_D) ;
+              // html (test)
+              seg.html(html, segment_D) ;
 
-	    // display junction
-	    cout << setw(junction_pos) << " " << it->first << " " << junctions_labels[it->first] << endl ;
+              // display window
+              cout << setw(window_pos) << " " << it->first << " " << windows_labels[it->first] << endl ;
 
-	    break ;
-	  }
-	  
+              break ;
+            }
+        }
       }
 
       out << endl ;
-      out_junctions.close();
+      out_windows.close();
 
       html << "
" << endl ; html << "