Commit 1ac5f145 authored by Marc Duez's avatar Marc Duez
Browse files
parents 2b4c982e 0d67533f
......@@ -63,10 +63,10 @@ should_coverage: clean
unit_gcovr: unit_coverage
mkdir -p reports
which gcovr > /dev/null && (cd algo; gcovr -r . -e tests/ --xml > reports/unit_coverage.xml) || echo "gcovr is needed to generate a full report"
which gcovr > /dev/null && (cd algo; gcovr -r . -e tests/ --xml > ../reports/unit_coverage.xml) || echo "gcovr is needed to generate a full report"
should_gcovr: should_coverage
mkdir -p reports
which gcovr > /dev/null && (cd algo; gcovr -r . -e tests/ --xml > reports/should_coverage.xml) || echo "gcovr is needed to generate a full report"
which gcovr > /dev/null && (cd algo; gcovr -r . -e tests/ --xml > ../reports/should_coverage.xml) || echo "gcovr is needed to generate a full report"
### Upload to coveralls.io
......
......@@ -12,7 +12,11 @@ void Germline::init(string _code, char _shortcut,
affect_5 = "V" ;
affect_4 = "" ;
affect_3 = "J" ;
affect_5 = string(1, toupper(shortcut)) + "-" + code + "V";
affect_4 = ""; // string(1, 14 + shortcut) + "-" + code + "D";
affect_3 = string(1, tolower(shortcut)) + "-" + code + "J";
delta_min = _delta_min ;
delta_max = _delta_max ;
......@@ -240,10 +244,10 @@ void MultiGermline::insert_in_one_index(IKmerStore<KmerAffect> *_index)
for (list<Germline*>::const_iterator it = germlines.begin(); it != germlines.end(); ++it)
{
Germline *germline = *it ;
germline->affect_5 = string(1, germline->shortcut) + "-" + germline->code + "V";
if (germline->rep_4.size())
germline->affect_4 = string(1, 14 + germline->shortcut) + "-" + germline->code + "D";
germline->affect_3 = string(1, tolower(germline->shortcut)) + "-" + germline->code + "J";
germline->use_index(_index) ;
}
}
......
......@@ -77,6 +77,10 @@ bool Segmenter::isDSegmented() const {
return dSegmented;
}
bool KmerSegmenter::isDetected() const {
return detected;
}
// Chevauchement
string Segmenter::removeChevauchement()
......@@ -166,23 +170,19 @@ ostream &operator<<(ostream &out, const Segmenter &s)
// KmerSegmenter (Cheap)
KmerSegmenter::KmerSegmenter(Sequence seq, MultiGermline *multigermline)
KmerSegmenter::KmerSegmenter() { kaa = 0 ; }
KmerSegmenter::KmerSegmenter(Sequence seq, Germline *germline)
{
label = seq.label ;
sequence = seq.sequence ;
info = "" ;
info_extra = "seed";
segmented = false;
segmented_germline = 0;
segmented_germline = germline ;
reversed = false;
Dend=0;
// Iterate over the germlines
for (list<Germline*>::const_iterator it = multigermline->germlines.begin(); it != multigermline->germlines.end(); ++it)
{
Germline *germline = *it ;
segmented_germline = germline;
int s = (size_t)germline->index->getS() ;
int length = sequence.length() ;
......@@ -190,7 +190,7 @@ KmerSegmenter::KmerSegmenter(Sequence seq, MultiGermline *multigermline)
{
because = UNSEG_TOO_SHORT;
kaa = NULL;
continue ;
return ;
}
kaa = new KmerAffectAnalyser(*(germline->index), sequence);
......@@ -227,8 +227,6 @@ KmerSegmenter::KmerSegmenter(Sequence seq, MultiGermline *multigermline)
if (segmented)
{
// Yes, it is segmented
germline->stats.insert(length);
reversed = (strand == -1);
because = reversed ? SEG_MINUS : SEG_PLUS ;
......@@ -240,16 +238,60 @@ KmerSegmenter::KmerSegmenter(Sequence seq, MultiGermline *multigermline)
return ;
}
// If the germline was detected, do not test other germlines
if (detected)
return ;
} // end for (Germlines)
}
KmerSegmenter::~KmerSegmenter() {
if (kaa)
delete kaa;
// if (kaa)
// delete kaa;
}
KmerMultiSegmenter::KmerMultiSegmenter(Sequence seq, MultiGermline *multigermline, ostream *out_unsegmented)
{
int best_score = 0 ;
// Iterate over the germlines
for (list<Germline*>::const_iterator it = multigermline->germlines.begin(); it != multigermline->germlines.end(); ++it)
{
Germline *germline = *it ;
KmerSegmenter kseg(seq, germline);
if (out_unsegmented)
{
// Debug, display k-mer affectation and segmentation result for this germline
*out_unsegmented << "#"
<< left << setw(4) << kseg.segmented_germline->code << " "
<< left << setw(20) << segmented_mesg[kseg.getSegmentationStatus()] << " ";
if (kseg.isSegmented())
*out_unsegmented << right << setw(3) << kseg.score << " ";
else
*out_unsegmented << " " ;
if (kseg.getSegmentationStatus() != UNSEG_TOO_SHORT)
*out_unsegmented << kseg.getKmerAffectAnalyser()->toString();
*out_unsegmented << endl ;
}
if (!best_score)
the_kseg = kseg;
if (kseg.isSegmented())
{
// Yes, it is segmented
if (kseg.score > best_score)
{
the_kseg = kseg ;
best_score = kseg.score ;
}
}
} // end for (Germlines)
}
KmerMultiSegmenter::~KmerMultiSegmenter() {
}
void KmerSegmenter::computeSegmentation(int strand, Germline* germline) {
......@@ -258,6 +300,8 @@ void KmerSegmenter::computeSegmentation(int strand, Germline* germline) {
segmented = true ;
because = 0 ; // Cause of unsegmentation
score = 0 ;
affect_infos max;
// Zero information
if (strand == 0)
......@@ -270,7 +314,6 @@ void KmerSegmenter::computeSegmentation(int strand, Germline* germline) {
}
else
{
affect_infos max;
if (strand == 1)
max = kaa->getMaximum(KmerAffect(germline->affect_5, 1),
KmerAffect(germline->affect_3, 1));
......@@ -320,6 +363,8 @@ void KmerSegmenter::computeSegmentation(int strand, Germline* germline) {
}
if (because)
segmented = false;
else
score = max.nb_before_left + max.nb_before_right + max.nb_after_left + max.nb_after_right;
}
KmerAffectAnalyser *KmerSegmenter::getKmerAffectAnalyser() const {
......
......@@ -141,12 +141,16 @@ class KmerSegmenter : public Segmenter
string affects;
public:
bool isDetected() const;
int score;
KmerSegmenter();
/**
* Build a segmenter based on KmerSegmentation
* @param seq: An object read from a FASTA/FASTQ file
* @param multigermline: the multigermline
* @param germline: the germline
*/
KmerSegmenter(Sequence seq, MultiGermline *multigermline);
KmerSegmenter(Sequence seq, Germline *germline);
~KmerSegmenter();
......@@ -167,6 +171,21 @@ class KmerSegmenter : public Segmenter
void computeSegmentation(int strand, Germline* germline);
};
class KmerMultiSegmenter
{
public:
/**
* @param seq: An object read from a FASTA/FASTQ file
* @param multigermline: the multigermline
*/
KmerMultiSegmenter(Sequence seq, MultiGermline *multigermline, ostream *out_unsegmented);
~KmerMultiSegmenter();
KmerSegmenter the_kseg;
};
class FineSegmenter : public Segmenter
{
public:
......
......@@ -20,12 +20,21 @@ WindowsStorage *WindowExtractor::extract(OnlineFasta *reads, MultiGermline *mult
while (reads->hasNext()) {
reads->next();
nb_reads++;
if (out_unsegmented) {
*out_unsegmented << reads->getSequence();
}
KmerMultiSegmenter kmseg(reads->getSequence(), multigermline, out_unsegmented);
KmerSegmenter seg(reads->getSequence(), multigermline);
KmerSegmenter seg = kmseg.the_kseg ;
int read_length = seg.getSequence().sequence.length();
stats[seg.getSegmentationStatus()].insert(read_length);
if (seg.isSegmented()) {
seg.segmented_germline->stats.insert(read_length);
junction junc = seg.getJunction(w);
if (junc.size()) {
......@@ -38,18 +47,16 @@ WindowsStorage *WindowExtractor::extract(OnlineFasta *reads, MultiGermline *mult
if (out_segmented) {
*out_segmented << seg ; // KmerSegmenter output (V/N/J)
if (out_unsegmented)
if (out_unsegmented) {
*out_segmented << seg.getKmerAffectAnalyser()->toString() << endl;
*out_unsegmented << "#>" << reads->getSequence().label << " segmented on " << seg.segmented_germline->code << endl << endl;
}
}
nb_reads_germline[seg.system]++;
} else if (out_unsegmented) {
*out_unsegmented << reads->getSequence();
*out_unsegmented << "#" << segmented_mesg[seg.getSegmentationStatus()] << " " << seg.segmented_germline->code << endl;
if (seg.getSegmentationStatus() != UNSEG_TOO_SHORT) {
*out_unsegmented << seg.getKmerAffectAnalyser()->toString() << endl;
}
*out_unsegmented << "#>" << reads->getSequence().label << " not segmented" << endl << endl;
}
}
return windowsStorage;
......
!LAUNCH: ../../vidjil -A -uU -g ../../germline ../../data/chimera-trg.fa
$ Do not segment on IGL by chance
1:IGL .* -> .* 0
f1:IGL .* -> .* 0
$ Report the sequence as AMBIGUOUS
1:UNSEG ambiguous .* 2
f1:UNSEG ambiguous .* 2
!LAUNCH: ../../vidjil -g ../../germline ../../data/multi-complete.fa
$ Segment all the seven reads
1:segmented 7 reads
$ Segment one read on TRA
1:TRA .* -> .* 1
$ Segment one read on TRB
1:TRB .* -> .* 1
$ Segment one read on TRG
1:TRG .* -> .* 1
$ Segment one read on TRD
1:TRD .* -> .* 1
$ Segment one read on IGH
1:IGH .* -> .* 1
$ Segment one read on IGK
1:IGK .* -> .* 1
$ Segment one read on IGL
1:IGL .* -> .* 1
!LAUNCH: ../../vidjil -g ../../germline -i ../../data/multi-complete.fa
$ Segment all the seven reads
1:segmented 7 reads
$ Segment one read on TRA
1:TRA .* -> .* 1
$ Segment one read on TRB
1:TRB .* -> .* 1
$ Segment one read on TRG
1:TRG .* -> .* 1
$ Segment one read on TRD
1:TRD .* -> .* 1
$ Segment one read on IGH
1:IGH .* -> .* 1
$ Segment one read on IGK
1:IGK .* -> .* 1
$ Segment one read on IGL
1:IGL .* -> .* 1
!LAUNCH: ../../vidjil -g ../../germline -i ../../data/multi-short.fa
$ Segment all the seven reads
1:segmented 7 reads
$ Segment one read on TRA
1:TRA .* -> .* 1
$ Segment one read on TRB
1:TRB .* -> .* 1
$ Segment one read on TRG
1:TRG .* -> .* 1
$ Segment one read on TRD
1:TRD .* -> .* 1
$ Segment one read on IGH
1:IGH .* -> .* 1
$ Segment one read on IGK
1:IGK .* -> .* 1
$ Segment one read on IGL
1:IGL .* -> .* 1
!LAUNCH: ../../vidjil -g ../../germline ../../data/multi-short.fa
$ Segment all the seven reads
1:segmented 7 reads
$ Segment one read on TRA
1:TRA .* -> .* 1
$ Segment one read on TRB
1:TRB .* -> .* 1
$ Segment one read on TRG
1:TRG .* -> .* 1
$ Segment one read on TRD
1:TRD .* -> .* 1
$ Segment one read on IGH
1:IGH .* -> .* 1
$ Segment one read on IGK
1:IGK .* -> .* 1
$ Segment one read on IGL
1:IGL .* -> .* 1
!LAUNCH: ../../vidjil -r 5 -o out2 -u -U -v -G ../../germline/IGH ../../data/Stanford_S22.fasta ; tail out2/Stanford_S22.segmented.vdj.fa out2/Stanford_S22.unsegmented.fa
!LAUNCH: ../../vidjil -r 5 -o out2 -u -U -v -G ../../germline/IGH ../../data/Stanford_S22.fasta ; tail out2/Stanford_S22.segmented.vdj.fa ; grep not out2/Stanford_S22.affects
# Testing uncommon and debug options
$ verbose (-v)
2:auditioned sequences
$ segmented.fa (-U)
1:>lcl.FLN1FA001BCDD7.1. .* seed
1:>lcl.FLN1FA001BCDD7.1. .* seed IGH
$ unsegmented.fa (-u)
1:>lcl.FLN1FA002PX5D6.1.
1:>lcl.FLN1FA002PX5D6.1. not segmented
......@@ -17,10 +17,6 @@ void testSegmentationBug1(int delta_min, int delta_max) {
Germline *germline ;
germline = new Germline("custom", 'x', seqV, seqV, seqJ, delta_min, delta_max);
germline->new_index("##############");
MultiGermline *multi ;
multi = new MultiGermline();
multi->insert(germline);
OnlineFasta input(buggy_sequences);
......@@ -41,8 +37,7 @@ void testSegmentationBug1(int delta_min, int delta_max) {
}
}
Segmenter *segment = new KmerSegmenter(input.getSequence(), multi);
KmerSegmenter *segment = new KmerSegmenter(input.getSequence(), germline);
if (strand == 2
|| (strand == 1
......@@ -56,7 +51,7 @@ void testSegmentationBug1(int delta_min, int delta_max) {
delete segment;
delete kaa;
}
delete multi;
delete germline;
}
void testBugs() {
......
......@@ -78,12 +78,9 @@ void testSegmentOverlap()
germline2 = new Germline("TRG2", 'G', seqV, seqV, seqJ, -50, 50);
germline2->new_index("##########");
MultiGermline *multi1 ;
multi1 = new MultiGermline();
multi1->insert(germline1);
for (int i = 0; i < data.size(); i++) {
KmerSegmenter ks(data.read(i), multi1);
KmerSegmenter ks(data.read(i), germline1);
TAP_TEST(ks.seg_V + ks.seg_N + ks.seg_J == data.sequence(i)
|| ks.seg_V + ks.seg_N + ks.seg_J == revcomp(data.sequence(i)),
TEST_KMER_SEGMENT_OVERLAP,
......@@ -96,7 +93,7 @@ void testSegmentOverlap()
" V= " << fs.seg_V << ", N = " << fs.seg_N << ", J = " << fs.seg_J);
}
delete multi1;
delete germline1;
delete germline2;
}
......@@ -110,15 +107,11 @@ void testSegmentationCause() {
germline = new Germline("TRG", 'G', seqV, seqV, seqJ, 0, 10);
germline->new_index("##########");
MultiGermline *multi ;
multi = new MultiGermline();
multi->insert(germline);
int nb_checked = 0;
for (int i = 0; i < data.size(); i++) {
KmerSegmenter ks(data.read(i), multi);
KmerSegmenter ks(data.read(i), germline);
if (data.read(i).label == "seq-seg+") {
TAP_TEST(ks.isSegmented(), TEST_KMER_IS_SEGMENTED, "seq is " << data.label(i));
TAP_TEST(ks.getSegmentationStatus() == SEG_PLUS, TEST_KMER_SEGMENTATION_CAUSE, "");
......@@ -200,7 +193,7 @@ void testSegmentationCause() {
TAP_TEST(nb_checked == 13, TEST_KMER_DATA, "");
delete multi;
delete germline;
}
void testExtractor() {
......
CC=g++
OPTIM=-O2 -DNDEBUG
CXXFLAGS=-W -Wall -I.. $(OPTIM) $(DEBUG) $(CONFIG)
LDFLAGS=-lm
LDFLAGS=-lm -lz
SRC=$(wildcard *.cpp)
EXEC=$(SRC:.cpp=)
LIBCORE=../core/vidjil.a
LIBCORE=../core/vidjil.a ../lib/lib.a
.PHONY: all core clean forcedep
......
......@@ -89,7 +89,7 @@ enum { CMD_WINDOWS, CMD_CLONES, CMD_SEGMENT, CMD_GERMLINES } ;
#define CLONE_FILENAME "clone.fa-"
#define WINDOWS_FILENAME ".windows.fa"
#define SEGMENTED_FILENAME ".segmented.vdj.fa"
#define UNSEGMENTED_FILENAME ".unsegmented.fa"
#define AFFECTS_FILENAME ".affects"
#define EDGES_FILENAME ".edges"
#define COMP_FILENAME "comp.vidjil"
#define JSON_SUFFIX ".vidjil"
......@@ -203,8 +203,7 @@ void usage(char *progname)
<< "Debug" << endl
<< " -U output segmented (in " << SEGMENTED_FILENAME << " file) sequences" << endl
<< " -u output unsegmented (in " << UNSEGMENTED_FILENAME << " file) sequences" << endl
<< " and display detailed k-mer affectation both on segmented and on unsegmented sequences" << endl
<< " -u output detailed k-mer affectation both on segmented and on unsegmented sequences (in " << AFFECTS_FILENAME << " file)" << endl
<< "Output" << endl
<< " -o <dir> output directory (default: " << DEFAULT_OUT_DIR << ")" << endl
<< " -b <string> output basename (by default basename of the input file)" << endl
......@@ -854,7 +853,7 @@ int main (int argc, char **argv)
}
if (output_unsegmented) {
string f_unsegmented = out_dir + f_basename + UNSEGMENTED_FILENAME ;
string f_unsegmented = out_dir + f_basename + AFFECTS_FILENAME ;
cout << " ==> " << f_unsegmented << endl ;
out_unsegmented = new ofstream(f_unsegmented.c_str());
we.setUnsegmentedOutput(out_unsegmented);
......
>TRA--V1-1*01--J1*01
ggacaaagccttgagcagccctctgaagtgacagctgtggaaggagccattgtccagata
aactgcacgtaccagacatctgggttttatgggctgtcctggtaccagcaacatgatggc
ggagcacccacatttctttcttacaatgctctggatggtttggaggagacaggtcgtttt
tcttcattccttagtcgctctgatagttatggttacctccttctacaggagctccagatg
aaagactctgcctcttacttctgcgctgtgagaga
gtatgaaagtattacctcccagttgcaatttggcaaaggaaccagagtttccacttctcc
cc
>TRB--V1*01--D1*01--J1-1*01
gatactggaattacccagacaccaaaatacctggtcacagcaatggggagtaaaaggaca
atgaaacgtgagcatctgggacatgattctatgtattggtacagacagaaagctaagaaa
tccctggagttcatgttttactacaactgtaaggaattcattgaaaacaagactgtgcca
aatcacttcacacctgaatgccctgacagctctcgcttataccttcatgtggtcgcactg
cagcaagaagactcagctgcgtatctctgcaccagcagccaaga
gggacagggggc
tgaacactgaagctttctttggacaaggcaccagactcacagttgtag
>TRG--V1*01--J1*01
tcttccaacttggaagggagaacgaagtcagtcaccaggctgactgggtcatctgctgaa
atcacctgtgatcttcctggagcaagtaccttatacatccactggtacctgcaccaggag
gggaaggccccacagtgtcttctgtactatgaaccctactactccagggttgtgctggaa
tcaggaatcactccaggaaagtatgacactggaagcacaaggagcaattggaatttgaga
ctgcaaaatctaattaaaaatgattctgggttctattactgtgccacctgggacagg
gaattattataagaaactctttggcagtggaacaacactggttgtcacag
>TRD--V1*01--D1*01--J1*01
gcccagaaggttactcaagcccagtcatcagtatccatgccagtgaggaaagcagtcacc
ctgaactgcctgtatgaaacaagttggtggtcatattatattttttggtacaagcaactt
cccagcaaagagatgattttccttattcgccagggttctgatgaacagaatgcaaaaagt
ggtcgctattctgtcaacttcaagaaagcagcgaaatccgtcgccttaaccatttcagcc
ttacagctagaagattcagcaaagtacttttgtgctcttggggaact
gaaatagt
acaccgataaactcatctttggaaaaggaacccgtgtgactgtggaaccaa
>IGH--V1-18*01--D1-1*01--J1*01
caggttcagctggtgcagtctggagctgaggtgaagaagcctggggcctcagtgaaggtc
tcctgcaaggcttctggttacacctttaccagctatggtatcagctgggtgcgacaggcc
cctggacaagggcttgagtggatgggatggatcagcgcttacaatggtaacacaaactat
gcacagaagctccagggcagagtcaccatgaccacagacacatccacgagcacagcctac
atggagctgaggagcctgagatctgacgacacggccgtgtattactgtgcgagaga
ggtacaactggaacgac
gctgaatacttccagcactggggccagggcaccctggtcaccgtctcctcag
>IGK--V1-12*01--J1*01
gacatccagatgacccagtctccatcttccgtgtctgcatctgtaggagacagagtcacc
atcacttgtcgggcgagtcagggtattagcagctggttagcctggtatcagcagaaacca
gggaaagcccctaagctcctgatctatgctgcatccagtttgcaaagtggggtcccatca
aggttcagcggcagtggatctgggacagatttcactctcaccatcagcagcctgcagcct
gaagattttgcaacttactattgtcaacaggctaacagtttccctcc
gtggacgttcggccaagggaccaaggtggaaatcaaac
>IGL--V1-36*01--J1*01
cagtctgtgctgactcagccaccctcggtgtctgaagcccccaggcagagggtcaccatc
tcctgttctggaagcagctccaacatcggaaataatgctgtaaactggtaccagcagctc
ccaggaaaggctcccaaactcctcatctattatgatgatctgctgccctcaggggtctct
gaccgattctctggctccaagtctggcacctcagcctccctggccatcagtgggctccag
tctgaggatgaggctgattattactgtgcagcatgggatgacagcctgaatggtcc
ttatgtcttcggaactgggaccaaggtcaccgtcctag
>TRA--V1-1*01--J1*01
tcttcattccttagtcgctctgatagttatggttacctccttctacaggagctccagatg
aaagactctgcctcttacttctgcgctgtgagaga
gtatgaaagtattacctcccagttgcaatttggcaaaggaaccagagtttccacttctcc
>TRB--V1*01--D1*01--J1-1*01
cagcaagaagactcagctgcgtatctctgcaccagcagccaaga
gggacagggggc
tgaacactgaagctttctttggacaaggcaccagactcacagttgtag
>TRG--V1*01--J1*01
ctgcaaaatctaattaaaaatgattctgggttctattactgtgccacctgggacagg
gaattattataagaaactctttggcagtggaacaacactggttgtcacag
>TRD--V1*01--D1*01--J1*01
ttacagctagaagattcagcaaagtacttttgtgctcttggggaact
gaaatagt
acaccgataaactcatctttggaaaaggaacccgtgtgactgtggaaccaa
>IGH--V1-18*01--D1-1*01--J1*01
atggagctgaggagcctgagatctgacgacacggccgtgtattactgtgcgagaga
ggtacaactggaacgac
gctgaatacttccagcactggggccagggcaccctggtcaccgtctcctcag