Commit 656ddd25 authored by Mikaël Salson's avatar Mikaël Salson

Merge branch 'feature-a/3358-output' into 'dev'

Feature a/3358 output

See merge request !320
parents 07961c01 932b88af
Pipeline #44834 passed with stages
in 7 minutes and 14 seconds
#include "output.h"
void Output::set(string key, json val)
{
j[key] = val ;
}
void Output::set(string key, string subkey, json val)
{
j[key][subkey] = val ;
}
void Output::set(string key, string subkey, string subsubkey, json val)
{
j[key][subkey][subsubkey] = val ;
}
void CloneOutput::setSeg(string subkey, json val)
{
set(KEY_SEG, subkey, val);
}
void Output::add_warning(string code, string msg, string level)
{
json_add_warning(j, code, msg, level);
}
json CloneOutput::toJson()
{
return j;
}
SampleOutput::SampleOutput(json init)
{
j = init;
}
SampleOutput::~SampleOutput()
{
for (auto it: clones)
delete it.second;
}
void SampleOutput::out(ostream &s)
{
}
void SampleOutput::addClone(junction junction, CloneOutput *clone)
{
clones[junction] = clone;
}
CloneOutput* SampleOutput::getClone(junction junction)
{
if (clones.find(junction) != clones.end()){
return clones[junction];
}
else
{
CloneOutput *clone = new(CloneOutput);
addClone(junction, clone);
clone -> set("sequence", 0); // TODO need to compute representative sequence for this case
return clone;
}
}
void SampleOutputVidjil::out(ostream &s)
{
json j_clones;
for (auto it: clones)
j_clones.push_back(it.second->toJson());
j["clones"] = j_clones;
s << j.dump(2);
}
#ifndef OUTPUT_H
#define OUTPUT_H
#include <string>
#include <fstream>
#include <iostream>
#include "tools.h"
#include "../lib/json.hpp"
#define KEY_SEG "seg"
using namespace std;
using json = nlohmann::json;
class Output
{
protected:
json j;
public:
void set(string key, json val);
void set(string key, string subkey, json val);
void set(string key, string subkey, string subsubkey, json val);
void add_warning(string code, string msg, string level);
};
class CloneOutput : public Output
{
public:
void setSeg(string subkey, json val);
json toJson();
};
class SampleOutput : public Output
{
protected:
map <junction, CloneOutput*> clones;
public:
SampleOutput(json init);
virtual ~SampleOutput();
void addClone(junction junction, CloneOutput *clone);
// get a clone, or create a new one if needed
CloneOutput* getClone(junction junction);
json toJson();
void out(ostream &s);
};
/*
class CloneOutputFormatter
{
}
class CloneOutputFormatterCSV(CloneOutputFormatter)
{
}
class CloneOutputFormatterJson(CloneOutputFormatter)
{
}
*/
// Native Json .vidjil format
// See vidjil-format.md
class SampleOutputVidjil : public SampleOutput
{
public:
void out(ostream &s);
};
#endif
......@@ -24,6 +24,7 @@
#include <cassert>
#include "segment.h"
#include "tools.h"
#include "output.h"
#include "../lib/json.hpp"
#include "affectanalyser.h"
#include <sstream>
......@@ -70,10 +71,9 @@ string AlignBox::getSequence(string sequence) {
return sequence.substr(start, end-start+1);
}
void AlignBox::addToJson(json &seg, int alternative_genes) {
void AlignBox::addToOutput(CloneOutput *clone, int alternative_genes) {
json j;
j["name"] = ref_label;
if (key != "3") // no end information for J
......@@ -88,14 +88,16 @@ void AlignBox::addToJson(json &seg, int alternative_genes) {
j["delLeft"] = del_left;
}
seg[key] = j ;
clone->setSeg(key, j) ;
/*Export the N best genes if threshold parameter is specified*/
if(rep && !this->score.empty() && rep->size() <= (int)this->score.size() && alternative_genes > 0 && alternative_genes <= (int)this->score.size()){
seg[key + "alt"] = json::array();
json jalt = json::array();
for(int i = 0; i < alternative_genes;++i){
int r = this->score[i].second;
seg[key + "alt"].push_back(json::object({{"name",rep->label(r)}}));
jalt.push_back(json::object({{"name",rep->label(r)}}));
}
clone->setSeg(key + "alt", jalt);
}
}
......@@ -1330,7 +1332,7 @@ void FineSegmenter::findCDR3(){
// Reminder: JUNCTIONstart is 1-based
}
void FineSegmenter::checkWarnings(json &json_clone)
void FineSegmenter::checkWarnings(CloneOutput *clone)
{
if (isSegmented())
{
......@@ -1339,79 +1341,82 @@ void FineSegmenter::checkWarnings(json &json_clone)
&& (box_J->ref_label.find("IGHJ1") != string::npos)
&& ((getMidLength() >= 90) || (getMidLength() <= 94)))
{
json_add_warning(json_clone, "W61", "Non-recombined D7-27/J1 sequence", LEVEL_ERROR);
clone->add_warning("W61", "Non-recombined D7-27/J1 sequence", LEVEL_ERROR);
}
}
}
json FineSegmenter::toJson(){
void FineSegmenter::toOutput(CloneOutput *clone){
json seg;
for (AlignBox *box: boxes)
{
box->addToJson(seg, this->alternative_genes);
box->addToOutput(clone, this->alternative_genes);
}
if (isSegmented()) {
clone->set("name", code);
if (isDSegmented()) {
seg["N1"] = seg_N1.size();
seg["N2"] = seg_N2.size();
clone->setSeg("N1", seg_N1.size());
clone->setSeg("N2", seg_N2.size());
}
else {
seg["N"] = seg_N.size();
clone->setSeg("N", seg_N.size());
}
if (CDR3start >= 0) {
seg["cdr3"] = {
clone->setSeg("cdr3", {
{"start", CDR3start},
{"stop", CDR3end},
{"aa", CDR3aa}
};
});
}
if (JUNCTIONstart >= 0) {
seg["junction"] = {
clone->setSeg("junction", {
{"start", JUNCTIONstart},
{"stop", JUNCTIONend},
{"aa", JUNCTIONaa},
{"productive", JUNCTIONproductive}
};
});
}
}
return seg;
else // not segmented
{
clone->set("name", label);
}
}
json toJsonSegVal(string s) {
return {{"val", s}};
}
json KmerSegmenter::toJson() {
void KmerSegmenter::toOutput(CloneOutput *clone) {
json seg;
int sequenceSize = sequence.size();
if (evalue > NO_LIMIT_VALUE)
seg["evalue"] = toJsonSegVal(scientific_string_of_double(evalue));
clone->setSeg("evalue", toJsonSegVal(scientific_string_of_double(evalue)));
if (evalue_left > NO_LIMIT_VALUE)
seg["evalue_left"] = toJsonSegVal(scientific_string_of_double(evalue_left));
clone->setSeg("evalue_left", toJsonSegVal(scientific_string_of_double(evalue_left)));
if (evalue_right > NO_LIMIT_VALUE)
seg["evalue_right"] = toJsonSegVal(scientific_string_of_double(evalue_right));
clone->setSeg("evalue_right", toJsonSegVal(scientific_string_of_double(evalue_right)));
if (getKmerAffectAnalyser() != NULL) {
seg["affectValues"] = {
clone->setSeg("affectValues", {
{"start", 1},
{"stop", sequenceSize},
{"seq", getKmerAffectAnalyser()->toStringValues()}
};
});
seg["affectSigns"] = {
clone->setSeg("affectSigns", {
{"start", 1},
{"stop", sequenceSize},
{"seq", getKmerAffectAnalyser()->toStringSigns()}
};
});
}
return seg;
}
......
......@@ -7,6 +7,7 @@
#include "bioreader.hpp"
#include "dynprog.h"
#include "tools.h"
#include "output.h"
#include "germline.h"
#include "kmerstore.h"
#include "kmeraffect.h"
......@@ -102,7 +103,7 @@ class AlignBox
AlignBox(string key = "", string color="");
string getSequence(string sequence);
void addToJson(json &seg, int alternative_genes=NO_LIMIT_VALUE);
void addToOutput(CloneOutput *clone, int alternative_genes);
/**
* Returns 'V', 'D', 'J', or possibly '5', '4', '3', '?', depending on the ref_label and on the key
......@@ -326,7 +327,7 @@ class KmerSegmenter : public Segmenter
KmerAffectAnalyser *getKmerAffectAnalyser() const;
string getInfoLineWithAffects() const;
json toJson();
void toOutput(CloneOutput *clone);
private:
void computeSegmentation(int strand, KmerAffect left, KmerAffect right,
......@@ -402,8 +403,8 @@ class FineSegmenter : public Segmenter
*/
void findCDR3();
void checkWarnings(json &json_clone);
json toJson();
void checkWarnings(CloneOutput *clone);
void toOutput(CloneOutput *clone);
};
......
......@@ -71,10 +71,10 @@ string spaced(const string &input, const string &seed) {
}
string string_of_int(int number)
string string_of_int(int number, int w)
{
stringstream ss;
ss << number ;
ss << setfill('0') << setw(w) << number ;
return ss.str();
}
......
#ifndef TOOLS_H
#define TOOLS_H
using namespace std ;
typedef string junction ;
// error
#define ERROR_STRING "[error] "
......@@ -107,7 +109,7 @@ template <class T>
bool pair_occurrence_sort(pair<T, int> a, pair<T, int> b);
string string_of_int(int number);
string string_of_int(int number, int pad_to_width=0);
string fixed_string_of_float(float number, int precision);
string scientific_string_of_double(double number);
string string_of_map(map <string, string> m, const string &before);
......
......@@ -266,35 +266,25 @@ void WindowsStorage::clearSequences(){
seqs_by_window.clear();
}
json WindowsStorage::sortedWindowsToJson(map <junction, json> json_data_segment, int max_json_output, bool delete_all) {
json windowsArray;
void WindowsStorage::sortedWindowsToOutput(SampleOutput *output, int max_output, bool delete_all) {
int top = 1;
for (list<pair <junction, size_t> >::iterator it = sort_all_windows.begin();
it != sort_all_windows.end(); )
{
json windowsList;
CloneOutput *clone = output->getClone(it->first);
if (json_data_segment.find(it->first) != json_data_segment.end()){
windowsList = json_data_segment[it->first];
}else{
windowsList["sequence"] = 0; //TODO need to compute representative sequence for this case
}
json reads = {it->second};
windowsList["id"] = it->first;
if (status_by_window[it->first][SEG_CHANGED_WINDOW])
json_add_warning(windowsList, "W50", "Short or shifted window");
clone->add_warning("W50", "Short or shifted window", LEVEL_WARN);
windowsList["reads"] = reads;
windowsList["top"] = top++;
windowsList["germline"] = germline_by_window[it->first]->code;
windowsList["seg_stat"] = this->statusToJson(it->first);
clone->set("id", it->first);
clone->set("reads", {it->second});
clone->set("top", top++);
clone->set("germline", germline_by_window[it->first]->code);
clone->set("seg_stat", this->statusToJson(it->first));
windowsArray.push_back(windowsList);
if (delete_all) {
germline_by_window.erase(it->first);
status_by_window.erase(it->first);
......@@ -302,11 +292,9 @@ json WindowsStorage::sortedWindowsToJson(map <junction, json> json_data_segment,
} else {
it++;
}
if (top == max_json_output + 1)
if (top == max_output + 1)
break ;
}
return windowsArray;
}
ostream &WindowsStorage::windowToStream(ostream &os, junction window, int num_seq,
......
......@@ -19,6 +19,7 @@
#include "read_score.h"
#include "representative.h"
#include "stats.h"
#include "output.h"
#include "../lib/json_fwd.hpp"
#define NB_BINS 30
......@@ -27,8 +28,6 @@
using namespace std;
using json = nlohmann::json;
typedef string junction ;
class WindowsStorage {
private:
map<junction, BinReadStorage > seqs_by_window;
......@@ -222,9 +221,9 @@ class WindowsStorage {
/**
* @param delete_all: Delete the objects while they are inserted into the JSON. This prevents the memory
* from continously increasing (see #2120, #3387)
* @return a JSON object containing all the information
* @post insert the information into the SampleOutput clones
*/
json sortedWindowsToJson(map<junction, json> json_data_segment, int max_json_output, bool delete_all=false);
void sortedWindowsToOutput(SampleOutput *output, int max_output, bool delete_all=false);
/**
* Clear the seqs_by_window map.
......
......@@ -9,10 +9,10 @@ $ Do not segment the TR recombinations
4:TR.* UNSEG
$ Report the unsegmented sequences in the json output
1: "germline": "not analyzed", "id": "TRA
1: "germline": "not analyzed", "id": "TRB
1: "germline": "not analyzed", "id": "TRG
1: "germline": "not analyzed", "id": "TRD
1: "germline": "not analyzed", "id": "000001", "name": "TRA
1: "germline": "not analyzed", "id": "000002", "name": "TRB
1: "germline": "not analyzed", "id": "000003", "name": "TRG
1: "germline": "not analyzed", "id": "000004", "name": "TRD
$ Count the unsegmented sequences in the json output
1: "not analyzed": .4.
......
......@@ -7,13 +7,12 @@
#include "core/dynprog.h"
#include "core/bioreader.hpp"
#include "core/segment.h"
#include "core/output.h"
#include "core/windowExtractor.h"
#include "lib/json.hpp"
using namespace std;
void testOverlap()
{
AlignBox *box_A = new AlignBox() ;
......@@ -275,7 +274,10 @@ void testBug2224(IndexTypes index) {
KmerSegmenter ks(data.read(0), germline);
TAP_TEST(ks.getKmerAffectAnalyser() == NULL, TEST_BUG2224, "");
json json_output = ks.toJson();
CloneOutput clone ;
ks.toOutput(&clone);
json json_output = clone.toJson();
TAP_TEST_EQUAL(json_output.count("affectValues"), 0, TEST_BUG2224, "");
TAP_TEST_EQUAL(json_output.count("affectSigns"), 0, TEST_BUG2224, "");
TAP_TEST_EQUAL(json_output.count("affectevalue"), 0, TEST_BUG2224, "");
......
......@@ -491,6 +491,12 @@ void testExtractGeneName(){
"Fail to extract gene name from:" << example_3 << " result:" << extractGeneName(example_3));
}
void testConversions(){
TAP_TEST_EQUAL(string_of_int(12), "12", TEST_CONVERSIONS, "");
TAP_TEST_EQUAL(string_of_int(12, 4), "0012", TEST_CONVERSIONS, "");
TAP_TEST_EQUAL(fixed_string_of_float(12.345, 1), "12.3", TEST_CONVERSIONS, "");
}
void testTools() {
testOnlineBioReader1();
testOnlineBioReaderMaxNth();
......@@ -515,4 +521,5 @@ void testTools() {
testTrimSequence();
testIsStopCodon();
testExtractGeneName();
testConversions();
}
......@@ -23,6 +23,7 @@ enum {
TEST_BAM_SEQUENCE,
TEST_IS_STOP_CODON,
TEST_EXTRACT_GENE_NAME,
TEST_CONVERSIONS,
/* Filter tests */
TEST_AUTOMATON_BUILDER_TO_FILTER_BIOREADER,
......
......@@ -53,6 +53,7 @@
#include "core/labels.h"
#include "core/list_utils.h"
#include "core/windowExtractor.h"
#include "core/output.h"
#include "lib/CLI11.hpp"
#include "lib/json.hpp"
......@@ -757,7 +758,7 @@ int main (int argc, char **argv)
ostringstream stream_cmdline;
for (int i=0; i < argc; i++) stream_cmdline << argv[i] << " ";
json j = {
SampleOutput *output = new SampleOutput({
{"vidjil_json_version", VIDJIL_JSON_VERSION},
{"samples", {
{"number", 1},
......@@ -766,7 +767,7 @@ int main (int argc, char **argv)
{"producer", {soft_version}},
{"commandline", {stream_cmdline.str()}}
}}
};
});
/////////////////////////////////////////
......@@ -1080,7 +1081,7 @@ int main (int argc, char **argv)
// warn if there are too few segmented sequences
if (ratio_segmented < WARN_PERCENT_SEGMENTED)
{
json_add_warning(j, "W20", "Very few V(D)J recombinations found: " + fixed_string_of_float(ratio_segmented, 2) + "%");
output->add_warning("W20", "Very few V(D)J recombinations found: " + fixed_string_of_float(ratio_segmented, 2) + "%", LEVEL_WARN);
stream_segmentation_info << " ! There are not so many CDR3 windows found in this set of reads." << endl ;
stream_segmentation_info << " ! Please check the unsegmentation causes below and refer to the documentation." << endl ;
}
......@@ -1088,8 +1089,6 @@ int main (int argc, char **argv)
we.out_stats(stream_segmentation_info);
cout << stream_segmentation_info.str();
map <junction, json> json_data_segment ;
//////////////////////////////////
//$$ Sort windows
......@@ -1330,27 +1329,28 @@ int main (int argc, char **argv)
if (verbose)
cout << "KmerSegmenter: " << kseg->getInfoLine() << endl;
json json_clone;
json_clone["sequence"] = kseg->getSequence().sequence;
json_clone["_coverage"] = { repComp.getCoverage() };
json_clone["_average_read_length"] = { windowsStorage->getAverageLength(it->first) };
json_clone["_coverage_info"] = {repComp.getCoverageInfo()};
CloneOutput *clone = new CloneOutput();
output->addClone(it->first, clone);
clone->set("sequence", kseg->getSequence().sequence);
clone->set("_coverage", { repComp.getCoverage() });
clone->set("_average_read_length", { windowsStorage->getAverageLength(it->first) });
clone->set("_coverage_info", {repComp.getCoverageInfo()});
//From KmerMultiSegmenter
json_clone["seg"] = kseg->toJson();
kseg->toOutput(clone);
if (repComp.getQuality().length())
json_clone["seg"]["quality"] = {
clone->set("seg", "quality", {
{"start", 1},
{"stop", kseg->getSequence().sequence.length()},
{"seq", repComp.getQuality()}
};
});
if (repComp.getCoverage() < WARN_COVERAGE)
json_add_warning(json_clone, "W51", "Low coverage: " + fixed_string_of_float(repComp.getCoverage(), 3));
clone->add_warning("W51", "Low coverage: " + fixed_string_of_float(repComp.getCoverage(), 3), LEVEL_WARN);
if (label.length())
json_clone["label"] = label ;
clone->set("label", label) ;
//$$ If max_clones is reached, we stop here but still outputs the representative
......@@ -1360,7 +1360,6 @@ int main (int argc, char **argv)
{
cout << representative << endl ;
out_clones << representative << endl ;
json_data_segment[it->first] = json_clone;
continue;
}
......@@ -1383,14 +1382,7 @@ int main (int argc, char **argv)
out_clone << seg << endl ;
out_clones << seg << endl ;
// From FineSegmenter
if (seg.code.length() > 0)
json_clone["name"] = seg.code;
json json_fseg = seg.toJson();
for (json::iterator it = json_fseg.begin(); it != json_fseg.end(); ++it) {
json_clone["seg"][it.key()] = it.value();
}
seg.toOutput(clone);
if (seg.isSegmented())
{
......@@ -1401,7 +1393,7 @@ int main (int argc, char **argv)
if (cc)
{
cout << " (similar to Clone #" << setfill('0') << setw(WIDTH_NB_CLONES) << cc << setfill(' ') << ")";
json_add_warning(json_clone, "W53", "Similar to another clone " + code,
clone->add_warning("W53", "Similar to another clone " + code,
num_clone <= WARN_NUM_CLONES_SIMILAR ? LEVEL_WARN : LEVEL_INFO);
nb_edges++ ;
......@@ -1426,9 +1418,7 @@ int main (int argc, char **argv)
out_clone << endl;
} // end if (seg.isSegmented())
seg.checkWarnings(json_clone);
json_data_segment[it->first] = json_clone;
seg.checkWarnings(clone);
if (output_sequences_by_cluster) // -a option, output all sequences
{
......@@ -1486,7 +1476,7 @@ int main (int argc, char **argv)
} // end if (command == CMD_CLONES)
//$$ .json output: json_data_segment
//$$ .json output
cout << endl ;
//json custom germline
......@@ -1515,7 +1505,7 @@ int main (int argc, char **argv)
//out_json << json->toString();
windowsStorage->clearSequences();
json jsonSortedWindows = windowsStorage->sortedWindowsToJson(json_data_segment, max_clones_id);
windowsStorage->sortedWindowsToOutput(output, max_clones_id);
json reads_germline;
for (list<Germline*>::const_iterator it = multigermline->germlines.begin(); it != multigermline->germlines.end(); ++it){
......@@ -1524,28 +1514,26 @@ int main (int argc, char **argv)
}
// Complete main json output
j["diversity"] = jsonDiversity ;
j["samples"]["log"] = { stream_segmentation_info.str() } ;
j["reads"] = {
// Complete main output
output->set("diversity", jsonDiversity);
output->set("samples", "log", { stream_segmentation_info.str() }) ;
output->set("reads", {
{"total", {nb_total_reads}},
{"segmented", {nb_segmented}},
{"germline", reads_germline}
} ;
j["clones"] = jsonSortedWindows ;
j["germlines"] = json_germlines ;
j["germlines"]["ref"] = multigermline->ref ;
j["germlines"]["species"] = multigermline->species ;
j["germlines"]["species_taxon_id"] = multigermline->species_taxon_id ;
});
output->set("germlines", json_germlines);
output->set("germlines", "ref", multigermline->ref);
output->set("germlines", "species", multigermline->species) ;
output->set("germlines", "species_taxon_id", multigermline->species_taxon_id) ;
if (epsilon || forced_edges.size()){
j["clusters"] = comp.toJson(clones_windows);
output->set("clusters", comp.toJson(clones_windows));
}
//Added edges in the json output file
if (jsonLevenshteinComputed)
j["similarity"] = jsonLevenshtein;