...
 
Commits (346)
......@@ -21,6 +21,7 @@ stages:
- valgrind_unit
- valgrind_functional
- multiple_tests
- benchmark
- publish_release
- deploy_prod
......@@ -216,14 +217,14 @@ test_browser_unit:
tags:
- web
test_browser-functional:
.browser-functional:
stage: test_functional
retry: 2
script:
- make -C browser
- source /etc/profile.d/rvm.sh
- rvm use 2.6.1
- HEADLESS=1 make -C browser/test functional BROWSERS=--browsers-from-file
- HEADLESS=1 make -C browser/test functional
artifacts:
paths:
- browser/
......@@ -232,10 +233,8 @@ test_browser-functional:
- /^hotfix-.*c.*\/.*$/
- prod-client
- schedules
tags:
- web
test_browser-functional-external:
.browser-functional-external:
stage: test_functional_external
retry: 2
script:
......@@ -251,9 +250,38 @@ test_browser-functional-external:
- /^hotfix-.*c.*\/.*$/
- prod-client
- schedules
test_browser-functional:
extends: .browser-functional
variables:
BROWSERS: "--browsers-from-file"
tags:
- web
test_legacy-browser-functional:
extends: .browser-functional
variables:
WATIR_CHROME: "1"
IGNORE_FUNCTIONAL_CBP: "1"
tags:
- legacy
test_browser-functional-external:
extends: .browser-functional-external
variables:
BROWSERS: "--browsers-from-file"
tags:
- web
test_legacy-browser-functional-external:
extends: .browser-functional-external
variables:
WATIR_CHROME: "1"
IGNORE_FUNCTIONAL_CBP: "1"
tags:
- legacy
code_quality:
stage: test_quality
script: make -C browser quality
......@@ -299,15 +327,14 @@ test_server_functional:
- sed -i '/\/etc\/nginx\/ssl\:\/etc\/nginx\/ssl/d' ./docker/docker-compose.yml
- sed -i 's/\:latest/\:test/g' ./docker/docker-compose.yml
- cd docker/vidjil-server/conf/ && mv defs.py defs_https.py && mv defs_http.py defs.py && cd ../../..
- cd docker/vidjil-client/conf/ && mv conf.js conf_https.js && mv conf_http.js conf.js && cd ../../..
- make germline && cp browser/js/germline.js docker/vidjil-client/conf
- cd docker && docker-compose up -d && cd ..
- sed -i "s/^python\ \.\.\/\.\.\/\.\./docker\ exec\ docker_uwsgi_1\ python\ \/usr\/share\/vidjil\/server\/web2py/" server/web2py/applications/vidjil/tests/init_func_test_db.sh
- sed -i "s/^python\ \.\.\/\.\.\/\.\./docker\ exec\ docker_uwsgi_1\ python\ \/usr\/share\/vidjil\/server\/web2py/" server/web2py/applications/vidjil/tests/init_tests.sh
- docker exec docker_uwsgi_1 sed -i "s/^\(FILE_SOURCE .*\)/FILE_SOURCE = '\/usr\/share\/vidjil\/demo'/" /usr/share/vidjil/server/web2py/applications/vidjil/modules/defs.py
- docker exec docker_nginx_1 make -C /usr/share/vidjil browser
- source /etc/profile.d/rvm.sh
- rvm use 2.6.1
- HEADLESS=1 make functional_server || (cd docker && docker-compose stop; docker stop $(docker ps -aq); docker rm $(docker ps -aq); docker rmi "vidjil/server:test" "vidjil/client:test"; false)
- HEADLESS=1 make functional_server BROWSERS="--browsers-from-file" || (cd docker && docker-compose stop; docker stop $(docker ps -aq); docker rm $(docker ps -aq); docker rmi "vidjil/server:test" "vidjil/client:test"; false)
- cd docker && docker-compose stop
- docker stop $(docker ps -aq)
- docker rm $(docker ps -aq)
......@@ -322,6 +349,20 @@ test_server_functional:
- x86_64
- docker
# Benchmark
benchmark_algo:
stage: benchmark
script:
- cd algo/tests ; python3 benchmark-releases.py -bi
when: manual
only:
- /^feature-.*a.*\/.*$/
tags:
- several-compilers
# Deployment
deploy_review:
......
#include "affectanalyser.h"
#include <algorithm>
#include <unordered_map>
bool operator==(const affect_infos &ai1, const affect_infos &ai2) {
return ai1.first_pos_max == ai2.first_pos_max
......@@ -312,6 +314,33 @@ int KmerAffectAnalyser::last(const KmerAffect &affect) const{
}
pair <KmerAffect, KmerAffect> KmerAffectAnalyser::max12(const set<KmerAffect> forbidden) const {
pair<KmerAffect, int> max_counts[2] = {make_pair(KmerAffect::getUnknown(), -1),
make_pair(KmerAffect::getUnknown(), -1)};
std::unordered_map<KmerAffect, int> counts;
for (KmerAffect affect: affectations) {
if (forbidden.count(affect) == 0) {
if (counts.count(affect) > 0)
counts[affect]++;
else
counts[affect] = 1;
}
}
for (auto it: counts) {
if (it.second > max_counts[1].second) {
if (it.second > max_counts[0].second) {
max_counts[1] = max_counts[0];
max_counts[0] = it;
} else {
max_counts[1] = it;
}
}
}
return make_pair(max_counts[0].first, max_counts[1].first);
}
string KmerAffectAnalyser::toString() const{
string kmer;
for (size_t i = 0; i < affectations.size(); i++) {
......@@ -349,12 +378,8 @@ CountKmerAffectAnalyser::CountKmerAffectAnalyser(IKmerStore<KmerAffect> &kms, co
CountKmerAffectAnalyser::~CountKmerAffectAnalyser() {
set<KmerAffect> affects = this->getDistinctAffectations();
/* Initialize each key with a 0-integer array */
for (set<KmerAffect>::iterator it = affects.begin();
it != affects.end(); it++) {
delete [] counts[*it];
for (auto it : counts) {
delete [] it.second;
}
}
......@@ -390,38 +415,6 @@ KmerAffect CountKmerAffectAnalyser::max(const set<KmerAffect> forbidden) const {
return max_affect;
}
pair <KmerAffect, KmerAffect> CountKmerAffectAnalyser::max12(const set<KmerAffect> forbidden) const {
map<KmerAffect, int* >::const_iterator it = counts.begin();
KmerAffect max1_affect = KmerAffect::getUnknown();
KmerAffect max2_affect = KmerAffect::getUnknown();
int max1_count = -1;
int max2_count = -1;
for (; it != counts.end(); it++) {
if (forbidden.count(it->first) == 0) {
int current_count = count(it->first);
if (current_count > max1_count)
{
max2_affect = max1_affect ;
max2_count = max1_count ;
max1_affect = it->first ;
max1_count = current_count ;
}
else if (current_count > max2_count)
{
max2_affect = it->first;
max2_count = current_count;
}
}
}
return make_pair(max1_affect, max2_affect);
}
int CountKmerAffectAnalyser::countBefore(const KmerAffect&affect, int pos) const {
if (pos == 0 || counts.count(affect) == 0)
return 0;
......
......@@ -121,6 +121,14 @@ class AffectAnalyser {
*/
virtual int last(const KmerAffect &affect) const = 0;
/*
* @return the two affectations that are seen the most frequently in the sequence
* taken apart the forbidden ones.
* @complexity n + m log m where n is the input sequence length and m the number
* of affectations
*/
virtual pair <KmerAffect, KmerAffect> max12(const set<KmerAffect> forbidden) const = 0;
/**
* @return a string representation of the object
*/
......@@ -222,6 +230,8 @@ class KmerAffectAnalyser: public AffectAnalyser {
int last(const KmerAffect &affect) const ;
pair <KmerAffect, KmerAffect> max12(const set<KmerAffect> forbidden) const;
string toString() const;
string toStringValues() const;
......@@ -302,12 +312,6 @@ class CountKmerAffectAnalyser: public KmerAffectAnalyser {
*/
KmerAffect max(const set<KmerAffect> forbidden = set<KmerAffect>()) const;
/*
* @return the two affectations that are seen the most frequently in the sequence
* taken apart the forbidden ones.
*/
pair <KmerAffect, KmerAffect> max12(const set<KmerAffect> forbidden) const;
/**
* Set the overlap allowed between two k-mers with two different affectations,
* when looking for the maximum.
......
......@@ -24,6 +24,7 @@ class AbstractACAutomaton: public IKmerStore<Info> {
protected:
void *initialState;
float all_index_load;
map<Info, size_t> kmers_inserted;
public:
AbstractACAutomaton();
......
......@@ -15,16 +15,16 @@ void AbstractACAutomaton<Info>::finish_building() {
IKmerStore<Info>::finish_building();
build_failure_functions();
}
all_index_load = 0;
for(auto iter: kmers_inserted) {
all_index_load += getIndexLoad(iter.first);
}
}
template<class Info>
float AbstractACAutomaton<Info>::getIndexLoad(Info kmer) const {
float load = 0;
if (kmers_inserted.count(kmer) == 0) {
for(auto iter: kmers_inserted) {
load += getIndexLoad(iter.first);
}
return (kmer.isUnknown()) ? 1 - load : load;
return (kmer.isUnknown()) ? 1 - all_index_load : all_index_load;
} else {
return kmers_inserted.at(kmer) / pow(4.0, kmer.getLength());
}
......
......@@ -126,6 +126,7 @@ BioReader::BioReader(bool virtualfasta, string name)
init(0, "");
this -> name = name;
basename = extract_basename(name);
filenames.push_back(this->name);
}
BioReader::BioReader(int extract_field, string extract_separator, int mark_pos)
......@@ -157,6 +158,7 @@ void BioReader::add(const string &filename, bool verbose) {
name += filename;
basename += extract_basename(filename);
filenames.push_back(name);
if (verbose)
cout << " <== " << filename ;
......
......@@ -189,6 +189,7 @@ public:
string name;
string basename;
list<string> filenames;
int size() const;
size_t totalSize() const;
......
......@@ -174,6 +174,17 @@ bool operator>=(const KmerAffect &a1, const KmerAffect &a2);
bool operator!=(const KmerAffect &a1, const KmerAffect &a2);
ostream &operator<<(ostream &os, const KmerAffect &kmer);
namespace std {
template <>
struct hash<KmerAffect> {
size_t operator()(const KmerAffect &affect) const {
return (affect.getLabel()[0] << 8) | affect.getLength();
}
};
}
#define AFFECT_NOT_UNKNOWN_SYMBOL "*"
#define AFFECT_AMBIGUOUS_SYMBOL "\0"
#define AFFECT_UNKNOWN_SYMBOL "\1"
......
......@@ -15,6 +15,7 @@ using namespace std;
typedef
enum { KMER_INDEX, AC_AUTOMATON } IndexTypes;
#define MAX_PRECOMPUTED_PROBA 500 /* Precompute 500 probabilities for each index load */
class Kmer {
public:
unsigned int count;
......@@ -74,6 +75,8 @@ protected:
size_t nb_kmers_inserted;
size_t max_size_indexing;
bool finished_building;
map<float, vector<double> > precomputed_proba_with_system,
precomputed_proba_without_system;
public:
......@@ -149,7 +152,7 @@ public:
/**
* @return probability that the number of kmers is 'at_least' or more in a sequence of length 'length'
*/
double getProbabilityAtLeastOrAbove(T kmer, int at_least, int length) const;
double getProbabilityAtLeastOrAbove(T kmer, int at_least, int length);
/**
* @return the value of k
......@@ -199,6 +202,10 @@ public:
* @pre word.length() == this->k
*/
virtual T& operator[](seqtype& word) = 0;
protected:
void precompute_proba(float index_load);
};
template<class T>
......@@ -366,17 +373,45 @@ int IKmerStore<T>::atMostMaxSizeIndexing(int n) const {
}
template<class T>
double IKmerStore<T>::getProbabilityAtLeastOrAbove(const T kmer, int at_least, int length) const {
void IKmerStore<T>::precompute_proba(float index_load) {
precomputed_proba_with_system[index_load] = vector<double>(MAX_PRECOMPUTED_PROBA);
precomputed_proba_without_system[index_load] = vector<double>(MAX_PRECOMPUTED_PROBA);
vector<double> &pproba_with = precomputed_proba_with_system[index_load];
vector<double> &pproba_without = precomputed_proba_without_system[index_load];
pproba_with[0] = 1;
pproba_without[0] = 1;
for (int i = 1; i < MAX_PRECOMPUTED_PROBA; i++) {
pproba_with[i] = pproba_with[i - 1] * index_load;
pproba_without[i] = pproba_without[i - 1] * (1 - index_load);
}
}
template<class T>
double IKmerStore<T>::getProbabilityAtLeastOrAbove(const T kmer, int at_least, int length) {
if (at_least == 0) return 1.0; // even if 'length' is very small
// n: number of kmers in the sequence
int n = length - getS() + 1;
float index_load = getIndexLoad(kmer) ;
if (! precomputed_proba_without_system.count(index_load)) {
precompute_proba(index_load);
}
double proba = 0;
double probability_having_system = pow(index_load, at_least);
double probability_not_having_system = pow(1 - index_load, n - at_least);
double probability_not_having_system;
double probability_having_system;
if (precomputed_proba_with_system.at(index_load).size() > (size_t)at_least)
probability_having_system = precomputed_proba_with_system.at(index_load)[at_least];
else
probability_having_system = pow(index_load, at_least);
if (precomputed_proba_without_system.at(index_load).size() > (size_t)n - at_least)
probability_not_having_system = precomputed_proba_without_system.at(index_load)[n-at_least];
else
probability_not_having_system = pow(1 - index_load, n - at_least);
for (int i=at_least; i<=n; i++) {
proba += nChoosek(n, i) * probability_having_system * probability_not_having_system;
probability_having_system *= index_load;
......
......@@ -4,6 +4,35 @@
#include <cstdlib>
#include "tools.h"
#include "lib/json.hpp"
using nlohmann::json;
json load_into_map_from_json(map <string, string> &the_map, string json_file)
{
if (!json_file.size())
return {};
cout << " <== " << json_file << endl ;
std::ifstream json_file_stream(json_file);
json j;
json_file_stream >> j;
json jj = j["config"]["labels"] ;
int n = 0;
for(json::iterator label = jj.begin(); label != jj.end(); ++label) {
string name = (*label)["name"].get<std::string>();
string sequence = (*label)["sequence"].get<std::string>();
the_map[sequence] = name;
n++ ;
}
cout << " ==> " << n << " labels" << endl;
return jj;
}
void load_into_map(map <string, string> &the_map, string map_file, string default_value)
{
......
......@@ -7,4 +7,4 @@
#include "bioreader.hpp"
void load_into_map(map <string, string> &the_map, string map_file, string default_value);
json load_into_map_from_json(map <string, string> &the_map, string json_file);
......@@ -14,7 +14,7 @@ using namespace std;
#define THRESHOLD_BAD_COVERAGE .5 /* Threshold below which the representatie
coverage is considered bad */
static ReadQualityScore DEFAULT_READ_SCORE;
static RandomScore DEFAULT_READ_SCORE;
/**
* Compute a representative sequence from a list of sequences.
......
......@@ -91,11 +91,14 @@ void AlignBox::addToOutput(CloneOutput *clone, int alternative_genes) {
clone->setSeg(key, j) ;
/*Export the N best genes if threshold parameter is specified*/
if(rep && !this->score.empty() && rep->size() <= (int)this->score.size() && alternative_genes > 0 && alternative_genes <= (int)this->score.size()){
if(rep && !this->score.empty() && rep->size() <= (int)this->score.size() && alternative_genes > 0){
json jalt = json::array();
for(int i = 0; i < alternative_genes;++i){
int r = this->score[i].second;
jalt.push_back(json::object({{"name",rep->label(r)}}));
int last_score = this->score[0].first;
for(int i = 0; i < (int)this->score.size() &&
(i < alternative_genes || last_score == this->score[i].first);++i){
int r = this->score[i].second;
jalt.push_back(json::object({{"name",rep->label(r)}}));
last_score = this->score[i].first;
}
clone->setSeg(key + "alt", jalt);
}
......@@ -439,7 +442,6 @@ KmerSegmenter::KmerSegmenter(Sequence seq, Germline *germline, double threshold,
info_extra = "seed";
segmented = false;
segmented_germline = germline ;
system = germline->code; // useful ?
reversed = false;
because = NOT_PROCESSED ; // Cause of unsegmentation
score = 0 ;
......@@ -521,7 +523,7 @@ KmerSegmenter::KmerSegmenter(Sequence seq, Germline *germline, double threshold,
|| (germline->seg_method == SEG_METHOD_MAX1U))
{ // Pseudo-germline, MAX12 and MAX1U
pair <KmerAffect, KmerAffect> max12 ;
CountKmerAffectAnalyser ckaa(*(germline->index), sequence);
KmerAffectAnalyser ckaa = *kaa;
set<KmerAffect> forbidden;
......@@ -1402,10 +1404,6 @@ void FineSegmenter::toOutput(CloneOutput *clone){
});
}
}
else // not segmented
{
clone->set("name", label);
}
}
json toJsonSegVal(string s) {
......
......@@ -135,7 +135,7 @@ class AlignBox
/* Marked position, for Cys104 and Phe118/Trp118 */
int marked_pos;
/* Identifiers and scores of other possible reference sequence */
/* Scores and identifiers of other possible reference sequence */
vector<pair<int, int> > score;
};
......
......@@ -82,7 +82,7 @@ WindowsStorage *WindowExtractor::extract(OnlineBioReader *reads,
stats[TOTAL_SEG_AND_WINDOW].insert(read_length) ;
if (seg->isJunctionChanged())
stats[SEG_CHANGED_WINDOW].insert(read_length);
stats_reads[seg->system].addScore(read_length);
stats_reads[seg->segmented_germline->code].addScore(read_length);
if (out_segmented) {
*out_segmented << *seg ; // KmerSegmenter output (V/N/J)
......
This source diff could not be displayed because it is too large. You can view the blob instead.
// From CLI11 examples
#include "CLI11.hpp"
#include "json.hpp"
// This example is only built on GCC 7 on Travis due to mismatch in stdlib
// for clang (CLI11 is forgiving about mismatches, json.hpp is not)
using nlohmann::json;
class ConfigJSON : public CLI::Config {
public:
std::string to_config(const CLI::App *app, bool default_also, bool, std::string) const override {
json j;
for(const CLI::Option *opt : app->get_options({})) {
// Only process option with a long-name and configurable
if(!opt->get_lnames().empty() && opt->get_configurable()) {
std::string name = opt->get_lnames()[0];
// Non-flags
if(opt->get_type_size() != 0) {
// If the option was found on command line
if(opt->count() == 1)
j[name] = opt->results().at(0);
else if(opt->count() > 1)
j[name] = opt->results();
// If the option has a default and is requested by optional argument
else if(default_also && !opt->get_defaultval().empty())
j[name] = opt->get_defaultval();
// Flag, one passed
} else if(opt->count() == 1) {
j[name] = true;
// Flag, multiple passed
} else if(opt->count() > 1) {
j[name] = opt->count();
// Flag, not present
} else if(opt->count() == 0 && default_also) {
j[name] = false;
}
}
}
for(const CLI::App *subcom : app->get_subcommands({}))
j[subcom->get_name()] = json(to_config(subcom, default_also, false, ""));
return j.dump(4);
}
std::vector<CLI::ConfigItem> from_config(std::istream &input) const override {
json j;
input >> j;
return _from_config(j["config"]);
}
std::vector<CLI::ConfigItem>
_from_config(json j, std::string name = "", std::vector<std::string> prefix = {}) const {
std::vector<CLI::ConfigItem> results;
if(j.is_object()) {
for(json::iterator item = j.begin(); item != j.end(); ++item) {
auto copy_prefix = prefix;
if(!name.empty())
copy_prefix.push_back(name);
auto sub_results = _from_config(*item, item.key(), copy_prefix);
results.insert(results.end(), sub_results.begin(), sub_results.end());
}
} else if(!name.empty()) {
results.emplace_back();
CLI::ConfigItem &res = results.back();
res.name = name;
res.parents = prefix;
if(j.is_boolean()) {
res.inputs = {j.get<bool>() ? "true" : "false"};
} else if(j.is_number()) {
std::stringstream ss;
ss << j.get<double>();
res.inputs = {ss.str()};
} else if(j.is_string()) {
res.inputs = {j.get<std::string>()};
} else if(j.is_array()) {
for(std::string ival : j)
res.inputs.push_back(ival);
} else {
throw CLI::ConversionError("Failed to convert " + name);
}
} else {
throw CLI::ConversionError("You must make all top level values objects in json!");
}
return results;
}
};
int testCLI11_json(int argc, char **argv) {
CLI::App app;
app.config_formatter(std::make_shared<ConfigJSON>());
int item;
app.add_flag("--simple");
app.add_option("--item", item);
app.set_config("--config");
CLI11_PARSE(app, argc, argv);
std::cout << app.config_to_str(true, true) << std::endl;
return 0;
}
class ConfigJSON : public CLI::Config {
std::string to_config(const CLI::App *app, bool default_also, bool, std::string) const override ;
std::vector<CLI::ConfigItem> from_config(std::istream &input) const override ;
} ;
......@@ -11,8 +11,8 @@ EXEC=$(SRC:.cpp=)
OBJ=$(SRC:.cpp=.o)
OTHER_SRC=$(wildcard unit-tests/*.cpp)
LIB=../core/vidjil.a ../lib/lib.a
SHOULD=$(wildcard should-get-tests/*.should-get) $(wildcard bugs/*.should-get)
SHOULD_LOG=$(SHOULD:.should-get=.tap)
SHOULD=$(wildcard should-get-tests/*.should) $(wildcard bugs/*.should)
SHOULD_LOG=$(SHOULD:.should=.tap)
SHOULD_VDJ=$(wildcard should-vdj-tests/*.should-vdj.fa)
SHOULD_VDJ_VDJ=$(SHOULD_VDJ:.should-vdj.fa=.1.vdj)
SHOULD_LOCUS=$(wildcard should-vdj-tests/*.should-locus.fa)
......
ARCHIVE = 'http://www.vidjil.org/releases/'
DEST = 'bench/'
SRC = DEST + 'src/'
BIN = DEST + 'bin/'
RUN = DEST + 'run/'
#####
LIMIT1e5 = '-x 100000 '
LIMIT1e4 = '-x 10000 '
LIMIT1e3 = '-x 1000 '
LIMIT1e2 = '-x 100 '
MULTI = '-g ../../germline/homo-sapiens.g '
IGH = '-g ../../germline/homo-sapiens.g:IGH '
L4 = '../../demo/LIL-L4.fastq.gz '
S22 = '../../demo/Stanford_S22.fasta '
CONSENSUS_NO = '-y 0 -z 0 '
CONSENSUS_ALL = '-y all -z 0 '
DESIGNATIONS = '-c designations '
BENCHS = {
'init': '-x 1 ' + MULTI + L4 + CONSENSUS_NO,
'germ': LIMIT1e5 + MULTI + L4 + '-c germlines ',
'multi-0': LIMIT1e5 + MULTI + L4 + CONSENSUS_NO,
'multi-1': LIMIT1e5 + MULTI + L4 + CONSENSUS_ALL,
'multi-a': LIMIT1e3 + MULTI + L4 + DESIGNATIONS + '-z 1000',
'igh-0': LIMIT1e5 + IGH + S22 + CONSENSUS_NO,
'igh-1': LIMIT1e5 + IGH + S22 + CONSENSUS_ALL,
'igh-a': LIMIT1e3 + IGH + S22 + DESIGNATIONS,
}
COMPATIBILITY = [
('2019.03', '-c designations', '-c segment'),
]
def convert(cmd, release):
'''
Convert a command line to be used by old vidjil-algo releases
>>> convert('-x 10 -c designations', '2019.05')
'-x 10 -c designations'
>>> convert('-x 10 -c designations', '2018.02')
'-x 10 -c segment'
'''
for rel, new, old in COMPATIBILITY:
if release < rel:
cmd = cmd.replace(new, old)
return cmd
#####
import re
import urllib.request
import os
import subprocess
import glob
import time
import sys
import argparse
import resource
stats = {}
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--install', action='store_true', help='install various releases')
parser.add_argument('-b', '--benchmark', action='store_true', help='benchmark installed releases')
def go(cmd, log=None):
if log:
flog = open(log, 'a')
flog.write('\n\n%s\n' % cmd)
else:
flog = sys.stdout
print(cmd, end=' ')
start = resource.getrusage(resource.RUSAGE_CHILDREN)
completed = subprocess.run(cmd, shell=True, stderr=subprocess.STDOUT, stdout=flog)
end = resource.getrusage(resource.RUSAGE_CHILDREN)
if log:
flog.close()
if completed.returncode:
print('FAILED', end=' ')
stime = end.ru_stime-start.ru_stime
utime = end.ru_utime-start.ru_utime
print('%5.2fu %5.2fs' % (utime, stime))
completed.check_returncode()
return stime + utime
def code(tgz):
'''
Extract release tag from filename
>>> code('vidjil-algo-2001.01.tar.gz')
'2001.01'
'''
base = tgz.replace('.tgz', '').replace('.tar.gz', '').replace('vidjil-algo-', '').replace('vidjil-', '')
return base
def get_releases():
with urllib.request.urlopen(ARCHIVE) as response:
for elt in str(response.read()).split('"'):
ok = True
for ignore in ['<', '>', 'x86', 'latest']:
if ignore in elt:
ok = False
break
if ok and 'vidjil-' in elt:
yield code(elt), elt
def install(release, tgz):
os.system('mkdir -p %s' % BIN)
print('== %s' % release)
dir = SRC + release
go('mkdir -p %s' % dir)
log = dir + '/' + 'install.log'
go('wget %s/%s -O %s/src.tgz' % (ARCHIVE, tgz, dir), log)
go('cd %s ; tar xfz src.tgz' % dir, log)
go('cd %s/*%s* ; make vidjil-algo || make CXX=g++-6' % (dir, release), log)
res = go('cp %s/*%s*/vidjil* %s/%s ' % (dir, release, BIN, release), log)
print()
def install_all():
for release, tgz in get_releases():
try:
install(release, tgz)
except subprocess.CalledProcessError:
print("FAILED")
def installed():
return sorted([f.replace(BIN, '') for f in glob.glob('%s/*' % BIN)])
def run_all(tag, args):
print('==== %s ==== %s' % (tag, args))
os.system('mkdir -p %s' % RUN)
for release in installed():
print('%9s' % release, end=' ')
log = RUN + '/%s-%s.log' % (tag, release)
cmd = '%s/%s ' % (BIN, release) + convert(args, release)
try:
bench = go(cmd, log)
stats[tag,release] = bench
except subprocess.CalledProcessError:
stats[tag,release] = None
print()
def show_benchs(f):
for tag, bench in BENCHS.items():
f.write('%8s: %s\n' % (tag, bench))
f.write('%9s ' % '')
for tag in BENCHS:
f.write('%8s' % tag)
f.write('\n\n')
for release in installed():
f.write('%-9s' % release)
for tag in BENCHS:
if (tag,release) in stats:
if stats[tag, release] is not None:
b = '%8.2f' % stats[tag,release]
else:
b = '%8s' % 'x'
else:
b = '%8s' % '-'
f.write(b)
f.write('\n')
def bench_all():
try:
for tag, bench in BENCHS.items():
run_all(tag, bench)
except KeyboardInterrupt:
pass
if __name__ == '__main__':
args = parser.parse_args(sys.argv[1:])
if not args.install and not args.benchmark:
parser.print_help()
if args.install:
install_all()
if args.benchmark:
bench_all()
show_benchs(sys.stdout)
\ No newline at end of file
{
"config": {
"first-reads": 10,
"min-reads": "1",
"max-consensus": "1",
"cdr3": true
}
}
{
"config": {
"labels": [
{ "name": "lab1", "sequence": "CGAGAGTGGGCAGCAGCTGG", "foo": 42 },
{ "name": "lab2", "sequence": "GAAGGGCTACTATGGTTCGGG", "bar": 17}
]
}
}
# seq1, 2, 3, 4 are identical
# One insertion has been added both in seq4 and seq5 (different ones) in uppercase.
>seq1
ggctggagtgggtttcatacattagtagtaatagtggtgccatatactacgcagactctgtgaagggccgattcaccatc
tccagaaacaatgccaaggactcactgtatctgcaaatgaacagcctgagagccgaggacacggctgtgtattactgtgc
gagagcgatcccccggtattactatgatactagtggcccaaacgactactggggccagggaaccctggtcaccgtctcct
cag
>seq2
ggctggagtgggtttcatacattagtagtaatagtggtgccatatactacgcagactctgtgaagggccgattcaccatc
tccagaaacaatgccaaggactcactgtatctgcaaatgaacagcctgagagccgaggacacggctgtgtattactgtgc
gagagcgatcccccggtattactatgatactagtggcccaaacgactactggggccagggaaccctggtcaccgtctcct
cag
>seq3
ggctggagtgggtttcatacattagtagtaatagtggtgccatatactacgcagactctgtgaagggccgattcaccatc
tccagaaacaatgccaaggactcactgtatctgcaaatgaacagcctgagagccgaggacacggctgtgtattactgtgc
gagagcgatcccccggtattactatgatactagtggcccaaacgactactggggccagggaaccctggtcaccgtctcct
cag
>seq4
ggctggagtgggtttcatacattagtagtaatagtggtgccatatactacgcagactctgtgaagggccgattcaccatc
tccagaaacaatgccaaggactcactgtatctgcaaatgAaacagcctgagagccgaggacacggctgtgtattactgtgc
gagagcgatcccccggtattactatgatactagtggcccaaacgactactggggccagggaaccctggtcaccgtctcct
cag
>seq5
ggctggagtgggtttcatacattagtagtaatagtggtgccatatactacgcagactctgtgaagggccgattcaccatc
tccagaaacaatgccaaggactcactgtatctgcaaatgaacagcctgagagAccgaggacacggctgtgtattactgtgc
gagagcgatcccccggtattactatgatactagtggcccaaacgactactggggccagggaaccctggtcaccgtctcct
cag
......@@ -3,7 +3,7 @@
"samples" : {
"number" : 1,
"original_names" : [ "/some/file" ] ,
"original_names" : [ "/some/file_1" ] ,
"run_timestamp" : [ "2015-02-19 16:37:06" ] ,
"producer" : [ "vidjil dev 0cf35de (2015-02-17)" ] ,
"log" : [ "Some log" ],
......
......@@ -3,7 +3,7 @@
"samples" : {
"number" : 1,
"original_names" : [ "/some/file" ] ,
"original_names" : [ "/some/file_2" ] ,
"run_timestamp" : [ "2015-02-19 16:37:06" ] ,
"producer" : [ "vidjil dev 0cf35de (2015-02-17)" ] ,
"log" : [ "Some log" ],
......@@ -27,6 +27,7 @@
"sequence" : "seq-1",
"reads" : [ 300 ] ,
"normalized_reads" : [ 500 ] ,
"top" : 1,
"germline" : "IGH"
},
......
This diff is collapsed.
>unexpected_name
TGTTTGCAGGTGTCCAGTGTCAGGTGAAACTGGTGGAGTCTGGGGGAGGCGTGGTCCGCCCTGGGAGGTCCCTGAGACTCTCCTGTGCCGACCTCCAGGACAGAGGACGCTGGGCCCAGAGAGACGAGGCAGAAAGAACCCCTCGTTCTTGAAGAGACGGTGACCATCGTCCCTTGGCCGCAGATATCGAAAGCATCATTTCCCCTGTCACAGTAGTCGATTCTACTAAAAGAGTCGCGGCCCCTAGTTTGACTACTGGGGCCAGAGAACCCTGGTCACCGTCTCCTCAGGTAAG
\ No newline at end of file
......@@ -2,12 +2,13 @@
Parses output of various RepSeq programs.
Takes either:
- a .fa file, a _Summary.txt file as produced by IMGT/V-QUEST
- or a results file produced by MiXCR
- or a results file produced by MiXCR or IgReC
and creates a .vdj file to be checked by should-vdj-to-tap.py
python repseq_vdj.py data-curated/curated_IG.fa data-curated/curated_ig_Summary.txt > data-curated/imgt-IG.vdj
python repsep_vdj.py data-curated/curated_TR.fa data-curated/curated_tr_Summary.txt > data-curated/imgt-TR.vdj
python repseq_vdj.py data-curated/mixcr.results > data-curated/mixcr.vdj
python repseq_vdj.py bla.igrec.results
python repseq_vdj.py data-curated/curated_IG.fa data-curated/igblast/IG/*.aln > data-curated/igblast-IG.vdj > data-curated/igblast-IG.vdj
python repseq_vdj.py data-curated/curated_TR.fa data-curated/igblast/TR/*.aln > data-curated/igblast-TR.vdj > data-curated/igblast-TR.vdj
'''
......@@ -91,6 +92,9 @@ class Result(VDJ_Formatter):
self.populate()
def __contains__ (self, key):
return key in self.d
def __getitem__(self, key):
return self.d[key]
......@@ -98,6 +102,49 @@ class Result(VDJ_Formatter):
return str(self.d)
### IgReC
IGREC_LABELS = [
'Read id', 'locus',
'V id', 'V start', 'V end', 'V score',
'J id', 'J start', 'J end', 'J score',
]
class IgReC_Result(Result):
r'''
>>> lig = '\t'.join(['blabli4577', 'TRB', 'TRBV13*02', '1', '164', '0.58156', 'TRBJ1-5*01', '319', '367', '0.94'])
>>> r = IgReC_Result(lig)
>>> r['Read id']
'blabli4577'
>>> r.vdj[V]
['TRBV13*02']
>>> r.vdj[J]
['TRBJ1-5*01']
'''
def parse(self, l):
self.labels = IGREC_LABELS
if ('\t' in l.strip()):
return l
else:
return None
def populate(self):
self.vdj[V] = [self['V id']]
self.vdj[J] = [self['J id']]
def header_igrec_results(ff_igrec):
f = open(ff_igrec).__iter__()
while True:
l = f.next()
result = IgReC_Result(l)
yield result['Read id'].replace('_', ' '), result.to_vdj()
### MiXCR
......@@ -111,16 +158,20 @@ class MiXCR_Result(Result):
return None
def populate(self):
self.vdj[V] = [self['Best V hit']]
if self['Best D hit']:
self.vdj[D] = [self['Best D hit']]
self.vdj[J] = [self['Best J hit']]
self.vdj[V] = [self['bestVHit']]
if self['bestDHit']:
self.vdj[D] = [self['bestDHit']]
self.vdj[J] = [self['bestJHit']]
self.vdj[N1] = self['N. Seq. VDJunction']
self.vdj[N2] = self['N. Seq. DJJunction']
self.vdj[N] = self['N. Seq. VJJunction']
if 'nSeqVDJunction' in self:
self.vdj[N1] = self['nSeqVDJunction']
if 'nSeqDJJunction' in self:
self.vdj[N2] = self['nSeqDJJunction']
if 'nSeqVJJunction' in self:
self.vdj[N] = self['nSeqVJJunction']
self.vdj[JUNCTION] = self['AA. Seq. CDR3']
if 'aaSeqCDR3' in self:
self.vdj[JUNCTION] = self['aaSeqCDR3']
def header_mixcr_results(ff_mixcr):
......@@ -128,12 +179,12 @@ def header_mixcr_results(ff_mixcr):
f = open(ff_mixcr).__iter__()
mixcr_first_line = f.next()
globals()['mixcr_labels'] = mixcr_first_line.split('\t')
globals()['mixcr_labels'] = mixcr_first_line.rstrip().split('\t')
while True:
l = f.next()
l = f.next().rstrip()
result = MiXCR_Result(l)
yield result['Description R1'], result.to_vdj()
yield result['descrsR1'], result.to_vdj()
......@@ -354,6 +405,8 @@ if __name__ == '__main__':
if 'mixcr' in sys.argv[1]:
vdj.parse_from_gen(header_mixcr_results(sys.argv[1]))
elif 'igrec' in sys.argv[1]:
vdj.parse_from_gen(header_igrec_results(sys.argv[1]))
elif 'igblast' in sys.argv[2]:
vdj.parse_from_gen(header_igblast_results(sys.argv[1], sys.argv[2:]))
else:
......
......@@ -5,5 +5,5 @@ $ Compilation was run with -Wall
>30: -Wall
$ No compilation warning
f0:warning
a0:warning
!LAUNCH: $VIDJIL_DIR/$EXEC -c designations -g $VIDJIL_DIR/germline/homo-sapiens.g $VIDJIL_DATA/3344-bad-filtering.fa
$ Check that proper filtering is used
1: IGHV4-31.02
!LAUNCH: $VIDJIL_DIR/$EXEC $VIDJIL_DEFAULT_OPTIONS -c designations -g $VIDJIL_DIR/germline/homo-sapiens.g $VIDJIL_DATA/3344-bad-filtering.fa
$ Check that proper filtering is used
1: IGHV4-31.02
!LAUNCH: $VIDJIL_DIR/$EXEC $VIDJIL_DEFAULT_OPTIONS -c clones -z 2 -3 -g $VIDJIL_DIR/germline/homo-sapiens.g:IGH $VIDJIL_DATA/Stanford_S22.fasta > /dev/null ; cat out/Stanford_S22.tsv
!LAUNCH: $VIDJIL_DIR/$EXEC -c clones -z 2 -3 -g $VIDJIL_DIR/germline/homo-sapiens.g:IGH $VIDJIL_DATA/Stanford_S22.fasta > /dev/null ; cat out/Stanford_S22.tsv
$ There are four lines, all with tabs
4:
......@@ -43,8 +43,8 @@ $ Junction of the first clone appears once, but CDR3 twice (it is also included
1:CTREEQYSSWYFDFW
w2:TREEQYSSWYFDF
$ The first clone has three warnings
1:W51 W69 W69
$ The first clone has one warning
1:TATTACTGTACCCGGGAGGAACAATATAGCAGCTGGTACTTTGACTTCTG .* W69
$ No spurious character
0:"
......
!LAUNCH: $VIDJIL_DIR/$EXEC $VIDJIL_DEFAULT_OPTIONS -c clones -z 2 -2 -3 -r 1 -g $VIDJIL_DIR/germline/homo-sapiens.g ../should-vdj-tests/Demo-X5.should-vdj.fa > /dev/null ; cat out/Demo-X5.should-vdj.tsv
!LAUNCH: $VIDJIL_DIR/$EXEC -c clones -z 2 -2 -3 -r 1 -g $VIDJIL_DIR/germline/homo-sapiens.g ../should-vdj-tests/Demo-X5.should-vdj.fa > /dev/null ; cat out/Demo-X5.should-vdj.tsv
$ There are 15 = 1 + 14 lines, all with tabs
15:
......
!REQUIRES: python $VIDJIL_DIR/tools/check_python_version.py
!LAUNCH: $VIDJIL_DIR/$EXEC $VIDJIL_DEFAULT_OPTIONS -r 1 -z 0 -w 60 -g $VIDJIL_DIR/germline/homo-sapiens.g:IGH $VIDJIL_DATA/Stanford_S22.fasta ; python $VIDJIL_DIR/tools/fuse.py out/Stanford_S22.vidjil out/Stanford_S22.vidjil -o out/fused.data ; cat out/fused.data | python $VIDJIL_DIR/tools/format_json.py -1
!LAUNCH: $VIDJIL_DIR/$EXEC -r 1 -z 0 -w 60 -g $VIDJIL_DIR/germline/homo-sapiens.g:IGH $VIDJIL_DATA/Stanford_S22.fasta ; python $VIDJIL_DIR/tools/fuse.py out/Stanford_S22.vidjil out/Stanford_S22.vidjil -o out/fused.data ; cat out/fused.data | python $VIDJIL_DIR/tools/format_json.py -1
$ Points list
1:"original_names": \[".*data//Stanford_S22.fasta", ".*data//Stanford_S22.fasta"\]
......