Commit 482d12db authored by flothoni's avatar flothoni
Browse files

Merge branch 'feature-c/4034-genescan-cluster' of...

Merge branch 'feature-c/4034-genescan-cluster' of gitlab.inria.fr:vidjil/vidjil into feature-c/4034-genescan-cluster
parents 7674dc78 aa73f4e8
Pipeline #149754 failed with stages
in 5 seconds
......@@ -56,6 +56,7 @@ test_germlines:
include:
- local: '/doc/.gitlab-ci.yml'
- local: 'algo/.gitlab-ci-compilers.yml' # Stage multiple_tests
- local: 'algo/.gitlab-ci.yml' # Vidjil-algo pipelines
# Algorithm
......@@ -372,12 +373,13 @@ ff45-server-functional:
benchmark_algo:
image: gcc:6.3
before_script:
- apt-get update
- apt-get install -y time valgrind python3 wget tar
extends: .install-algo-dependencies
stage: benchmark
script:
- cd algo/tests ; python3 benchmark-releases.py -bic
- cd algo/tests ; python3 benchmark-releases.py -r 3 -bIc
artifacts:
paths:
- algo/tests/benchmark.log
when: manual
only:
- /^feature-.*a.*\/.*$/
......
## Preparation
* [ ] New features are described in `doc/vidjil-algo.md`
* [ ] Breaking changes (or needed) are understood
On the `feature-a/release` branch, the last commit is the release commit updating the three following files:
* [ ] CHANGELOG
* [ ] `doc/vidjil-algo.md` with the proper release tag
* [ ] `algo/release` with the proper release tag
## Pipelines
https://gitlab.inria.fr/vidjil/vidjil/pipelines/XXXXX
Usual tests should pass, but also:
* [ ] prepare_release
* [ ] valgrind_unit
* [ ] valgrind_functional
* [ ] multiple_tests
Benchmarks
* [ ] almost no change...
* [ ] ... or significant changes are understood
## Tag and push
* [ ] tag (`git tag release-20XX-XX; git push origin release-20XX-XX`)
* [ ] mirror to GH
## Deploy
* [ ] doc
* [ ] `app` (vidjil-algo-next)
* [ ] `app` if needed, update server configs
* [ ] `app` notification to users
* [ ] `app` (vidjil-algo)
* [ ] `app` test on production (X5 and L4)
* [ ] `hds` queue for qualification ?
* [ ] twice a year, communicate to users
/label ~cpp
.install-algo-dependencies:
before_script:
- apt-get update
- apt-get install -y time valgrind python3 wget tar
.testing-compilers:
extends: .install-algo-dependencies
stage: multiple_tests
tags:
- cidocker
before_script:
- apt-get update
- apt-get install -y time valgrind python3 wget tar
script:
- $CXX --version
- make demo data germline
......
profiling_algo:
stage: benchmark
image: gcc:9
before_script:
- apt-get update
- apt-get install -y wget python3 tar libgoogle-perftools4 libgoogle-perftools-dev google-perftools graphviz
script:
- make demo data germline
- make DEBUG="-g"
- LIB_PROFILE=$(find /usr/lib -name libprofiler.so)
- CPUPROFILE=vidjil.cpu LD_PRELOAD="$LIB_PROFILE" ./vidjil-algo -g germline -r 1 demo/LIL-L4.fastq.gz
- LIB_MALLOC=$(find /usr/lib -name libtcmalloc.so)
- HEAPPROFILE=vidjil.mem LD_PRELOAD="$LIB_MALLOC" ./vidjil-algo -g germline -r 1 demo/LIL-L4.fastq.gz
- google-pprof --lines --text vidjil-algo vidjil.cpu
- echo "###########################################"
- google-pprof --lines --text vidjil-algo vidjil.mem.*
- google-pprof --lines --pdf vidjil-algo vidjil.cpu > vidjil-cpu.pdf
- google-pprof --lines --pdf vidjil-algo vidjil.mem.* > vidjil-mem.pdf
artifacts:
paths:
- vidjil-*.pdf
- vidjil.cpu
- vidjil.mem.*
when: manual
tags:
- cidocker
......@@ -457,6 +457,20 @@ void json_add_warning(json &clone, string code, string msg, string level)
clone["warn"] += { {"code", code}, {"level", level}, {"msg", msg} } ;
}
// Signal handling
bool global_interrupted;
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
void sigintHandler(int sig_num)
{
signal(SIGINT, sigintHandler);
global_interrupted = true;
}
#pragma GCC diagnostic pop
/*
Return the part of label before the star
For example:
......
......@@ -41,6 +41,7 @@ typedef string junction ;
#include <iomanip>
#include <string>
#include <cassert>
#include <signal.h>
#include <vector>
#include "bioreader.hpp"
#include "../lib/gzstream.h"
......@@ -96,6 +97,13 @@ inline int spaced_int(int *input, const string &seed) {
}
/* Signal handling */
extern bool global_interrupted;
void sigintHandler(int sig_num);
/*
Extract the gene name from a label. This take the whole part
before the star and returns it. If there is no star in the
......
......@@ -18,7 +18,8 @@ WindowsStorage *WindowExtractor::extract(OnlineBioReader *reads,
map<string, string> &windows_labels, bool only_labeled_windows,
bool keep_unsegmented_as_clone,
double nb_expected, int nb_reads_for_evalue,
VirtualReadScore *scorer) {
VirtualReadScore *scorer,
SampleOutput *output) {
init_stats();
WindowsStorage *windowsStorage = new WindowsStorage(windows_labels);
......@@ -27,8 +28,19 @@ WindowsStorage *WindowExtractor::extract(OnlineBioReader *reads,
unsigned long long int bp_total = 0;
global_interrupted = false ;
signal(SIGINT, sigintHandler);
while (reads->hasNext()) {
if (global_interrupted)
{
string msg = "Interrupted after processing " + string_of_int(nb_reads) + " reads" ;
if (output) output->add_warning("W09", msg, LEVEL_WARN);
cout << WARNING_STRING << msg << endl ;
break;
}
try {
reads->next();
}
......@@ -119,6 +131,7 @@ WindowsStorage *WindowExtractor::extract(OnlineBioReader *reads,
cout.flush() ;
}
}
signal(SIGINT, SIG_DFL);
cout << endl ;
......
......@@ -12,6 +12,7 @@
#include "read_storage.h"
#include "bioreader.hpp"
#include "read_score.h"
#include "output.h"
#define NB_BINS_CLONES 10
#define MAX_VALUE_BINS_CLONES 1000
......@@ -53,6 +54,7 @@ class WindowExtractor {
* @param nb_expected: maximal e-value of the segmentation
* @param nb_reads_for_evalue: number of reads, used for e-value computation. Can be approximate or faked.
* @param scorer: how reads are scored (only the best ones are keeped for large clones)
* @param output: global output, used here for warnings
* @return a pointer to a WindowsStorage that will contain all the windows.
* It is a pointer so that the WindowsStorage is not duplicated.
* @post Statistics on segmentation will be provided through the getSegmentationStats() methods
......@@ -63,7 +65,8 @@ class WindowExtractor {
map<string, string> &windows_labels, bool only_labeled_windows=false,
bool keep_unsegmented_as_clone=false,
double nb_expected = THRESHOLD_NB_EXPECTED, int nb_reads_for_evalue = 1,
VirtualReadScore *scorer = &DEFAULT_READ_SCORE);
VirtualReadScore *scorer = &DEFAULT_READ_SCORE,
SampleOutput *output = NULL);
/**
* @return the average length of sequences whose segmentation has been classified as seg
......
......@@ -5,10 +5,14 @@ SRC = DEST + 'src/'
BIN = DEST + 'bin/'
RUN = DEST + 'run/'
OUT = 'benchmark.log'
CURRENT = 'HEAD'
#####
WARN_RATIO = 0.10
LIMIT1e5 = '-x 100000 '
LIMIT1e4 = '-x 10000 '
LIMIT1e3 = '-x 1000 '
......@@ -23,23 +27,56 @@ CONSENSUS_NO = '-y 0 -z 0 '
CONSENSUS_ALL = '-y all -z 0 '
DESIGNATIONS = '-c designations '
BENCHS = {
'init': '-x 1 ' + MULTI + L4 + CONSENSUS_NO,
'germ': LIMIT1e5 + MULTI + L4 + '-c germlines ',
from collections import OrderedDict
'multi-0': LIMIT1e5 + MULTI + L4 + CONSENSUS_NO,
'multi-1': LIMIT1e5 + MULTI + L4 + CONSENSUS_ALL,
'multi-a': LIMIT1e3 + MULTI + L4 + DESIGNATIONS + '-z 1000',
BENCHS = OrderedDict([
('init', '-x 1 ' + MULTI + L4 + CONSENSUS_NO),
('germ', LIMIT1e5 + MULTI + L4 + '-c germlines '),
'igh-0': LIMIT1e5 + IGH + S22 + CONSENSUS_NO,
'igh-1': LIMIT1e5 + IGH + S22 + CONSENSUS_ALL,
'igh-a': LIMIT1e3 + IGH + S22 + DESIGNATIONS,
}
('multi-0', LIMIT1e5 + MULTI + L4 + CONSENSUS_NO),
('multi-1', LIMIT1e5 + MULTI + L4 + CONSENSUS_ALL),
('multi-a', LIMIT1e3 + MULTI + L4 + DESIGNATIONS + '-z 1000'),
('igh-0', LIMIT1e5 + IGH + S22 + CONSENSUS_NO),
('igh-1', LIMIT1e5 + IGH + S22 + CONSENSUS_ALL),
('igh-a', LIMIT1e3 + IGH + S22 + DESIGNATIONS),
])
COMPATIBILITY = [
('2019.03', '-c designations', '-c segment'),
]
# Notable changes that may affect speed/memory
INFOS = {
'2019.03': 'Aho by default',
'2018.07': '--analysis-filter (always 3)',
'2018.10': '--analysis-filter 1',
'2020.04': '#4287',
}
# Simple colored output
CSIm = '\033[%sm'
class ANSI:
RESET = 0
BRIGHT = 1
BLACK = 30
RED = 31
GREEN = 32
YELLOW = 33
BLUE = 34
MAGENTA = 35
CYAN = 36
WHITE = 37
def color(col, text, colorize = True):
if not colorize:
return text
return CSIm % col + text + CSIm % ANSI.RESET
#
def convert(cmd, release):
'''
Convert a command line to be used by old vidjil-algo releases
......@@ -68,39 +105,50 @@ import time
import sys
import argparse
import resource
import datetime
from tempfile import NamedTemporaryFile
stats = {}
parser = argparse.ArgumentParser()
parser.add_argument('-c', '--current', action='store_true', help='install current HEAD')
parser.add_argument('-i', '--install', action='store_true', help='install various releases from %s' % ARCHIVE)
parser.add_argument('-i', '--install', dest='release', default=[], action='append',
help='install selected releases from %s, such as in "-s 2018.02 -s 2020.05"' % ARCHIVE)
parser.add_argument('-I', '--install-all', action='store_true',
help='install all releases from %s' % ARCHIVE)
parser.add_argument('-b', '--benchmark', action='store_true', help='benchmark installed releases')
parser.add_argument('-s', '--select', dest='benchs', default=[], action='append',
help = 'Specify the benchmarks to select (among {}, default is all)'.format(', '.join(BENCHS.keys())))
parser.add_argument('-r', '--retries', type=int, default=1, help='Number of times each benchmark is launched')
def go(cmd, log=None):
def go(cmd, log=None, time=False):
if log:
flog = open(log, 'a')
flog.write('\n\n%s\n' % cmd)
else:
flog = sys.stdout
print(cmd, end=' ')
start = resource.getrusage(resource.RUSAGE_CHILDREN)
if time:
time_file = NamedTemporaryFile(mode='w+', delete=False)
cmd = "/usr/bin/time -o {} -f '%U\t%S\t%M' {}".format(time_file.name, cmd)
returncode = subprocess.call(cmd, shell=True, stderr=subprocess.STDOUT, stdout=flog)
end = resource.getrusage(resource.RUSAGE_CHILDREN)
if log:
flog.close()
if returncode:
print('FAILED', end=' ')
stime = end.ru_stime-start.ru_stime
utime = end.ru_utime-start.ru_utime
print('%5.2fu %5.2fs' % (utime, stime))
if returncode:
raise subprocess.CalledProcessError(returncode, cmd)
elif not time:
return
else:
(utime, stime, mem) = [ float(i) for i in time_file.read().split() ]
mem = mem // 1000
os.unlink(time_file.name)
print(color(ANSI.YELLOW, '%5.2fu %5.2fs %6.1fM' % (utime, stime, mem)))
return stime + utime
return (stime + utime, mem)
def code(tgz):
'''
......@@ -147,10 +195,11 @@ def install(release, tgz):
def install_current():
install(CURRENT, None)
def install_from_archive():
def install_from_archive(install_versions):
for release, tgz in get_releases():
try:
install(release, tgz)
if (not install_versions) or release in install_versions:
install(release, tgz)
except subprocess.CalledProcessError:
print("FAILED")
......@@ -158,50 +207,92 @@ def installed():
return sorted([f.replace(BIN, '') for f in glob.glob('%s/*' % BIN)])
def run_all(tag, args):
go("make -C ../.. germline")
go("make -C ../.. data")
go("make -C ../.. demo")
print('==== %s ==== %s' % (tag, args))
def run_all(tag, args, retries):
print(color(ANSI.CYAN, '==== %s ==== %s' % (tag, args)))
os.system('mkdir -p %s' % RUN)
for release in installed():
print('%9s' % release, end=' ')
print(color(ANSI.MAGENTA, '%9s' % release), end=' ')
log = RUN + '/%s-%s.log' % (tag, release)
cmd = '%s/%s ' % (BIN, release) + convert(args, release)
try:
bench = go(cmd, log)
stats[tag,release] = bench
benchs = []
for i in range(retries) :
benchs.append(go(cmd, log, True))
time = min([b[0] for b in benchs])
mem = min([b[1] for b in benchs])
stats[tag,release] = (time, mem)
except subprocess.CalledProcessError:
stats[tag,release] = None
print()
def show_benchs(f):
def bench_line(f, release, stats, index, format='%8.2f', previous_release=None, colorize=True):
f.write('%-9s' % release)
warned = False
for tag in BENCHS:
if (tag,release) in stats:
if stats[tag, release] is not None:
val = stats[tag,release][index]
b = format % val
# Highlight value
if previous_release:
if stats[tag, previous_release] is not None:
previous_val = stats[tag,previous_release][index]
if previous_val:
if val/previous_val >= 1 + WARN_RATIO:
b = color(ANSI.RED, b) if colorize else '!' + b[1:]
warned = True
elif val/previous_val <= 1 - WARN_RATIO:
b = color(ANSI.GREEN, b) if colorize else '!' + b[1:]
warned = True
else:
b = '%8s' % 'x'
else:
b = '%8s' % '-'
f.write(b)
if release in INFOS:
f.write(' ' + INFOS[release])
f.write('\n')
return warned
def show_benchs(f, watched_release=None, colorize=True):
f.write('\n')
f.write(color(ANSI.YELLOW, '\nBenchmark summary, %s\n' % datetime.datetime.now().isoformat(), colorize))
for tag, bench in BENCHS.items():
f.write('%8s: %s\n' % (tag, bench))
f.write('\n')
f.write('%9s ' % '')
for tag in BENCHS:
f.write('%8s' % tag)
f.write('\n\n')
for release in installed():
f.write('%-9s' % release)
for tag in BENCHS:
if (tag,release) in stats:
if stats[tag, release] is not None:
b = '%8.2f' % stats[tag,release]
else:
b = '%8s' % 'x'
else:
b = '%8s' % '-'
f.write(b)
f.write('\n')
warned = False
def bench_all():
for (key, index, format) in [
('Time (s)', 0, '%8.2f'),
('Memory (MB)', 1, '%8d'),
]:
f.write(color(ANSI.YELLOW, '\n%s\n' % key, colorize))
previous_release = None
for release in installed():
w = bench_line(f, release, stats, index, format, previous_release, colorize)
previous_release = release
if w and release == watched_release:
warned = True
return warned
def bench_all(retries, selected_benchs):
try:
go("make -C ../.. germline")
go("make -C ../.. data")
go("make -C ../.. demo")
print()
print()
for tag, bench in BENCHS.items():
run_all(tag, bench)
if len(selected_benchs) == 0 or tag in selected_benchs:
run_all(tag, bench, retries)
except KeyboardInterrupt:
pass
......@@ -210,15 +301,20 @@ def bench_all():
if __name__ == '__main__':
args = parser.parse_args(sys.argv[1:])
if not args.install and not args.benchmark:
if not args.release and not args.benchmark:
parser.print_help()
if args.current:
install_current()
if args.install:
install_from_archive()
if args.release or args.install_all:
install_from_archive(args.release)
if args.benchmark:
bench_all()
show_benchs(sys.stdout)
bench_all(args.retries, args.benchs)
show_benchs(sys.stdout, colorize=True)
print('\n==>', OUT)
watched_release = installed()[-1]
warned = show_benchs(open(OUT, 'w'), watched_release=watched_release, colorize=False)
sys.exit(42 if warned else 0)
......@@ -1133,7 +1133,7 @@ int main (int argc, char **argv)
windows_labels, only_labeled_windows,
keep_unsegmented_as_clone,
expected_value_kmer, nb_reads_for_evalue,
readScorer);
readScorer, &output);
windowsStorage->setIdToAll();
size_t nb_total_reads = we.getNbReads();
......@@ -1308,12 +1308,22 @@ int main (int argc, char **argv)
cout << " ==> " << out_seqdir + CLONE_FILENAME + "*" << "\t(detail, by clone)" << endl ;
cout << endl ;
global_interrupted = false;
signal(SIGINT, sigintHandler);
for (list <pair<junction,size_t> >::const_iterator it = sort_clones.begin();
it != sort_clones.end(); ++it) {
junction win = it->first;
size_t clone_nb_reads = it->second;
if (global_interrupted)
{
string msg = "Interrupted after analyzing " + string_of_int(num_clone) + " clones" ;
output.add_warning("W09", msg, LEVEL_WARN);
cout << WARNING_STRING << msg << endl ;
break;
}
++num_clone ;
bool clone_on_stdout = (num_clone <= CLONES_ON_STDOUT) || verbose;
......@@ -1537,7 +1547,8 @@ int main (int argc, char **argv)
cout << endl ;
out_clone.close();
} // end for clones
signal(SIGINT, SIG_DFL);
out_edges.close() ;
delete out_clones;
......
......@@ -28,4 +28,7 @@ quality:
../../doc/%:
$(MAKE) -C ../../doc $@
../../tools/tests/%:
$(MAKE) -C ../../tools/tests data/$(notdir $@)
.PHONY: unit functional headless all
......@@ -90,44 +90,89 @@ BMC Genomics 2014, 15:409
Jean-Sebastien Allain et al.,
*IGHV segment utilization in immunoglobulin gene rearrangement differentiates patients with anti-myelin-associated glycoprotein neuropathy from others immunoglobulin M-gammopathies*,
Haematologica, 2018, 103:e207-e210
http://dx.doi.org/10.3324/haematol.2017.177444
<http://dx.doi.org/10.3324/haematol.2017.177444>
Jack Bartram et al.,
*High throughput sequencing in acute lymphoblastic leukemia reveals clonal architecture of central nervous system and bone marrow compartments*
Haematologica, 2018,
<https://dx.doi.org/10.3324%2Fhaematol.2017.174987>
Sébastien Bender et al.,
*Immunoglobulin variable domain high-throughput sequencing reveals specific novel mutational patterns in POEMS syndrome*
Blood, 2020,
<https://doi.org/10.1182/blood.2019004197>
Yann Ferret et al.,
*Multi-loci diagnosis of acute lymphoblastic leukaemia with high-throughput sequencing and bioinformatics analysis*,