Commit 76f9b7d3 authored by Philippe SWARTVAGHER's avatar Philippe SWARTVAGHER
Browse files

Merge branch 'predict'

parents dca18279 026223e7
......@@ -12,3 +12,4 @@ tags
results/
uncore_get
uncore_set
attic
......@@ -4,13 +4,13 @@ OMP_FLAGS = -fopenmp -mcmodel=large -DWITH_OPENMP=1 --no-PIC # --no-PIC is requi
all: bench_openmp bench_starpu
bench_openmp: bench.c openmp_stream.c openmp_prime.c helper.c openmp.c timing.c prime.c openmp_cursor.c helper_stream.c helper_cursor.c openmp_scalar.c openmp_scalar_avx.c
bench_openmp: bench.c openmp_stream.c openmp_prime.c helper.c openmp.c timing.c prime.c openmp_cursor.c helper_stream.c helper_cursor.c openmp_scalar.c openmp_scalar_avx.c openmp_stream_weak.c
$(MPICC) $(CFLAGS) $(OMP_FLAGS) `pkg-config --cflags hwloc` $^ -o $@ `pkg-config --libs hwloc` -lm
bench_openmp_likwid : bench.c openmp_stream.c openmp_prime.c helper.c openmp.c timing.c prime.c openmp_cursor.c helper_stream.c helper_cursor.c freq_meter.c openmp_scalar.c openmp_scalar_avx.c
bench_openmp_likwid : bench.c openmp_stream.c openmp_prime.c helper.c openmp.c timing.c prime.c openmp_cursor.c helper_stream.c helper_cursor.c freq_meter.c openmp_scalar.c openmp_scalar_avx.c openmp_stream_weak.c
$(MPICC) $(CFLAGS) $(OMP_FLAGS) -DWITH_LIKWID=1 `pkg-config --cflags hwloc` $^ -o $@ `pkg-config --libs hwloc` -lm -llikwid
bench_openmp_freq : bench.c openmp_stream.c openmp_prime.c helper.c openmp.c timing.c prime.c openmp_cursor.c helper_stream.c helper_cursor.c freq_meter.c openmp_scalar.c openmp_scalar_avx.c
bench_openmp_freq : bench.c openmp_stream.c openmp_prime.c helper.c openmp.c timing.c prime.c openmp_cursor.c helper_stream.c helper_cursor.c freq_meter.c openmp_scalar.c openmp_scalar_avx.c openmp_stream_weak.c
$(MPICC) $(CFLAGS) $(OMP_FLAGS) -DWITH_DIRECT_CPU_FREQ=1 `pkg-config --cflags hwloc` $^ -o $@ `pkg-config --libs hwloc` -lm
bench_starpu: bench.c starpu_cholesky.c starpu_prime.c starpu_stream.c helper.c starpu.c prime.c helper_stream.c helper_cursor.c starpu_cursor.c
......
......@@ -80,16 +80,36 @@ can be launched.
Scripts are in the `plot` folder and require Python with Matplotlib.
`plot_comm_stream_nb_threads.py` is the main script. It plots computing
benchmark results and communication performances on the same graph, according
to mainly the number of cores.
`plot_comm_comp.py` is the main script. It plots computing benchmark results
and communication performances on the same graph, according to mainly the
number of cores.
For instance:
```bash
python3 plot_comm_stream_nb_threads.py bandwidth_thread_last_* --per-core --top=10000 --stream-top=15000 --o=bandwidth_thread_last.png --title="Network Bandwidth and STREAM Benchmark"
python3 plot_comm_comp.py bandwidth_thread_last_* --per-core --top=10000 --stream-top=15000 --o=bandwidth_thread_last.png --network-bandwidth --title="Network Bandwidth and STREAM Benchmark"
```
The Python module `comm_comp` provides classes to parse outputs of the benchmarking program and to generate plots. For instance:
```python
import glob
from comm_comp import *
parser = FilesParser(glob.glob("copy_*_threads.out"))
results_copy_alone = parser.flatten_results['comp']['alone']['copy']['time']['avg']
results_copy_with_comms = parser.flatten_results['comp']['with_comm']['copy']['time']['avg']
results_comm_alone = parser.flatten_results['comm']['alone']['lat']['med']
results_comm_with_comp = parser.flatten_results['comm']['with_comp']['lat']['med']
graph = CommCompGraph(parser.x_values, parser.x_type, CommCompGraphCommType.LATENCY, parser.compute_bench_type, CompMetric.TIME, "Network Latency (64 MB)", "COPY")
graph.add_comp_curve(results_copy_alone, "alone", CommCompGraphCurveType.ALONE, False)
graph.add_comp_curve(results_copy_with_comms, "while Ping-Pongs", CommCompGraphCurveType.PARALLEL, True)
graph.add_comm_curve(results_comm_alone, "alone", CommCompGraphCurveType.ALONE, False, display_line=False)
graph.add_comm_curve(results_comm_with_comp, "while Computations", CommCompGraphCurveType.PARALLEL, True)
graph.comm_top_limit = 7
graph.comp_top_limit = 45
graph.show()
```
## Measuring if a program is CPU- or memory-bound
......
......@@ -6,6 +6,7 @@
#include <hwloc/helper.h>
#include <mpi.h>
#include "bench_hwloc.h"
#include "runtime.h"
#include "cholesky.h"
#include "prime.h"
......@@ -28,6 +29,7 @@
#elif WITH_OPENMP == 1
#include "scalar.h"
#include "scalar_avx.h"
#include "stream_weak.h"
#include "timing.h"
#define get_worker_count get_nb_openmp_threads
#define timing_now(__var) PUK_GET_TICK(__var)
......@@ -46,9 +48,10 @@
#define NB_PINGPONG_BANDWIDTH 50
hwloc_topology_t topology;
static int rank = 0, other_rank = 1;
static pthread_barrier_t thread_barrier;
static hwloc_topology_t topology;
static int topo_pu_depth;
static int nb_pus;
static int ping_thread_pu;
......@@ -195,14 +198,7 @@ static void* ping_pong_thread_func(void* arg)
{
for (int j = 0; j < i; j++)
{
if (comm_bench_type == LATENCY)
{
printf("%.3lf us\n", comm_durations[j]);
}
else // bandwidth
{
printf("%.3lf MB/s\n", pingpong_size * sizeof(float) / (1024*1024*(comm_durations[j] / 1000000)));
}
printf("%.3lf MB/s - %.3lf us\n", pingpong_size * sizeof(float) / (1024*1024*(comm_durations[j] / 1000000)), comm_durations[j]);
}
fflush(stdout);
}
......@@ -264,8 +260,6 @@ int main(int argc, char* argv[])
starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
starpu_mpi_comm_size(MPI_COMM_WORLD, &worldsize);
topology = starpu_get_hwloc_topology();
}
else
{
......@@ -277,6 +271,8 @@ int main(int argc, char* argv[])
}
}
topology = starpu_get_hwloc_topology();
PRINTF_RANK0("# StarPU version\n");
#elif WITH_OPENMP == 1
if (enable_comm)
......@@ -284,11 +280,11 @@ int main(int argc, char* argv[])
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &worldsize);
hwloc_topology_init(&topology);
hwloc_topology_load(topology);
}
hwloc_topology_init(&topology);
hwloc_topology_load(topology);
PRINTF_RANK0("# OpenMP version\n");
#endif
......@@ -359,6 +355,11 @@ int main(int argc, char* argv[])
computing = scalar_avx_get_functions();
PRINTF_RANK0("# Will do scalar avx benchmark\n");
}
else if (strcmp(argv[i], "--compute_bench=stream_weak") == 0)
{
computing = stream_weak_get_functions();
PRINTF_RANK0("# Will do weak stream benchmark\n");
}
#endif
else if (strcmp(argv[i], "--compute_bench=cursor") == 0)
{
......@@ -433,7 +434,7 @@ int main(int argc, char* argv[])
#if WITH_STARPU == 1
printf("--compute_bench={prime,stream,cursor,cholesky} to precise which computing benchmark to perform.\n");
#elif WITH_OPENMP == 1
printf("--compute_bench={prime,stream,cursor,scalar,scalar_avx} to precise which computing benchmark to perform.\n");
printf("--compute_bench={prime,stream,stream_weak,cursor,scalar,scalar_avx} to precise which computing benchmark to perform.\n");
#endif
printf("--duplex to do full-duplex communications.\n");
printf("--disable_comms to disable communication benchmark.\n");
......@@ -717,9 +718,9 @@ int main(int argc, char* argv[])
if (enable_comm)
{
ping_pong_free();
hwloc_topology_destroy(topology);
MPI_Finalize();
}
hwloc_topology_destroy(topology);
#endif
#if (WITH_LIKWID == 1) || (WITH_DIRECT_CPU_FREQ == 1)
......
#ifndef __BENCH_HWLOC_H
#define __BENCH_HWLOC_H
#include <hwloc.h>
hwloc_topology_t topology;
#endif /* __BENCH_HWLOC_H */
......@@ -10,13 +10,16 @@ volatile int* stream_comm_bench_ended = NULL;
double stream_perfs_warmup[4][3];
double stream_perfs_no_comm[4][3];
double stream_perfs_comm[4][3];
double stream_time_warmup[4][3];
double stream_time_no_comm[4][3];
double stream_time_comm[4][3];
void stream_set_comm_bench_ended_ptr(volatile int* _comm_bench_ended)
{
stream_comm_bench_ended = _comm_bench_ended;
}
static void print_stream_result_lines(char bench_to_run[4], double results[4][3])
static void print_stream_result_lines(char bench_to_run[4], double results[4][3], double times[4][3])
{
char* stream_labels[4] = {
"Copy: ",
......@@ -29,7 +32,7 @@ static void print_stream_result_lines(char bench_to_run[4], double results[4][3]
{
if (bench_to_run[i])
{
printf("# %s\t%.3f\t%.3f\t%.3f\n", stream_labels[i], results[i][0], results[i][1], results[i][2]);
printf("# %s\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n", stream_labels[i], results[i][0], results[i][1], results[i][2], times[i][0]*1000, times[i][1]*1000, times[i][2]*1000);
}
}
}
......@@ -71,21 +74,21 @@ void stream_man()
void stream_print_results()
{
printf("# STREAM results: Bandwidth MB/s (max, avg, min)\n");
printf("# STREAM results: Bandwidth MB/s (max, avg, min) Time ms (min, avg, max)\n");
if (stream_bench_done[WARMUP])
{
printf("# warmup:\n");
print_stream_result_lines(bench_to_run, stream_perfs_warmup);
print_stream_result_lines(bench_to_run, stream_perfs_warmup, stream_time_warmup);
}
if (stream_bench_done[WITH_COMM])
{
printf("# with communications:\n");
print_stream_result_lines(bench_to_run, stream_perfs_comm);
print_stream_result_lines(bench_to_run, stream_perfs_comm, stream_time_comm);
}
if (stream_bench_done[WITHOUT_COMM])
{
printf("# without communications:\n");
print_stream_result_lines(bench_to_run, stream_perfs_no_comm);
print_stream_result_lines(bench_to_run, stream_perfs_no_comm, stream_time_no_comm);
}
}
......
......@@ -9,6 +9,9 @@
extern double stream_perfs_warmup[4][3];
extern double stream_perfs_no_comm[4][3];
extern double stream_perfs_comm[4][3];
extern double stream_time_warmup[4][3];
extern double stream_time_no_comm[4][3];
extern double stream_time_comm[4][3];
// array to be accessed with bench_type enum to know if we already launched this bench
// to prevent erasing old data bench
......
......@@ -133,18 +133,30 @@ int stream_run(int nb_runs, enum bench_type bench_type)
stream_perfs_warmup[j][0] = 1.0E-06 * bytes[j] / mintime[j];
stream_perfs_warmup[j][1] = 1.0E-06 * bytes[j] / avgtime[j];
stream_perfs_warmup[j][2] = 1.0E-06 * bytes[j] / maxtime[j];
stream_time_warmup[j][0] = mintime[j];
stream_time_warmup[j][1] = avgtime[j];
stream_time_warmup[j][2] = maxtime[j];
}
else if (bench_type == WITH_COMM)
{
stream_perfs_comm[j][0] = 1.0E-06 * bytes[j] / mintime[j];
stream_perfs_comm[j][1] = 1.0E-06 * bytes[j] / avgtime[j];
stream_perfs_comm[j][2] = 1.0E-06 * bytes[j] / maxtime[j];
stream_time_comm[j][0] = mintime[j];
stream_time_comm[j][1] = avgtime[j];
stream_time_comm[j][2] = maxtime[j];
}
else if (bench_type == WITHOUT_COMM)
{
stream_perfs_no_comm[j][0] = 1.0E-06 * bytes[j] / mintime[j];
stream_perfs_no_comm[j][1] = 1.0E-06 * bytes[j] / avgtime[j];
stream_perfs_no_comm[j][2] = 1.0E-06 * bytes[j] / maxtime[j];
stream_time_no_comm[j][0] = mintime[j];
stream_time_no_comm[j][1] = avgtime[j];
stream_time_no_comm[j][2] = maxtime[j];
}
else
{
......
#include <float.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <omp.h>
#include "runtime.h"
#include "stream.h"
#include "timing.h"
#include "helper_stream.h"
#include "hwloc.h"
static STREAM_TYPE **a, **b, **c;
static int array_size;
static int nb_threads;
extern hwloc_topology_t topology;
static inline hwloc_uint64_t get_l3_size()
{
#if HWLOC_API_VERSION >= 0x00020000
hwloc_obj_t l3_obj = hwloc_get_obj_by_type(topology, HWLOC_OBJ_L3CACHE, 0);
#else
int cache_depth = hwloc_get_cache_type_depth(topology, 3, HWLOC_OBJ_CACHE_UNIFIED);
assert(cache_depth > 0);
hwloc_obj_t l3_obj = hwloc_get_obj_by_depth(topology, cache_depth, 0);
#endif
return l3_obj->attr->cache.size;
}
static int stream_weak_init()
{
nb_threads = get_nb_openmp_threads();
array_size = 2 * get_l3_size() / sizeof(STREAM_TYPE);
printf("# Each thread will work on an array of %d items (%ld KB)\n", array_size, array_size*sizeof(STREAM_TYPE)/1024);
a = malloc(nb_threads*sizeof(STREAM_TYPE*));
b = malloc(nb_threads*sizeof(STREAM_TYPE*));
c = malloc(nb_threads*sizeof(STREAM_TYPE*));
#pragma omp parallel for
for (int i = 0; i < nb_threads; i++)
{
a[i] = malloc(array_size*sizeof(STREAM_TYPE));
b[i] = malloc(array_size*sizeof(STREAM_TYPE));
c[i] = malloc(array_size*sizeof(STREAM_TYPE));
for (int j = 0; j < array_size; j++)
{
a[i][j] = 1.0;
b[i][j] = 2.0;
c[i][j] = 0.0;
}
}
return 0;
}
static void stream_weak_run_kernel(double avgtime[4], double maxtime[4], double mintime[4])
{
puk_tick_t start_time, end_time;
double durations[4];
STREAM_TYPE scalar = 3.0;
/* COPY */
if (bench_to_run[COPY])
{
PUK_GET_TICK(start_time);
#pragma omp parallel for
for (int i = 0; i < nb_threads; i++)
for (int j = 0; j < array_size; j++)
c[i][j] = a[i][j];
PUK_GET_TICK(end_time);
durations[COPY] = PUK_TIMING_DELAY(start_time, end_time) / 1000000.0f;
}
/* SCALE */
if (bench_to_run[SCALE])
{
PUK_GET_TICK(start_time);
#pragma omp parallel for
for (int i = 0; i < nb_threads; i++)
for (int j = 0; j < array_size; j++)
b[i][j] = scalar*c[i][j];
PUK_GET_TICK(end_time);
durations[SCALE] = PUK_TIMING_DELAY(start_time, end_time) / 1000000.0f;
}
/* ADD */
if (bench_to_run[ADD])
{
PUK_GET_TICK(start_time);
#pragma omp parallel for
for (int i = 0; i < nb_threads; i++)
for (int j = 0; j < array_size; j++)
c[i][j] = a[i][j]+b[i][j];
PUK_GET_TICK(end_time);
durations[ADD] = PUK_TIMING_DELAY(start_time, end_time) / 1000000.0f;
}
/* TRIAD */
if (bench_to_run[TRIAD])
{
PUK_GET_TICK(start_time);
#pragma omp parallel for
for (int i = 0; i < nb_threads; i++)
for (int j = 0; j < array_size; j++)
a[i][j] = b[i][j]+scalar*c[i][j];
PUK_GET_TICK(end_time);
durations[TRIAD] = PUK_TIMING_DELAY(start_time, end_time) / 1000000.0f;
}
for (int j = 0; j < 4; j++)
{
if (bench_to_run[j])
{
avgtime[j] = avgtime[j] + durations[j];
mintime[j] = MIN(mintime[j], durations[j]);
maxtime[j] = MAX(maxtime[j], durations[j]);
}
}
}
static int stream_weak_run(int nb_runs, enum bench_type bench_type)
{
if (stream_bench_done[bench_type])
{
printf("Warning: this bench was already done.\n");
}
int real_nb_runs = nb_runs;
double avgtime[4] = {0}, maxtime[4] = {0}, mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
double bytes[4] = {
2 * sizeof(STREAM_TYPE) * array_size * nb_threads,
2 * sizeof(STREAM_TYPE) * array_size * nb_threads,
3 * sizeof(STREAM_TYPE) * array_size * nb_threads,
3 * sizeof(STREAM_TYPE) * array_size * nb_threads
};
for (int k = 0; k < nb_runs; k++)
{
stream_weak_run_kernel(avgtime, maxtime, mintime);
}
if (bench_type == WITH_COMM && stream_comm_bench_ended != NULL)
{
/* Keep computing while we need more pingpongs: */
while (!*stream_comm_bench_ended)
{
stream_weak_run_kernel(avgtime, maxtime, mintime);
real_nb_runs++;
}
}
for (int j = 0; j < 4; j++)
{
if (bench_to_run[j])
{
avgtime[j] = avgtime[j]/(double)(real_nb_runs);
if (bench_type == WARMUP)
{
stream_perfs_warmup[j][0] = 1.0E-06 * bytes[j] / mintime[j];
stream_perfs_warmup[j][1] = 1.0E-06 * bytes[j] / avgtime[j];
stream_perfs_warmup[j][2] = 1.0E-06 * bytes[j] / maxtime[j];
stream_time_warmup[j][0] = mintime[j];
stream_time_warmup[j][1] = avgtime[j];
stream_time_warmup[j][2] = maxtime[j];
}
else if (bench_type == WITH_COMM)
{
stream_perfs_comm[j][0] = 1.0E-06 * bytes[j] / mintime[j];
stream_perfs_comm[j][1] = 1.0E-06 * bytes[j] / avgtime[j];
stream_perfs_comm[j][2] = 1.0E-06 * bytes[j] / maxtime[j];
stream_time_comm[j][0] = mintime[j];
stream_time_comm[j][1] = avgtime[j];
stream_time_comm[j][2] = maxtime[j];
}
else if (bench_type == WITHOUT_COMM)
{
stream_perfs_no_comm[j][0] = 1.0E-06 * bytes[j] / mintime[j];
stream_perfs_no_comm[j][1] = 1.0E-06 * bytes[j] / avgtime[j];
stream_perfs_no_comm[j][2] = 1.0E-06 * bytes[j] / maxtime[j];
stream_time_no_comm[j][0] = mintime[j];
stream_time_no_comm[j][1] = avgtime[j];
stream_time_no_comm[j][2] = maxtime[j];
}
else
{
abort();
}
}
}
stream_bench_done[bench_type] = 1;
return 0;
}
static void stream_weak_release()
{
for (int i = 0; i < nb_threads; i++)
{
free(a[i]);
free(b[i]);
free(c[i]);
}
free(a);
free(b);
free(c);
}
struct computing_functions stream_weak_get_functions()
{
struct computing_functions s = {
.init = &stream_weak_init,
.run = &stream_weak_run,
.print_results = &stream_print_results,
.release = &stream_weak_release,
.parse_arg = &stream_parse_arg,
.man = &stream_man,
.get_nb_runs = &stream_get_nb_runs,
.set_comm_bench_ended_ptr = &stream_set_comm_bench_ended_ptr
};
return s;
}
This diff is collapsed.
import os
import re
import math
import numpy as np
from matplotlib.ticker import FuncFormatter
......@@ -13,6 +14,10 @@ TOP_OPT = "--top="
BOTTOM_OPT = "--bottom="
STREAM_TOP_OPT = "--stream-top="
STREAM_BOTTOM_OPT = "--stream-bottom="
PER_CORE_OPT = "--per-core"
STREAM_BENCH_DISPLAY_OPT = "--stream="
TITLE_OPT = "--title="
OUTPUT_FILENAME_OPT = "--o="
REMOVE_IN_PATH = ["/home/philippe/Documents/resultats/plafrim/exploited/", "/home/philippe/Documents/resultats/memory-contention/", "/home/philippe/Documents/resultats/"]
......@@ -23,6 +28,10 @@ top_limit = None
bottom_limit = None
stream_top_limit = None
stream_bottom_limit = None
per_core = False
user_per_core_nb = None
user_title = None
user_figname = None
def get_title_prefix(working_directory):
......@@ -48,46 +57,22 @@ def get_working_directory(path):
return working_directory
def get_stream_results(lines, nb_threads=None):
def apply_scale(values):
if nb_threads is not None:
return list(map(lambda v: v / nb_threads, values))
else:
return values
def parse_line(line):
stream_type = line.split()[1].lower()[:-1] # exclude the last character (a semi-column)
values = apply_scale([float(v) for v in line.split()[2:]])
def get_compute_bench_type(lines):
for line in lines[:20]:
if line.strip().startswith("# Will do CPU intensive benchmark"):
return "prime"
elif line.strip().startswith("# Will do Cholesky benchmark"):
return "cholesky"
elif "STREAM" in line or "weak stream":
return "stream"
elif line.strip().startswith("# Will do scalar avx benchmark"):
return "scalar avx"
elif line.strip().startswith("# Will do scalar benchmark"):
return "scalar"
elif line.strip().startswith("# Will do Cursor benchmark"):
return "cursor"
return stream_type, values
def is_result_line(line):
return line.split()[1].lower()[:-1] in ["copy", "triad", "add", "scale"]
result = {}
in_without_comm = False
in_with_comm = False
for i in range(len(lines)-15, len(lines)):
assert(not (in_with_comm and in_without_comm))
if lines[i].strip() == "# without communications:":
in_without_comm = True
in_with_comm = False
result["without_comm"] = {}
elif lines[i].strip() == "# with communications:":
in_without_comm = False
in_with_comm = True
result["with_comm"] = {}
elif in_without_comm and is_result_line(lines[i]):
op, values = parse_line(lines[i])
result["without_comm"][op] = values
elif in_with_comm and is_result_line(lines[i]):
op, values = parse_line(lines[i])
result["with_comm"][op] = values
return result
raise Exception("Unable to find compute bench type.")
def get_time_results(lines, compute_bench_type=None):
......@@ -128,52 +113,63 @@ def get_cholesky_results(lines, nb_threads=None):
def compute_stats(l):
assert(len(l) > 0)
if len(l) == 1:
return [l[0], l[0], l[0]]
sorted_l = sorted(l)
if len(l) == 2:
return [l[0], (l[0]+l[1])/2, l[1]]
# Case when len(l) < 10 can be problematic:
d1 = max(len(l) // 10, 1)
med = len(l) // 2
d9 = min(math.ceil(len(l) / 10) * 9, len(l)-2)
assert(d1 <= med)