Commit 997ccfb6 authored by Philippe SWARTVAGHER's avatar Philippe SWARTVAGHER
Browse files

Add a memset computing kernel

parent d9baafc6
......@@ -38,6 +38,7 @@ openmp_sources = \
openmp_cursor.c \
openmp_scalar.c \
openmp_stream_weak.c \
openmp_memset.c \
malloc.c \
timing.c \
$(common_sources)
......
......@@ -27,7 +27,7 @@ static int rank = 0, other_rank = 1;
static pthread_barrier_t thread_barrier;
static volatile int compute_bench_ended = 0;
static volatile int comm_bench_ended = 0;
static struct params_s params;
struct params_s params;
struct machine_s machine;
......
......@@ -21,9 +21,9 @@
#include "stream_weak.h"
#ifdef HAVE_SIMD_AVX
#include "scalar_avx.h"
#define NB_KERNELS 6
#define NB_KERNELS 7
#else
#define NB_KERNELS 5
#define NB_KERNELS 6
#endif
#endif
......@@ -54,8 +54,9 @@ void init_params()
#elif defined(_OPENMP)
computing_kernels[3] = stream_weak_get_functions();
computing_kernels[4] = scalar_get_functions();
computing_kernels[5] = memset_get_functions();
#ifdef HAVE_SIMD_AVX
computing_kernels[5] = scalar_avx_get_functions();
computing_kernels[6] = scalar_avx_get_functions();
#endif
#endif
}
......
......@@ -78,6 +78,8 @@ void fill_machine(struct machine_s* machine);
#if defined(_OPENMP)
int get_nb_openmp_threads();
struct computing_functions memset_get_functions();
#endif
#if WITH_STARPU == 1
......
......@@ -9,7 +9,6 @@
#include "helper_cursor.h"
#define ARRAY_SIZE 100000000 // * 8 Bytes (double) =~ 763 MB
#define NB_RUNS 20
#ifdef ALLOC_STATIC
......
#include <float.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <omp.h>
#include "cli.h"
#include "timing.h"
#define MEMSET_TYPE int
static MEMSET_TYPE **a;
static int array_size;
static int nb_threads;
extern struct machine_s machine;
extern struct params_s params;
char memset_bench_done[] = {0, 0, 0};
double memset_perfs_warmup[] = {FLT_MAX, 0, 0};
double memset_perfs_no_comm[] = {FLT_MAX, 0, 0};
double memset_perfs_comm[] = {FLT_MAX, 0, 0};
volatile int* memset_comm_bench_ended = NULL;
static void memset_set_comm_bench_ended_ptr(volatile int* _comm_bench_ended)
{
memset_comm_bench_ended = _comm_bench_ended;
}
static int memset_get_nb_runs(enum comm_bench_type comm_bench_type)
{
return 20;
}
static void memset_print_results()
{
printf("# memset results: Time ms (min, avg, max)\n");
if (memset_bench_done[WARMUP])
{
printf("# warmup %.5f\t%.5f\t%.5f\n", memset_perfs_warmup[0] / 1000.0f, memset_perfs_warmup[1] / 1000.0f, memset_perfs_warmup[2] / 1000.0f);
}
if (memset_bench_done[WITH_COMM])
{
printf("# with communications %.5f\t%.5f\t%.5f\n", memset_perfs_comm[0] / 1000.0f, memset_perfs_comm[1] / 1000.0f, memset_perfs_comm[2] / 1000.0f);
}
if (memset_bench_done[WITHOUT_COMM])
{
printf("# without communications %.5f\t%.5f\t%.5f\n", memset_perfs_no_comm[0] / 1000.0f, memset_perfs_no_comm[1] / 1000.0f, memset_perfs_no_comm[2] / 1000.0f);
}
}
static int memset_init()
{
nb_threads = get_nb_openmp_threads();
array_size = params.pingpong_size;
printf("# Each thread will work on an array of %d items (%ld KB)\n", array_size, array_size*sizeof(MEMSET_TYPE)/1024);
a = malloc(nb_threads*sizeof(MEMSET_TYPE*));
#pragma omp parallel for
for (int i = 0; i < nb_threads; i++)
{
a[i] = comp_malloc(array_size*sizeof(MEMSET_TYPE));
for (int j = 0; j < array_size; j++)
{
a[i][j] = 1;
}
}
return 0;
}
static double memset_run_kernel()
{
puk_tick_t start_time, end_time;
MEMSET_TYPE scalar = 3;
PUK_GET_TICK(start_time);
#pragma omp parallel for
for (int i = 0; i < nb_threads; i++)
{
memset(a[i], scalar, array_size*sizeof(MEMSET_TYPE));
}
PUK_GET_TICK(end_time);
return PUK_TIMING_DELAY(start_time, end_time);
}
static int memset_run(int nb_runs, enum bench_type bench_type)
{
if (bench_type != WARMUP && memset_bench_done[bench_type])
{
printf("Warning: this bench was already done.\n");
}
double avgtime = 0, maxtime = 0, mintime = FLT_MAX;
int real_nb_runs = nb_runs;
double duration;
for (int k = 0; k < nb_runs; k++)
{
duration = memset_run_kernel();
mintime = MIN(mintime, duration);
avgtime += duration;
maxtime = MAX(maxtime, duration);
}
if (bench_type == WITH_COMM && memset_comm_bench_ended != NULL)
{
/* Keep computing while we need more pingpongs: */
while (!*memset_comm_bench_ended)
{
duration = memset_run_kernel();
mintime = MIN(mintime, duration);
avgtime += duration;
maxtime = MAX(maxtime, duration);
real_nb_runs++;
}
}
if (memset_bench_done[bench_type])
{
return 0;
}
if (bench_type == WARMUP)
{
memset_perfs_warmup[0] = mintime;
memset_perfs_warmup[1] = avgtime / (double) (real_nb_runs);
memset_perfs_warmup[2] = maxtime;
}
else if (bench_type == WITH_COMM)
{
memset_perfs_comm[0] = mintime;
memset_perfs_comm[1] = avgtime / (double) (real_nb_runs);
memset_perfs_comm[2] = maxtime;
}
else if (bench_type == WITHOUT_COMM)
{
memset_perfs_no_comm[0] = mintime;
memset_perfs_no_comm[1] = avgtime / (double) (real_nb_runs);
memset_perfs_no_comm[2] = maxtime;
}
else
{
abort();
}
memset_bench_done[bench_type] = 1;
return 0;
}
static void memset_release()
{
for (int i = 0; i < nb_threads; i++)
{
comp_free(a[i], array_size*sizeof(MEMSET_TYPE));
}
free(a);
}
struct computing_functions memset_get_functions()
{
struct computing_functions s = {
.init = &memset_init,
.run = &memset_run,
.print_results = &memset_print_results,
.print_params = NULL,
.release = &memset_release,
.parse_arg = NULL,
.man = NULL,
.get_nb_runs = &memset_get_nb_runs,
.set_comm_bench_ended_ptr = &memset_set_comm_bench_ended_ptr,
.name = "memset"
};
return s;
}
......@@ -13,7 +13,7 @@ static int rank = 0, other_rank = 1;
static pthread_barrier_t thread_barrier;
static int nb_comm_todo;
static double med_comm_duration, seq_comm_time, parallel_comm_time;
static struct params_s params;
struct params_s params;
struct machine_s machine;
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment