openmp_memset.c 12.5 KB
Newer Older
1
2
3
4
5
6
#include <float.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <omp.h>

7
8
9
10
11
12
#include "config.h"

#if defined(HAVE_DECL__MM_STREAM_SI32)
#include <x86intrin.h>
#endif

13
14
15
16
#include "cli.h"
#include "timing.h"

#define MEMSET_TYPE int
17
#define NB_ITER 3
18
#define THROUGHPUT_DURATION 3000000 // 3 sec
19
20
21
22

static MEMSET_TYPE **a;
static int array_size;
static int nb_threads;
23
static int use_non_temporal = 0;
24
static int measure_throughput = 0;
25
26
27
28
29
extern struct machine_s machine;
extern struct params_s params;

char memset_bench_done[] = {0, 0, 0};

30
// All in microseconds:
31
32
33
34
35
static double memset_perfs_warmup[] = {FLT_MAX, 0, 0};
static double memset_perfs_no_comm[] = {FLT_MAX, 0, 0};
static double memset_perfs_comm[] = {FLT_MAX, 0, 0};
static double* memset_perfs_per_thread_no_comm;
static double* memset_perfs_per_thread_comm;
36
37
38
39
40
41
42
43
44
45
46

volatile int* memset_comm_bench_ended = NULL;


static void memset_set_comm_bench_ended_ptr(volatile int* _comm_bench_ended)
{
    memset_comm_bench_ended = _comm_bench_ended;
}

static int memset_get_nb_runs(enum comm_bench_type comm_bench_type)
{
47
    return measure_throughput ? 3 : 20;
48
49
}

50
51
static inline double time_to_bw(double t) // microsec
{
52
    return (sizeof(MEMSET_TYPE) * array_size * nb_threads) / t; // MB/s
53
54
}

55
56
static inline double time_to_bw_per_thread(double t) // microsec
{
57
    return (sizeof(MEMSET_TYPE) * array_size) / t; // MB/s
58
59
}

60
61
static void memset_print_results()
{
62
63
    if (params.per_thread_perf)
    {
64
65
66
67
	int* thread_binding = malloc(nb_threads * sizeof(int));
	memset(thread_binding, 0, nb_threads * sizeof(int));
	get_worker_binding_ids(machine.topology, nb_threads, thread_binding);

68
69
	if (memset_bench_done[WITH_COMM])
	{
70
	    printf("# memset results per thread (with comm): Bandwidth MB/s (max, avg, min) Time ms (min, avg, max)\n");
71
72
	    for (int i = 0; i < nb_threads; i++)
	    {
73
74
75
		printf("Thread #%d\t%d\t"COMP_BW_FORMAT"\t"COMP_BW_FORMAT"\t"COMP_BW_FORMAT"\t"COMP_TIME_FORMAT"\t"COMP_TIME_FORMAT"\t"COMP_TIME_FORMAT"\n",
			i, thread_binding[i],
			time_to_bw_per_thread(memset_perfs_per_thread_comm[i*3]), time_to_bw_per_thread(memset_perfs_per_thread_comm[i*3+1]), time_to_bw_per_thread(memset_perfs_per_thread_comm[i*3+2]),
76
77
78
79
80
			memset_perfs_per_thread_comm[i*3] / 1000.0f, memset_perfs_per_thread_comm[i*3+1] / 1000.0f, memset_perfs_per_thread_comm[i*3+2] / 1000.0f);
	    }
	}
	if (memset_bench_done[WITHOUT_COMM])
	{
81
	    printf("# memset results per thread (without comm): Bandwidth MB/s (max, avg, min) Time ms (min, avg, max)\n");
82
83
	    for (int i = 0; i < nb_threads; i++)
	    {
84
85
86
		printf("Thread #%d\t%d\t"COMP_BW_FORMAT"\t"COMP_BW_FORMAT"\t"COMP_BW_FORMAT"\t"COMP_TIME_FORMAT"\t"COMP_TIME_FORMAT"\t"COMP_TIME_FORMAT"\n",
			i, thread_binding[i],
			time_to_bw_per_thread(memset_perfs_per_thread_no_comm[i*3]), time_to_bw_per_thread(memset_perfs_per_thread_no_comm[i*3+1]), time_to_bw_per_thread(memset_perfs_per_thread_no_comm[i*3+2]),
87
88
89
			memset_perfs_per_thread_no_comm[i*3] / 1000.0f, memset_perfs_per_thread_no_comm[i*3+1] / 1000.0f, memset_perfs_per_thread_no_comm[i*3+2] / 1000.0f);
	    }
	}
90
91

	free(thread_binding);
92
93
    }

94
    printf("# memset results: Bandwidth MB/s (max, avg, min) Time ms (min, avg, max)\n");
95
96
    if (memset_bench_done[WARMUP])
    {
97
98
99
100
101
102
103
104
105
106
	if (measure_throughput)
	{
	    printf("# warmup                 -\t-\t-\t-\t-\t-\n");
	}
	else
	{
	    printf("# warmup                 "COMP_BW_FORMAT"\t"COMP_BW_FORMAT"\t"COMP_BW_FORMAT"\t"COMP_TIME_FORMAT"\t"COMP_TIME_FORMAT"\t"COMP_TIME_FORMAT"\n",
		    time_to_bw(memset_perfs_warmup[0]), time_to_bw(memset_perfs_warmup[1]), time_to_bw(memset_perfs_warmup[2]),
		    memset_perfs_warmup[0] / 1000.0f, memset_perfs_warmup[1] / 1000.0f, memset_perfs_warmup[2] / 1000.0f);
	}
107
108
109
    }
    if (memset_bench_done[WITH_COMM])
    {
110
111
112
113
114
115
116
117
118
119
	if (measure_throughput)
	{
	    printf("# with communications    -\t-\t-\t-\t-\t-\n");
	}
	else
	{
	    printf("# with communications    "COMP_BW_FORMAT"\t"COMP_BW_FORMAT"\t"COMP_BW_FORMAT"\t"COMP_TIME_FORMAT"\t"COMP_TIME_FORMAT"\t"COMP_TIME_FORMAT"\n",
		    time_to_bw(memset_perfs_comm[0]), time_to_bw(memset_perfs_comm[1]), time_to_bw(memset_perfs_comm[2]),
		    memset_perfs_comm[0] / 1000.0f, memset_perfs_comm[1] / 1000.0f, memset_perfs_comm[2] / 1000.0f);
	}
120
121
122
    }
    if (memset_bench_done[WITHOUT_COMM])
    {
123
124
125
126
127
128
129
130
131
132
	if (measure_throughput)
	{
	    printf("# without communications -\t-\t-\t-\t-\t-\n");
	}
	else
	{
	    printf("# without communications "COMP_BW_FORMAT"\t"COMP_BW_FORMAT"\t"COMP_BW_FORMAT"\t"COMP_TIME_FORMAT"\t"COMP_TIME_FORMAT"\t"COMP_TIME_FORMAT"\n",
		    time_to_bw(memset_perfs_no_comm[0]), time_to_bw(memset_perfs_no_comm[1]), time_to_bw(memset_perfs_no_comm[2]),
		    memset_perfs_no_comm[0] / 1000.0f, memset_perfs_no_comm[1] / 1000.0f, memset_perfs_no_comm[2] / 1000.0f);
	}
133
134
135
136
137
138
    }
}

static int memset_init()
{
    nb_threads = get_nb_openmp_threads();
139
    array_size = params.enable_comm ? params.pingpong_size : COMM_BANDWIDTH_BUFFER_SIZE;
140
141
142
143
144

    printf("# Each thread will work on an array of %d items (%ld KB)\n", array_size, array_size*sizeof(MEMSET_TYPE)/1024);

    a = malloc(nb_threads*sizeof(MEMSET_TYPE*));

145
146
147
    memset_perfs_per_thread_no_comm = malloc(nb_threads*3*sizeof(double));
    memset_perfs_per_thread_comm = malloc(nb_threads*3*sizeof(double));

148
149
150
    #pragma omp parallel for
    for (int i = 0; i < nb_threads; i++)
    {
151
152
153
	a[i] = data_malloc(
	    array_size*sizeof(MEMSET_TYPE),
	    params.memory_comp_numa_nodes_nb == 0 ? -1 : get_numa_node_comp_for_id(i % params.memory_comp_numa_nodes_nb));
154
155
156
157
158

	for (int j = 0; j < array_size; j++)
	{
	    a[i][j] = 1;
	}
159
160
161
162
163
164
165

	memset_perfs_per_thread_no_comm[i*3] = FLT_MAX;
	memset_perfs_per_thread_no_comm[i*3+1] = 0.0f;
	memset_perfs_per_thread_no_comm[i*3+2] = 0.0f;
	memset_perfs_per_thread_comm[i*3] = FLT_MAX;
	memset_perfs_per_thread_comm[i*3+1] = 0.0f;
	memset_perfs_per_thread_comm[i*3+2] = 0.0f;
166
167
168
169
170
    }

    return 0;
}

171
static double memset_run_kernel(enum bench_type bench_type)
172
173
174
{
    puk_tick_t start_time, end_time;
    MEMSET_TYPE scalar = 3;
175
176
177
178
179
180
181
182
183
184
    double* per_thread_perfs = NULL;

    if (bench_type == WITH_COMM)
    {
	per_thread_perfs = memset_perfs_per_thread_comm;
    }
    else if (bench_type == WITHOUT_COMM)
    {
	per_thread_perfs = memset_perfs_per_thread_no_comm;
    }
185

186
187
    puk_tick_t* thread_start_times = malloc(nb_threads * sizeof(puk_tick_t));
    puk_tick_t* thread_end_times = malloc(nb_threads * sizeof(puk_tick_t));
188
189
    double* thread_durations = malloc(nb_threads * sizeof(double));
    unsigned* nb_iter_per_thread = malloc(nb_threads * sizeof(unsigned));
190

191
    PUK_GET_TICK(start_time);
192
    if (measure_throughput)
193
    {
194
195
196
197
198
	#pragma omp parallel for
	for (int i = 0; i < nb_threads; i++)
	{
	    nb_iter_per_thread[i] = 0;
	    thread_durations[i] = 0;
199

200
201
202
203
	    puk_tick_t start_iter_time, end_iter_time;
	    double last_iter_duration = 0;
	    do
	    {
204
#if defined(HAVE_DECL__MM_STREAM_SI32)
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
		if (use_non_temporal)
		{
		    PUK_GET_TICK(start_iter_time);
		    int value = scalar * (nb_iter_per_thread[i]+1);
		    for (int k = 0; k < array_size; k++)
		    {
			_mm_stream_si32(a[i]+k, value);
		    }
		    PUK_GET_TICK(end_iter_time);
		}
		else
#endif
		{
		    PUK_GET_TICK(start_iter_time);
		    memset(a[i], scalar * (nb_iter_per_thread[i]+1), array_size*sizeof(MEMSET_TYPE));
		    PUK_GET_TICK(end_iter_time);
		}
		if (nb_iter_per_thread[i] > 0)
		{
		    last_iter_duration = PUK_TIMING_DELAY(start_iter_time, end_iter_time);
		    thread_durations[i] += last_iter_duration;
		}
		nb_iter_per_thread[i]++;
	    } while (PUK_TIMING_DELAY(start_time, end_iter_time) < THROUGHPUT_DURATION);
	    nb_iter_per_thread[i] -= 2;
	    thread_durations[i] -= last_iter_duration;
	    thread_durations[i] /= nb_iter_per_thread[i];
	}
    }
    else
    {
	#pragma omp parallel for
	for (int i = 0; i < nb_threads; i++)
238
	{
239
240
#if defined(HAVE_DECL__MM_STREAM_SI32)
	    if (use_non_temporal)
241
	    {
242
243
		PUK_GET_TICK(thread_start_times[i]);
		for (int j = 1; j <= NB_ITER; j++)
244
		{
245
246
247
248
249
		    int value = scalar * j;
		    for (int k = 0; k < array_size; k++)
		    {
			_mm_stream_si32(a[i]+k, value);
		    }
250
		}
251
		PUK_GET_TICK(thread_end_times[i]);
252
	    }
253
	    else
254
255
#endif
	    {
256
257
258
259
260
261
		PUK_GET_TICK(thread_start_times[i]);
		for (int j = 1; j <= NB_ITER; j++)
		{
		    memset(a[i], scalar*j, array_size*sizeof(MEMSET_TYPE));
		}
		PUK_GET_TICK(thread_end_times[i]);
262
	    }
263
	}
264
265
    }
    PUK_GET_TICK(end_time);
266

267
268
269
    if (params.per_thread_perf && (bench_type == WITH_COMM || bench_type == WITHOUT_COMM))
    {
	for (int i = 0; i < nb_threads; i++)
270
	{
271
272
273
274
275
276
277
278
279
	    double thread_duration;
	    if (measure_throughput)
	    {
		thread_duration = thread_durations[i];
	    }
	    else
	    {
		thread_duration = PUK_TIMING_DELAY(thread_start_times[i], thread_end_times[i]) / NB_ITER;
	    }
280
281
282
	    per_thread_perfs[i*3] = MIN(per_thread_perfs[i*3], thread_duration);
	    per_thread_perfs[i*3+1] += thread_duration;
	    per_thread_perfs[i*3+2] = MAX(per_thread_perfs[i*3+2], thread_duration);
283
	}
284
    }
285

286
287
288
289
    if (measure_throughput)
    {
	for (int i = 0; i < nb_threads; i++)
	{
290
	    if (nb_iter_per_thread[i] < NB_ITER)
291
292
293
294
295
296
	    {
		fprintf(stderr, "# Warning: thread %d did less than %d iterations (%u).\n", i, NB_ITER, nb_iter_per_thread[i]);
	    }
	}
    }

297
298
    free(thread_start_times);
    free(thread_end_times);
299
    free(nb_iter_per_thread);
300

301
    return PUK_TIMING_DELAY(start_time, end_time) / NB_ITER;
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
}

static int memset_run(int nb_runs, enum bench_type bench_type)
{
    if (bench_type != WARMUP && memset_bench_done[bench_type])
    {
	printf("Warning: this bench was already done.\n");
    }

    double avgtime = 0, maxtime = 0, mintime = FLT_MAX;
    int real_nb_runs = nb_runs;
    double duration;

    for (int k = 0; k < nb_runs; k++)
    {
317
	duration = memset_run_kernel(bench_type);
318
319
320
321
322
323
324
325
326
327
328

	mintime = MIN(mintime, duration);
	avgtime += duration;
	maxtime = MAX(maxtime, duration);
    }

    if (bench_type == WITH_COMM && memset_comm_bench_ended != NULL)
    {
	/* Keep computing while we need more pingpongs: */
	while (!*memset_comm_bench_ended)
	{
329
	    duration = memset_run_kernel(bench_type);
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354

	    mintime = MIN(mintime, duration);
	    avgtime += duration;
	    maxtime = MAX(maxtime, duration);

	    real_nb_runs++;
	}
    }

    if (memset_bench_done[bench_type])
    {
	return 0;
    }

    if (bench_type == WARMUP)
    {
	memset_perfs_warmup[0] = mintime;
	memset_perfs_warmup[1] = avgtime / (double) (real_nb_runs);
	memset_perfs_warmup[2] = maxtime;
    }
    else if (bench_type == WITH_COMM)
    {
	memset_perfs_comm[0] = mintime;
	memset_perfs_comm[1] = avgtime / (double) (real_nb_runs);
	memset_perfs_comm[2] = maxtime;
355
356
357
358
359
360
361
362

	if (params.per_thread_perf)
	{
	    for (int i = 0; i < nb_threads; i++)
	    {
		memset_perfs_per_thread_comm[i*3+1] /= (double) (real_nb_runs);
	    }
	}
363
364
365
366
367
368
    }
    else if (bench_type == WITHOUT_COMM)
    {
	memset_perfs_no_comm[0] = mintime;
	memset_perfs_no_comm[1] = avgtime / (double) (real_nb_runs);
	memset_perfs_no_comm[2] = maxtime;
369
370
371
372
373
374
375
376

	if (params.per_thread_perf)
	{
	    for (int i = 0; i < nb_threads; i++)
	    {
		memset_perfs_per_thread_no_comm[i*3+1] /= (double) (real_nb_runs);
	    }
	}
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
    }
    else
    {
	abort();
    }

    memset_bench_done[bench_type] = 1;

    return 0;
}

static void memset_release()
{
    for (int i = 0; i < nb_threads; i++)
    {
392
393
394
395
	data_free(
	    a[i],
	    array_size*sizeof(MEMSET_TYPE),
	    params.memory_comp_numa_nodes_nb == 0 ? -1 : get_numa_node_comp_for_id(i % params.memory_comp_numa_nodes_nb));
396
397
398
    }

    free(a);
399
400
    free(memset_perfs_per_thread_no_comm);
    free(memset_perfs_per_thread_comm);
401
402
}

403
404
405
static void memset_man()
{
    printf("Memset-related options:\n");
406
#if defined(HAVE_DECL__MM_STREAM_SI32)
407
408
    printf("\t--nt\tuse non-temporal stores to bypass the LLC\n");
#endif
409
    printf("\t--throughput\tmeasure memory throughput\n");
410
411
412
413
414
}

static void memset_print_params()
{
    printf("# Will %suse non-temporal memset\n", use_non_temporal ? "" : "not ");
415
    printf("# Will %smeasure memory bandwidth with throughput\n", measure_throughput ? "" : "not ");
416
417
418
419
420
421
422
423
424
425
426
}

static int memset_parse_arg(char* arg)
{
#if defined(HAVE_DECL__MM_STREAM_SI32)
    if (strcmp(arg, "--nt") == 0)
    {
	use_non_temporal = 1;
	return 1;
    }
#endif
427
428
429
430
431
    if (strcmp(arg, "--throughput") == 0)
    {
	measure_throughput = 1;
	return 1;
    }
432
433
434
435

    return 0;
}

436
437
438
439
440
441
struct computing_functions memset_get_functions()
{
    struct computing_functions s = {
	.init = &memset_init,
	.run = &memset_run,
	.print_results = &memset_print_results,
442
	.print_params = &memset_print_params,
443
	.release = &memset_release,
444
445
	.parse_arg = &memset_parse_arg,
	.man = &memset_man,
446
447
448
449
450
451
452
	.get_nb_runs = &memset_get_nb_runs,
	.set_comm_bench_ended_ptr = &memset_set_comm_bench_ended_ptr,
	.name = "memset"
    };

    return s;
}