Commit 316d5061 authored by Philippe SWARTVAGHER's avatar Philippe SWARTVAGHER
Browse files

Use non-temporal instructions also on ARM processors

parent 48401f44
......@@ -101,6 +101,18 @@ fi
AC_CHECK_DECLS([_mm_stream_si32], [have_mm_stream_si32=yes], [have_mm_stream_si32=no], [[#include <x86intrin.h>]])
AM_CONDITIONAL(HAVE_MM_STREAM_SI32, test x$have_mm_stream_si32 = xyes)
AC_CHECK_DECLS([vst1q_lane_s32, vdupq_n_s32], [have_arm_nt_store=yes], [have_arm_nt_store=no], [[#include <arm_neon.h>]])
AM_CONDITIONAL(HAVE_ARM_NT_STORE, test x$have_arm_nt_store = xyes)
if test x$have_arm_nt_store = xyes; then
AC_DEFINE([HAVE_ARM_NT_STORE], [1], [Define to 1 if you have non-temporal store instructions on ARM architecture])
fi
have_non_temporal_store=no
if test x$have_mm_stream_si32 = xyes -o x$have_arm_nt_store = xyes; then
AC_DEFINE([HAVE_NON_TEMPORAL_STORE], [1], [Define to 1 if you have non-temporal store instructions])
have_non_temporal_store=yes
fi
# Compiler
AC_PROG_CC
......@@ -165,11 +177,12 @@ AC_OUTPUT
AC_MSG_NOTICE([
Configuration summary:
CPU: $host_cpu
MPI: $have_mpi ($mpicc_path)
likwid: $have_likwid
StarPU: $have_starpu
MKL: $have_mkl
AVX: $available_avx
Python: $install_python_package
CPU: $host_cpu
MPI: $have_mpi ($mpicc_path)
likwid: $have_likwid
StarPU: $have_starpu
MKL: $have_mkl
AVX: $available_avx
Python: $install_python_package
nt store: $have_non_temporal_store
])
......@@ -6,8 +6,12 @@
#include "config.h"
#if defined(HAVE_DECL__MM_STREAM_SI32)
#if HAVE_NON_TEMPORAL_STORE
#if HAVE_DECL__MM_STREAM_SI32
#include <x86intrin.h>
#elif HAVE_ARM_NT_STORE
#include <arm_neon.h>
#endif
#endif
#include "cli.h"
......@@ -37,6 +41,18 @@ static double* memset_perfs_per_thread_comm;
volatile int* memset_comm_bench_ended = NULL;
#if HAVE_NON_TEMPORAL_STORE
static inline __attribute__((always_inline)) void non_temporal_store(int *p, int a)
{
#if HAVE_DECL__MM_STREAM_SI32
_mm_stream_si32(p, a);
#elif HAVE_ARM_NT_STORE
vst1q_lane_s32((int32_t *) p, vdupq_n_s32(a), 0);
#endif
}
#endif
static void memset_set_comm_bench_ended_ptr(volatile int* _comm_bench_ended)
{
memset_comm_bench_ended = _comm_bench_ended;
......@@ -201,14 +217,14 @@ static double memset_run_kernel(enum bench_type bench_type)
double last_iter_duration = 0;
do
{
#if defined(HAVE_DECL__MM_STREAM_SI32)
#if HAVE_NON_TEMPORAL_STORE
if (use_non_temporal)
{
PUK_GET_TICK(start_iter_time);
int value = scalar * (nb_iter_per_thread[i]+1);
for (int k = 0; k < array_size; k++)
{
_mm_stream_si32(a[i]+k, value);
non_temporal_store(a[i]+k, value);
}
PUK_GET_TICK(end_iter_time);
}
......@@ -236,7 +252,7 @@ static double memset_run_kernel(enum bench_type bench_type)
#pragma omp parallel for
for (int i = 0; i < nb_threads; i++)
{
#if defined(HAVE_DECL__MM_STREAM_SI32)
#if HAVE_NON_TEMPORAL_STORE
if (use_non_temporal)
{
PUK_GET_TICK(thread_start_times[i]);
......@@ -245,7 +261,7 @@ static double memset_run_kernel(enum bench_type bench_type)
int value = scalar * j;
for (int k = 0; k < array_size; k++)
{
_mm_stream_si32(a[i]+k, value);
non_temporal_store(a[i]+k, value);
}
}
PUK_GET_TICK(thread_end_times[i]);
......@@ -403,7 +419,7 @@ static void memset_release()
static void memset_man()
{
printf("Memset-related options:\n");
#if defined(HAVE_DECL__MM_STREAM_SI32)
#if HAVE_NON_TEMPORAL_STORE
printf("\t--nt\tuse non-temporal stores to bypass the LLC\n");
#endif
printf("\t--throughput\tmeasure memory throughput\n");
......@@ -417,7 +433,7 @@ static void memset_print_params()
static int memset_parse_arg(char* arg)
{
#if defined(HAVE_DECL__MM_STREAM_SI32)
#if HAVE_NON_TEMPORAL_STORE
if (strcmp(arg, "--nt") == 0)
{
use_non_temporal = 1;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment