Mentions légales du service

Skip to content
Snippets Groups Projects
Commit b5782fef authored by DERUMIGNY Nicolas's avatar DERUMIGNY Nicolas
Browse files

Merge branch 'nderumig/mem_area_from_args' into 'master'

benchmark/common: using argument-provided location for MEM_AREAs

See merge request !3
parents 4e621f03 f3a9a831
No related branches found
No related tags found
1 merge request!3benchmark/common: using argument-provided location for MEM_AREAs
...@@ -209,6 +209,8 @@ class Benchmark_Lib: ...@@ -209,6 +209,8 @@ class Benchmark_Lib:
(1, "num_papi_events", 0), (1, "num_papi_events", 0),
(1, "papi_result_array", None), (1, "papi_result_array", None),
(1, "reg_values", None), (1, "reg_values", None),
(1, "load_area", None),
(1, "store_area", None),
), ),
) )
fn.__name__ = fn_name fn.__name__ = fn_name
...@@ -239,6 +241,8 @@ class Benchmark_Lib: ...@@ -239,6 +241,8 @@ class Benchmark_Lib:
ctypes.c_ssize_t, ctypes.c_ssize_t,
ctypes.POINTER(ctypes.c_longlong), ctypes.POINTER(ctypes.c_longlong),
ctypes.POINTER(ctypes.c_byte), ctypes.POINTER(ctypes.c_byte),
ctypes.POINTER(ctypes.c_byte),
ctypes.POINTER(ctypes.c_byte),
) )
def __del__(self): def __del__(self):
...@@ -1158,6 +1162,8 @@ class _Benchmark_Runner: ...@@ -1158,6 +1162,8 @@ class _Benchmark_Runner:
num_events = out.allocate_argument(1) num_events = out.allocate_argument(1)
results = out.allocate_argument(2) results = out.allocate_argument(2)
reg_values = out.allocate_argument(3) reg_values = out.allocate_argument(3)
mem_load_area = out.allocate_argument(4)
mem_store_area = out.allocate_argument(5)
out.newline() out.newline()
out.comment("*" * 70) out.comment("*" * 70)
...@@ -1174,11 +1180,21 @@ class _Benchmark_Runner: ...@@ -1174,11 +1180,21 @@ class _Benchmark_Runner:
out.comment("ARG num_events ", num_events) out.comment("ARG num_events ", num_events)
out.comment("ARG results ", results) out.comment("ARG results ", results)
out.comment("ARG reg_values ", reg_values) out.comment("ARG reg_values ", reg_values)
out.comment("ARG mem_load_area ", mem_load_area)
out.comment("ARG mem_store_area ", mem_store_area)
out.comment("free callee-saves for kernel") out.comment("free callee-saves for kernel")
out.push_callee_saves() out.push_callee_saves()
out.newline() out.newline()
need_memory: bool = any(i.has_memory_operand() for i in benchmark.instructions)
if need_memory:
out.comment("saving memory areas")
out.push_to_stack(mem_load_area)
out.push_to_stack(mem_store_area)
else:
out.free_reg(mem_load_area)
out.free_reg(mem_store_area)
SCRATCH_REG_1 = out.scratch_register(0) SCRATCH_REG_1 = out.scratch_register(0)
SCRATCH_REG_2 = out.scratch_register(1) SCRATCH_REG_2 = out.scratch_register(1)
...@@ -1186,9 +1202,8 @@ class _Benchmark_Runner: ...@@ -1186,9 +1202,8 @@ class _Benchmark_Runner:
SCRATCH_REG_4 = out.scratch_register(3) SCRATCH_REG_4 = out.scratch_register(3)
SCRATCH_REG_5 = out.scratch_register(4) SCRATCH_REG_5 = out.scratch_register(4)
# Set of registers that are currently used, but will not during the execution # Set of registers that are currently used, but will not during the execution
# of the kernel (used during generation), and reverse # of the kernel (=> used only during initialisation)
unused_registers_kernel = set() unused_registers_kernel = set()
used_registers_kernel = set()
from pipedream.asm.x86 import RDX from pipedream.asm.x86 import RDX
...@@ -1213,57 +1228,74 @@ class _Benchmark_Runner: ...@@ -1213,57 +1228,74 @@ class _Benchmark_Runner:
out.comment("size of one row of results table in bytes") out.comment("size of one row of results table in bytes")
STRIDE = out.mul_reg_with_const(num_events, 8) STRIDE = out.mul_reg_with_const(num_events, 8)
need_memory: bool = any(i.has_memory_operand() for i in benchmark.instructions)
if need_memory: if need_memory:
MEMORY_ARENA_LD = ir.Label(self._MEMORY_ARENA_LD + "@GOTPCREL(%rip)")
MEMORY_ARENA_ST = ir.Label(self._MEMORY_ARENA_ST + "@GOTPCREL(%rip)")
out.newline() out.newline()
out.comment("clear memory arena") out.comment("clear mem_store_area")
# FIXME: other address sizes
## void *memset(void *s, int c, size_t n); ## void *memset(void *s, int c, size_t n);
s = out.get_argument_register(0) s = out.get_argument_register(0)
c = out.get_argument_register(1) c = out.get_argument_register(1)
n = out.get_argument_register(2) n = out.get_argument_register(2)
out.put_const_in_register(MEMORY_ARENA_LD, s) MEMORY_REG_STORE = out.ir_builder.select_memory_base_register(
benchmark.instructions,
set(out.iter_free_registers()),
64,
)
out.comment("mem_store -> ", MEMORY_REG_STORE)
# popping mem_store_area from the stack
out.pop_from_stack(s)
# 1) saves `mem_store_area` 2) frees `s` and 3) takes `MEMORY_REG_STORE`
out.move_to(s, MEMORY_REG_STORE)
# Saving `MEMORY_REG_STORE` and `reg_values` as they may be overwritten by
# the call
out.push_to_stack(MEMORY_REG_STORE)
out.push_to_stack(reg_values)
out.put_const_in_register(0, c) out.put_const_in_register(0, c)
out.put_const_in_register(self.memory_size(benchmark), n) out.put_const_in_register(self.memory_size(benchmark), n)
out.call("memset@PLT", s, c, n) out.call("memset@PLT", s, c, n)
out.free_reg(s) out.free_reg(s)
out.free_reg(c) out.free_reg(c)
out.free_reg(n) out.free_reg(n)
out.put_const_in_register(MEMORY_ARENA_ST, s) # Restoring `reg_values` to leave the stack unchanged
out.pop_from_stack(reg_values)
out.pop_from_stack(MEMORY_REG_STORE)
out.newline()
out.comment("clear mem_load_area")
MEMORY_REG_LOAD = out.ir_builder.select_memory_base_register(
benchmark.instructions,
set(out.iter_free_registers()) - set([MEMORY_REG_STORE]),
64,
)
out.comment("mem_store -> ", MEMORY_REG_LOAD)
# Popping `mem_load_area` from the stack
out.pop_from_stack(s)
# 1) saves `mem_load_area` 2) frees `s` and 3) takes `MEMORY_REG_LOAD`
out.move_to(s, MEMORY_REG_LOAD)
# saving `MEM_REG_LOAD`, `MEM_REG_STORE` and `reg_values` as they may be
# overwritten by the call
out.push_to_stack(MEMORY_REG_LOAD)
out.push_to_stack(MEMORY_REG_STORE)
out.push_to_stack(reg_values)
out.put_const_in_register(0, c) out.put_const_in_register(0, c)
out.put_const_in_register(self.memory_size(benchmark), n) out.put_const_in_register(self.memory_size(benchmark), n)
out.call("memset@PLT", s, c, n) out.call("memset@PLT", s, c, n)
out.free_reg(s) out.free_reg(s)
out.free_reg(c) out.free_reg(c)
out.free_reg(n) out.free_reg(n)
# Restoring `reg_values` to leave the stack unchanged
out.pop_from_stack(reg_values)
out.newline()
else:
MEMORY_REG_LOAD = None
MEMORY_REG_STORE = None
with out.counting_loop("measurement", LOOP_COUNTER, num_iterations) as loop: with out.counting_loop("measurement", LOOP_COUNTER, num_iterations) as loop:
out.comment("push loop counter") out.comment("push loop counter")
out.push_to_stack(LOOP_COUNTER)
## Reserve registers for future use
if need_memory: if need_memory:
# FIXME: other address sizes out.pop_from_stack(MEMORY_REG_STORE)
## We may use `reg_values` as it will be free before codegen out.pop_from_stack(MEMORY_REG_LOAD)
out.free_reg(reg_values) out.push_to_stack(LOOP_COUNTER)
MEMORY_REG_LOAD = out.ir_builder.select_memory_base_register(
benchmark.instructions,
set(out.iter_free_registers()),
64,
)
MEMORY_REG_STORE = out.ir_builder.select_memory_base_register(
benchmark.instructions,
set(out.iter_free_registers()) - set([MEMORY_REG_LOAD]),
64,
)
used_registers_kernel.add(MEMORY_REG_LOAD)
used_registers_kernel.add(MEMORY_REG_STORE)
out.take_reg(reg_values)
else:
MEMORY_REG_LOAD = None
MEMORY_REG_STORE = None
## Pre-allocate kernel and its related variables, as some register information may ## Pre-allocate kernel and its related variables, as some register information may
## be used by prologue and/or pre-prologue ## be used by prologue and/or pre-prologue
...@@ -1282,7 +1314,7 @@ class _Benchmark_Runner: ...@@ -1282,7 +1314,7 @@ class _Benchmark_Runner:
MEMORY_REG_LOAD, MEMORY_REG_LOAD,
MEMORY_REG_STORE, MEMORY_REG_STORE,
unused_registers_kernel, unused_registers_kernel,
used_registers_kernel, set(),
gen_iaca_markers=gen_iaca_markers, gen_iaca_markers=gen_iaca_markers,
) )
...@@ -1298,6 +1330,9 @@ class _Benchmark_Runner: ...@@ -1298,6 +1330,9 @@ class _Benchmark_Runner:
if gen_papi_calls: if gen_papi_calls:
out.comment("push loop stride") out.comment("push loop stride")
out.push_to_stack(STRIDE) out.push_to_stack(STRIDE)
if need_memory:
out.push_to_stack(MEMORY_REG_LOAD)
out.push_to_stack(MEMORY_REG_STORE)
out.sequentialize_cpu() out.sequentialize_cpu()
...@@ -1309,26 +1344,30 @@ class _Benchmark_Runner: ...@@ -1309,26 +1344,30 @@ class _Benchmark_Runner:
out.branch_if_not_zero(ret, loop.exit) out.branch_if_not_zero(ret, loop.exit)
# TODO: test this # TODO: test this
out.comment("pop papi_event_set") out.comment("push papi_event_set and papi_results")
if need_memory:
out.comment("poping first mem_area regs")
out.pop_from_stack(MEMORY_REG_STORE)
out.pop_from_stack(MEMORY_REG_LOAD)
out.comment('"real" push')
out.push_to_stack(papi_event_set) out.push_to_stack(papi_event_set)
out.comment("push papi_results")
out.push_to_stack(results) out.push_to_stack(results)
if need_memory:
out.comment("pushing back mem_area regs")
out.push_to_stack(MEMORY_REG_LOAD)
out.push_to_stack(MEMORY_REG_STORE)
out.sequentialize_cpu() out.sequentialize_cpu()
## allow prologue generator to see real instructions with allocated registers, etc. ## allow prologue generator to see real instructions with allocated registers, etc.
if need_memory:
out.comment("pop load/store regions")
out.pop_from_stack(MEMORY_REG_STORE)
out.pop_from_stack(MEMORY_REG_LOAD)
out.emit_benchmark_prologue( out.emit_benchmark_prologue(
fully_allocated_kernel, fully_allocated_kernel,
) )
## Actually use reserved registers
if need_memory:
out.comment("init pointers location")
out.take_reg(MEMORY_REG_LOAD)
out.take_reg(MEMORY_REG_STORE)
out.put_const_in_register(MEMORY_ARENA_LD, MEMORY_REG_LOAD)
out.put_const_in_register(MEMORY_ARENA_ST, MEMORY_REG_STORE)
## free registers stolen by backend ## free registers stolen by backend
out.free_stolen_benchmark_registers(stolen_regs) out.free_stolen_benchmark_registers(stolen_regs)
...@@ -1338,10 +1377,6 @@ class _Benchmark_Runner: ...@@ -1338,10 +1377,6 @@ class _Benchmark_Runner:
## actually emit kernel ## actually emit kernel
out.splice_in_code(kernel_code) out.splice_in_code(kernel_code)
if MEMORY_REG_LOAD is not None and MEMORY_REG_STORE is not None:
out.free_reg(MEMORY_REG_LOAD)
out.free_reg(MEMORY_REG_STORE)
out.comment("END BENCHMARK") out.comment("END BENCHMARK")
out.comment("*" * 40) out.comment("*" * 40)
...@@ -1354,6 +1389,9 @@ class _Benchmark_Runner: ...@@ -1354,6 +1389,9 @@ class _Benchmark_Runner:
out.pop_from_stack(results) out.pop_from_stack(results)
out.comment("pop papi_event_set") out.comment("pop papi_event_set")
out.pop_from_stack(papi_event_set) out.pop_from_stack(papi_event_set)
if need_memory:
out.push_to_stack(MEMORY_REG_LOAD)
out.push_to_stack(MEMORY_REG_STORE)
out.comment("stop & read PAPI counters") out.comment("stop & read PAPI counters")
ret = out.call("PAPI_stop@PLT", papi_event_set, results) ret = out.call("PAPI_stop@PLT", papi_event_set, results)
...@@ -1365,15 +1403,34 @@ class _Benchmark_Runner: ...@@ -1365,15 +1403,34 @@ class _Benchmark_Runner:
out.sequentialize_cpu() out.sequentialize_cpu()
out.comment("pop stride") out.comment("pop stride")
if need_memory:
out.pop_from_stack(MEMORY_REG_STORE)
out.pop_from_stack(MEMORY_REG_LOAD)
out.pop_from_stack(STRIDE) out.pop_from_stack(STRIDE)
if need_memory:
out.push_to_stack(MEMORY_REG_LOAD)
out.push_to_stack(MEMORY_REG_STORE)
SCRATCH_REG_5 = out.scratch_register(4) SCRATCH_REG_5 = out.scratch_register(4)
if need_memory:
out.pop_from_stack(MEMORY_REG_STORE)
out.pop_from_stack(MEMORY_REG_LOAD)
out.emit_loop_epilogue(fully_allocated_kernel, SCRATCH_REG_5) out.emit_loop_epilogue(fully_allocated_kernel, SCRATCH_REG_5)
if need_memory:
out.push_to_stack(MEMORY_REG_LOAD)
out.push_to_stack(MEMORY_REG_STORE)
out.comment("pop loop counter") out.comment("pop loop counter")
if need_memory:
out.pop_from_stack(MEMORY_REG_STORE)
out.pop_from_stack(MEMORY_REG_LOAD)
out.pop_from_stack(LOOP_COUNTER) out.pop_from_stack(LOOP_COUNTER)
out.newline() if need_memory:
out.push_to_stack(MEMORY_REG_LOAD)
out.push_to_stack(MEMORY_REG_STORE)
out.newline()
out.add_registers(STRIDE, results) out.add_registers(STRIDE, results)
out.newline() out.newline()
...@@ -1381,6 +1438,12 @@ class _Benchmark_Runner: ...@@ -1381,6 +1438,12 @@ class _Benchmark_Runner:
with out.with_register(out.return_register()) as ret: with out.with_register(out.return_register()) as ret:
out.put_const_in_register(0, ret) out.put_const_in_register(0, ret)
if need_memory:
out.comment("Memory area are not needed anymore, discarding")
out.pop_from_stack(MEMORY_REG_STORE)
out.pop_from_stack(MEMORY_REG_LOAD)
out.free_reg(MEMORY_REG_LOAD, MEMORY_REG_STORE)
out.free_reg( out.free_reg(
papi_event_set, papi_event_set,
results, results,
...@@ -1492,8 +1555,6 @@ class _Benchmark_Runner: ...@@ -1492,8 +1555,6 @@ class _Benchmark_Runner:
return out.take_code(), kernel_instructions return out.take_code(), kernel_instructions
_MEMORY_ARENA_LD = "_memory_arena_ld"
_MEMORY_ARENA_ST = "_memory_arena_st"
_PAGE_SIZE = 4096 _PAGE_SIZE = 4096
def _gen_benchmark_lib( def _gen_benchmark_lib(
...@@ -1572,50 +1633,7 @@ class _Benchmark_Runner: ...@@ -1572,50 +1633,7 @@ class _Benchmark_Runner:
benchmark_functions[benchmark] = fn_name benchmark_functions[benchmark] = fn_name
## calculate of size of memory arena asm_writer.end_file(asm_file)
memory_arena_size = 0
for b in benchmark_specs:
memory_size = (
allocator.Maximize_Deps_Register_Allocator.memory_arena_size(
b.instructions
)
* b.unroll_factor
)
memory_arena_size = max(memory_arena_size, memory_size)
PAGE_SIZE = self._PAGE_SIZE
## round up to a multiple of page size
memory_arena_size = (
memory_arena_size + PAGE_SIZE - memory_arena_size % PAGE_SIZE
)
## add a padding page
memory_arena_size += PAGE_SIZE
## why not
memory_arena_size *= 2
asm_writer.global_byte_array(
self._MEMORY_ARENA_LD + "pad_before_", memory_arena_size, 4096
)
asm_writer.global_byte_array(self._MEMORY_ARENA_LD, memory_arena_size, 4096)
asm_writer.global_byte_array(
self._MEMORY_ARENA_LD + "pad_after_", memory_arena_size, 4096
)
asm_writer.global_byte_array(
self._MEMORY_ARENA_ST + "pad_before_", memory_arena_size, 4096
)
asm_writer.global_byte_array(self._MEMORY_ARENA_ST, memory_arena_size, 4096)
asm_writer.global_byte_array(
self._MEMORY_ARENA_ST + "pad_after_", memory_arena_size, 4096
)
asm_writer.end_file(asm_file)
self.info("assemble benchmark library") self.info("assemble benchmark library")
...@@ -1729,6 +1747,22 @@ class _Benchmark_Runner: ...@@ -1729,6 +1747,22 @@ class _Benchmark_Runner:
dtype=ctypes.c_byte, dtype=ctypes.c_byte,
order="C", order="C",
) )
load_area = numpy.ndarray(
shape=[
self.memory_size(benchmark),
1,
],
dtype=ctypes.c_byte,
order="C",
)
store_area = numpy.ndarray(
shape=[
self.memory_size(benchmark),
1,
],
dtype=ctypes.c_byte,
order="C",
)
random.seed(42) random.seed(42)
for i in range(benchmark.arch.nb_vector_reg * ALIGNEMENT - 1): for i in range(benchmark.arch.nb_vector_reg * ALIGNEMENT - 1):
...@@ -1736,6 +1770,8 @@ class _Benchmark_Runner: ...@@ -1736,6 +1770,8 @@ class _Benchmark_Runner:
init_values[i, j] = random.getrandbits(8) init_values[i, j] = random.getrandbits(8)
data = init_values.ctypes.data_as(ctypes.POINTER(ctypes.c_byte)) data = init_values.ctypes.data_as(ctypes.POINTER(ctypes.c_byte))
load_area_data = load_area.ctypes.data_as(ctypes.POINTER(ctypes.c_byte))
store_area_data = store_area.ctypes.data_as(ctypes.POINTER(ctypes.c_byte))
addr = ctypes.addressof(data) addr = ctypes.addressof(data)
offset = 0 if addr % ALIGNEMENT == 0 else ALIGNEMENT - (addr % ALIGNEMENT) offset = 0 if addr % ALIGNEMENT == 0 else ALIGNEMENT - (addr % ALIGNEMENT)
aligned_init_values = ctypes.POINTER(ctypes.c_byte).from_address( aligned_init_values = ctypes.POINTER(ctypes.c_byte).from_address(
...@@ -1748,6 +1784,8 @@ class _Benchmark_Runner: ...@@ -1748,6 +1784,8 @@ class _Benchmark_Runner:
num_events, num_events,
result_array.ctypes.data_as(ctypes.POINTER(ctypes.c_longlong)), result_array.ctypes.data_as(ctypes.POINTER(ctypes.c_longlong)),
aligned_init_values, aligned_init_values,
load_area_data,
store_area_data,
) )
time_after = time.perf_counter() time_after = time.perf_counter()
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment