Mentions légales du service

Skip to content
Snippets Groups Projects
Commit 3813e316 authored by GRUBER Fabian's avatar GRUBER Fabian
Browse files

XXX: did too many things again.

parent 424f870a
No related branches found
No related tags found
No related merge requests found
Showing
with 1700 additions and 947 deletions
......@@ -54,6 +54,10 @@ class ASM_Writer(abc.ABC):
def end_function(self, function_name: str):
""" Emit code to end a function """
@abc.abstractmethod
def global_byte_array(self, name: str, size: int, alignment: int):
""" Emit directives to declare a global array with given name and size. """
def print(self, indent, *args):
if args:
if indent:
......
......@@ -88,7 +88,7 @@ class X86_Architecture(Architecture):
## muops
muops_init = 1
# the subq+jne at the end of the loop are fused
# the subq+jne at the end of the loop are macro-fused
muops_per_iteration = num_iterations
muops_total = muops_init + muops_per_iteration
......@@ -125,7 +125,7 @@ class X86_IR_Builder(IR_Builder):
@abc.override
def get_scratch_register(self, idx: int) -> 'Register':
return {
0: R13, 1: R14, 2: R15,
0: R12, 1: R13, 2: R14, 3: R15,
}[idx]
@abc.override
......@@ -139,24 +139,22 @@ class X86_IR_Builder(IR_Builder):
if address_width == 32:
assert EBP in free_regs
candidates = BASE_REGISTER_32
candidates = set([EBP])
candidates = set([EBP, EDI, ESI])
elif address_width == 64:
assert RBP in free_regs
candidates = BASE_REGISTER_64
candidates = set([RBP])
candidates = set([RBP, RDI, RSI])
else:
raise NotImplementedError(f'TODO: {address_width}-bit addressing')
candidates = set(candidates) & free_regs
# used in sequentialize CPU
candidates -= set([AL, AH, AX, EAX, RAX,
BL, BH, BX, EBX, RBX,
CL, CH, CX, ECX, RCX,
DL, DH, DX, EDX, RDX,
])
# scratch registers
candidates -= set([R13, R14, R15])
# # used in sequentialize CPU
# candidates -= set([AL, AH, AX, EAX, RAX,
# BL, BH, BX, EBX, RBX,
# CL, CH, CX, ECX, RCX,
# DL, DH, DX, EDX, RDX,
# ])
# do not clobber stack
candidates -= set([SPL, SP, ESP, RSP])
......@@ -173,7 +171,7 @@ class X86_IR_Builder(IR_Builder):
candidates -= set([reg])
assert candidates
assert candidates, [candidates, sorted(set([r.widest for r in free_regs]))]
reg = sorted(candidates)[0]
......@@ -303,14 +301,22 @@ class X86_IR_Builder(IR_Builder):
def emit_put_const_in_register(self, const: int, reg: Register) -> ty.List[Instruction]:
assert reg in GPR, f'can only do GPR for now ({reg})'
assert type(const) in (int, Label)
assert type(const) != int or 0 <= const <= (2 ** 64) - 1
reg32 = reg.as_width(32)
reg64 = reg.as_width(64)
if const == 0:
i = self._set_registers(Harness.XOR_GPR32, 'src', reg32, 'src-dst', reg32)
else:
elif type(const) is int and const <= 2 ** 32 - 1:
i = Harness.MOV_IMM32_GPR32
i = self._set_value(i, 'src', const)
i = self._set_register(i, 'dst', reg32)
else:
i = Harness.MOV_IMM64_GPR64
i = self._set_value(i, 'src', const)
i = self._set_register(i, 'dst', reg64)
return [i]
@abc.override
......
......@@ -6,6 +6,7 @@ from pipedream.asm.x86.registers import *
from pipedream.asm.x86.operands import *
import functools
import math
import typing as ty
......@@ -68,6 +69,32 @@ class X86_ASM_Writer(ASM_Writer):
print('#', inst)
raise
@abc.override
def global_byte_array(self, name: str, size: int, alignment: int):
## this is just copy/adapted from ASM emitted by GCC
assert type(name) is str
assert type(size) is int
assert type(alignment) is int
assert name and name.isprintable()
assert size >= 0
assert alignment >= 0
assert math.log2(alignment).is_integer()
## assume we are in text section
# self.print(1, '.text')
## switch to BSS & emit array
self.print(1, '.bss')
self.print(1, '.global', name)
self.print(1, '.align', alignment)
self.print(1, '.type', name + ',', '@object')
self.print(1, '.size', name + ',', size)
self.print(0, name + ':')
self.print(1, '.zero', size)
## switch back to text section
self.print(1, '.text')
@abc.override
def comment(self, *args):
self.print(1, '#', *args)
......
......@@ -144,6 +144,7 @@ class X86_Instruction_Set(ir.Instruction_Set):
i64 = 'i64'
i32 = 'i32'
i8 = 'i8'
USE, DEF, USE_DEF = ir.Use_Def
R, W, RW = USE, DEF, USE_DEF
......@@ -174,7 +175,7 @@ INTEL_MNEMONIC_CANNOT_BENCHMARK = frozenset([
def mk_inst(*, name: str, att_mnemonic: str, intel_mnemonic: str,
operands: ty.List, tags: ty.Set[str],
isa_set: str, isa_extension: str = None,
can_benchmark = True):
can_benchmark = None):
# print('mk_inst', name, att_mnemonic, intel_mnemonic,
# list(op() for op in operands),
# tags, isa_set, isa_extension,
......@@ -187,7 +188,8 @@ def mk_inst(*, name: str, att_mnemonic: str, intel_mnemonic: str,
operands = tuple(mk_op() for mk_op in operands)
## filter out instructions we currently do not support
can_benchmark = pipedream_asm_backend_can_handle(isa_set, intel_mnemonic, att_mnemonic, tags, operands)
if can_benchmark is None:
can_benchmark = pipedream_asm_backend_can_handle(isa_set, intel_mnemonic, att_mnemonic, tags, operands)
inst = X86_Instruction(name, att_mnemonic, intel_mnemonic, isa_set, operands, tags, can_benchmark)
ATT_MNEMONICS[name] = att_mnemonic
......@@ -364,7 +366,7 @@ def make_reg_op(*, name: str, reg_class: ir.Register_Class, reg: X86_Register =
def make_imm_op(*, name: str, imm_bits: int,
type, elems: int, visibility: ir.Operand_Visibility):
type, elems: int, value: int = None, visibility: ir.Operand_Visibility):
if type[0] == 'i':
clss = {
8: Imm8,
......@@ -382,7 +384,7 @@ def make_imm_op(*, name: str, imm_bits: int,
else:
raise ValueError('invalid type for immediate operand: ' + repr(type))
return lambda: clss(name, visibility)
return lambda: clss(name, visibility, value)
def make_flags_op(*, name: str, reg: X86_Register, read: X86_Flags, write: X86_Flags,
......@@ -509,6 +511,7 @@ class Harness:
IMUL_IMM_GPR64 = INSTRUCTIONS['IMUL_GPR64i64_GPR64i64_IMMi32']
MOV_GPR64 = INSTRUCTIONS['MOV_GPR64i64_GPR64i64']
MOV_IMM32_GPR32 = INSTRUCTIONS['MOV_GPR32i32_IMMi32']
MOV_IMM64_GPR64 = INSTRUCTIONS['MOV_GPR64i64_IMMi64']
SUB_IMM8_GPR64 = INSTRUCTIONS['SUB_GPR64i64_IMMi8']
TEST_GPR64 = INSTRUCTIONS['TEST_GPR64i64_GPR64i64']
XOR_GPR32 = INSTRUCTIONS['XOR_GPR32i32_GPR32i32']
......@@ -601,6 +604,22 @@ class Harness:
can_benchmark = False,
)
# zero byte jump
JMP_E9_0 = mk_inst(
name = 'JMP_0',
# att_mnemonic = '.byte 0xeb, 0',
att_mnemonic = '.byte 0xe9, 0, 0, 0, 0',
intel_mnemonic = 'XXX',
isa_set = instructions_xed.ISA.I86,
operands = [
# TODO: relbr operand type
# make_imm_op(name='dst', imm_bits=8, type=i8, elems=1, value=ir.Label('0'), visibility=EXPLICIT),
make_reg_op(name='ip', reg=RIP, reg_class=RC_RIP, action=RW, type=i64, elems=1, visibility=SUPPRESSED),
],
tags = ['branch', 'conditional-branch', 'relative-branch'],
can_benchmark = True,
)
tmp = collections.OrderedDict()
for inst in sorted(INSTRUCTIONS.values(), key=lambda i: i.name):
......
......@@ -114,23 +114,6 @@ class _Benchmark_Runner:
tmp_dir = tmp_dir,
)
max_memory_size = 0
for b in benchmarks:
memory_size = pseudoalloc.MaximizeDepsPseudoAllocator.memory_arena_size(b.instructions) * b.unroll_factor
max_memory_size = max(max_memory_size, memory_size)
memory_arena_size = 2 * max_memory_size + 2 * 4096
memory_arena_base = (ctypes.c_char * memory_arena_size)()
## make sure there is some space before the arena
offset = 4096
## align to 64
offset = offset + 64 - ctypes.addressof(memory_arena_base) % 64
memory_arena_ptr = ctypes.cast(ctypes.byref(memory_arena_base, offset), ctypes.POINTER(ctypes.c_char))
for i, benchmark in enumerate(bench_lib.benchmarks, 1):
result = self._run_benchmark(
benchmark_index = i,
......@@ -139,7 +122,6 @@ class _Benchmark_Runner:
perf_counters = perf_counters,
benchmark_lib = bench_lib,
benchmark = benchmark,
memory_arena = memory_arena_ptr,
num_iterations = num_iterations,
num_warmup_iterations = num_warmup_iterations,
outlier_low = outlier_low,
......@@ -504,7 +486,6 @@ class _Benchmark_Runner:
papi_event_set = out.allocate_argument(0)
num_events = out.allocate_argument(1)
results = out.allocate_argument(2)
memory_arena = out.allocate_argument(3)
out.newline()
out.comment('*' * 70)
......@@ -516,37 +497,52 @@ class _Benchmark_Runner:
out.comment('ARG papi_event_set ', papi_event_set)
out.comment('ARG num_events ', num_events)
out.comment('ARG results ', results)
out.comment('ARG memory_arena ', memory_arena)
out.comment('free callee-saves for kernel')
out.push_callee_saves()
out.newline()
out.comment('papi_event_set and results are alive in the kernel')
SCRATCH_REG_1 = out.scratch_register(0)
SCRATCH_REG_2 = out.scratch_register(1)
SCRATCH_REG_3 = out.scratch_register(2)
SCRATCH_REG_4 = out.scratch_register(3)
from pipedream.asm.x86 import RDX
assert results is RDX
out.comment('papi_event_set -> ', SCRATCH_REG_1)
papi_event_set = out.move_to(papi_event_set, SCRATCH_REG_1)
out.comment('results -> ', SCRATCH_REG_2)
results = out.move_to(results, SCRATCH_REG_2)
out.comment('num_events -> ', SCRATCH_REG_2)
num_events = out.move_to(num_events, SCRATCH_REG_2)
out.comment('results -> ', SCRATCH_REG_3)
results = out.move_to(results, SCRATCH_REG_3)
LOOP_COUNTER = SCRATCH_REG_3
LOOP_COUNTER = SCRATCH_REG_4
out.comment('size of one row of results table in bytes')
STRIDE = out.mul_reg_with_const(num_events, 8)
if any(i.has_memory_operand() for i in benchmark.instructions):
# FIXME: other address sizes
MEMORY_REG = out.ir_builder.select_memory_base_register(benchmark.instructions, set(out.free_registers()), 64)
need_memory: bool = any(i.has_memory_operand() for i in benchmark.instructions)
if need_memory:
MEMORY_ARENA = ir.Label(self._MEMORY_ARENA + '@GOTPCREL(%rip)')
out.comment('clear memory arena')
## void *memset(void *s, int c, size_t n);
s = out.get_argument_register(0)
c = out.get_argument_register(1)
n = out.get_argument_register(2)
out.comment('memory_arena -> ', MEMORY_REG)
out.move_to(memory_arena, MEMORY_REG)
else:
out.free_reg(memory_arena)
MEMORY_REG = None
out.put_const_in_register(MEMORY_ARENA, s)
out.put_const_in_register(0, c)
out.put_const_in_register(self.memory_size(benchmark), n)
out.call('memset@PLT', s, c, n)
out.free_reg(s)
out.free_reg(c)
out.free_reg(n)
with out.counting_loop('measurement', LOOP_COUNTER, num_iterations) as loop:
out.comment('push loop counter')
......@@ -571,6 +567,14 @@ class _Benchmark_Runner:
out.sequentialize_cpu()
if need_memory:
# FIXME: other address sizes
MEMORY_REG = out.ir_builder.select_memory_base_register(benchmark.instructions, set(out.free_registers()), 64)
out.take_reg(MEMORY_REG)
out.put_const_in_register(MEMORY_ARENA, MEMORY_REG)
else:
MEMORY_REG = None
## allow backend to reserve some registers.
out.steal_benchmark_registers(benchmark.instructions)
......@@ -688,6 +692,9 @@ class _Benchmark_Runner:
return out.take_code(), kernel_instructions
_MEMORY_ARENA = '_memory_arena_'
_PAGE_SIZE = 4096
def _gen_benchmark_lib(self, *,
benchmark_specs: ty.List[Benchmark_Spec],
architecture: ir.Architecture,
......@@ -736,6 +743,30 @@ class _Benchmark_Runner:
benchmark_functions[benchmark] = fn_name
## calculate of size of memory arena
memory_arena_size = 0
for b in benchmark_specs:
memory_size = pseudoalloc.MaximizeDepsPseudoAllocator.memory_arena_size(b.instructions) * b.unroll_factor
memory_arena_size = max(memory_arena_size, memory_size)
PAGE_SIZE = self._PAGE_SIZE
## round up to a multiple of page size
memory_arena_size = memory_arena_size + PAGE_SIZE - memory_arena_size % PAGE_SIZE
## add a padding page
memory_arena_size += PAGE_SIZE
## why not
memory_arena_size *= 2
asm_writer.global_byte_array(self._MEMORY_ARENA + 'pad_before_', memory_arena_size, 4096)
asm_writer.global_byte_array(self._MEMORY_ARENA, memory_arena_size, 4096)
asm_writer.global_byte_array(self._MEMORY_ARENA + 'pad_after_', memory_arena_size, 4096)
asm_writer.end_file(asm_file)
self.info('assemble benchmark library')
......@@ -748,7 +779,7 @@ class _Benchmark_Runner:
)
lib = ctypes.cdll.LoadLibrary(lib_file)
os.unlink(lib_file)
# os.unlink(lib_file)
return _Benchmark_Lib(
lib,
......@@ -762,7 +793,6 @@ class _Benchmark_Runner:
perf_counters: 'Perf_Counter_Spec',
benchmark_lib: '_Benchmark_Lib',
benchmark: Benchmark_Spec,
memory_arena: ctypes.c_char_p,
num_iterations: int, num_warmup_iterations: int,
outlier_low: int = 0, outlier_high: int = 100) -> Benchmark_Run:
assert type(outlier_low) is int and 0 <= outlier_low <= 100
......@@ -772,15 +802,18 @@ class _Benchmark_Runner:
outlier_low = fractions.Fraction(outlier_low) / 100
outlier_high = fractions.Fraction(outlier_high) / 100
total_muops = []
total_insts = []
total_cycles = []
total_cycles = []
total_insts = []
total_fused_muops = []
total_unfused_muops = []
total_clean_IPC = []
total_clean_MPC = []
total_clean_IPC = []
total_clean_fMPC = []
total_clean_uMPC = []
total_IPC = []
total_MPC = []
total_IPC = []
total_fMPC = []
total_uMPC = []
benchmark_fn = benchmark_lib.benchmark_function(benchmark)
event_sets = []
......@@ -809,7 +842,6 @@ class _Benchmark_Runner:
event_set_id,
num_events,
result_array.ctypes.data_as(ctypes.POINTER(ctypes.c_longlong)),
memory_arena,
)
time_after = time.perf_counter()
......@@ -835,7 +867,6 @@ class _Benchmark_Runner:
result_array = result_array[index_array]
# drop outlier values (below/above lo/hi percentiles)
lo = round(len(result_array) * outlier_low)
hi = round(len(result_array) * outlier_high)
......@@ -845,42 +876,95 @@ class _Benchmark_Runner:
assert len(result_array)
muops = column_by_name(perf_counters.uop_counter(), result_array)
insts = column_by_name(perf_counters.instruction_counter(), result_array)
cycles = column_by_name(perf_counters.cycle_counter(), result_array)
assert cycles.max() > 0
total_cycles.append(cycles)
overhead = benchmark.arch.loop_overhead(benchmark.kernel_iterations)
# this assumes that the loop control muops
# do not contend significantly with the benchmark kernel.
clean_IPC = (insts - overhead.instructions) / cycles
clean_MPC = (muops - overhead.muops) / cycles
try:
muops_idx = evt_set.index_of(perf_counters.unfused_uop_counter())
except IndexError:
unfused_muops = None
uMPC = None
clean_uMPC = None
else:
muops = result_array[:, muops_idx]
IPC = insts / cycles
MPC = muops / cycles
# this assumes that the loop control muops
# do not contend significantly with the benchmark kernel.
clean_MPC = (muops - overhead.muops) / cycles
total_muops.append(muops)
total_insts.append(insts)
total_cycles.append(cycles)
MPC = muops / cycles
total_clean_IPC.append(clean_IPC)
total_clean_MPC.append(clean_MPC)
total_unfused_muops.append(muops)
total_clean_uMPC.append(clean_MPC)
total_uMPC.append(MPC)
unfused_muops = Statistics.from_array(muops)
uMPC = Statistics.from_array(MPC)
clean_uMPC = Statistics.from_array(clean_MPC)
try:
muops_idx = evt_set.index_of(perf_counters.fused_uop_counter())
except IndexError:
fused_muops = None
fMPC = None
clean_fMPC = None
else:
muops = result_array[:, muops_idx]
total_IPC.append(IPC)
total_MPC.append(MPC)
# this assumes that the loop control muops
# do not contend significantly with the benchmark kernel.
clean_MPC = (muops - overhead.muops) / cycles
MPC = muops / cycles
total_fused_muops.append(muops)
total_clean_fMPC.append(clean_MPC)
total_fMPC.append(MPC)
fused_muops = Statistics.from_array(muops)
fMPC = Statistics.from_array(MPC)
clean_fMPC = Statistics.from_array(clean_MPC)
try:
insts_idx = evt_set.index_of(perf_counters.instruction_counter())
except IndexError:
instructions = None
IPC = None
clean_IPC = None
else:
insts = result_array[:, insts_idx]
clean_IPC = (insts - overhead.instructions) / cycles
IPC = insts / cycles
total_insts.append(insts)
total_clean_IPC.append(clean_IPC)
total_IPC.append(IPC)
instructions = Statistics.from_array(insts)
IPC = Statistics.from_array(IPC)
clean_IPC = Statistics.from_array(clean_IPC)
measurement = Event_Set_Run(
cycles = Statistics.from_array(cycles),
instructions = Statistics.from_array(insts),
muops = Statistics.from_array(muops),
cycles = Statistics.from_array(cycles),
clean_IPC = Statistics.from_array(clean_IPC),
clean_MPC = Statistics.from_array(clean_MPC),
unfused_muops = unfused_muops,
uMPC = uMPC,
clean_uMPC = clean_uMPC,
IPC=Statistics.from_array(IPC),
MPC=Statistics.from_array(MPC),
fused_muops = fused_muops,
fMPC = fMPC,
clean_fMPC = clean_fMPC,
instructions = instructions,
IPC = IPC,
clean_IPC = clean_IPC,
)
for port, evt in perf_counters.port_counters.items():
......@@ -896,7 +980,7 @@ class _Benchmark_Runner:
measurement.port_muops[port] = mean_and_stddev
for evt in evt_set.event_names:
if evt == perf_counters.uop_counter():
if evt == perf_counters.fused_uop_counter():
continue
if evt == perf_counters.instruction_counter():
continue
......@@ -905,6 +989,7 @@ class _Benchmark_Runner:
if evt in perf_counters.port_counters.values():
continue
assert type(evt) is str, evt
measurement.misc[evt] = Statistics.from_array(column_by_name(evt, result_array))
event_sets.append(measurement)
......@@ -912,27 +997,46 @@ class _Benchmark_Runner:
timestamp_end = datetime.datetime.now()
def total(arrays):
array = numpy.hstack(arrays)
return Statistics.from_array(array)
if True or arrays:
array = numpy.hstack(arrays)
return Statistics.from_array(array)
else:
return None
## we are running with SCHED_FIFO.
## so the kernel won't preempt us, give other processes some room to breathe
# os.sched_yield()
return Benchmark_Run(
benchmark = benchmark,
timestamp = timestamp_bgn,
runtime = timestamp_end - timestamp_bgn,
cycles = total(total_cycles),
instructions = total(total_insts),
muops = total(total_muops),
clean_IPC = total(total_clean_IPC),
clean_MPC = total(total_clean_MPC),
IPC = total(total_IPC),
MPC = total(total_MPC),
event_sets = event_sets,
benchmark = benchmark,
timestamp = timestamp_bgn,
runtime = timestamp_end - timestamp_bgn,
cycles = total(total_cycles),
instructions = total(total_insts),
fused_muops = total(total_fused_muops),
unfused_muops = total(total_unfused_muops),
clean_IPC = total(total_clean_IPC),
clean_fMPC = total(total_clean_fMPC),
clean_uMPC = total(total_clean_uMPC),
IPC = total(total_IPC),
fMPC = total(total_fMPC),
uMPC = total(total_uMPC),
event_sets = event_sets,
)
def memory_size(self, bench: Benchmark_Spec) -> int:
memory_size = pseudoalloc.MaximizeDepsPseudoAllocator.memory_arena_size(bench.instructions) * bench.unroll_factor
PAGE_SIZE = self._PAGE_SIZE
## round up to a multiple of page size
memory_size = memory_size + PAGE_SIZE - memory_size % PAGE_SIZE
## add a padding page
memory_size += PAGE_SIZE
return memory_size
def _die(self, *msg):
self.info.clear()
print('error:', *msg, file=sys.stderr)
......@@ -1346,6 +1450,10 @@ class _ASM_Builder:
def branch_if_not_zero(self, reg: ir.Register, dst: ir.Label):
self.insts(self._irb.emit_branch_if_not_zero(reg, dst))
def get_argument_register(self, idx: int) -> ir.Register:
reg = self._irb.get_argument_register(idx)
return reg
def allocate_argument(self, idx: int) -> ir.Register:
reg = self._irb.get_argument_register(idx)
self._alloc.take(reg)
......@@ -1632,18 +1740,33 @@ class Perf_Counter_Spec:
def make_throughput_counters(clss, extra_events: ty.List[str] = ()) -> 'Perf_Counter_Spec':
papi = clss._the_papi()
return Perf_Counter_Spec(papi, {}, extra_events, clss.throughput_counters())
events = [
*clss.throughput_counters(),
*extra_events,
]
return Perf_Counter_Spec(papi, {}, events, [clss.cycle_counter()])
@classmethod
def make_latency_counters(clss, extra_events: ty.List[str] = ()) -> 'Perf_Counter_Spec':
papi = clss._the_papi()
return Perf_Counter_Spec(papi, {}, extra_events, clss.throughput_counters())
events = [
*clss.throughput_counters(),
*extra_events,
]
return Perf_Counter_Spec(papi, {}, events, [clss.cycle_counter()])
@classmethod
def make_port_counters(clss, extra_events: ty.List[str] = ()) -> 'Perf_Counter_Spec':
papi = clss._the_papi()
events = [
*clss.throughput_counters(),
*extra_events,
]
# FIXME: recent Intel CPUs only
ports = {
port: evt
......@@ -1652,7 +1775,7 @@ class Perf_Counter_Spec:
if papi.can_count_event(evt)
}
return Perf_Counter_Spec(papi, ports, extra_events, clss.throughput_counters())
return Perf_Counter_Spec(papi, ports, events, [clss.cycle_counter()])
##### access registered PAPI event set
......@@ -1685,21 +1808,35 @@ class Perf_Counter_Spec:
return "INSTRUCTIONS_RETIRED"
@classmethod
def uop_counter(clss) -> ty.Optional[str]:
def unfused_uop_counter(clss) -> ty.Optional[str]:
""" Count muops in the unfused domain (after fission of fused micro ops) """
# FIXME: recent Intel CPUs only
# return "UOPS_EXECUTED"
return "UOPS_RETIRED"
return "UOPS_RETIRED:ALL"
# return "UOPS_RETIRED:RETIRE_SLOTS"
# return "UOPS_ISSUED"
@classmethod
def fused_uop_counter(clss) -> ty.Optional[str]:
""" Count muops in the fused domain (fused micro ops in ROB, etc.) """
# FIXME: recent Intel CPUs only
return "UOPS_RETIRED:RETIRE_SLOTS"
# return "UOPS_ISSUED"
@classmethod
def throughput_counters(clss) -> ty.Sequence[str]:
return tuple([
clss.cycle_counter(),
clss.instruction_counter(),
clss.uop_counter(),
clss.fused_uop_counter(),
clss.unfused_uop_counter(),
])
@classmethod
def uop_counters_per_port(clss, papi: pypapi.Papi) -> ty.Sequence[ty.Tuple[int, str]]:
""" Count unfused muops executed per execution port """
if papi.can_count_event("UOPS_EXECUTED_PORT:PORT_0"):
return tuple(enumerate([
"UOPS_EXECUTED_PORT:PORT_0",
......@@ -1770,7 +1907,6 @@ class _Benchmark_Lib:
ctypes.c_int,
ctypes.c_ssize_t,
ctypes.POINTER(ctypes.c_longlong),
ctypes.POINTER(ctypes.c_char),
]
fn.restype = ctypes.c_int
......
......@@ -186,33 +186,39 @@ class Pretty_Printer:
'unroll-factor', '%10d' % benchmark.unroll_factor,
'kernel-iterations', '%10d' % benchmark.kernel_iterations,
)
# self.write_parameter_two_columns(
# 'IPC (max)', self.round_result(run.clean_IPC.max),
# 'MPC (max)', self.round_result(run.clean_MPC.max),
# )
# self.write_parameter_two_columns(
# 'IPC (p90)', self.round_result(run.clean_IPC.p90),
# 'MPC (p90)', self.round_result(run.clean_MPC.p90),
# )
i = run.instructions.max - benchmark.loop_overhead.instructions
fm = run.fused_muops.max - benchmark.loop_overhead.muops
um = run.unfused_muops.max - benchmark.loop_overhead.muops
self.write_parameter_two_columns(
'IPC (max)', self.round_result(run.clean_IPC.max),
'MPC (max)', self.round_result(run.clean_MPC.max),
)
self.write_parameter_two_columns(
'IPC (p90)', self.round_result(run.clean_IPC.p90),
'MPC (p90)', self.round_result(run.clean_MPC.p90),
'fused muops', '%10s' % round(fm / i),
'unfused muops', '%10s' % round(um / i),
)
if len(benchmark.instructions) == 1:
i = run.instructions.max - benchmark.loop_overhead.instructions
m = run.muops.max - benchmark.loop_overhead.muops
self.write_result('num-muops', fractions.Fraction(m) / fractions.Fraction(i))
self.write_statistic('clean-IPC', run.clean_IPC)
self.write_statistic('clean-MPC', run.clean_MPC)
self.write_statistic('IPC', run.IPC)
self.write_statistic('MPC', run.MPC)
self.write_statistic('cycles', run.cycles)
self.write_statistic('instructions', run.instructions)
self.write_statistic('muops', run.muops)
self.write_statistic('clean-IPC', run.clean_IPC)
self.write_statistic('clean-fMPC', run.clean_fMPC)
self.write_statistic('clean-uMPC', run.clean_uMPC)
self.write_statistic('IPC', run.IPC)
self.write_statistic('fMPC', run.fMPC)
self.write_statistic('uMPC', run.uMPC)
self.write_statistic('cycles', run.cycles)
self.write_statistic('instructions', run.instructions)
self.write_statistic('fused muops', run.fused_muops)
self.write_statistic('unfused muops', run.unfused_muops)
for m in run.event_sets:
for port, stat in m.port_muops.items():
# ignore ports that take <.1% of the instructions in the bench
if stat.mean < (run.muops.mean / 1000):
if stat.mean < (run.unfused_muops.mean / 1000):
continue
self.write_statistic('port-' + str(port), stat)
......@@ -222,44 +228,6 @@ class Pretty_Printer:
print()
def pretty_print_benchmark_run_summary(self, run: Benchmark_Run_Summary):
print()
self.header1(run.name, ':')
self.write_parameter_one_column('runtime', run.runtime)
self.write_parameter_two_columns(
'IPC (max)', self.round_result(run.ipc.max),
'MPC (max)', self.round_result(run.mpc.max),
)
self.write_parameter_two_columns(
'IPC (p90)', self.round_result(run.ipc.p90),
'MPC (p90)', self.round_result(run.mpc.p90),
)
if len(run.kernel) == 1:
i = run.instructions.max
m = run.muops.max
if math.isnan(i) or math.isnan(m):
self.write_result('num-muops', math.nan)
else:
self.write_result('num-muops', fractions.Fraction(m) / fractions.Fraction(i))
self.write_statistic('IPC', run.ipc)
self.write_statistic('MPC', run.mpc)
self.write_statistic('cycles', run.cycles)
self.write_statistic('instructions', run.instructions)
self.write_statistic('muops', run.muops)
for port, stat in run.port_muops.items():
# ignore ports that take <.1% of the instructions in the bench
if stat.mean < (run.muops.mean / 1000):
continue
self.write_statistic('port-' + str(port), stat)
print()
def main(argv):
# arch = ir.Architecture.for_name('x86')
......@@ -327,7 +295,7 @@ def main(argv):
with open(input_file) as fd:
try:
for lineno, line in enumerate(fd, 1):
run = Benchmark_Run_Summary.from_json(line)
run = Benchmark_Run.from_json(line)
if args.short:
# if len(run.kernel) > 1:
......@@ -339,21 +307,28 @@ def main(argv):
# continue
try:
num_muops = round(run.muops.mean / run.instructions.mean)
num_fused_muops = round(run.fused_muops.mean / run.instructions.mean)
except ZeroDivisionError:
num_fused_muops = math.nan
try:
num_unfused_muops = round(run.unfused_muops.mean / run.instructions.mean)
except ZeroDivisionError:
num_muops = math.nan
num_unfused_muops = math.nan
print(
run.name.ljust(100),
f'IPC={round(run.ipc.mean,1)}~{round(run.ipc.stddev,1)}',
f'MPC={round(run.mpc.mean,1)}~{round(run.mpc.stddev,1)}',
f'IPC={round(run.clean_IPC.mean,1)}~{round(run.clean_IPC.stddev,1)}',
f'fMPC={round(run.clean_fMPC.mean,1)}~{round(run.clean_fMPC.stddev,1)}',
f'uMPC={round(run.clean_uMPC.mean,1)}~{round(run.clean_uMPC.stddev,1)}',
'-',
'%3s MUOPS' % num_muops,
'%3s FUSED MUOPS' % num_fused_muops,
'%3s UNFUSED MUOPS' % num_unfused_muops,
'-',
' '.join(f'P{p}' for p in sorted(ports)),
)
else:
p.pretty_print_benchmark_run_summary(run)
p.pretty_print_benchmark_run(run)
except json.JSONDecodeError as e:
print(Pretty_Printer.C.red('JSON syntax error:'), input_file + f':{lineno}:', e)
......
......@@ -70,7 +70,7 @@ class Benchmark_Spec(yaml.YAML_Serializable):
self.kernel_iterations = kernel_iterations
self.arch = arch
self.instructions = instructions
self.loop_overhead = loop_overhead
self.loop_overhead = loop_overhead
@classmethod
def from_instructions(clss, *,
......@@ -273,30 +273,41 @@ class Event_Set_Run(yaml.YAML_Struct):
Run one event set over a benchmark
"""
cycles = yaml.Slot(Statistics)
instructions = yaml.Slot(Statistics)
muops = yaml.Slot(Statistics)
cycles = yaml.Slot(Statistics)
instructions = yaml.Slot(ty.Optional[Statistics])
fused_muops = yaml.Slot(ty.Optional[Statistics])
unfused_muops = yaml.Slot(ty.Optional[Statistics])
## IPC not counting instructions for the kernel loop
clean_IPC = yaml.Slot(Statistics)
clean_MPC = yaml.Slot(Statistics)
clean_IPC = yaml.Slot(ty.Optional[Statistics])
clean_fMPC = yaml.Slot(ty.Optional[Statistics])
clean_uMPC = yaml.Slot(ty.Optional[Statistics])
IPC = yaml.Slot(Statistics)
MPC = yaml.Slot(Statistics)
IPC = yaml.Slot(ty.Optional[Statistics])
fMPC = yaml.Slot(ty.Optional[Statistics])
uMPC = yaml.Slot(ty.Optional[Statistics])
port_muops = yaml.Slot(ty.Dict[int, Statistics], default={})
misc = yaml.Slot(ty.Dict[str, Statistics], default={})
def __init__(self, *, cycles, instructions, muops, clean_IPC, clean_MPC, IPC, MPC, port_muops = None, misc = None):
self.cycles = cycles
self.instructions = instructions
self.muops = muops
def __init__(self, *,
cycles,
instructions = None, IPC = None, clean_IPC = None,
fused_muops = None, clean_fMPC = None, fMPC = None,
unfused_muops = None, clean_uMPC = None, uMPC = None,
port_muops = None, misc = None):
self.cycles = cycles
self.instructions = instructions
self.fused_muops = fused_muops
self.unfused_muops = unfused_muops
self.clean_IPC = clean_IPC
self.clean_MPC = clean_MPC
self.clean_fMPC = clean_fMPC
self.clean_uMPC = clean_uMPC
self.IPC = IPC
self.MPC = MPC
self.fMPC = fMPC
self.uMPC = uMPC
self.port_muops = port_muops or {}
self.misc = misc or {}
......@@ -311,178 +322,250 @@ class Benchmark_Run(yaml.YAML_Struct):
timestamp = yaml.Slot(datetime.datetime)
runtime = yaml.Slot(datetime.timedelta)
cycles = yaml.Slot(Statistics)
instructions = yaml.Slot(Statistics)
muops = yaml.Slot(Statistics)
cycles = yaml.Slot(Statistics)
instructions = yaml.Slot(Statistics)
fused_muops = yaml.Slot(Statistics)
unfused_muops = yaml.Slot(Statistics)
## IPC not counting instructions for the kernel loop
clean_IPC = yaml.Slot(Statistics)
clean_MPC = yaml.Slot(Statistics)
## fused muops -//-
clean_fMPC = yaml.Slot(Statistics)
## unfused muops -//-
clean_uMPC = yaml.Slot(Statistics)
## instructions per cycle
IPC = yaml.Slot(Statistics)
MPC = yaml.Slot(Statistics)
## fused muops -//-
fMPC = yaml.Slot(Statistics)
## unfused muops -//-
uMPC = yaml.Slot(Statistics)
event_sets = yaml.Slot(ty.List[Event_Set_Run])
def __init__(self, *, benchmark, timestamp, runtime = datetime.timedelta(seconds = 0),
cycles, instructions, muops, clean_IPC, clean_MPC, IPC, MPC,
cycles,
instructions, IPC, clean_IPC,
fused_muops, clean_fMPC, fMPC,
unfused_muops, clean_uMPC, uMPC,
event_sets):
self.benchmark = benchmark
self.timestamp = timestamp
self.runtime = runtime
self.cycles = cycles
self.instructions = instructions
self.muops = muops
self.instructions = instructions
self.clean_IPC = clean_IPC
self.clean_MPC = clean_MPC
self.IPC = IPC
self.MPC = MPC
self.fused_muops = fused_muops
self.clean_fMPC = clean_fMPC
self.fMPC = fMPC
self.unfused_muops = unfused_muops
self.clean_uMPC = clean_uMPC
self.uMPC = uMPC
self.event_sets = event_sets
# @property
# def port_usage(self, port: int):
# """
# muops:port / total_muops
# """
## XXX
# return self.ports[port] / self.total_muops
@property
def name(self):
return self.benchmark.name
@property
def kernel(self):
return tuple(sorted(i.name for i in self.benchmark.instructions))
@dataclasses.dataclass(frozen=True)
class Benchmark_Run_Summary:
"""
Smaller version of benchmark run containing less information.
Intended for serialization from/to JSON for faster reading.
"""
@property
def ipc(self):
return self.clean_IPC
kernel: ty.Tuple[str, ...]
@property
def fmpc(self):
return self.clean_fMPC
ipc: Statistics
mpc: Statistics
instructions: Statistics
muops: Statistics
cycles: Statistics
@property
def umpc(self):
return self.clean_uMPC
port_muops: ty.Dict[int, Statistics] = dataclasses.field(default_factory=dict)
misc: ty.Dict[str, Statistics] = dataclasses.field(default_factory=dict)
@property
def port_muops(self):
return self.ports_usage()
runtime: float = math.nan
## XXX
@property
def name(self) -> str:
return Benchmark_Spec.name_from_instruction_names(self.kernel)
def num_fused_muops(self) -> int:
return round(self.clean_fMPC.mean / self.clean_IPC.mean)
@property
def num_muops(self) -> int:
return round(self.mpc.mean / self.ipc.mean)
def num_unfused_muops(self) -> int:
return round(self.clean_uMPC.mean / self.clean_IPC.mean)
def ports_usage(self, min_usage: float = 0.05) -> ty.Dict[int, Statistics]:
assert 0 < min_usage <= 1, min_usage
out = {}
for m in self.event_sets:
for port, stat in m.port_muops.items():
assert port not in out, port
if (stat.mean / self.unfused_muops.mean) < min_usage:
continue
out[port] = stat
return out
def ports_used(self, min_usage: float = 0.05) -> ty.FrozenSet[int]:
"""
Return set of ports where at least :min_usage: percent of all muops where executed.
"""
assert 0 < min_usage <= 1, min_usage
return frozenset(self.ports_usage(min_usage).keys())
return frozenset(p for p, stat in self.port_muops.items() if (stat.mean / self.muops.mean) >= min_usage)
def to_json(self) -> str:
out = {}
for slot in self._yaml_slots_:
assert slot.yaml_name not in out
val = getattr(self, slot.py_name)
if slot.type is Statistics:
val = val.to_jsonish()
elif slot.type is Benchmark_Spec:
val = self._Benchmark_Spec_to_json(val)
elif slot.type is datetime.datetime:
val = self._datetime_to_json(val)
elif slot.type is datetime.timedelta:
val = val.total_seconds()
elif slot.type is ty.List[Event_Set_Run]:
val = [self._Event_Set_Run_to_json(e) for e in val]
else:
raise TypeError(slot.type)
out[slot.yaml_name] = val
return json.dumps(out)
@staticmethod
def from_benchmark_run(run: Benchmark_Run) -> 'Benchmark_Run_Summary':
port_muops = {}
misc = {}
def from_json(txt) -> 'Benchmark_Run':
data = json.loads(txt)
assert type(data) is dict
for m in run.event_sets:
for port, stat in m.port_muops.items():
assert port not in port_muops, port
out = {}
for slot in Benchmark_Run._yaml_slots_:
val = data[slot.yaml_name]
if slot.type is Statistics:
val = Statistics.from_jsonish(val)
elif slot.type is Benchmark_Spec:
val = Benchmark_Run._Benchmark_Spec_from_json(val)
elif slot.type is datetime.datetime:
val = Benchmark_Run._datetime_from_json(val)
elif slot.type is datetime.timedelta:
val = datetime.timedelta(seconds=val)
elif slot.type is ty.List[Event_Set_Run]:
val = [Benchmark_Run._Event_Set_Run_from_json(e) for e in val]
else:
raise TypeError(slot.type)
port_muops[port] = stat
out[slot.py_name] = val
for name, stat in m.misc.items():
assert name not in misc, name
return Benchmark_Run(**out)
misc[name] = stat
@staticmethod
def _Benchmark_Spec_to_json(spec) -> dict:
out = {
'name': spec.name,
'kind': spec.kind.name,
'unroll_factor': spec.unroll_factor,
'kernel_iterations': spec.kernel_iterations,
'arch': spec.arch.name,
'instructions': [i.name for i in spec.instructions],
'loop_overhead': {'instructions': spec.loop_overhead.instructions,
'muops': spec.loop_overhead.muops}
}
return out
return Benchmark_Run_Summary(
kernel = tuple(sorted(i.name for i in run.benchmark.instructions)),
@staticmethod
def _Benchmark_Spec_from_json(obj: dict) -> Benchmark_Spec:
arch = ir.Architecture.for_name(obj['arch'])
instruction_set = arch.instruction_set()
instructions = [instruction_set[i] for i in obj['instructions']]
runtime = run.runtime.total_seconds(),
ipc = run.clean_IPC,
mpc = run.clean_MPC,
instructions = run.instructions,
muops = run.muops,
cycles = run.cycles,
port_muops = port_muops,
misc = misc,
return Benchmark_Spec(
name = obj['name'],
kind = Benchmark_Kind[obj['kind']],
unroll_factor = obj['unroll_factor'],
kernel_iterations = obj['kernel_iterations'],
arch = arch,
instructions = instructions,
loop_overhead = Loop_Overhead(obj['loop_overhead']['instructions'],
obj['loop_overhead']['muops'],)
)
@staticmethod
def from_json(txt: str) -> 'Benchmark_Run_Summary':
data = json.loads(txt)
def _Event_Set_Run_to_json(event_set_run) -> dict:
d = {}
assert type(data) is dict
for slot in Event_Set_Run._yaml_slots_:
val = getattr(event_set_run, slot.py_name)
runtime = data.get('runtime', math.nan)
port_muops = {}
port_json = data.get('ports')
if port_json:
for port in range(8):
port_muops[port] = Statistics.from_json_summary(f'port{port}-muops', port_json)
misc = {}
misc_json = data.get('misc')
if misc_json:
for name, stat in misc_json.items():
misc[port] = Statistics.from_json_summary('', stat)
return Benchmark_Run_Summary(
kernel = tuple(data['kernel']),
runtime = runtime,
ipc = Statistics.from_json_summary('ipc', data),
mpc = Statistics.from_json_summary('mpc', data),
instructions = Statistics.from_json_summary('instructions', data),
muops = Statistics.from_json_summary('muops', data),
cycles = Statistics.from_json_summary('cycles', data),
port_muops = port_muops,
misc = misc,
)
if slot.type is Statistics:
val = val.to_jsonish()
elif slot.type is ty.Optional[Statistics]:
if val is not None:
val = val.to_jsonish()
elif slot.type is ty.Dict[int, Statistics]:
val = {k: s.to_jsonish() for k, s in val.items()}
elif slot.type is ty.Dict[str, Statistics]:
val = {k: s.to_jsonish() for k, s in val.items()}
def to_json(self) -> str:
out = {'kernel': self.kernel}
if not math.isnan(self.runtime):
out['runtime'] = self.runtime
self.ipc.to_json_summary('ipc', out)
self.mpc.to_json_summary('mpc', out)
self.instructions.to_json_summary('instructions', out)
self.muops.to_json_summary('muops', out)
self.cycles.to_json_summary('cycles', out)
if self.port_muops:
out['ports'] = ports = {}
for port, stat in self.port_muops.items():
stat.to_json_summary(f'port{port}-muops', ports)
if self.misc:
out['misc'] = misc = {}
for name, stat in self.misc.items():
assert type(name) is str, name
data = {}
stat.to_json_summary('', data)
misc[name] = data
d[slot.yaml_name] = val
return json.dumps(out)
return d
@staticmethod
def _Event_Set_Run_from_json(d: dict) -> Event_Set_Run:
out = {}
for slot in Event_Set_Run._yaml_slots_:
val = d.get(slot.yaml_name)
if slot.type is Statistics:
val = Statistics.from_jsonish(val)
elif slot.type is ty.Optional[Statistics]:
if val is not None:
val = Statistics.from_jsonish(val)
elif slot.type is ty.Dict[int, Statistics]:
val = {int(k): Statistics.from_jsonish(s) for k, s in val.items()}
elif slot.type is ty.Dict[str, Statistics]:
val = {k: Statistics.from_jsonish(s) for k, s in val.items()}
else:
raise TypeError(slot.type)
def __hash__(self):
return object.__hash__(self)
out[slot.py_name] = val
return Event_Set_Run(**out)
class Benchmark_Run_Summary_Aggregator:
@staticmethod
def _datetime_to_json(date) -> str:
return date.strftime('%Y-%m-%d %H:%M:%S.%f')
@staticmethod
def _datetime_from_json(date_str) -> datetime.datetime:
return datetime.datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S.%f')
class Benchmark_Run_Aggregator:
"""
Helper for filtering & aggregating Benchmark_Run_Summary objects.
Helper for filtering & aggregating Benchmark_Run objects.
"""
DEFAULT_MIN_STDDEV = 0.0
......@@ -501,17 +584,18 @@ class Benchmark_Run_Summary_Aggregator:
self._measurements = collections.defaultdict(list)
def add_measurement(self, run: Benchmark_Run_Summary) -> bool:
assert type(run) is Benchmark_Run_Summary, type(run)
def add_measurement(self, run: Benchmark_Run) -> bool:
assert type(run) is Benchmark_Run, type(run)
## user provided filters
if run.ipc.num_samples < self.min_samples or run.mpc.num_samples < self.min_samples:
if run.ipc.num_samples < self.min_samples or run.fmpc.num_samples < self.min_samples or \
run.umpc.num_samples < self.min_samples:
return False
if run.ipc.stddev < self.min_stddev or run.mpc.stddev < self.min_stddev:
if run.ipc.stddev < self.min_stddev or run.fmpc.stddev < self.min_stddev or run.umpc.stddev < self.min_stddev:
return False
if run.ipc.stddev > self.max_stddev or run.mpc.stddev > self.max_stddev:
if run.ipc.stddev > self.max_stddev or run.fmpc.stddev > self.max_stddev or run.umpc.stddev > self.max_stddev:
return False
if self.predicate and self.predicate(run.kernel):
......@@ -521,8 +605,8 @@ class Benchmark_Run_Summary_Aggregator:
heapq.heappush(self._measurements[key], self.Item(run))
return True
def force_add(self, run: Benchmark_Run_Summary) -> bool:
assert type(run) is Benchmark_Run_Summary, type(run)
def force_add(self, run: Benchmark_Run) -> bool:
assert type(run) is Benchmark_Run, type(run)
key = self._normalize_key(run.kernel)
heapq.heappush(self._measurements[key], self.Item(run))
......@@ -536,11 +620,14 @@ class Benchmark_Run_Summary_Aggregator:
key = self._normalize_key(insts)
self._measurements.pop(key, None)
def __getitem__(self, key: ty.Sequence[str]) -> Benchmark_Run_Summary:
def __getitem__(self, key: ty.Sequence[str]) -> Benchmark_Run:
key = self._normalize_key(key)
return self._measurements[key][0].run
try:
return self._measurements[key][0].run
except IndexError:
raise KeyError(key)
def get(self, key: ty.Sequence[str]) -> Benchmark_Run_Summary:
def get(self, key: ty.Sequence[str]) -> Benchmark_Run:
key = self._normalize_key(key)
heap = self._measurements[key]
try:
......@@ -557,12 +644,12 @@ class Benchmark_Run_Summary_Aggregator:
key = self._normalize_key(key)
return key in self._measurements
def __iter__(self) -> ty.Iterable[Benchmark_Run_Summary]:
def __iter__(self) -> ty.Iterable[Benchmark_Run]:
for heap in self._measurements.values():
if heap:
yield heap[0].run
def all_measurements(self) -> ty.Iterable[Benchmark_Run_Summary]:
def all_measurements(self) -> ty.Iterable[Benchmark_Run]:
for heap in self._measurements.values():
for item in heap:
yield item.run
......@@ -589,9 +676,9 @@ class Benchmark_Run_Summary_Aggregator:
@dataclasses.dataclass(frozen=True)
class Item:
run: Benchmark_Run_Summary
run: Benchmark_Run
def __lt__(self, that):
# reverse comparison
return self.run.mpc.mean >= that.run.mpc.mean
return self.run.umpc.mean >= that.run.umpc.mean
This diff is collapsed.
import collections
import dataclasses
import enum
import math
import itertools
import functools
import typing as ty
class Kernel:
def __init__(self, ipc: float, mpc: float, insts: ty.Tuple['Inst', ...]):
def __init__(self, ipc: float, fmpc: float, umpc: float, insts: ty.Tuple['Inst', ...]):
assert all(type(i) is Inst for i in insts), insts
self._ipc = float(ipc)
self._mpc = float(mpc)
self._umpc = float(umpc)
self._fmpc = float(fmpc)
self._insts = insts
counts = collections.Counter(insts)
self._gcd = min(math.gcd(a, b) for a, b in itertools.product(counts.values(), repeat=2))
if not insts:
self._gcd = 1
else:
self._gcd = functools.reduce(math.gcd, counts.values(), counts[insts[0]])
@property
def ipc(self) -> float:
return self._ipc
@property
def mpc(self) -> float:
return self._mpc
def fmpc(self) -> float:
return self._fmpc
@property
def umpc(self) -> float:
return self._umpc
def count(self, inst) -> int:
cnt = self._insts.count(inst) / self._gcd
......@@ -35,7 +45,7 @@ class Kernel:
return iter(self._insts)
def __len__(self):
return len(self._insts)
return len(self._insts) // self._gcd
def __getitem__(self, idx):
return self._insts[idx]
......@@ -47,7 +57,11 @@ class Kernel:
return self._insts.items() < that._insts.items()
def __repr__(self) -> str:
return f'Kernel(ipc={self.ipc}, mpc={self.mpc}, insts=[' + ', '.join(repr(i) for i in self._insts) + '])'
return ''.join([
f'Kernel(ipc={self.ipc}, fmpc={self.fmpc}, ',
f'umpc={self.umpc}, ',
'insts=[' + ', '.join(repr(i) for i in self._insts) + '])'
])
def __str__(self) -> str:
seq = []
......@@ -67,39 +81,26 @@ class Kernel:
return 'K{' + ' '.join(x(i) for i in seq) + '}'
@dataclasses.dataclass(frozen=True, order=True)
class Inst:
def __init__(self, name, num_muops):
self._name = name
self._num_muops = num_muops
@property
def name(self) -> str:
return self._name
@property
def num_muops(self) -> int:
return self._num_muops
def __eq__(self, that):
if type(self) is not type(that):
return False
return self.name == that.name
def __hash__(self):
return hash(self.name)
def __lt__(self, that):
if type(self) is not type(that):
return NotImplemented
name: str
num_fused_muops: int
num_unfused_muops: int
return self.name < that.name
def __post_init__(self):
assert self.num_fused_muops <= self.num_unfused_muops
def __str__(self) -> str:
return f'I{{{self.name}}}'
def __repr__(self) -> str:
return f'{type(self).__name__}(name={self.name!r}, num_muops={self.num_muops!r})'
return ''.join([
type(self).__name__, '(',
f'name={self.name!r}, ',
f'num_fused_muops={self.num_fused_muops!r}, ',
f'num_unfused_muops={self.num_unfused_muops!r}',
')'
])
class Muop:
......@@ -142,9 +143,13 @@ class Port_Type(enum.Enum):
SLOWDOWN = 1
SPEEDUP = 2
@property
def is_real(self) -> bool:
return self is Port_Type.REAL
@property
def is_virtual(self) -> bool:
return bool(self.value)
return not self.is_real
class Port(ty.NamedTuple):
......@@ -152,6 +157,10 @@ class Port(ty.NamedTuple):
type: Port_Type = Port_Type.REAL
max_throughput: float = 1.0
@property
def is_real(self) -> bool:
return self.type.is_real
@property
def is_virtual(self) -> bool:
return self.type.is_virtual
......
import json
import re
__all__ = [
'load',
'remove_comments',
]
RE_COMMENT = re.compile(r'\s*(#|//).*$')
RE_COMMENT_LINE = re.compile(r'^\s*(#|//).*$')
RE_INLINE_COMMENT = re.compile(r'(:?(?:\s)*([A-Za-z\d\.{}]*)|((?<=\").*\"),?)(?:\s)*(((#|(//)).*)|)$')
def remove_comments(txt: str) -> str:
lines = txt.splitlines()
for lineno, line in enumerate(lines):
if re.search(RE_COMMENT, line):
if re.match(RE_COMMENT_LINE, line):
lines[lineno] = ""
elif re.search(RE_INLINE_COMMENT, line):
lines[lineno] = re.sub(RE_INLINE_COMMENT, r'\1', line)
return '\n'.join(lines)
def load(fp, *, allow_comments: bool = True, **kwargs) -> object:
"""
Load JSON from file.
"""
if allow_comments:
txt = fp.read()
txt = remove_comments(txt)
return json.loads(txt, **kwargs)
else:
return json.load(fp, **kwargs)
......@@ -37,8 +37,8 @@ class Statistics(yaml.YAML_Struct):
MAD = yaml.Slot(float, math.nan)
min = yaml.Slot(float, math.nan)
max = yaml.Slot(float, math.nan)
percentiles = yaml.Slot(ty.Dict[int, float])
histogram = yaml.Slot(ty.List[int])
percentiles = yaml.Slot(ty.Dict[int, float], dict)
histogram = yaml.Slot(ty.List[int], list)
def __init__(self, *,
mean: float, stddev: float, variance: float = math.nan,
......@@ -341,52 +341,27 @@ class Statistics(yaml.YAML_Struct):
return hist
def to_json_summary(self, prefix: str, dst: dict):
"""
Extract information for JSON serialization in a Benchmark_Run_Summary.
Add fields from :self: into dict :dst:.
If :prefix: is non-empty keys are prefixed with :prefix: + '-'.
"""
def to_jsonish(self) -> dict:
d = {}
for slot in self._yaml_slots_:
assert slot.yaml_name not in d
if prefix:
prefix += '-'
dst[prefix + 'num-samples'] = self.num_samples
dst[prefix + 'mean'] = self.mean
dst[prefix + 'stddev'] = self.stddev
dst[prefix + 'variance'] = self.variance
dst[prefix + 'min'] = self.min
dst[prefix + 'max'] = self.max
dst[prefix + 'p10'] = self.p10
dst[prefix + 'p25'] = self.p25
dst[prefix + 'p50'] = self.p50
dst[prefix + 'p75'] = self.p75
dst[prefix + 'p90'] = self.p90
d[slot.yaml_name] = getattr(self, slot.py_name)
return d
@staticmethod
def from_json_summary(prefix: str, src: dict) -> 'Statistics':
"""
Create Statistics object from JSON serialized data.
def from_jsonish(src: dict) -> 'Statistics':
out = {}
Reads keys prefixed with :prefix: + '-' (if prefix is non-empty).
"""
for slot in Statistics._yaml_slots_:
try:
val = src[slot.yaml_name]
if prefix:
prefix += '-'
if slot.type is ty.Dict[int, float]:
val = {int(k): v for k, v in val.items()}
except KeyError:
val = slot.default
return Statistics(
num_samples = src.get(prefix + 'num-samples', math.nan),
mean = src.get(prefix + 'mean', math.nan),
stddev = src.get(prefix + 'stddev', math.nan),
variance = src.get(prefix + 'variance', math.nan),
min = src.get(prefix + 'min', math.nan),
max = src.get(prefix + 'max', math.nan),
percentiles = {
10: src.get(prefix + 'p10', math.nan),
25: src.get(prefix + 'p25', math.nan),
50: src.get(prefix + 'p50', math.nan),
75: src.get(prefix + 'p75', math.nan),
90: src.get(prefix + 'p90', math.nan),
},
)
out[slot.py_name] = val
return Statistics(**out)
......@@ -72,6 +72,16 @@ class YAML_Serializer(abc.ABC, ty.Generic[T]):
clss.for_type(args[1]),
)
if origin is ty.Union and len(want.__args__) == 2:
args = want.__args__
assert args[0] is not type(None)
assert args[1] is type(None)
return Optional_Serializer(
clss.for_type(args[0])
)
# base types
if want is str:
return Str_Serializer()
......@@ -166,6 +176,28 @@ class Dict_Serializer(ty.Generic[K, V], YAML_Serializer[ty.Dict[K, V]]):
return construct_dict(node, self.key, self.val)
class Optional_Serializer(ty.Generic[E], YAML_Serializer[ty.Optional[E]]):
"""
Serializer for a typing.Optional[E] value (i.e. either None or an E).
"""
def __init__(self, value: YAML_Serializer[E]):
self.value = value
def to_yaml(self, obj):
if obj is None:
return Representer().represent_none(None)
else:
return self.value.to_yaml(obj)
def from_yaml(self, node):
if type(node) is yaml.ScalarNode and node.tag == 'tag:yaml.org,2002:null':
assert node.value == 'null'
return None
else:
return self.value.from_yaml(node)
class Enum_Serializer(ty.Generic[E], YAML_Serializer[E]):
"""
Serializer for a enum.Enum enumeration class.
......@@ -266,7 +298,7 @@ class YAML_Struct_Serializer(YAML_Serializer):
def yaml_items():
for slot in self.struct._yaml_slots_:
key = represent_str(slot.yaml_name)
val = slot.type.to_yaml(slot.__get__(obj))
val = slot.serializer.to_yaml(slot.__get__(obj))
assert val is not None, repr(slot.type)
......@@ -291,7 +323,7 @@ class YAML_Struct_Serializer(YAML_Serializer):
if slot is None:
raise yaml.constructor.ConstructorError('invalid field ' + repr(k) + ' in ' + self.struct.__name__)
v = slot.type.from_yaml(v)
v = slot.serializer.from_yaml(v)
kwargs[slot.py_name] = v
......@@ -305,9 +337,19 @@ class Slot:
NO_DEFAULT = object()
def __init__(self, type, default = NO_DEFAULT):
self._type = YAML_Serializer.for_type(type)
self._default = default
def __init__(self, type_, default = NO_DEFAULT):
self._type = type_
self._serializer = YAML_Serializer.for_type(type_)
if default is self.NO_DEFAULT:
self._default = self._fail_no_default
self._has_default = False
else:
if type(default) is type:
self._default = default
else:
self._default = lambda: default
self._has_default = True
def __set_name__(self, owner, name):
self._py_name = name
......@@ -322,12 +364,13 @@ class Slot:
def __set__(self, instance, value):
instance.__dict__[self._py_name] = value
@property
def has_default(self) -> bool:
"""
Check if this descriptor has a default value.
"""
return self.default is not self.NO_DEFAULT
return self._has_default
def set_default(self, instance):
"""
......@@ -351,9 +394,16 @@ class Slot:
def type(self):
return self._type
@property
def serializer(self):
return self._serializer
@property
def default(self):
return self._default
return self._default()
def _fail_no_default(self):
raise ValueError(f'Slot {self.yaml_name!r} has no default')
def load(serializer: YAML_Serializer, stream: ty.IO[str]) -> YAML_Serializable:
......
......@@ -5,7 +5,7 @@
into a condensed JSON format that is MUCH faster to parse.
Prints one JSON dict on a single line per benchmark run.
Use pipedream.benchmark.types.Benchmark_Run_Summary to read/write this JSON format.
Use pipedream.benchmark.types.Benchmark_Run to read/write this JSON format.
"""
import argparse
......@@ -18,7 +18,7 @@ except ImportError:
sys.path.append(str(pathlib.Path(__file__).parent.parent / 'src'))
import pipedream.utils.yaml as yaml
from pipedream.benchmark.types import Benchmark_Run, Benchmark_Run_Summary
from pipedream.benchmark.types import Benchmark_Run
def main():
......@@ -38,9 +38,7 @@ def main():
with open(F) as fd:
try:
for run in yaml.load_all(Benchmark_Run.yaml_serializer(), fd):
summary = Benchmark_Run_Summary.from_benchmark_run(run)
print(summary.to_json(), file=out)
print(run.to_json(), file=out)
except yaml.YAMLError as e:
print('error: malformed file', repr(F) + ':', e, file=sys.stderr)
......
......@@ -3,7 +3,7 @@
"""
Aggregate & filter a stream on benchmark measurements.
Reads & writes Benchmark_Run_Summary records in JSONl format.
Reads & writes Benchmark_Run records in JSONl format.
"""
import argparse
......@@ -18,7 +18,7 @@ except ImportError:
import pipedream.benchmark.common as common
import pipedream.asm.ir as ir
from pipedream.benchmark.types import Benchmark_Run_Summary, Benchmark_Run_Summary_Aggregator
from pipedream.benchmark.types import Benchmark_Run, Benchmark_Run_Aggregator
def main():
......@@ -31,21 +31,21 @@ def main():
parser.add_argument(
'--min-stddev',
type=float, default=Benchmark_Run_Summary_Aggregator.DEFAULT_MIN_STDDEV,
type=float, default=Benchmark_Run_Aggregator.DEFAULT_MIN_STDDEV,
help="""
Benchmark runs with an IPC/MPC stddev lower than this are ignored.
""",
)
parser.add_argument(
'--max-stddev',
type=float, default=Benchmark_Run_Summary_Aggregator.DEFAULT_MAX_STDDEV,
type=float, default=Benchmark_Run_Aggregator.DEFAULT_MAX_STDDEV,
help="""
Benchmark runs with an IPC/MPC stddev higher than this are ignored.
""",
)
parser.add_argument(
'--min-samples',
type=float, default=Benchmark_Run_Summary_Aggregator.DEFAULT_MIN_SAMPLES,
type=float, default=Benchmark_Run_Aggregator.DEFAULT_MIN_SAMPLES,
help="""
Benchmark runs with an IPC/MPC sample count lower than this are ignored.
""",
......@@ -75,7 +75,7 @@ def main():
else:
predicate = lambda kernel: False
measurements = Benchmark_Run_Summary_Aggregator(
measurements = Benchmark_Run_Aggregator(
min_stddev=args.min_stddev,
max_stddev=args.max_stddev,
min_samples=args.min_samples,
......@@ -85,7 +85,7 @@ def main():
for file in args.FILE:
with open(file) as fd:
for line in fd:
run = Benchmark_Run_Summary.from_json(line)
run = Benchmark_Run.from_json(line)
measurements.add_measurement(run)
......
This diff is collapsed.
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment