XXX: did too many things again.

3813e316 · GRUBER Fabian · 424f870a · 3813e316 · 3813e316 · 3813e316
Commit 3813e316 authored 6 years ago by GRUBER Fabian
--- a/src/pipedream/asm/asmwriter.py
+++ b/src/pipedream/asm/asmwriter.py
@@ -54,6 +54,10 @@ class ASM_Writer(abc.ABC):
  def end_function(self, function_name: str):
    """ Emit code to end a function """

+  @abc.abstractmethod
+  def global_byte_array(self, name: str, size: int, alignment: int):
+    """ Emit directives to declare a global array with given name and size. """
+
  def print(self, indent, *args):
    if args:
      if indent:

--- a/src/pipedream/asm/x86/__init__.py
+++ b/src/pipedream/asm/x86/__init__.py
@@ -88,7 +88,7 @@ class X86_Architecture(Architecture):

    ## muops
    muops_init          = 1
-    # the subq+jne at the end of the loop are fused
+    # the subq+jne at the end of the loop are macro-fused
    muops_per_iteration = num_iterations
    muops_total         = muops_init + muops_per_iteration

@@ -125,7 +125,7 @@ class X86_IR_Builder(IR_Builder):
  @abc.override
  def get_scratch_register(self, idx: int) -> 'Register':
    return {
-      0: R13, 1: R14, 2: R15,
+      0: R12, 1: R13, 2: R14, 3: R15,
    }[idx]

  @abc.override
@@ -139,24 +139,22 @@ class X86_IR_Builder(IR_Builder):
    if address_width == 32:
      assert EBP in free_regs
      candidates = BASE_REGISTER_32
-      candidates = set([EBP])
+      candidates = set([EBP, EDI, ESI])
    elif address_width == 64:
      assert RBP in free_regs
      candidates = BASE_REGISTER_64
-      candidates = set([RBP])
+      candidates = set([RBP, RDI, RSI])
    else:
      raise NotImplementedError(f'TODO: {address_width}-bit addressing')

    candidates = set(candidates) & free_regs

-    # used in sequentialize CPU
-    candidates -= set([AL, AH, AX, EAX, RAX,
-                       BL, BH, BX, EBX, RBX,
-                       CL, CH, CX, ECX, RCX,
-                       DL, DH, DX, EDX, RDX,
-                       ])
-    # scratch registers
-    candidates -= set([R13, R14, R15])
+    # # used in sequentialize CPU
+    # candidates -= set([AL, AH, AX, EAX, RAX,
+    #                    BL, BH, BX, EBX, RBX,
+    #                    CL, CH, CX, ECX, RCX,
+    #                    DL, DH, DX, EDX, RDX,
+    #                    ])
    # do not clobber stack
    candidates -= set([SPL, SP, ESP, RSP])

@@ -173,7 +171,7 @@ class X86_IR_Builder(IR_Builder):

        candidates -= set([reg])

-    assert candidates
+    assert candidates, [candidates, sorted(set([r.widest for r in free_regs]))]

    reg = sorted(candidates)[0]

@@ -303,14 +301,22 @@ class X86_IR_Builder(IR_Builder):
  def emit_put_const_in_register(self, const: int, reg: Register) -> ty.List[Instruction]:
    assert reg in GPR, f'can only do GPR for now ({reg})'

+    assert type(const) in (int, Label)
+    assert type(const) != int or 0 <= const <= (2 ** 64) - 1
+
    reg32 = reg.as_width(32)
+    reg64 = reg.as_width(64)

    if const == 0:
      i = self._set_registers(Harness.XOR_GPR32, 'src', reg32, 'src-dst', reg32)
-    else:
+    elif type(const) is int and const <= 2 ** 32 - 1:
      i = Harness.MOV_IMM32_GPR32
      i = self._set_value(i, 'src', const)
      i = self._set_register(i, 'dst', reg32)
+    else:
+      i = Harness.MOV_IMM64_GPR64
+      i = self._set_value(i, 'src', const)
+      i = self._set_register(i, 'dst', reg64)
    return [i]

  @abc.override

--- a/src/pipedream/asm/x86/asmwriter.py
+++ b/src/pipedream/asm/x86/asmwriter.py
@@ -6,6 +6,7 @@ from pipedream.asm.x86.registers import *
 from pipedream.asm.x86.operands  import *

 import functools
+import math
 import typing as ty


@@ -68,6 +69,32 @@ class X86_ASM_Writer(ASM_Writer):
        print('#', inst)
        raise

+  @abc.override
+  def global_byte_array(self, name: str, size: int, alignment: int):
+    ## this is just copy/adapted from ASM emitted by GCC
+
+    assert type(name) is str
+    assert type(size) is int
+    assert type(alignment) is int
+
+    assert name and name.isprintable()
+    assert size >= 0
+    assert alignment >= 0
+    assert math.log2(alignment).is_integer()
+
+    ## assume we are in text section
+    # self.print(1, '.text')
+    ## switch to BSS & emit array
+    self.print(1, '.bss')
+    self.print(1, '.global', name)
+    self.print(1, '.align', alignment)
+    self.print(1, '.type', name + ',', '@object')
+    self.print(1, '.size', name + ',', size)
+    self.print(0, name + ':')
+    self.print(1, '.zero', size)
+    ## switch back to text section
+    self.print(1, '.text')
+
  @abc.override
  def comment(self, *args):
    self.print(1, '#', *args)

--- a/src/pipedream/asm/x86/instructions.py
+++ b/src/pipedream/asm/x86/instructions.py
@@ -144,6 +144,7 @@ class X86_Instruction_Set(ir.Instruction_Set):

 i64 = 'i64'
 i32 = 'i32'
+i8  = 'i8'
 USE, DEF, USE_DEF = ir.Use_Def
 R, W, RW = USE, DEF, USE_DEF

@@ -174,7 +175,7 @@ INTEL_MNEMONIC_CANNOT_BENCHMARK = frozenset([
 def mk_inst(*, name: str, att_mnemonic: str, intel_mnemonic: str,
            operands: ty.List, tags: ty.Set[str],
            isa_set: str, isa_extension: str = None,
-            can_benchmark = True):
+            can_benchmark = None):
  # print('mk_inst', name, att_mnemonic, intel_mnemonic,
  #       list(op() for op in operands),
  #       tags, isa_set, isa_extension,
@@ -187,7 +188,8 @@ def mk_inst(*, name: str, att_mnemonic: str, intel_mnemonic: str,
  operands = tuple(mk_op() for mk_op in operands)

  ## filter out instructions we currently do not support
-  can_benchmark = pipedream_asm_backend_can_handle(isa_set, intel_mnemonic, att_mnemonic, tags, operands)
+  if can_benchmark is None:
+    can_benchmark = pipedream_asm_backend_can_handle(isa_set, intel_mnemonic, att_mnemonic, tags, operands)

  inst = X86_Instruction(name, att_mnemonic, intel_mnemonic, isa_set, operands, tags, can_benchmark)
  ATT_MNEMONICS[name] = att_mnemonic
@@ -364,7 +366,7 @@ def make_reg_op(*, name: str, reg_class: ir.Register_Class, reg: X86_Register =


 def make_imm_op(*, name: str, imm_bits: int,
-                type, elems: int, visibility: ir.Operand_Visibility):
+                type, elems: int, value: int = None, visibility: ir.Operand_Visibility):
  if type[0] == 'i':
    clss = {
      8: Imm8,
@@ -382,7 +384,7 @@ def make_imm_op(*, name: str, imm_bits: int,
  else:
    raise ValueError('invalid type for immediate operand: ' + repr(type))

-  return lambda: clss(name, visibility)
+  return lambda: clss(name, visibility, value)


 def make_flags_op(*, name: str, reg: X86_Register, read: X86_Flags, write: X86_Flags,
@@ -509,6 +511,7 @@ class Harness:
  IMUL_IMM_GPR64  = INSTRUCTIONS['IMUL_GPR64i64_GPR64i64_IMMi32']
  MOV_GPR64       = INSTRUCTIONS['MOV_GPR64i64_GPR64i64']
  MOV_IMM32_GPR32 = INSTRUCTIONS['MOV_GPR32i32_IMMi32']
+  MOV_IMM64_GPR64 = INSTRUCTIONS['MOV_GPR64i64_IMMi64']
  SUB_IMM8_GPR64  = INSTRUCTIONS['SUB_GPR64i64_IMMi8']
  TEST_GPR64      = INSTRUCTIONS['TEST_GPR64i64_GPR64i64']
  XOR_GPR32       = INSTRUCTIONS['XOR_GPR32i32_GPR32i32']
@@ -601,6 +604,22 @@ class Harness:
    can_benchmark  = False,
  )

+  # zero byte jump
+  JMP_E9_0 = mk_inst(
+    name           = 'JMP_0',
+    # att_mnemonic   = '.byte 0xeb, 0',
+    att_mnemonic   = '.byte 0xe9, 0, 0, 0, 0',
+    intel_mnemonic = 'XXX',
+    isa_set        = instructions_xed.ISA.I86,
+    operands       = [
+      # TODO: relbr operand type
+      # make_imm_op(name='dst', imm_bits=8, type=i8, elems=1, value=ir.Label('0'), visibility=EXPLICIT),
+      make_reg_op(name='ip', reg=RIP, reg_class=RC_RIP, action=RW, type=i64, elems=1, visibility=SUPPRESSED),
+    ],
+    tags           = ['branch', 'conditional-branch', 'relative-branch'],
+    can_benchmark  = True,
+  )
+

 tmp = collections.OrderedDict()
 for inst in sorted(INSTRUCTIONS.values(), key=lambda i: i.name):

--- a/src/pipedream/benchmark/common.py
+++ b/src/pipedream/benchmark/common.py
@@ -114,23 +114,6 @@ class _Benchmark_Runner:
      tmp_dir         = tmp_dir,
    )

-    max_memory_size = 0
-
-    for b in benchmarks:
-      memory_size = pseudoalloc.MaximizeDepsPseudoAllocator.memory_arena_size(b.instructions) * b.unroll_factor
-
-      max_memory_size = max(max_memory_size, memory_size)
-
-    memory_arena_size = 2 * max_memory_size + 2 * 4096
-    memory_arena_base = (ctypes.c_char * memory_arena_size)()
-
-    ## make sure there is some space before the arena
-    offset = 4096
-    ## align to 64
-    offset = offset + 64 - ctypes.addressof(memory_arena_base) % 64
-
-    memory_arena_ptr = ctypes.cast(ctypes.byref(memory_arena_base, offset), ctypes.POINTER(ctypes.c_char))
-
    for i, benchmark in enumerate(bench_lib.benchmarks, 1):
      result = self._run_benchmark(
        benchmark_index       = i,
@@ -139,7 +122,6 @@ class _Benchmark_Runner:
        perf_counters         = perf_counters,
        benchmark_lib         = bench_lib,
        benchmark             = benchmark,
-        memory_arena          = memory_arena_ptr,
        num_iterations        = num_iterations,
        num_warmup_iterations = num_warmup_iterations,
        outlier_low           = outlier_low,
@@ -504,7 +486,6 @@ class _Benchmark_Runner:
    papi_event_set = out.allocate_argument(0)
    num_events     = out.allocate_argument(1)
    results        = out.allocate_argument(2)
-    memory_arena   = out.allocate_argument(3)

    out.newline()
    out.comment('*' * 70)
@@ -516,37 +497,52 @@ class _Benchmark_Runner:
    out.comment('ARG papi_event_set ', papi_event_set)
    out.comment('ARG num_events     ', num_events)
    out.comment('ARG results        ', results)
-    out.comment('ARG memory_arena   ', memory_arena)

    out.comment('free callee-saves for kernel')
    out.push_callee_saves()

    out.newline()
-    out.comment('papi_event_set and results are alive in the kernel')

    SCRATCH_REG_1 = out.scratch_register(0)
    SCRATCH_REG_2 = out.scratch_register(1)
    SCRATCH_REG_3 = out.scratch_register(2)
+    SCRATCH_REG_4 = out.scratch_register(3)
+
+    from pipedream.asm.x86 import RDX
+    assert results is RDX

    out.comment('papi_event_set -> ', SCRATCH_REG_1)
    papi_event_set = out.move_to(papi_event_set, SCRATCH_REG_1)
-    out.comment('results -> ', SCRATCH_REG_2)
-    results        = out.move_to(results,        SCRATCH_REG_2)
+    out.comment('num_events     -> ', SCRATCH_REG_2)
+    num_events     = out.move_to(num_events, SCRATCH_REG_2)
+    out.comment('results        -> ', SCRATCH_REG_3)
+    results        = out.move_to(results, SCRATCH_REG_3)

-    LOOP_COUNTER = SCRATCH_REG_3
+    LOOP_COUNTER = SCRATCH_REG_4

    out.comment('size of one row of results table in bytes')
    STRIDE = out.mul_reg_with_const(num_events, 8)

-    if any(i.has_memory_operand() for i in benchmark.instructions):
-      # FIXME: other address sizes
-      MEMORY_REG = out.ir_builder.select_memory_base_register(benchmark.instructions, set(out.free_registers()), 64)
+    need_memory: bool = any(i.has_memory_operand() for i in benchmark.instructions)
+
+    if need_memory:
+      MEMORY_ARENA = ir.Label(self._MEMORY_ARENA + '@GOTPCREL(%rip)')
+
+      out.comment('clear memory arena')
+      ## void *memset(void *s, int c, size_t n);
+      s = out.get_argument_register(0)
+      c = out.get_argument_register(1)
+      n = out.get_argument_register(2)

-      out.comment('memory_arena -> ', MEMORY_REG)
-      out.move_to(memory_arena, MEMORY_REG)
-    else:
-      out.free_reg(memory_arena)
-      MEMORY_REG = None
+      out.put_const_in_register(MEMORY_ARENA, s)
+      out.put_const_in_register(0, c)
+      out.put_const_in_register(self.memory_size(benchmark), n)
+
+      out.call('memset@PLT', s, c, n)
+
+      out.free_reg(s)
+      out.free_reg(c)
+      out.free_reg(n)

    with out.counting_loop('measurement', LOOP_COUNTER, num_iterations) as loop:
      out.comment('push loop counter')
@@ -571,6 +567,14 @@ class _Benchmark_Runner:

      out.sequentialize_cpu()

+      if need_memory:
+        # FIXME: other address sizes
+        MEMORY_REG = out.ir_builder.select_memory_base_register(benchmark.instructions, set(out.free_registers()), 64)
+        out.take_reg(MEMORY_REG)
+        out.put_const_in_register(MEMORY_ARENA, MEMORY_REG)
+      else:
+        MEMORY_REG = None
+
      ## allow backend to reserve some registers.
      out.steal_benchmark_registers(benchmark.instructions)

@@ -688,6 +692,9 @@ class _Benchmark_Runner:

    return out.take_code(), kernel_instructions

+  _MEMORY_ARENA = '_memory_arena_'
+  _PAGE_SIZE    = 4096
+
  def _gen_benchmark_lib(self, *,
                         benchmark_specs: ty.List[Benchmark_Spec],
                         architecture: ir.Architecture,
@@ -736,6 +743,30 @@ class _Benchmark_Runner:

        benchmark_functions[benchmark] = fn_name

+      ## calculate of size of memory arena
+
+      memory_arena_size = 0
+
+      for b in benchmark_specs:
+        memory_size = pseudoalloc.MaximizeDepsPseudoAllocator.memory_arena_size(b.instructions) * b.unroll_factor
+
+        memory_arena_size = max(memory_arena_size, memory_size)
+
+      PAGE_SIZE = self._PAGE_SIZE
+
+      ## round up to a multiple of page size
+      memory_arena_size = memory_arena_size + PAGE_SIZE - memory_arena_size % PAGE_SIZE
+
+      ## add a padding page
+      memory_arena_size += PAGE_SIZE
+
+      ## why not
+      memory_arena_size *= 2
+
+      asm_writer.global_byte_array(self._MEMORY_ARENA + 'pad_before_', memory_arena_size, 4096)
+      asm_writer.global_byte_array(self._MEMORY_ARENA, memory_arena_size, 4096)
+      asm_writer.global_byte_array(self._MEMORY_ARENA + 'pad_after_', memory_arena_size, 4096)
+
      asm_writer.end_file(asm_file)

    self.info('assemble benchmark library')
@@ -748,7 +779,7 @@ class _Benchmark_Runner:
    )

    lib = ctypes.cdll.LoadLibrary(lib_file)
-    os.unlink(lib_file)
+    # os.unlink(lib_file)

    return _Benchmark_Lib(
      lib,
@@ -762,7 +793,6 @@ class _Benchmark_Runner:
                     perf_counters: 'Perf_Counter_Spec',
                     benchmark_lib: '_Benchmark_Lib',
                     benchmark: Benchmark_Spec,
-                     memory_arena: ctypes.c_char_p,
                     num_iterations: int, num_warmup_iterations: int,
                     outlier_low: int = 0, outlier_high: int = 100) -> Benchmark_Run:
    assert type(outlier_low) is int and 0 <= outlier_low <= 100
@@ -772,15 +802,18 @@ class _Benchmark_Runner:
    outlier_low  = fractions.Fraction(outlier_low) / 100
    outlier_high = fractions.Fraction(outlier_high) / 100

-    total_muops  = []
-    total_insts  = []
-    total_cycles = []
+    total_cycles         = []
+    total_insts          = []
+    total_fused_muops    = []
+    total_unfused_muops  = []

-    total_clean_IPC = []
-    total_clean_MPC = []
+    total_clean_IPC  = []
+    total_clean_fMPC = []
+    total_clean_uMPC = []

-    total_IPC = []
-    total_MPC = []
+    total_IPC  = []
+    total_fMPC = []
+    total_uMPC = []

    benchmark_fn = benchmark_lib.benchmark_function(benchmark)
    event_sets   = []
@@ -809,7 +842,6 @@ class _Benchmark_Runner:
        event_set_id,
        num_events,
        result_array.ctypes.data_as(ctypes.POINTER(ctypes.c_longlong)),
-        memory_arena,
      )
      time_after = time.perf_counter()

@@ -835,7 +867,6 @@ class _Benchmark_Runner:
      result_array = result_array[index_array]

      # drop outlier values (below/above lo/hi percentiles)
-
      lo = round(len(result_array) * outlier_low)
      hi = round(len(result_array) * outlier_high)

@@ -845,42 +876,95 @@ class _Benchmark_Runner:

      assert len(result_array)

-      muops  = column_by_name(perf_counters.uop_counter(), result_array)
-      insts  = column_by_name(perf_counters.instruction_counter(), result_array)
      cycles = column_by_name(perf_counters.cycle_counter(), result_array)

      assert cycles.max() > 0

+      total_cycles.append(cycles)
+
      overhead = benchmark.arch.loop_overhead(benchmark.kernel_iterations)

-      # this assumes that the loop control muops
-      # do not contend significantly with the benchmark kernel.
-      clean_IPC = (insts - overhead.instructions) / cycles
-      clean_MPC = (muops - overhead.muops) / cycles
+      try:
+        muops_idx = evt_set.index_of(perf_counters.unfused_uop_counter())
+      except IndexError:
+        unfused_muops = None
+        uMPC          = None
+        clean_uMPC    = None
+      else:
+        muops = result_array[:, muops_idx]

-      IPC = insts / cycles
-      MPC = muops / cycles
+        # this assumes that the loop control muops
+        # do not contend significantly with the benchmark kernel.
+        clean_MPC = (muops - overhead.muops) / cycles

-      total_muops.append(muops)
-      total_insts.append(insts)
-      total_cycles.append(cycles)
+        MPC = muops / cycles

-      total_clean_IPC.append(clean_IPC)
-      total_clean_MPC.append(clean_MPC)
+        total_unfused_muops.append(muops)
+        total_clean_uMPC.append(clean_MPC)
+        total_uMPC.append(MPC)
+
+        unfused_muops = Statistics.from_array(muops)
+        uMPC          = Statistics.from_array(MPC)
+        clean_uMPC    = Statistics.from_array(clean_MPC)
+
+      try:
+        muops_idx = evt_set.index_of(perf_counters.fused_uop_counter())
+      except IndexError:
+        fused_muops = None
+        fMPC        = None
+        clean_fMPC  = None
+      else:
+        muops = result_array[:, muops_idx]

-      total_IPC.append(IPC)
-      total_MPC.append(MPC)
+        # this assumes that the loop control muops
+        # do not contend significantly with the benchmark kernel.
+        clean_MPC = (muops - overhead.muops) / cycles
+
+        MPC = muops / cycles
+
+        total_fused_muops.append(muops)
+        total_clean_fMPC.append(clean_MPC)
+        total_fMPC.append(MPC)
+
+        fused_muops = Statistics.from_array(muops)
+        fMPC        = Statistics.from_array(MPC)
+        clean_fMPC  = Statistics.from_array(clean_MPC)
+
+      try:
+        insts_idx = evt_set.index_of(perf_counters.instruction_counter())
+      except IndexError:
+        instructions = None
+        IPC          = None
+        clean_IPC    = None
+      else:
+        insts = result_array[:, insts_idx]
+
+        clean_IPC = (insts - overhead.instructions) / cycles
+
+        IPC = insts / cycles
+
+        total_insts.append(insts)
+        total_clean_IPC.append(clean_IPC)
+        total_IPC.append(IPC)
+
+        instructions = Statistics.from_array(insts)
+        IPC          = Statistics.from_array(IPC)
+        clean_IPC    = Statistics.from_array(clean_IPC)

      measurement = Event_Set_Run(
-        cycles       = Statistics.from_array(cycles),
-        instructions = Statistics.from_array(insts),
-        muops        = Statistics.from_array(muops),
+        cycles = Statistics.from_array(cycles),

-        clean_IPC = Statistics.from_array(clean_IPC),
-        clean_MPC = Statistics.from_array(clean_MPC),
+        unfused_muops = unfused_muops,
+        uMPC          = uMPC,
+        clean_uMPC    = clean_uMPC,

-        IPC=Statistics.from_array(IPC),
-        MPC=Statistics.from_array(MPC),
+        fused_muops  = fused_muops,
+        fMPC         = fMPC,
+        clean_fMPC   = clean_fMPC,
+
+        instructions = instructions,
+        IPC          = IPC,
+        clean_IPC    = clean_IPC,
      )

      for port, evt in perf_counters.port_counters.items():
@@ -896,7 +980,7 @@ class _Benchmark_Runner:
        measurement.port_muops[port] = mean_and_stddev

      for evt in evt_set.event_names:
-        if evt == perf_counters.uop_counter():
+        if evt == perf_counters.fused_uop_counter():
          continue
        if evt == perf_counters.instruction_counter():
          continue
@@ -905,6 +989,7 @@ class _Benchmark_Runner:
        if evt in perf_counters.port_counters.values():
          continue

+        assert type(evt) is str, evt
        measurement.misc[evt] = Statistics.from_array(column_by_name(evt, result_array))

      event_sets.append(measurement)
@@ -912,27 +997,46 @@ class _Benchmark_Runner:
    timestamp_end = datetime.datetime.now()

    def total(arrays):
-      array = numpy.hstack(arrays)
-      return Statistics.from_array(array)
+      if True or arrays:
+        array = numpy.hstack(arrays)
+        return Statistics.from_array(array)
+      else:
+        return None

    ## we are running with SCHED_FIFO.
    ## so the kernel won't preempt us, give other processes some room to breathe
    # os.sched_yield()

    return Benchmark_Run(
-      benchmark    = benchmark,
-      timestamp    = timestamp_bgn,
-      runtime      = timestamp_end - timestamp_bgn,
-      cycles       = total(total_cycles),
-      instructions = total(total_insts),
-      muops        = total(total_muops),
-      clean_IPC    = total(total_clean_IPC),
-      clean_MPC    = total(total_clean_MPC),
-      IPC          = total(total_IPC),
-      MPC          = total(total_MPC),
-      event_sets   = event_sets,
+      benchmark     = benchmark,
+      timestamp     = timestamp_bgn,
+      runtime       = timestamp_end - timestamp_bgn,
+      cycles        = total(total_cycles),
+      instructions  = total(total_insts),
+      fused_muops   = total(total_fused_muops),
+      unfused_muops = total(total_unfused_muops),
+      clean_IPC     = total(total_clean_IPC),
+      clean_fMPC    = total(total_clean_fMPC),
+      clean_uMPC    = total(total_clean_uMPC),
+      IPC           = total(total_IPC),
+      fMPC          = total(total_fMPC),
+      uMPC          = total(total_uMPC),
+      event_sets    = event_sets,
    )

+  def memory_size(self, bench: Benchmark_Spec) -> int:
+    memory_size = pseudoalloc.MaximizeDepsPseudoAllocator.memory_arena_size(bench.instructions) * bench.unroll_factor
+
+    PAGE_SIZE = self._PAGE_SIZE
+
+    ## round up to a multiple of page size
+    memory_size = memory_size + PAGE_SIZE - memory_size % PAGE_SIZE
+
+    ## add a padding page
+    memory_size += PAGE_SIZE
+
+    return memory_size
+
  def _die(self, *msg):
    self.info.clear()
    print('error:', *msg, file=sys.stderr)
@@ -1346,6 +1450,10 @@ class _ASM_Builder:
  def branch_if_not_zero(self, reg: ir.Register, dst: ir.Label):
    self.insts(self._irb.emit_branch_if_not_zero(reg, dst))

+  def get_argument_register(self, idx: int) -> ir.Register:
+    reg = self._irb.get_argument_register(idx)
+    return reg
+
  def allocate_argument(self, idx: int) -> ir.Register:
    reg = self._irb.get_argument_register(idx)
    self._alloc.take(reg)
@@ -1632,18 +1740,33 @@ class Perf_Counter_Spec:
  def make_throughput_counters(clss, extra_events: ty.List[str] = ()) -> 'Perf_Counter_Spec':
    papi = clss._the_papi()

-    return Perf_Counter_Spec(papi, {}, extra_events, clss.throughput_counters())
+    events = [
+      *clss.throughput_counters(),
+      *extra_events,
+    ]
+
+    return Perf_Counter_Spec(papi, {}, events, [clss.cycle_counter()])

  @classmethod
  def make_latency_counters(clss, extra_events: ty.List[str] = ()) -> 'Perf_Counter_Spec':
    papi = clss._the_papi()

-    return Perf_Counter_Spec(papi, {}, extra_events, clss.throughput_counters())
+    events = [
+      *clss.throughput_counters(),
+      *extra_events,
+    ]
+
+    return Perf_Counter_Spec(papi, {}, events, [clss.cycle_counter()])

  @classmethod
  def make_port_counters(clss, extra_events: ty.List[str] = ()) -> 'Perf_Counter_Spec':
    papi = clss._the_papi()

+    events = [
+      *clss.throughput_counters(),
+      *extra_events,
+    ]
+
    # FIXME: recent Intel CPUs only
    ports = {
      port: evt
@@ -1652,7 +1775,7 @@ class Perf_Counter_Spec:
      if papi.can_count_event(evt)
    }

-    return Perf_Counter_Spec(papi, ports, extra_events, clss.throughput_counters())
+    return Perf_Counter_Spec(papi, ports, events, [clss.cycle_counter()])

  ##### access registered PAPI event set

@@ -1685,21 +1808,35 @@ class Perf_Counter_Spec:
    return "INSTRUCTIONS_RETIRED"

  @classmethod
-  def uop_counter(clss) -> ty.Optional[str]:
+  def unfused_uop_counter(clss) -> ty.Optional[str]:
+    """ Count muops in the unfused domain (after fission of fused micro ops) """
+
    # FIXME: recent Intel CPUs only
    # return "UOPS_EXECUTED"
-    return "UOPS_RETIRED"
+    return "UOPS_RETIRED:ALL"
+    # return "UOPS_RETIRED:RETIRE_SLOTS"
+    # return "UOPS_ISSUED"
+
+  @classmethod
+  def fused_uop_counter(clss) -> ty.Optional[str]:
+    """ Count muops in the fused domain (fused micro ops in ROB, etc.) """
+
+    # FIXME: recent Intel CPUs only
+    return "UOPS_RETIRED:RETIRE_SLOTS"
+    # return "UOPS_ISSUED"

  @classmethod
  def throughput_counters(clss) -> ty.Sequence[str]:
    return tuple([
-      clss.cycle_counter(),
      clss.instruction_counter(),
-      clss.uop_counter(),
+      clss.fused_uop_counter(),
+      clss.unfused_uop_counter(),
    ])

  @classmethod
  def uop_counters_per_port(clss, papi: pypapi.Papi) -> ty.Sequence[ty.Tuple[int, str]]:
+    """ Count unfused muops executed per execution port """
+
    if papi.can_count_event("UOPS_EXECUTED_PORT:PORT_0"):
      return tuple(enumerate([
        "UOPS_EXECUTED_PORT:PORT_0",
@@ -1770,7 +1907,6 @@ class _Benchmark_Lib:
      ctypes.c_int,
      ctypes.c_ssize_t,
      ctypes.POINTER(ctypes.c_longlong),
-      ctypes.POINTER(ctypes.c_char),
    ]
    fn.restype = ctypes.c_int


--- a/src/pipedream/benchmark/show_stats.py
+++ b/src/pipedream/benchmark/show_stats.py
@@ -186,33 +186,39 @@ class Pretty_Printer:
      'unroll-factor',     '%10d' % benchmark.unroll_factor,
      'kernel-iterations', '%10d' % benchmark.kernel_iterations,
    )
+    # self.write_parameter_two_columns(
+    #   'IPC (max)', self.round_result(run.clean_IPC.max),
+    #   'MPC (max)', self.round_result(run.clean_MPC.max),
+    # )
+    # self.write_parameter_two_columns(
+    #   'IPC (p90)', self.round_result(run.clean_IPC.p90),
+    #   'MPC (p90)', self.round_result(run.clean_MPC.p90),
+    # )
+
+    i  = run.instructions.max  - benchmark.loop_overhead.instructions
+    fm = run.fused_muops.max   - benchmark.loop_overhead.muops
+    um = run.unfused_muops.max - benchmark.loop_overhead.muops
+
    self.write_parameter_two_columns(
-      'IPC (max)', self.round_result(run.clean_IPC.max),
-      'MPC (max)', self.round_result(run.clean_MPC.max),
-    )
-    self.write_parameter_two_columns(
-      'IPC (p90)', self.round_result(run.clean_IPC.p90),
-      'MPC (p90)', self.round_result(run.clean_MPC.p90),
+      'fused muops',   '%10s' % round(fm / i),
+      'unfused muops', '%10s' % round(um / i),
    )

-    if len(benchmark.instructions) == 1:
-      i = run.instructions.max - benchmark.loop_overhead.instructions
-      m = run.muops.max        - benchmark.loop_overhead.muops
-
-      self.write_result('num-muops', fractions.Fraction(m) / fractions.Fraction(i))
-
-    self.write_statistic('clean-IPC',    run.clean_IPC)
-    self.write_statistic('clean-MPC',    run.clean_MPC)
-    self.write_statistic('IPC',          run.IPC)
-    self.write_statistic('MPC',          run.MPC)
-    self.write_statistic('cycles',       run.cycles)
-    self.write_statistic('instructions', run.instructions)
-    self.write_statistic('muops',        run.muops)
+    self.write_statistic('clean-IPC',     run.clean_IPC)
+    self.write_statistic('clean-fMPC',    run.clean_fMPC)
+    self.write_statistic('clean-uMPC',    run.clean_uMPC)
+    self.write_statistic('IPC',           run.IPC)
+    self.write_statistic('fMPC',          run.fMPC)
+    self.write_statistic('uMPC',          run.uMPC)
+    self.write_statistic('cycles',        run.cycles)
+    self.write_statistic('instructions',  run.instructions)
+    self.write_statistic('fused muops',   run.fused_muops)
+    self.write_statistic('unfused muops', run.unfused_muops)

    for m in run.event_sets:
      for port, stat in m.port_muops.items():
        # ignore ports that take <.1% of the instructions in the bench
-        if stat.mean < (run.muops.mean / 1000):
+        if stat.mean < (run.unfused_muops.mean / 1000):
          continue

        self.write_statistic('port-' + str(port), stat)
@@ -222,44 +228,6 @@ class Pretty_Printer:

    print()

-  def pretty_print_benchmark_run_summary(self, run: Benchmark_Run_Summary):
-    print()
-    self.header1(run.name, ':')
-    self.write_parameter_one_column('runtime',   run.runtime)
-
-    self.write_parameter_two_columns(
-      'IPC (max)', self.round_result(run.ipc.max),
-      'MPC (max)', self.round_result(run.mpc.max),
-    )
-    self.write_parameter_two_columns(
-      'IPC (p90)', self.round_result(run.ipc.p90),
-      'MPC (p90)', self.round_result(run.mpc.p90),
-    )
-
-    if len(run.kernel) == 1:
-      i = run.instructions.max
-      m = run.muops.max
-
-      if math.isnan(i) or math.isnan(m):
-        self.write_result('num-muops', math.nan)
-      else:
-        self.write_result('num-muops', fractions.Fraction(m) / fractions.Fraction(i))
-
-    self.write_statistic('IPC',          run.ipc)
-    self.write_statistic('MPC',          run.mpc)
-    self.write_statistic('cycles',       run.cycles)
-    self.write_statistic('instructions', run.instructions)
-    self.write_statistic('muops',        run.muops)
-
-    for port, stat in run.port_muops.items():
-      # ignore ports that take <.1% of the instructions in the bench
-      if stat.mean < (run.muops.mean / 1000):
-        continue
-
-      self.write_statistic('port-' + str(port), stat)
-
-    print()
-

 def main(argv):
  # arch  = ir.Architecture.for_name('x86')
@@ -327,7 +295,7 @@ def main(argv):
      with open(input_file) as fd:
        try:
          for lineno, line in enumerate(fd, 1):
-            run = Benchmark_Run_Summary.from_json(line)
+            run = Benchmark_Run.from_json(line)

            if args.short:
              # if len(run.kernel) > 1:
@@ -339,21 +307,28 @@ def main(argv):
              #   continue

              try:
-                num_muops = round(run.muops.mean / run.instructions.mean)
+                num_fused_muops = round(run.fused_muops.mean / run.instructions.mean)
+              except ZeroDivisionError:
+                num_fused_muops = math.nan
+
+              try:
+                num_unfused_muops = round(run.unfused_muops.mean / run.instructions.mean)
              except ZeroDivisionError:
-                num_muops = math.nan
+                num_unfused_muops = math.nan

              print(
                run.name.ljust(100),
-                f'IPC={round(run.ipc.mean,1)}~{round(run.ipc.stddev,1)}',
-                f'MPC={round(run.mpc.mean,1)}~{round(run.mpc.stddev,1)}',
+                f'IPC={round(run.clean_IPC.mean,1)}~{round(run.clean_IPC.stddev,1)}',
+                f'fMPC={round(run.clean_fMPC.mean,1)}~{round(run.clean_fMPC.stddev,1)}',
+                f'uMPC={round(run.clean_uMPC.mean,1)}~{round(run.clean_uMPC.stddev,1)}',
                '-',
-                '%3s MUOPS' % num_muops,
+                '%3s FUSED MUOPS' % num_fused_muops,
+                '%3s UNFUSED MUOPS' % num_unfused_muops,
                '-',
                ' '.join(f'P{p}' for p in sorted(ports)),
              )
            else:
-              p.pretty_print_benchmark_run_summary(run)
+              p.pretty_print_benchmark_run(run)
        except json.JSONDecodeError as e:
          print(Pretty_Printer.C.red('JSON syntax error:'), input_file + f':{lineno}:', e)


--- a/src/pipedream/benchmark/types.py
+++ b/src/pipedream/benchmark/types.py
@@ -70,7 +70,7 @@ class Benchmark_Spec(yaml.YAML_Serializable):
    self.kernel_iterations = kernel_iterations
    self.arch              = arch
    self.instructions      = instructions
-    self.loop_overhead = loop_overhead
+    self.loop_overhead     = loop_overhead

  @classmethod
  def from_instructions(clss, *,
@@ -273,30 +273,41 @@ class Event_Set_Run(yaml.YAML_Struct):
    Run one event set over a benchmark
  """

-  cycles       = yaml.Slot(Statistics)
-  instructions = yaml.Slot(Statistics)
-  muops        = yaml.Slot(Statistics)
+  cycles         = yaml.Slot(Statistics)
+  instructions   = yaml.Slot(ty.Optional[Statistics])
+  fused_muops    = yaml.Slot(ty.Optional[Statistics])
+  unfused_muops  = yaml.Slot(ty.Optional[Statistics])

  ## IPC not counting instructions for the kernel loop
-  clean_IPC    = yaml.Slot(Statistics)
-  clean_MPC    = yaml.Slot(Statistics)
+  clean_IPC    = yaml.Slot(ty.Optional[Statistics])
+  clean_fMPC   = yaml.Slot(ty.Optional[Statistics])
+  clean_uMPC   = yaml.Slot(ty.Optional[Statistics])

-  IPC          = yaml.Slot(Statistics)
-  MPC          = yaml.Slot(Statistics)
+  IPC          = yaml.Slot(ty.Optional[Statistics])
+  fMPC         = yaml.Slot(ty.Optional[Statistics])
+  uMPC         = yaml.Slot(ty.Optional[Statistics])

  port_muops   = yaml.Slot(ty.Dict[int, Statistics], default={})
  misc         = yaml.Slot(ty.Dict[str, Statistics], default={})

-  def __init__(self, *, cycles, instructions, muops, clean_IPC, clean_MPC, IPC, MPC, port_muops = None, misc = None):
-    self.cycles       = cycles
-    self.instructions = instructions
-    self.muops        = muops
+  def __init__(self, *,
+               cycles,
+               instructions = None, IPC = None, clean_IPC = None,
+               fused_muops = None, clean_fMPC = None, fMPC = None,
+               unfused_muops = None, clean_uMPC = None, uMPC = None,
+               port_muops = None, misc = None):
+    self.cycles        = cycles
+    self.instructions  = instructions
+    self.fused_muops   = fused_muops
+    self.unfused_muops = unfused_muops

    self.clean_IPC    = clean_IPC
-    self.clean_MPC    = clean_MPC
+    self.clean_fMPC   = clean_fMPC
+    self.clean_uMPC   = clean_uMPC

    self.IPC          = IPC
-    self.MPC          = MPC
+    self.fMPC         = fMPC
+    self.uMPC         = uMPC

    self.port_muops   = port_muops or {}
    self.misc         = misc       or {}
@@ -311,178 +322,250 @@ class Benchmark_Run(yaml.YAML_Struct):
  timestamp = yaml.Slot(datetime.datetime)
  runtime   = yaml.Slot(datetime.timedelta)

-  cycles       = yaml.Slot(Statistics)
-  instructions = yaml.Slot(Statistics)
-  muops        = yaml.Slot(Statistics)
+  cycles        = yaml.Slot(Statistics)
+  instructions  = yaml.Slot(Statistics)
+  fused_muops   = yaml.Slot(Statistics)
+  unfused_muops = yaml.Slot(Statistics)

  ## IPC not counting instructions for the kernel loop
  clean_IPC    = yaml.Slot(Statistics)
-  clean_MPC    = yaml.Slot(Statistics)
+  ## fused muops -//-
+  clean_fMPC   = yaml.Slot(Statistics)
+  ## unfused muops -//-
+  clean_uMPC   = yaml.Slot(Statistics)

+  ## instructions per cycle
  IPC          = yaml.Slot(Statistics)
-  MPC          = yaml.Slot(Statistics)
+  ## fused muops -//-
+  fMPC         = yaml.Slot(Statistics)
+  ## unfused muops -//-
+  uMPC         = yaml.Slot(Statistics)

  event_sets   = yaml.Slot(ty.List[Event_Set_Run])

  def __init__(self, *, benchmark, timestamp, runtime = datetime.timedelta(seconds = 0),
-               cycles, instructions, muops, clean_IPC, clean_MPC, IPC, MPC,
+               cycles,
+               instructions, IPC, clean_IPC,
+               fused_muops, clean_fMPC, fMPC,
+               unfused_muops, clean_uMPC, uMPC,
               event_sets):
    self.benchmark    = benchmark
    self.timestamp    = timestamp
    self.runtime      = runtime
    self.cycles       = cycles
-    self.instructions = instructions
-    self.muops        = muops

+    self.instructions = instructions
    self.clean_IPC    = clean_IPC
-    self.clean_MPC    = clean_MPC
-
    self.IPC          = IPC
-    self.MPC          = MPC
+
+    self.fused_muops  = fused_muops
+    self.clean_fMPC   = clean_fMPC
+    self.fMPC         = fMPC
+
+    self.unfused_muops = unfused_muops
+    self.clean_uMPC    = clean_uMPC
+    self.uMPC          = uMPC

    self.event_sets   = event_sets

-  # @property
-  # def port_usage(self, port: int):
-  #   """
-  #     muops:port / total_muops
-  #   """
+  ## XXX

-  #   return self.ports[port] / self.total_muops
+  @property
+  def name(self):
+    return self.benchmark.name

+  @property
+  def kernel(self):
+    return tuple(sorted(i.name for i in self.benchmark.instructions))

-@dataclasses.dataclass(frozen=True)
-class Benchmark_Run_Summary:
-  """
-    Smaller version of benchmark run containing less information.
-    Intended for serialization from/to JSON for faster reading.
-  """
+  @property
+  def ipc(self):
+    return self.clean_IPC

-  kernel: ty.Tuple[str, ...]
+  @property
+  def fmpc(self):
+    return self.clean_fMPC

-  ipc: Statistics
-  mpc: Statistics
-  instructions: Statistics
-  muops: Statistics
-  cycles: Statistics
+  @property
+  def umpc(self):
+    return self.clean_uMPC

-  port_muops: ty.Dict[int, Statistics] = dataclasses.field(default_factory=dict)
-  misc: ty.Dict[str, Statistics]       = dataclasses.field(default_factory=dict)
+  @property
+  def port_muops(self):
+    return self.ports_usage()

-  runtime: float = math.nan
+  ## XXX

  @property
-  def name(self) -> str:
-    return Benchmark_Spec.name_from_instruction_names(self.kernel)
+  def num_fused_muops(self) -> int:
+    return round(self.clean_fMPC.mean / self.clean_IPC.mean)

  @property
-  def num_muops(self) -> int:
-    return round(self.mpc.mean / self.ipc.mean)
+  def num_unfused_muops(self) -> int:
+    return round(self.clean_uMPC.mean / self.clean_IPC.mean)
+
+  def ports_usage(self, min_usage: float = 0.05) -> ty.Dict[int, Statistics]:
+    assert 0 < min_usage <= 1, min_usage
+
+    out = {}
+
+    for m in self.event_sets:
+      for port, stat in m.port_muops.items():
+        assert port not in out, port
+
+        if (stat.mean / self.unfused_muops.mean) < min_usage:
+          continue
+
+        out[port] = stat
+
+    return out

  def ports_used(self, min_usage: float = 0.05) -> ty.FrozenSet[int]:
    """
      Return set of ports where at least :min_usage: percent of all muops where executed.
    """

-    assert 0 < min_usage <= 1, min_usage
+    return frozenset(self.ports_usage(min_usage).keys())

-    return frozenset(p for p, stat in self.port_muops.items() if (stat.mean / self.muops.mean) >= min_usage)
+  def to_json(self) -> str:
+    out = {}
+
+    for slot in self._yaml_slots_:
+      assert slot.yaml_name not in out
+
+      val = getattr(self, slot.py_name)
+
+      if slot.type is Statistics:
+        val = val.to_jsonish()
+      elif slot.type is Benchmark_Spec:
+        val = self._Benchmark_Spec_to_json(val)
+      elif slot.type is datetime.datetime:
+        val = self._datetime_to_json(val)
+      elif slot.type is datetime.timedelta:
+        val = val.total_seconds()
+      elif slot.type is ty.List[Event_Set_Run]:
+        val = [self._Event_Set_Run_to_json(e) for e in val]
+      else:
+        raise TypeError(slot.type)
+
+      out[slot.yaml_name] = val
+
+    return json.dumps(out)

  @staticmethod
-  def from_benchmark_run(run: Benchmark_Run) -> 'Benchmark_Run_Summary':
-    port_muops = {}
-    misc       = {}
+  def from_json(txt) -> 'Benchmark_Run':
+    data = json.loads(txt)
+    assert type(data) is dict

-    for m in run.event_sets:
-      for port, stat in m.port_muops.items():
-        assert port not in port_muops, port
+    out = {}
+
+    for slot in Benchmark_Run._yaml_slots_:
+      val = data[slot.yaml_name]
+
+      if slot.type is Statistics:
+        val = Statistics.from_jsonish(val)
+      elif slot.type is Benchmark_Spec:
+        val = Benchmark_Run._Benchmark_Spec_from_json(val)
+      elif slot.type is datetime.datetime:
+        val = Benchmark_Run._datetime_from_json(val)
+      elif slot.type is datetime.timedelta:
+        val = datetime.timedelta(seconds=val)
+      elif slot.type is ty.List[Event_Set_Run]:
+        val = [Benchmark_Run._Event_Set_Run_from_json(e) for e in val]
+      else:
+        raise TypeError(slot.type)

-        port_muops[port] = stat
+      out[slot.py_name] = val

-      for name, stat in m.misc.items():
-        assert name not in misc, name
+    return Benchmark_Run(**out)

-        misc[name] = stat
+  @staticmethod
+  def _Benchmark_Spec_to_json(spec) -> dict:
+    out = {
+      'name':               spec.name,
+      'kind':               spec.kind.name,
+      'unroll_factor':      spec.unroll_factor,
+      'kernel_iterations':  spec.kernel_iterations,
+      'arch':               spec.arch.name,
+      'instructions':       [i.name for i in spec.instructions],
+      'loop_overhead':      {'instructions': spec.loop_overhead.instructions,
+                             'muops': spec.loop_overhead.muops}
+    }
+    return out

-    return Benchmark_Run_Summary(
-      kernel = tuple(sorted(i.name for i in run.benchmark.instructions)),
+  @staticmethod
+  def _Benchmark_Spec_from_json(obj: dict) -> Benchmark_Spec:
+    arch            = ir.Architecture.for_name(obj['arch'])
+    instruction_set = arch.instruction_set()
+    instructions    = [instruction_set[i] for i in obj['instructions']]

-      runtime      = run.runtime.total_seconds(),
-      ipc          = run.clean_IPC,
-      mpc          = run.clean_MPC,
-      instructions = run.instructions,
-      muops        = run.muops,
-      cycles       = run.cycles,
-      port_muops   = port_muops,
-      misc         = misc,
+    return Benchmark_Spec(
+      name              = obj['name'],
+      kind              = Benchmark_Kind[obj['kind']],
+      unroll_factor     = obj['unroll_factor'],
+      kernel_iterations = obj['kernel_iterations'],
+      arch              = arch,
+      instructions      = instructions,
+      loop_overhead     = Loop_Overhead(obj['loop_overhead']['instructions'],
+                                        obj['loop_overhead']['muops'],)
    )

  @staticmethod
-  def from_json(txt: str) -> 'Benchmark_Run_Summary':
-    data = json.loads(txt)
+  def _Event_Set_Run_to_json(event_set_run) -> dict:
+    d = {}

-    assert type(data) is dict
+    for slot in Event_Set_Run._yaml_slots_:
+      val = getattr(event_set_run, slot.py_name)

-    runtime = data.get('runtime', math.nan)
-
-    port_muops = {}
-    port_json = data.get('ports')
-    if port_json:
-      for port in range(8):
-        port_muops[port] = Statistics.from_json_summary(f'port{port}-muops', port_json)
-
-    misc = {}
-    misc_json = data.get('misc')
-    if misc_json:
-      for name, stat in misc_json.items():
-        misc[port] = Statistics.from_json_summary('', stat)
-
-    return Benchmark_Run_Summary(
-      kernel       = tuple(data['kernel']),
-      runtime      = runtime,
-      ipc          = Statistics.from_json_summary('ipc', data),
-      mpc          = Statistics.from_json_summary('mpc', data),
-      instructions = Statistics.from_json_summary('instructions', data),
-      muops        = Statistics.from_json_summary('muops', data),
-      cycles       = Statistics.from_json_summary('cycles', data),
-      port_muops   = port_muops,
-      misc         = misc,
-    )
+      if slot.type is Statistics:
+        val = val.to_jsonish()
+      elif slot.type is ty.Optional[Statistics]:
+        if val is not None:
+          val = val.to_jsonish()
+      elif slot.type is ty.Dict[int, Statistics]:
+        val = {k: s.to_jsonish() for k, s in val.items()}
+      elif slot.type is ty.Dict[str, Statistics]:
+        val = {k: s.to_jsonish() for k, s in val.items()}

-  def to_json(self) -> str:
-    out = {'kernel': self.kernel}
-
-    if not math.isnan(self.runtime):
-      out['runtime'] = self.runtime
-
-    self.ipc.to_json_summary('ipc', out)
-    self.mpc.to_json_summary('mpc', out)
-    self.instructions.to_json_summary('instructions', out)
-    self.muops.to_json_summary('muops', out)
-    self.cycles.to_json_summary('cycles', out)
-
-    if self.port_muops:
-      out['ports'] = ports = {}
-      for port, stat in self.port_muops.items():
-        stat.to_json_summary(f'port{port}-muops', ports)
-
-    if self.misc:
-      out['misc'] = misc = {}
-      for name, stat in self.misc.items():
-        assert type(name) is str, name
-        data = {}
-        stat.to_json_summary('', data)
-        misc[name] = data
+      d[slot.yaml_name] = val

-    return json.dumps(out)
+    return d
+
+  @staticmethod
+  def _Event_Set_Run_from_json(d: dict) -> Event_Set_Run:
+    out = {}
+
+    for slot in Event_Set_Run._yaml_slots_:
+      val = d.get(slot.yaml_name)
+
+      if slot.type is Statistics:
+        val = Statistics.from_jsonish(val)
+      elif slot.type is ty.Optional[Statistics]:
+        if val is not None:
+          val = Statistics.from_jsonish(val)
+      elif slot.type is ty.Dict[int, Statistics]:
+        val = {int(k): Statistics.from_jsonish(s) for k, s in val.items()}
+      elif slot.type is ty.Dict[str, Statistics]:
+        val = {k: Statistics.from_jsonish(s) for k, s in val.items()}
+      else:
+        raise TypeError(slot.type)

-  def __hash__(self):
-    return object.__hash__(self)
+      out[slot.py_name] = val

+    return Event_Set_Run(**out)

-class Benchmark_Run_Summary_Aggregator:
+  @staticmethod
+  def _datetime_to_json(date) -> str:
+    return date.strftime('%Y-%m-%d %H:%M:%S.%f')
+
+  @staticmethod
+  def _datetime_from_json(date_str) -> datetime.datetime:
+    return datetime.datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S.%f')
+
+
+class Benchmark_Run_Aggregator:
  """
-    Helper for filtering & aggregating Benchmark_Run_Summary objects.
+    Helper for filtering & aggregating Benchmark_Run objects.
  """

  DEFAULT_MIN_STDDEV  = 0.0
@@ -501,17 +584,18 @@ class Benchmark_Run_Summary_Aggregator:

    self._measurements = collections.defaultdict(list)

-  def add_measurement(self, run: Benchmark_Run_Summary) -> bool:
-    assert type(run) is Benchmark_Run_Summary, type(run)
+  def add_measurement(self, run: Benchmark_Run) -> bool:
+    assert type(run) is Benchmark_Run, type(run)

    ## user provided filters
-    if run.ipc.num_samples < self.min_samples or run.mpc.num_samples < self.min_samples:
+    if run.ipc.num_samples < self.min_samples or run.fmpc.num_samples < self.min_samples or \
+       run.umpc.num_samples < self.min_samples:
      return False

-    if run.ipc.stddev < self.min_stddev or run.mpc.stddev < self.min_stddev:
+    if run.ipc.stddev < self.min_stddev or run.fmpc.stddev < self.min_stddev or run.umpc.stddev < self.min_stddev:
      return False

-    if run.ipc.stddev > self.max_stddev or run.mpc.stddev > self.max_stddev:
+    if run.ipc.stddev > self.max_stddev or run.fmpc.stddev > self.max_stddev or run.umpc.stddev > self.max_stddev:
      return False

    if self.predicate and self.predicate(run.kernel):
@@ -521,8 +605,8 @@ class Benchmark_Run_Summary_Aggregator:
    heapq.heappush(self._measurements[key], self.Item(run))
    return True

-  def force_add(self, run: Benchmark_Run_Summary) -> bool:
-    assert type(run) is Benchmark_Run_Summary, type(run)
+  def force_add(self, run: Benchmark_Run) -> bool:
+    assert type(run) is Benchmark_Run, type(run)

    key = self._normalize_key(run.kernel)
    heapq.heappush(self._measurements[key], self.Item(run))
@@ -536,11 +620,14 @@ class Benchmark_Run_Summary_Aggregator:
    key = self._normalize_key(insts)
    self._measurements.pop(key, None)

-  def __getitem__(self, key: ty.Sequence[str]) -> Benchmark_Run_Summary:
+  def __getitem__(self, key: ty.Sequence[str]) -> Benchmark_Run:
    key = self._normalize_key(key)
-    return self._measurements[key][0].run
+    try:
+      return self._measurements[key][0].run
+    except IndexError:
+      raise KeyError(key)

-  def get(self, key: ty.Sequence[str]) -> Benchmark_Run_Summary:
+  def get(self, key: ty.Sequence[str]) -> Benchmark_Run:
    key = self._normalize_key(key)
    heap = self._measurements[key]
    try:
@@ -557,12 +644,12 @@ class Benchmark_Run_Summary_Aggregator:
    key = self._normalize_key(key)
    return key in self._measurements

-  def __iter__(self) -> ty.Iterable[Benchmark_Run_Summary]:
+  def __iter__(self) -> ty.Iterable[Benchmark_Run]:
    for heap in self._measurements.values():
      if heap:
        yield heap[0].run

-  def all_measurements(self) -> ty.Iterable[Benchmark_Run_Summary]:
+  def all_measurements(self) -> ty.Iterable[Benchmark_Run]:
    for heap in self._measurements.values():
      for item in heap:
        yield item.run
@@ -589,9 +676,9 @@ class Benchmark_Run_Summary_Aggregator:

  @dataclasses.dataclass(frozen=True)
  class Item:
-    run: Benchmark_Run_Summary
+    run: Benchmark_Run

    def __lt__(self, that):
      # reverse comparison
-      return self.run.mpc.mean >= that.run.mpc.mean
+      return self.run.umpc.mean >= that.run.umpc.mean

--- a/src/pipedream/ilp/__init__.py
+++ b/src/pipedream/ilp/__init__.py
--- a/src/pipedream/ilp/types.py
+++ b/src/pipedream/ilp/types.py

 import collections
+import dataclasses
 import enum
 import math
 import itertools
+import functools
 import typing as ty


 class Kernel:
-  def __init__(self, ipc: float, mpc: float, insts: ty.Tuple['Inst', ...]):
+  def __init__(self, ipc: float, fmpc: float, umpc: float, insts: ty.Tuple['Inst', ...]):
    assert all(type(i) is Inst for i in insts), insts

    self._ipc       = float(ipc)
-    self._mpc       = float(mpc)
+    self._umpc      = float(umpc)
+    self._fmpc      = float(fmpc)
    self._insts     = insts

    counts = collections.Counter(insts)

-    self._gcd = min(math.gcd(a, b) for a, b in itertools.product(counts.values(), repeat=2))
+    if not insts:
+      self._gcd = 1
+    else:
+      self._gcd = functools.reduce(math.gcd, counts.values(), counts[insts[0]])

  @property
  def ipc(self) -> float:
    return self._ipc

  @property
-  def mpc(self) -> float:
-    return self._mpc
+  def fmpc(self) -> float:
+    return self._fmpc
+
+  @property
+  def umpc(self) -> float:
+    return self._umpc

  def count(self, inst) -> int:
    cnt = self._insts.count(inst) / self._gcd
@@ -35,7 +45,7 @@ class Kernel:
    return iter(self._insts)

  def __len__(self):
-    return len(self._insts)
+    return len(self._insts) // self._gcd

  def __getitem__(self, idx):
    return self._insts[idx]
@@ -47,7 +57,11 @@ class Kernel:
    return self._insts.items() < that._insts.items()

  def __repr__(self) -> str:
-    return f'Kernel(ipc={self.ipc}, mpc={self.mpc}, insts=[' + ', '.join(repr(i) for i in self._insts) + '])'
+    return ''.join([
+      f'Kernel(ipc={self.ipc}, fmpc={self.fmpc}, ',
+      f'umpc={self.umpc}, ',
+      'insts=[' + ', '.join(repr(i) for i in self._insts) + '])'
+    ])

  def __str__(self) -> str:
    seq = []
@@ -67,39 +81,26 @@ class Kernel:
    return 'K{' + ' '.join(x(i) for i in seq) + '}'


+@dataclasses.dataclass(frozen=True, order=True)
 class Inst:
-  def __init__(self, name, num_muops):
-    self._name      = name
-    self._num_muops = num_muops
-
-  @property
-  def name(self) -> str:
-    return self._name
-
-  @property
-  def num_muops(self) -> int:
-    return self._num_muops
-
-  def __eq__(self, that):
-    if type(self) is not type(that):
-      return False
-
-    return self.name == that.name
-
-  def __hash__(self):
-    return hash(self.name)
-
-  def __lt__(self, that):
-    if type(self) is not type(that):
-      return NotImplemented
+  name: str
+  num_fused_muops: int
+  num_unfused_muops: int

-    return self.name < that.name
+  def __post_init__(self):
+    assert self.num_fused_muops <= self.num_unfused_muops

  def __str__(self) -> str:
    return f'I{{{self.name}}}'

  def __repr__(self) -> str:
-    return f'{type(self).__name__}(name={self.name!r}, num_muops={self.num_muops!r})'
+    return ''.join([
+      type(self).__name__, '(',
+      f'name={self.name!r}, ',
+      f'num_fused_muops={self.num_fused_muops!r}, ',
+      f'num_unfused_muops={self.num_unfused_muops!r}',
+      ')'
+    ])


 class Muop:
@@ -142,9 +143,13 @@ class Port_Type(enum.Enum):
  SLOWDOWN = 1
  SPEEDUP  = 2

+  @property
+  def is_real(self) -> bool:
+    return self is Port_Type.REAL
+
  @property
  def is_virtual(self) -> bool:
-    return bool(self.value)
+    return not self.is_real


 class Port(ty.NamedTuple):
@@ -152,6 +157,10 @@ class Port(ty.NamedTuple):
  type: Port_Type = Port_Type.REAL
  max_throughput: float = 1.0

+  @property
+  def is_real(self) -> bool:
+    return self.type.is_real
+
  @property
  def is_virtual(self) -> bool:
    return self.type.is_virtual

--- a/src/pipedream/utils/json.py
+++ b/src/pipedream/utils/json.py
+
+import json
+import re
+
+__all__ = [
+  'load',
+  'remove_comments',
+]
+
+RE_COMMENT        = re.compile(r'\s*(#|//).*$')
+RE_COMMENT_LINE   = re.compile(r'^\s*(#|//).*$')
+RE_INLINE_COMMENT = re.compile(r'(:?(?:\s)*([A-Za-z\d\.{}]*)|((?<=\").*\"),?)(?:\s)*(((#|(//)).*)|)$')
+
+
+def remove_comments(txt: str) -> str:
+  lines = txt.splitlines()
+
+  for lineno, line in enumerate(lines):
+    if re.search(RE_COMMENT, line):
+      if re.match(RE_COMMENT_LINE, line):
+        lines[lineno] = ""
+      elif re.search(RE_INLINE_COMMENT, line):
+        lines[lineno] = re.sub(RE_INLINE_COMMENT, r'\1', line)
+
+  return '\n'.join(lines)
+
+
+def load(fp, *, allow_comments: bool = True, **kwargs) -> object:
+  """
+    Load JSON from file.
+  """
+
+  if allow_comments:
+    txt = fp.read()
+    txt = remove_comments(txt)
+
+    return json.loads(txt, **kwargs)
+  else:
+    return json.load(fp, **kwargs)
--- a/src/pipedream/utils/statistics.py
+++ b/src/pipedream/utils/statistics.py
@@ -37,8 +37,8 @@ class Statistics(yaml.YAML_Struct):
  MAD         = yaml.Slot(float, math.nan)
  min         = yaml.Slot(float, math.nan)
  max         = yaml.Slot(float, math.nan)
-  percentiles = yaml.Slot(ty.Dict[int, float])
-  histogram   = yaml.Slot(ty.List[int])
+  percentiles = yaml.Slot(ty.Dict[int, float], dict)
+  histogram   = yaml.Slot(ty.List[int], list)

  def __init__(self, *,
               mean: float, stddev: float, variance: float = math.nan,
@@ -341,52 +341,27 @@ class Statistics(yaml.YAML_Struct):

    return hist

-  def to_json_summary(self, prefix: str, dst: dict):
-    """
-      Extract information for JSON serialization in a Benchmark_Run_Summary.
-
-      Add fields from :self: into dict :dst:.
-      If :prefix: is non-empty keys are prefixed with :prefix: + '-'.
-    """
+  def to_jsonish(self) -> dict:
+    d = {}
+    for slot in self._yaml_slots_:
+      assert slot.yaml_name not in d

-    if prefix:
-      prefix += '-'
-
-    dst[prefix + 'num-samples'] = self.num_samples
-    dst[prefix + 'mean']        = self.mean
-    dst[prefix + 'stddev']      = self.stddev
-    dst[prefix + 'variance']    = self.variance
-    dst[prefix + 'min']         = self.min
-    dst[prefix + 'max']         = self.max
-    dst[prefix + 'p10']         = self.p10
-    dst[prefix + 'p25']         = self.p25
-    dst[prefix + 'p50']         = self.p50
-    dst[prefix + 'p75']         = self.p75
-    dst[prefix + 'p90']         = self.p90
+      d[slot.yaml_name] = getattr(self, slot.py_name)
+    return d

  @staticmethod
-  def from_json_summary(prefix: str, src: dict) -> 'Statistics':
-    """
-      Create Statistics object from JSON serialized data.
+  def from_jsonish(src: dict) -> 'Statistics':
+    out = {}

-      Reads keys prefixed with :prefix: + '-' (if prefix is non-empty).
-    """
+    for slot in Statistics._yaml_slots_:
+      try:
+        val = src[slot.yaml_name]

-    if prefix:
-      prefix += '-'
+        if slot.type is ty.Dict[int, float]:
+          val = {int(k): v for k, v in val.items()}
+      except KeyError:
+        val = slot.default

-    return Statistics(
-      num_samples = src.get(prefix + 'num-samples', math.nan),
-      mean        = src.get(prefix + 'mean', math.nan),
-      stddev      = src.get(prefix + 'stddev', math.nan),
-      variance    = src.get(prefix + 'variance', math.nan),
-      min         = src.get(prefix + 'min', math.nan),
-      max         = src.get(prefix + 'max', math.nan),
-      percentiles = {
-        10: src.get(prefix + 'p10', math.nan),
-        25: src.get(prefix + 'p25', math.nan),
-        50: src.get(prefix + 'p50', math.nan),
-        75: src.get(prefix + 'p75', math.nan),
-        90: src.get(prefix + 'p90', math.nan),
-      },
-    )
+      out[slot.py_name] = val
+
+    return Statistics(**out)
--- a/src/pipedream/utils/yaml.py
+++ b/src/pipedream/utils/yaml.py
@@ -72,6 +72,16 @@ class YAML_Serializer(abc.ABC, ty.Generic[T]):
          clss.for_type(args[1]),
        )

+      if origin is ty.Union and len(want.__args__) == 2:
+        args = want.__args__
+
+        assert args[0] is not type(None)
+        assert args[1] is type(None)
+
+        return Optional_Serializer(
+          clss.for_type(args[0])
+        )
+
    # base types
    if want is str:
      return Str_Serializer()
@@ -166,6 +176,28 @@ class Dict_Serializer(ty.Generic[K, V], YAML_Serializer[ty.Dict[K, V]]):
    return construct_dict(node, self.key, self.val)


+class Optional_Serializer(ty.Generic[E], YAML_Serializer[ty.Optional[E]]):
+  """
+    Serializer for a typing.Optional[E] value (i.e. either None or an E).
+  """
+
+  def __init__(self, value: YAML_Serializer[E]):
+    self.value = value
+
+  def to_yaml(self, obj):
+    if obj is None:
+      return Representer().represent_none(None)
+    else:
+      return self.value.to_yaml(obj)
+
+  def from_yaml(self, node):
+    if type(node) is yaml.ScalarNode and node.tag == 'tag:yaml.org,2002:null':
+      assert node.value == 'null'
+      return None
+    else:
+      return self.value.from_yaml(node)
+
+
 class Enum_Serializer(ty.Generic[E], YAML_Serializer[E]):
  """
    Serializer for a enum.Enum enumeration class.
@@ -266,7 +298,7 @@ class YAML_Struct_Serializer(YAML_Serializer):
    def yaml_items():
      for slot in self.struct._yaml_slots_:
        key = represent_str(slot.yaml_name)
-        val = slot.type.to_yaml(slot.__get__(obj))
+        val = slot.serializer.to_yaml(slot.__get__(obj))

        assert val is not None, repr(slot.type)

@@ -291,7 +323,7 @@ class YAML_Struct_Serializer(YAML_Serializer):
      if slot is None:
        raise yaml.constructor.ConstructorError('invalid field ' + repr(k) + ' in ' + self.struct.__name__)

-      v = slot.type.from_yaml(v)
+      v = slot.serializer.from_yaml(v)

      kwargs[slot.py_name] = v

@@ -305,9 +337,19 @@ class Slot:

  NO_DEFAULT = object()

-  def __init__(self, type, default = NO_DEFAULT):
-    self._type    = YAML_Serializer.for_type(type)
-    self._default = default
+  def __init__(self, type_, default = NO_DEFAULT):
+    self._type       = type_
+    self._serializer = YAML_Serializer.for_type(type_)
+
+    if default is self.NO_DEFAULT:
+      self._default     = self._fail_no_default
+      self._has_default = False
+    else:
+      if type(default) is type:
+        self._default = default
+      else:
+        self._default = lambda: default
+      self._has_default = True

  def __set_name__(self, owner, name):
    self._py_name   = name
@@ -322,12 +364,13 @@ class Slot:
  def __set__(self, instance, value):
    instance.__dict__[self._py_name] = value

+  @property
  def has_default(self) -> bool:
    """
      Check if this descriptor has a default value.
    """

-    return self.default is not self.NO_DEFAULT
+    return self._has_default

  def set_default(self, instance):
    """
@@ -351,9 +394,16 @@ class Slot:
  def type(self):
    return self._type

+  @property
+  def serializer(self):
+    return self._serializer
+
  @property
  def default(self):
-    return self._default
+    return self._default()
+
+  def _fail_no_default(self):
+    raise ValueError(f'Slot {self.yaml_name!r} has no default')


 def load(serializer: YAML_Serializer, stream: ty.IO[str]) -> YAML_Serializable:

--- a/tools/benchmark-yaml-to-jsonl
+++ b/tools/benchmark-yaml-to-jsonl
@@ -5,7 +5,7 @@
  into a condensed JSON format that is MUCH faster to parse.

  Prints one JSON dict on a single line per benchmark run.
-  Use pipedream.benchmark.types.Benchmark_Run_Summary to read/write this JSON format.
+  Use pipedream.benchmark.types.Benchmark_Run to read/write this JSON format.
 """

 import argparse
@@ -18,7 +18,7 @@ except ImportError:
  sys.path.append(str(pathlib.Path(__file__).parent.parent / 'src'))

 import pipedream.utils.yaml as yaml
-from pipedream.benchmark.types import Benchmark_Run, Benchmark_Run_Summary
+from pipedream.benchmark.types import Benchmark_Run


 def main():
@@ -38,9 +38,7 @@ def main():
    with open(F) as fd:
      try:
        for run in yaml.load_all(Benchmark_Run.yaml_serializer(), fd):
-          summary = Benchmark_Run_Summary.from_benchmark_run(run)
-
-          print(summary.to_json(), file=out)
+          print(run.to_json(), file=out)
      except yaml.YAMLError as e:
        print('error: malformed file', repr(F) + ':', e, file=sys.stderr)


--- a/tools/filter-benchmark-jsonl
+++ b/tools/filter-benchmark-jsonl
@@ -3,7 +3,7 @@
 """
  Aggregate & filter a stream on benchmark measurements.

-  Reads & writes Benchmark_Run_Summary records in JSONl format.
+  Reads & writes Benchmark_Run records in JSONl format.
 """

 import argparse
@@ -18,7 +18,7 @@ except ImportError:

 import pipedream.benchmark.common as common
 import pipedream.asm.ir as ir
-from pipedream.benchmark.types import Benchmark_Run_Summary, Benchmark_Run_Summary_Aggregator
+from pipedream.benchmark.types import Benchmark_Run, Benchmark_Run_Aggregator


 def main():
@@ -31,21 +31,21 @@ def main():

  parser.add_argument(
    '--min-stddev',
-    type=float, default=Benchmark_Run_Summary_Aggregator.DEFAULT_MIN_STDDEV,
+    type=float, default=Benchmark_Run_Aggregator.DEFAULT_MIN_STDDEV,
    help="""
      Benchmark runs with an IPC/MPC stddev lower than this are ignored.
    """,
  )
  parser.add_argument(
    '--max-stddev',
-    type=float, default=Benchmark_Run_Summary_Aggregator.DEFAULT_MAX_STDDEV,
+    type=float, default=Benchmark_Run_Aggregator.DEFAULT_MAX_STDDEV,
    help="""
      Benchmark runs with an IPC/MPC stddev higher than this are ignored.
    """,
  )
  parser.add_argument(
    '--min-samples',
-    type=float, default=Benchmark_Run_Summary_Aggregator.DEFAULT_MIN_SAMPLES,
+    type=float, default=Benchmark_Run_Aggregator.DEFAULT_MIN_SAMPLES,
    help="""
      Benchmark runs with an IPC/MPC sample count lower than this are ignored.
    """,
@@ -75,7 +75,7 @@ def main():
  else:
    predicate = lambda kernel: False

-  measurements = Benchmark_Run_Summary_Aggregator(
+  measurements = Benchmark_Run_Aggregator(
    min_stddev=args.min_stddev,
    max_stddev=args.max_stddev,
    min_samples=args.min_samples,
@@ -85,7 +85,7 @@ def main():
  for file in args.FILE:
    with open(file) as fd:
      for line in fd:
-        run = Benchmark_Run_Summary.from_json(line)
+        run = Benchmark_Run.from_json(line)

        measurements.add_measurement(run)


--- a/tools/find-eqivalence-classes
+++ b/tools/find-eqivalence-classes
--- a/tools/solve-ilp
+++ b/tools/solve-ilp