Wrote scripts to solve port-mapping ILP problem.

cherry-pick of several intermediate commits. 00685293 from Thu Jul 4 11:00:48 2019 +0200. 9386c13a from Thu May 16 08:09:25 2019 +0200 3813e316 from Fri May 10 16:56:23 2019 +0200 50ee0275 from Sun May 5 10:19:47 2019 +0200 0b54af76 from Sat May 4 20:04:48 2019 +0200 d07f97d1 from Sat May 4 15:30:37 2019 +0200 e924c740 from Sat May 4 15:21:27 2019 +0200 e3138dad from Sat May 4 14:00:44 2019 +0200 a58a53c1 from Fri May 3 16:42:26 2019 +0200 ILP model: save latest version. Many a long night was spent hacking on this. Deadlines where tight. Midnight oil was burnt. So I didn't commit intermediate stages.

Wrote scripts to solve port-mapping ILP problem.
71e25551 · GRUBER Fabian · c2bc8b3c · 71e25551 · 71e25551
Commit 71e25551 authored 5 years ago by GRUBER Fabian
--- a/tools/find-eqivalence-classes
+++ b/tools/find-eqivalence-classes
@@ -33,6 +33,7 @@ from pipedream.utils import chunks, nub
 import pipedream.asm.ir as ir
 import pipedream.benchmark.common as common
 import pipedream.utils.yaml as yaml
+import pipedream.utils.json as json
 BLACKLISTED_INSTRUCTIONS = frozenset([
  # 'OR_GPR16i16_IMMi16',
@@ -77,7 +78,7 @@ def main():
  # inputs
  subp.add_argument('--measurements',
-                    dest='measurements_file',
+                    dest='measurements_input',
                    required=True,
                    type=pathlib.Path,
                    help='File to read benchmark results from',)
@@ -93,7 +94,11 @@ def main():
  # outputs
  subp.add_argument('-eo', '--eq-class-output', type=pathlib.Path, default=pathlib.Path('/dev/stdout'))
-  subp.add_argument('-mo', '--measurement-output', type=pathlib.Path, default=None)
+  subp.add_argument('-mo', '--measurements-output',
+                    dest='measurements_output',
+                    required=False,
+                    type=pathlib.Path,
+                    help='File to write benchmark results to',)
  subp.add_argument('--yaml-log', default=None, type=pathlib.Path)
  subp.add_argument('--json-log', default=None, type=pathlib.Path)
@@ -147,17 +152,17 @@ def main():
  # inputs
  subp.add_argument('--measurements',
-                    dest='measurements_file',
+                    dest='measurements_input',
                    required=True,
                    type=pathlib.Path,
-                    help='File to read benchmark results from and write them back to',)
+                    help='File to read benchmark results from',)
  subp.add_argument('--eq-classes',
                    dest='eq_classes_file',
                    required=True,
                    type=pathlib.Path,
                    help='File to read equivalence classes from',)
  subp.add_argument('--tag',
-                    default=None,
+                    required=True,
                    type=str,
                    help='Tag selector to select complex instructions',)
  subp.add_argument('--min-muops',
@@ -171,6 +176,13 @@ def main():
                      with a IPC/MPC stddev higher than this are ignored.
                    """,)
+  ## outputs
+  subp.add_argument('-mo', '--measurements-output',
+                    dest='measurements_output',
+                    required=False,
+                    type=pathlib.Path,
+                    help='File to write benchmark results to',)
  ##############################################################################
  args = parser.parse_args()
@@ -180,11 +192,15 @@ def main():
  if command is None:
    parser.error('must supply a command')
+  if args.measurements_output is None:
+    args.measurements_output = args.measurements_input
  command(**vars(args))
 def generate_simple_ilp_input(*,
-                              measurements_file: pathlib.Path,
+                              measurements_input: pathlib.Path,
+                              measurements_output: pathlib.Path,
                              eq_classes_file: pathlib.Path,
                              tag: ty.Optional[str],
                              min_muops: int,
@@ -193,21 +209,23 @@ def generate_simple_ilp_input(*,
  arch            = ir.Architecture.for_name('x86')
  instruction_set = arch.instruction_set()
-  measurements = Benchmark_Run_Summary_Aggregator(
+  measurements = Benchmark_Run_Aggregator(
    max_stddev=max_stddev,
    # *0.9 to account for dropping the bottom & top 5 percent.
    min_samples=MIN_NUM_SAMPLES * 0.9,
  )
+  # touch measurements file
+  open(measurements_output, 'a').close()
  try:
-    read_measurements_from_files(measurements, [measurements_file])
+    read_measurements_from_files(measurements, [measurements_input])
    eq_classes = read_equivalence_classes_from_file(instruction_set, eq_classes_file)
  except FileNotFoundError as e:
    print('error:', e, file=sys.stderr)
    exit(1)
-  complex_insts   = common.glob_instruction_tags(arch, [tag])
+  complex_insts   = list(nub(common.glob_instruction_tags(arch, [tag])))
-  # representatives = [random.choice(eq) for eq in eq_classes]
  representatives = list(nub(eq[0] for eq in eq_classes))
  if not complex_insts:
@@ -234,108 +252,120 @@ def generate_simple_ilp_input(*,
      ordering               = Ordering.ALPHABETIC,
    )
-    EQ.make_measurements(kernels, measurements, measure_ports, output=measurements_file)
+    kernels = [k for k in kernels if tuple(i.name for i in k) not in measurements]
-  log('MEASURE: muI')
+    CHUNK_SIZE = 2000
-  make_measurements(nub((i,) for i in complex_insts + representatives), measure_ports=True)
-  assert complex_insts[0].name in measurements
+    for chunk in chunks(kernels, CHUNK_SIZE):
+      try:
+        EQ.make_measurements(chunk, measurements, measure_ports=measure_ports, output=measurements_output)
+      finally:
+        os.sched_yield()
-  if min_muops:
+  log('MEASURE: complexI', f'({len(complex_insts)})')
-    for i in list(complex_insts):
+  make_measurements([(i,) for i in complex_insts], measure_ports=True)
-      m = measurements[i.name]
-      num_muops = round(m.muops.mean / m.instructions.mean)
+  complex_insts = [i for i in complex_insts if measurements[i.name].num_unfused_muops > 1]
-      if num_muops < min_muops:
+  if not complex_insts:
-        complex_insts.remove(i)
+    log('no real complexI found')
+    return
-  kernels = list(combination_kernels(complex_insts, representatives))
+  log('MEASURE: muI ', f'({len(representatives)})')
-  log('MEASURE: muI x complexI', f'({len(kernels)})')
+  make_measurements([(i,) for i in representatives], measure_ports=True)
-  for chunk in chunks(kernels, 1000):
-    make_measurements(chunk, measure_ports=False)
-  del kernels
+  def combinations_of_instructions_using_only_ports(ports: ty.FrozenSet['Port']) -> ty.Iterable[ty.Tuple[ir.Instruction,
+                                                                                                         ...]]:
+    port_sets = set(frozenset(ps) for ps in powerset(ports))
+    port_sets.remove(frozenset())
-  ## higher order kernels
+    assert frozenset([6]) in port_sets
-  kernels = []
+    assert len([i for i in representatives if i.name == 'JMP_0'])
-  if False: # NEW WAY, TODO IMPLEMENT
+    combinations = collections.defaultdict(list)
-    for complexI in complex_insts:
-      mCI = measurements[complexI.name]
-      if MEASURE_PORTS:
+    # FIXME: we really want a set cover for every possible subset of ports ... but that is expensive
-        ## TODO: read ports from args or partial model
+    for N in range(1, 8):
-        ports = mCI.ports_used()
+      for kernel in itertools.combinations(representatives, N):
-      else:
+        kernel_ports = frozenset()
-        interferes_with = set()
-        for muI in representatives:
+        for inst in kernel:
-          mMUI    = measurements[muI.name]
+          run = measurements[inst.name]
-          mCI_MUI = measurements[complexI.name, muI.name]
-          # no interference iff MPC(A) + MPC(B) == MPC(A B)
+          kernel_ports |= run.ports_used()
-          if abs((mc.mpc.mean + mr.mpc.mean) - mcr.mpc.mean) <= 0.1:
-            continue
-          interferes_with.add(muI)
+        # if kernel_ports in combinations:
+        #   kernel = min(kernel, combinations[kernel_ports], key=len)
-        ports = set()
+        # combinations[kernel_ports] = tuple(sorted(kernel, key=lambda i: i.name))
-        for muI in interferes_with:
+        combinations[kernel_ports].append(tuple(sorted(kernel, key=lambda i: i.name)))
-          muI_ports = ...  # TODO: read partial model
-          ports |= muI_ports
+        # if not port_sets:
+        #   break
-      for port_set in powerset(ports):
+    if False:
-        kernel = minimal_combination_of_instructions_using_only_port_set(port_set)
+      print('COVERED ' + str(ports) + ':')
+      for port_set in sorted(combinations, key=lambda ps: [len(ps), ps]):
+        for kernel in combinations[port_set]:
+          print(' ', Benchmark_Spec.name_from_instruction_names(i.name for i in kernel), *sorted(port_set))
+      print('UNCOVERED ' + str(ports) + ':')
+      for port_set in sorted(port_sets, key=lambda ps: [len(ps), ps]):
+        if port_set not in combinations:
+          print(' ', *sorted(port_set))
+      print()
+      exit()
-        kernels.append(kernel)
+    # print('MAX', max(map(len, combinations.values())))
-        kernels.append((complexI,) + kernel)
-  else:
-    interferes_with = collections.defaultdict(set)
-    for c in complex_insts:
+    out = []
-      for r in representatives:
-        if c is r:
-          continue
-        mc  = measurements[c.name]
-        mr  = measurements[r.name]
-        if mc.ports_used() and mr.ports_used():
+    for port_set in sorted(combinations, key=lambda ps: [len(ps), ps]):
-          if not (mc.ports_used() & mr.ports_used()):
+      kernels = combinations[port_set]
-            # does not use same ports. no interference
+      kernels = sorted(kernels, key=len)[:3]
-            continue
-        else:
-          mcr = measurements[c.name, r.name]
-          # no interference iff MPC(A) + MPC(B) == MPC(A B)
+      out += kernels
-          if abs((mc.mpc.mean + mr.mpc.mean) - mcr.mpc.mean) <= 0.1:
-            continue
-        interferes_with[c].add(r)
+    return sorted(out, key=len)
-    for c, interferes in interferes_with.items():
+  assert complex_insts[0].name in measurements
-      mc = measurements[c.name]
-      num_muops = round(mc.muops.mean / mc.instructions.mean)
+  if min_muops:
+    for i in list(complex_insts):
+      m = measurements[i.name]
-      for combi in combination_kernels([c], interferes, interferes):
+      num_muops = round(m.unfused_muops.mean / m.instructions.mean)
-        kernels.append(combi)
-  log('MEASURE: muI x muI x complexI', f'({len(kernels)})')
+      if num_muops < min_muops:
+        complex_insts.remove(i)
+  ## higher order kernels
+  kernels = []
-  for chunk in chunks(kernels, 1000):
+  for complexI in complex_insts:
-    make_measurements(chunk, measure_ports=False)
+    for combi in combination_kernels([complexI], representatives):
+      kernels.append(combi)
+  log('MEASURE: muI x complexI', f'({len(kernels)})')
+  make_measurements(kernels, measure_ports=True)
+  kernels = []
+  for complexI in complex_insts:
+    for combi in combination_kernels([complexI], representatives, representatives):
+      kernels.append(combi)
+  log('MEASURE: muI x muI x complexI', f'({len(kernels)})')
+  make_measurements(kernels, measure_ports=True)
 def find_equivalence_classes(*,
-                             measurements_file: ty.Optional[pathlib.Path],
+                             measurements_input: ty.Optional[pathlib.Path],
                             eq_classes_file: ty.Optional[pathlib.Path],
                             tag: ty.Optional[str],
                             eq_class_output: ty.Optional[pathlib.Path],
-                             measurement_output: ty.Optional[pathlib.Path],
+                             measurements_output: ty.Optional[pathlib.Path],
                             yaml_log: ty.Optional[pathlib.Path],
                             json_log: ty.Optional[pathlib.Path],
                             num_representatives: int,
@@ -360,13 +390,13 @@ def find_equivalence_classes(*,
  common.set_process_name('pipedream-equivalence-classes')
  common.set_scheduler_params()
-  measurements = Benchmark_Run_Summary_Aggregator(
+  measurements = Benchmark_Run_Aggregator(
    max_stddev=max_stddev,
    # *0.9 to account for dropping the bottom & top 5 percent.
    min_samples=MIN_NUM_SAMPLES * 0.9,
  )
-  read_measurements_from_files(measurements, [measurements_file])
+  read_measurements_from_files(measurements, [measurements_input])
  ##############################################################################
  ## build initial input equivalence classes
@@ -407,6 +437,9 @@ def find_equivalence_classes(*,
        if inst.name in BLACKLISTED_INSTRUCTIONS:
          return True
+        if inst.name == 'JMP_0':
+          return False
        # forbid instructions with a read/write to a fixed register (forces data dependencies)
        for op in inst.operands:
          if not isinstance(op, ir.Register_Operand):
@@ -424,7 +457,7 @@ def find_equivalence_classes(*,
      EQ.make_measurements([(i,) for i in all_insts],
                           measurements,
-                           output=measurement_output,)
+                           output=measurements_output,)
      muI_grouped_by_prefix = collections.defaultdict(list)
@@ -435,7 +468,7 @@ def find_equivalence_classes(*,
        run = measurements[i.name]
-        if run.num_muops != 1:
+        if run.num_unfused_muops != 1:
          continue
        muI.append(i)
@@ -456,7 +489,7 @@ def find_equivalence_classes(*,
          measurements        = measurements,
          equivalence_classes = prefix_eq_classes,
          eq_class_output     = None,
-          measurement_output  = None,
+          measurements_output = None,
          yaml_log            = yaml_log,
          json_log            = json_log,
        )
@@ -470,7 +503,7 @@ def find_equivalence_classes(*,
    measurements        = measurements,
    equivalence_classes = equivalence_classes,
    eq_class_output     = eq_class_output,
-    measurement_output  = measurement_output,
+    measurements_output = measurements_output,
    yaml_log            = yaml_log,
    json_log            = json_log,
  )
@@ -478,10 +511,10 @@ def find_equivalence_classes(*,
 def _find_equivalence_classes(*,
                              EQ: 'Equivalence_Class_Finder',
-                              measurements: Benchmark_Run_Summary_Aggregator,
+                              measurements: Benchmark_Run_Aggregator,
                              equivalence_classes: ty.List['Eq_Class'],
                              eq_class_output: ty.Optional[pathlib.Path],
-                              measurement_output: ty.Optional[pathlib.Path],
+                              measurements_output: ty.Optional[pathlib.Path],
                              yaml_log: ty.Optional[pathlib.Path],
                              json_log: ty.Optional[pathlib.Path],) -> ty.List['Eq_Class']:
@@ -519,7 +552,7 @@ def _find_equivalence_classes(*,
    ##############################################################################
    ## write output
-    if eq_class_output or measurement_output:
+    if eq_class_output or measurements_output:
      log('WRITE RESULTS')
    if eq_class_output:
@@ -534,6 +567,9 @@ def _find_equivalence_classes(*,
              print(',', file=fd)
            first = False
            print(' ', '{', file=fd)
+            print(' ', ' ', '"avg-ipc": ' + str(numpy.mean([measurements[i].ipc.mean for i in insts])) + ',', file=fd),
+            print(' ', ' ', '"avg-fused-mpc": ' + str(numpy.mean([measurements[i].fmpc.mean for i in insts])) + ',', file=fd),
+            print(' ', ' ', '"avg-unfused-mpc": ' + str(numpy.mean([measurements[i].umpc.mean for i in insts])) + ',', file=fd),
            print(' ', ' ', '"insts": [' + ', '.join('"' + i + '"' for i in insts) + ']', file=fd)
            print(' ', '}', end = '', file=fd)
          print(file=fd)
@@ -542,8 +578,8 @@ def _find_equivalence_classes(*,
      except argparse.ArgumentTypeError as e:
        print(e, file=sys.stderr)
-    if measurement_output is not None:
+    if measurements_output is not None:
-      write_measurements(measurements, measurement_output)
+      write_measurements(measurements, measurements_output)
  return equivalence_classes
@@ -595,6 +631,8 @@ def split_equivalence_classes(EQ,
                              equivalence_classes: ty.List['Eq_Class']) -> ty.Tuple[ty.List['Eq_Class'], bool]:
  assert type(equivalence_classes) is list
+  equivalence_classes = list(equivalence_classes)
  changed: bool = False
  log('  SPLIT', len(equivalence_classes), 'CLASS(ES)')
@@ -602,8 +640,8 @@ def split_equivalence_classes(EQ,
  for eq in list(equivalence_classes):
    # log('    SPLIT', eq)
-    # reps = EQ._select_representatives(eq)
+    reps = EQ._select_representatives(eq)
-    reps = eq.random_sample(EQ.num_representatives)
+    # reps = eq.random_sample(EQ.num_representatives)
    EQ.make_measurements(((i,) for i in reps), measurements)
    EQ.make_measurements(combination_kernels(reps, repeat=2), measurements, measure_ports=False)
@@ -781,11 +819,14 @@ class Equivalence_Class_Finder:
    n_random: int
    if self.random_representatives:
-      n_alphabetic = 1
+      n_alphabetic = self.num_representatives // 4
-      n_random     = max(0, self.num_representatives - 1)
+      n_random     = self.num_representatives - n_alphabetic
    else:
-      n_alphabetic = max(0, self.num_representatives - 1)
+      n_random     = self.num_representatives // 4
-      n_random     = 1
+      n_alphabetic = self.num_representatives - n_random
+    # n_random     = 0
+    # n_alphabetic = self.num_representatives - n_random
    assert n_alphabetic + n_random == self.num_representatives
@@ -798,14 +839,14 @@ class Equivalence_Class_Finder:
    return reps
  @staticmethod
-  def eq_class_ipc_and_mpc(eq_clss: Eq_Class, measurements: Benchmark_Run_Summary_Aggregator):
+  def eq_class_ipc_and_mpc(eq_clss: Eq_Class, measurements: Benchmark_Run_Aggregator):
    insts = [(i.name,) for i in eq_clss]
    ipc   = numpy.mean([measurements[i].ipc.mean for i in insts])
-    mpc   = numpy.mean([measurements[i].mpc.mean for i in insts])
+    mpc   = numpy.mean([measurements[i].umpc.mean for i in insts])
    return ipc, mpc
  @classmethod
-  def log_eq_class(clss, eq_clss: Eq_Class, measurements: Benchmark_Run_Summary_Aggregator, indent: int = 2):
+  def log_eq_class(clss, eq_clss: Eq_Class, measurements: Benchmark_Run_Aggregator, indent: int = 2):
    insts    = sorted(i.name for i in eq_clss)
    ipc, mpc = clss.eq_class_ipc_and_mpc(eq_clss, measurements)
    log(' ' * indent, eq_clss,
@@ -815,7 +856,7 @@ class Equivalence_Class_Finder:
  def make_measurements(self,
                        kernels: ty.Iterable[ty.Tuple[ir.Instruction, ...]],
-                        measurements: Benchmark_Run_Summary_Aggregator,
+                        measurements: Benchmark_Run_Aggregator,
                        *,
                        measure_ports: bool = None,
                        force: bool = False,
@@ -846,7 +887,7 @@ class Equivalence_Class_Finder:
      return False
    extra_counters = [
-      # 'RESOURCE_STALLS',
+      'RESOURCE_STALLS',
    ]
    if measure_ports:
@@ -885,29 +926,30 @@ class Equivalence_Class_Finder:
                                         tmp_dir               = str(pathlib.Path.cwd() / 'tmp'),
                                         debug                 = False,):
          try:
+            run.drop_details()
            if self.yaml_log is not None:
              self.yaml_log.write('---\n')
              yaml.dump(run, self.yaml_log)
              self.yaml_log.write('...\n')
            if self.json_log is not None:
-              print(Benchmark_Run_Summary.from_benchmark_run(run).to_json(), file=self.json_log)
+              print(run.to_json(), file=self.json_log)
-            name    = run.benchmark.name
+            name = run.benchmark.name
-            summary = Benchmark_Run_Summary.from_benchmark_run(run)
+            err  = test_equivalence(name, name, run, run)
-            err     = test_equivalence(name, name, summary, summary)
            if err:
              print(f'error: IPC({name}) != IPC({name})', ':', err, file=sys.stderr)
              exit(1)
-            added = measurements.add_measurement(summary)
+            added = measurements.add_measurement(run)
            if NUM_SAMPLES >= MAX_NUM_SAMPLES or added:
              key = tuple(run.benchmark.instructions)
              unknown.remove(key)
              if not added:
-                measurements.force_add(summary)
+                measurements.force_add(run)
          finally:
            if self.yaml_log is not None:
              self.yaml_log.flush()
@@ -917,20 +959,17 @@ class Equivalence_Class_Finder:
        NUM_SAMPLES += NUM_SAMPLES_STEP
    finally:
      if output:
+        os.sched_yield()
        write_measurements(measurements, output)
    return True
-def read_measurements_from_files(measurements: Benchmark_Run_Summary_Aggregator,
+def read_measurements_from_files(measurements: Benchmark_Run_Aggregator,
                                 files: ty.Sequence[pathlib.Path]):
  for file in files:
    log('READING MEASUREMENTS FROM', shlex.quote(str(file)))
-    with open(file) as fd:
+    measurements.read_from_file(file)
-      for line in fd:
-        run = Benchmark_Run_Summary.from_json(line)
-        measurements.add_measurement(run)
  log('FOUND', len(measurements), 'measurement(s)')
@@ -938,7 +977,7 @@ def read_measurements_from_files(measurements: Benchmark_Run_Summary_Aggregator,
 def read_equivalence_classes_from_file(instruction_set: ir.Instruction_Set, file: str):
  try:
    with argparse.FileType('r')(file) as fd:
-      json_eq_classes = json.load(fd)
+      json_eq_classes = json.load(fd, allow_comments=True)
      equivalence_classes = []
      all_insts = []
@@ -964,7 +1003,7 @@ def read_equivalence_classes_from_file(instruction_set: ir.Instruction_Set, file
    exit(1)
-def write_measurements(measurements: Benchmark_Run_Summary_Aggregator, file: pathlib.Path):
+def write_measurements(measurements: Benchmark_Run_Aggregator, file: pathlib.Path):
  assert isinstance(file, pathlib.Path)
  log('WRITE MEASUREMENTS TO', shlex.quote(str(file)))
@@ -972,21 +1011,17 @@ def write_measurements(measurements: Benchmark_Run_Summary_Aggregator, file: pat
  written = 0
  if str(file) in ['/dev/stdout', '/dev/stderr']:
-    with open(file, 'w') as fd:
+    written += measurements.write_to_file(file, only_best=False)
-      for m in measurements.all_measurements():
-        print(m.to_json(), file=fd)
-        written += 1
  else:
    with tempfile.NamedTemporaryFile(mode='w',
                                     prefix='eq-class-measurements.',
-                                     suffix='.jsonl',
+                                     suffix=file.suffix,
                                     delete=False,) as fd:
-      for m in measurements.all_measurements():
+      written += measurements.write_to_file(pathlib.Path(fd.name), only_best=False)
-        print(m.to_json(), file=fd)
+    os.makedirs(file.parent, exist_ok=True)
-        written += 1
+    if file.exists():
-      os.makedirs(file.parent, exist_ok=True)
      shutil.move(file, file.with_suffix(file.suffix + '.bkp'))
-      shutil.move(fd.name, file)
+    shutil.move(fd.name, file)
  log('WROTE', written, 'measurement(s)')
@@ -995,12 +1030,19 @@ def combination_kernels(*iterables: ty.Sequence[ir.Instruction], repeat: int = 1
  ## AB is not necessarily the same as AABB (x86 is weird)
  ## so we use product instead of combinations
  for combi in itertools.product(*iterables, repeat=repeat):
-    for N in range(1, 6):
+    for kernel in repetition_kernels(combi):
-      kernel = sum(((i,) * N for i in combi), ())
      yield kernel
+def repetition_kernels(kernel: ty.Tuple[ir.Instruction, ...],
+                       max_repeat: int = 5) -> ty.Iterable[ty.Tuple[ir.Instruction, ...]]:
+  assert max_repeat >= 1
+  for N in range(1, max_repeat + 1):
+    k = sum(((i,) * N for i in kernel), ())
+    yield k
 class Percent:
  def __init__(self, numerator, denominator = None):
    if type(numerator) is Percent:
@@ -1046,8 +1088,8 @@ Percent.FIVE = Percent(5, 100)
 def test_equivalence(A: ir.Instruction, B: ir.Instruction,
-                     runA: Benchmark_Run_Summary, runB: Benchmark_Run_Summary,
+                     runA: Benchmark_Run, runB: Benchmark_Run,
-                     runAB: Benchmark_Run_Summary = None, *, margin: Percent = Percent.FIVE) -> ty.Optional[str]:
+                     runAB: Benchmark_Run = None, *, margin: Percent = Percent.FIVE) -> ty.Optional[str]:
  """
    Check if instruction A and B are equivalent.
    Returns an error message iff they are not equal, None otherwise.
@@ -1069,17 +1111,25 @@ def test_equivalence(A: ir.Instruction, B: ir.Instruction,
      dev  = round(stat.stddev, 3)
      return f'{mean}±{dev}'
+    assert runA.num_fused_muops == runB.num_fused_muops, f'num_fused_muops({A})={runA.num_fused_muops} ' \
+                                                         f'num_fused_muops({B})={runB.num_fused_muops}'
+    assert runA.num_unfused_muops == runB.num_unfused_muops, f'num_unfused_muops({A})={runA.num_unfused_muops} ' \
+                                                             f'num_unfused_muops({B})={runB.num_unfused_muops}'
    assert IPCs_are_equivalent(runA, runB, margin),  f'IPC({A})={pstat(runA.ipc)} IPC({B})={pstat(runB.ipc)}'
    if runAB:
      assert IPCs_are_equivalent(runA, runAB, margin), f'IPC({A})={pstat(runA.ipc)} IPC({A} {B})={pstat(runAB.ipc)}'
      assert IPCs_are_equivalent(runB, runAB, margin), f'IPC({B})={pstat(runB.ipc)} IPC({A} {B})={pstat(runAB.ipc)}'
-    assert MPCs_are_equivalent(runA, runB, margin),  f'MPC({A})={pstat(runA.mpc)} MPC({B})={pstat(runB.mpc)}'
+    assert MPCs_are_equivalent(runA, runB, margin),  f'MPC({A})={pstat(runA.umpc)} MPC({B})={pstat(runB.umpc)}'
    if runAB:
-      assert MPCs_are_equivalent(runA, runAB, margin), f'MPC({A})={pstat(runA.mpc)} MPC({A} {B})={pstat(runAB.mpc)}'
+      assert MPCs_are_equivalent(runA, runAB, margin), f'MPC({A})={pstat(runA.umpc)} MPC({A} {B})={pstat(runAB.umpc)}'
-      assert MPCs_are_equivalent(runB, runAB, margin), f'MPC({B})={pstat(runB.mpc)} MPC({A} {B})={pstat(runAB.mpc)}'
+      assert MPCs_are_equivalent(runB, runAB, margin), f'MPC({B})={pstat(runB.umpc)} MPC({A} {B})={pstat(runAB.umpc)}'
-    if runA.port_muops and runB.port_muops:
+    USE_PORTS = False
+    if USE_PORTS and runA.port_muops and runB.port_muops:
      def ports_used(run) -> ty.Set[int]:
        "return set of ports used by a benchmark run (not by how much each port is used)"
@@ -1089,7 +1139,7 @@ def test_equivalence(A: ir.Instruction, B: ir.Instruction,
          ## a port that receives less than 5% of all muops is considered unused
          ## (i.e. that usage is some measurement noise, comes from another hyperthread, ...)
          ## TODO: investigate example ADC_GPR64i64_IMMi32
-          if (stat.mean / run.muops.mean) >= 0.05:
+          if (stat.mean / run.unfused_muops.mean) >= 0.05:
            ports.add(port)
        return ports
@@ -1125,6 +1175,12 @@ def round_float(value: float) -> fractions.Fraction:
  return rounded
+def powerset(iterable: ty.Iterable['T']) -> ty.Iterable[ty.Tuple['T', ...]]:
+    "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
+    s = list(iterable)
+    return itertools.chain.from_iterable(itertools.combinations(s, r) for r in range(len(s) + 1))
 def ttest_1samp_from_stats(mean, std, var, nobs, popmean):
  """
    Calculate the T-test for the mean of ONE group of scores from descriptive statistics.
@@ -1191,29 +1247,30 @@ def tost_paired(y, x, low, upp, transform=None):
 P_VALUE = 0.05
-def IPCs_are_equivalent(a: Benchmark_Run_Summary, b: Benchmark_Run_Summary, margin: Percent = Percent.FIVE):
+def IPCs_are_equivalent(a: Benchmark_Run, b: Benchmark_Run, margin: Percent = Percent.FIVE):
  assert 0 <= margin <= 1
-  if round(a.ipc.p75, 2) == round(b.ipc.p75, 2):
+  # if round(a.ipc.p75, 2) == round(b.ipc.p75, 2):
-    return True
+  #   return True
  margin = min(a.ipc.mean, b.ipc.mean) * float(margin)
  return means_are_equivalent(a.ipc, b.ipc, margin)
-def MPCs_are_equivalent(a: Benchmark_Run_Summary, b: Benchmark_Run_Summary, margin: Percent = Percent.FIVE):
+def MPCs_are_equivalent(a: Benchmark_Run, b: Benchmark_Run, margin: Percent = Percent.FIVE):
  assert 0 <= margin <= 1
-  if round(a.mpc.p75, 2) == round(b.mpc.p75, 2):
+  # if round(a.umpc.p75, 2) == round(b.umpc.p75, 2) and round(a.fmpc.p75, 2) == round(b.fmpc.p75, 2):
-    return True
+  #   return True
-  margin = min(a.mpc.mean, b.mpc.mean) * float(margin)
+  u_margin = min(a.umpc.mean, b.umpc.mean) * float(margin)
+  f_margin = min(a.fmpc.mean, b.fmpc.mean) * float(margin)
-  return means_are_equivalent(a.mpc, b.mpc, margin)
+  return means_are_equivalent(a.umpc, b.umpc, u_margin) and means_are_equivalent(a.fmpc, b.fmpc, f_margin)
-def port_muops_are_equivalent(a: Benchmark_Run_Summary, b: Benchmark_Run_Summary, port: int):
+def port_muops_are_equivalent(a: Benchmark_Run, b: Benchmark_Run, port: int):
  ## Muop per port counters aren't that precise and port usage fluctuates way more than
  ## cycles or total number of muops, so we are only looking for a very broad sense of equality here.
  ## The average benchmark runs 5_000_000 muops, so this works out to a margin of 200_000 muops.

--- a/tools/solve-ilp
+++ b/tools/solve-ilp