From 71e255517d50b15b76b4eaf2587bf5fc34c4450b Mon Sep 17 00:00:00 2001
From: Fabian Gruber <fabian.gruber@inria.fr>
Date: Wed, 31 Jul 2019 14:49:49 +0200
Subject: [PATCH] Wrote scripts to solve port-mapping ILP problem.

cherry-pick of several intermediate commits.
  00685293ca106d91e191f0be18 from Thu Jul 4 11:00:48 2019 +0200.
  9386c13adba5de1584eff6f18f from Thu May 16 08:09:25 2019 +0200
  3813e316c21427a3066f23509e from Fri May 10 16:56:23 2019 +0200
  50ee0275a794109eeda6592421 from Sun May 5 10:19:47 2019 +0200
  0b54af769395b07dc12af24ce3 from Sat May 4 20:04:48 2019 +0200
  d07f97d17f1a96a09ff5dbe502 from Sat May 4 15:30:37 2019 +0200
  e924c740bf634cf052b89f9125 from Sat May 4 15:21:27 2019 +0200
  e3138dad5270c22c15663fbb45 from Sat May 4 14:00:44 2019 +0200
  a58a53c132a23c8d2af1c69d72 from Fri May 3 16:42:26 2019 +0200

ILP model: save latest version.

Many a long night was spent hacking on this.
Deadlines where tight.
Midnight oil was burnt.
So I didn't commit intermediate stages.
---
 tools/find-eqivalence-classes | 343 +++++++-----
 tools/solve-ilp               | 965 +++++++++++++++++++++-------------
 2 files changed, 785 insertions(+), 523 deletions(-)

diff --git a/tools/find-eqivalence-classes b/tools/find-eqivalence-classes
index 4ec8be0..ce55ae7 100755
--- a/tools/find-eqivalence-classes
+++ b/tools/find-eqivalence-classes
@@ -33,6 +33,7 @@ from pipedream.utils import chunks, nub
 import pipedream.asm.ir as ir
 import pipedream.benchmark.common as common
 import pipedream.utils.yaml as yaml
+import pipedream.utils.json as json
 
 BLACKLISTED_INSTRUCTIONS = frozenset([
   # 'OR_GPR16i16_IMMi16',
@@ -77,7 +78,7 @@ def main():
 
   # inputs
   subp.add_argument('--measurements',
-                    dest='measurements_file',
+                    dest='measurements_input',
                     required=True,
                     type=pathlib.Path,
                     help='File to read benchmark results from',)
@@ -93,7 +94,11 @@ def main():
 
   # outputs
   subp.add_argument('-eo', '--eq-class-output', type=pathlib.Path, default=pathlib.Path('/dev/stdout'))
-  subp.add_argument('-mo', '--measurement-output', type=pathlib.Path, default=None)
+  subp.add_argument('-mo', '--measurements-output',
+                    dest='measurements_output',
+                    required=False,
+                    type=pathlib.Path,
+                    help='File to write benchmark results to',)
   subp.add_argument('--yaml-log', default=None, type=pathlib.Path)
   subp.add_argument('--json-log', default=None, type=pathlib.Path)
 
@@ -147,17 +152,17 @@ def main():
 
   # inputs
   subp.add_argument('--measurements',
-                    dest='measurements_file',
+                    dest='measurements_input',
                     required=True,
                     type=pathlib.Path,
-                    help='File to read benchmark results from and write them back to',)
+                    help='File to read benchmark results from',)
   subp.add_argument('--eq-classes',
                     dest='eq_classes_file',
                     required=True,
                     type=pathlib.Path,
                     help='File to read equivalence classes from',)
   subp.add_argument('--tag',
-                    default=None,
+                    required=True,
                     type=str,
                     help='Tag selector to select complex instructions',)
   subp.add_argument('--min-muops',
@@ -171,6 +176,13 @@ def main():
                       with a IPC/MPC stddev higher than this are ignored.
                     """,)
 
+  ## outputs
+  subp.add_argument('-mo', '--measurements-output',
+                    dest='measurements_output',
+                    required=False,
+                    type=pathlib.Path,
+                    help='File to write benchmark results to',)
+
   ##############################################################################
 
   args = parser.parse_args()
@@ -180,11 +192,15 @@ def main():
   if command is None:
     parser.error('must supply a command')
 
+  if args.measurements_output is None:
+    args.measurements_output = args.measurements_input
+
   command(**vars(args))
 
 
 def generate_simple_ilp_input(*,
-                              measurements_file: pathlib.Path,
+                              measurements_input: pathlib.Path,
+                              measurements_output: pathlib.Path,
                               eq_classes_file: pathlib.Path,
                               tag: ty.Optional[str],
                               min_muops: int,
@@ -193,21 +209,23 @@ def generate_simple_ilp_input(*,
   arch            = ir.Architecture.for_name('x86')
   instruction_set = arch.instruction_set()
 
-  measurements = Benchmark_Run_Summary_Aggregator(
+  measurements = Benchmark_Run_Aggregator(
     max_stddev=max_stddev,
     # *0.9 to account for dropping the bottom & top 5 percent.
     min_samples=MIN_NUM_SAMPLES * 0.9,
   )
 
+  # touch measurements file
+  open(measurements_output, 'a').close()
+
   try:
-    read_measurements_from_files(measurements, [measurements_file])
+    read_measurements_from_files(measurements, [measurements_input])
     eq_classes = read_equivalence_classes_from_file(instruction_set, eq_classes_file)
   except FileNotFoundError as e:
     print('error:', e, file=sys.stderr)
     exit(1)
 
-  complex_insts   = common.glob_instruction_tags(arch, [tag])
-  # representatives = [random.choice(eq) for eq in eq_classes]
+  complex_insts   = list(nub(common.glob_instruction_tags(arch, [tag])))
   representatives = list(nub(eq[0] for eq in eq_classes))
 
   if not complex_insts:
@@ -234,108 +252,120 @@ def generate_simple_ilp_input(*,
       ordering               = Ordering.ALPHABETIC,
     )
 
-    EQ.make_measurements(kernels, measurements, measure_ports, output=measurements_file)
+    kernels = [k for k in kernels if tuple(i.name for i in k) not in measurements]
 
-  log('MEASURE: muI')
-  make_measurements(nub((i,) for i in complex_insts + representatives), measure_ports=True)
+    CHUNK_SIZE = 2000
 
-  assert complex_insts[0].name in measurements
+    for chunk in chunks(kernels, CHUNK_SIZE):
+      try:
+        EQ.make_measurements(chunk, measurements, measure_ports=measure_ports, output=measurements_output)
+      finally:
+        os.sched_yield()
 
-  if min_muops:
-    for i in list(complex_insts):
-      m = measurements[i.name]
+  log('MEASURE: complexI', f'({len(complex_insts)})')
+  make_measurements([(i,) for i in complex_insts], measure_ports=True)
 
-      num_muops = round(m.muops.mean / m.instructions.mean)
+  complex_insts = [i for i in complex_insts if measurements[i.name].num_unfused_muops > 1]
 
-      if num_muops < min_muops:
-        complex_insts.remove(i)
+  if not complex_insts:
+    log('no real complexI found')
+    return
 
-  kernels = list(combination_kernels(complex_insts, representatives))
-  log('MEASURE: muI x complexI', f'({len(kernels)})')
-  for chunk in chunks(kernels, 1000):
-    make_measurements(chunk, measure_ports=False)
+  log('MEASURE: muI ', f'({len(representatives)})')
+  make_measurements([(i,) for i in representatives], measure_ports=True)
 
-  del kernels
+  def combinations_of_instructions_using_only_ports(ports: ty.FrozenSet['Port']) -> ty.Iterable[ty.Tuple[ir.Instruction,
+                                                                                                         ...]]:
+    port_sets = set(frozenset(ps) for ps in powerset(ports))
+    port_sets.remove(frozenset())
 
-  ## higher order kernels
+    assert frozenset([6]) in port_sets
 
-  kernels = []
+    assert len([i for i in representatives if i.name == 'JMP_0'])
 
-  if False: # NEW WAY, TODO IMPLEMENT
-    for complexI in complex_insts:
-      mCI = measurements[complexI.name]
+    combinations = collections.defaultdict(list)
 
-      if MEASURE_PORTS:
-        ## TODO: read ports from args or partial model
-        ports = mCI.ports_used()
-      else:
-        interferes_with = set()
+    # FIXME: we really want a set cover for every possible subset of ports ... but that is expensive
+    for N in range(1, 8):
+      for kernel in itertools.combinations(representatives, N):
+        kernel_ports = frozenset()
 
-        for muI in representatives:
-          mMUI    = measurements[muI.name]
-          mCI_MUI = measurements[complexI.name, muI.name]
+        for inst in kernel:
+          run = measurements[inst.name]
 
-          # no interference iff MPC(A) + MPC(B) == MPC(A B)
-          if abs((mc.mpc.mean + mr.mpc.mean) - mcr.mpc.mean) <= 0.1:
-            continue
+          kernel_ports |= run.ports_used()
 
-          interferes_with.add(muI)
+        # if kernel_ports in combinations:
+        #   kernel = min(kernel, combinations[kernel_ports], key=len)
 
-        ports = set()
+        # combinations[kernel_ports] = tuple(sorted(kernel, key=lambda i: i.name))
 
-        for muI in interferes_with:
-          muI_ports = ...  # TODO: read partial model
+        combinations[kernel_ports].append(tuple(sorted(kernel, key=lambda i: i.name)))
 
-          ports |= muI_ports
+        # if not port_sets:
+        #   break
 
-      for port_set in powerset(ports):
-        kernel = minimal_combination_of_instructions_using_only_port_set(port_set)
+    if False:
+      print('COVERED ' + str(ports) + ':')
+      for port_set in sorted(combinations, key=lambda ps: [len(ps), ps]):
+        for kernel in combinations[port_set]:
+          print(' ', Benchmark_Spec.name_from_instruction_names(i.name for i in kernel), *sorted(port_set))
+      print('UNCOVERED ' + str(ports) + ':')
+      for port_set in sorted(port_sets, key=lambda ps: [len(ps), ps]):
+        if port_set not in combinations:
+          print(' ', *sorted(port_set))
+      print()
+      exit()
 
-        kernels.append(kernel)
-        kernels.append((complexI,) + kernel)
-  else:
-    interferes_with = collections.defaultdict(set)
+    # print('MAX', max(map(len, combinations.values())))
 
-    for c in complex_insts:
-      for r in representatives:
-        if c is r:
-          continue
-        mc  = measurements[c.name]
-        mr  = measurements[r.name]
+    out = []
 
-        if mc.ports_used() and mr.ports_used():
-          if not (mc.ports_used() & mr.ports_used()):
-            # does not use same ports. no interference
-            continue
-        else:
-          mcr = measurements[c.name, r.name]
+    for port_set in sorted(combinations, key=lambda ps: [len(ps), ps]):
+      kernels = combinations[port_set]
+      kernels = sorted(kernels, key=len)[:3]
 
-          # no interference iff MPC(A) + MPC(B) == MPC(A B)
-          if abs((mc.mpc.mean + mr.mpc.mean) - mcr.mpc.mean) <= 0.1:
-            continue
+      out += kernels
 
-        interferes_with[c].add(r)
+    return sorted(out, key=len)
 
-    for c, interferes in interferes_with.items():
-      mc = measurements[c.name]
+  assert complex_insts[0].name in measurements
 
-      num_muops = round(mc.muops.mean / mc.instructions.mean)
+  if min_muops:
+    for i in list(complex_insts):
+      m = measurements[i.name]
 
-      for combi in combination_kernels([c], interferes, interferes):
-        kernels.append(combi)
+      num_muops = round(m.unfused_muops.mean / m.instructions.mean)
 
-  log('MEASURE: muI x muI x complexI', f'({len(kernels)})')
+      if num_muops < min_muops:
+        complex_insts.remove(i)
+
+  ## higher order kernels
+
+  kernels = []
 
-  for chunk in chunks(kernels, 1000):
-    make_measurements(chunk, measure_ports=False)
+  for complexI in complex_insts:
+    for combi in combination_kernels([complexI], representatives):
+      kernels.append(combi)
+
+  log('MEASURE: muI x complexI', f'({len(kernels)})')
+  make_measurements(kernels, measure_ports=True)
+
+  kernels = []
+  for complexI in complex_insts:
+    for combi in combination_kernels([complexI], representatives, representatives):
+      kernels.append(combi)
+
+  log('MEASURE: muI x muI x complexI', f'({len(kernels)})')
+  make_measurements(kernels, measure_ports=True)
 
 
 def find_equivalence_classes(*,
-                             measurements_file: ty.Optional[pathlib.Path],
+                             measurements_input: ty.Optional[pathlib.Path],
                              eq_classes_file: ty.Optional[pathlib.Path],
                              tag: ty.Optional[str],
                              eq_class_output: ty.Optional[pathlib.Path],
-                             measurement_output: ty.Optional[pathlib.Path],
+                             measurements_output: ty.Optional[pathlib.Path],
                              yaml_log: ty.Optional[pathlib.Path],
                              json_log: ty.Optional[pathlib.Path],
                              num_representatives: int,
@@ -360,13 +390,13 @@ def find_equivalence_classes(*,
   common.set_process_name('pipedream-equivalence-classes')
   common.set_scheduler_params()
 
-  measurements = Benchmark_Run_Summary_Aggregator(
+  measurements = Benchmark_Run_Aggregator(
     max_stddev=max_stddev,
     # *0.9 to account for dropping the bottom & top 5 percent.
     min_samples=MIN_NUM_SAMPLES * 0.9,
   )
 
-  read_measurements_from_files(measurements, [measurements_file])
+  read_measurements_from_files(measurements, [measurements_input])
 
   ##############################################################################
   ## build initial input equivalence classes
@@ -407,6 +437,9 @@ def find_equivalence_classes(*,
         if inst.name in BLACKLISTED_INSTRUCTIONS:
           return True
 
+        if inst.name == 'JMP_0':
+          return False
+
         # forbid instructions with a read/write to a fixed register (forces data dependencies)
         for op in inst.operands:
           if not isinstance(op, ir.Register_Operand):
@@ -424,7 +457,7 @@ def find_equivalence_classes(*,
 
       EQ.make_measurements([(i,) for i in all_insts],
                            measurements,
-                           output=measurement_output,)
+                           output=measurements_output,)
 
       muI_grouped_by_prefix = collections.defaultdict(list)
 
@@ -435,7 +468,7 @@ def find_equivalence_classes(*,
 
         run = measurements[i.name]
 
-        if run.num_muops != 1:
+        if run.num_unfused_muops != 1:
           continue
 
         muI.append(i)
@@ -456,7 +489,7 @@ def find_equivalence_classes(*,
           measurements        = measurements,
           equivalence_classes = prefix_eq_classes,
           eq_class_output     = None,
-          measurement_output  = None,
+          measurements_output = None,
           yaml_log            = yaml_log,
           json_log            = json_log,
         )
@@ -470,7 +503,7 @@ def find_equivalence_classes(*,
     measurements        = measurements,
     equivalence_classes = equivalence_classes,
     eq_class_output     = eq_class_output,
-    measurement_output  = measurement_output,
+    measurements_output = measurements_output,
     yaml_log            = yaml_log,
     json_log            = json_log,
   )
@@ -478,10 +511,10 @@ def find_equivalence_classes(*,
 
 def _find_equivalence_classes(*,
                               EQ: 'Equivalence_Class_Finder',
-                              measurements: Benchmark_Run_Summary_Aggregator,
+                              measurements: Benchmark_Run_Aggregator,
                               equivalence_classes: ty.List['Eq_Class'],
                               eq_class_output: ty.Optional[pathlib.Path],
-                              measurement_output: ty.Optional[pathlib.Path],
+                              measurements_output: ty.Optional[pathlib.Path],
                               yaml_log: ty.Optional[pathlib.Path],
                               json_log: ty.Optional[pathlib.Path],) -> ty.List['Eq_Class']:
 
@@ -519,7 +552,7 @@ def _find_equivalence_classes(*,
     ##############################################################################
     ## write output
 
-    if eq_class_output or measurement_output:
+    if eq_class_output or measurements_output:
       log('WRITE RESULTS')
 
     if eq_class_output:
@@ -534,6 +567,9 @@ def _find_equivalence_classes(*,
               print(',', file=fd)
             first = False
             print(' ', '{', file=fd)
+            print(' ', ' ', '"avg-ipc": ' + str(numpy.mean([measurements[i].ipc.mean for i in insts])) + ',', file=fd),
+            print(' ', ' ', '"avg-fused-mpc": ' + str(numpy.mean([measurements[i].fmpc.mean for i in insts])) + ',', file=fd),
+            print(' ', ' ', '"avg-unfused-mpc": ' + str(numpy.mean([measurements[i].umpc.mean for i in insts])) + ',', file=fd),
             print(' ', ' ', '"insts": [' + ', '.join('"' + i + '"' for i in insts) + ']', file=fd)
             print(' ', '}', end = '', file=fd)
           print(file=fd)
@@ -542,8 +578,8 @@ def _find_equivalence_classes(*,
       except argparse.ArgumentTypeError as e:
         print(e, file=sys.stderr)
 
-    if measurement_output is not None:
-      write_measurements(measurements, measurement_output)
+    if measurements_output is not None:
+      write_measurements(measurements, measurements_output)
 
   return equivalence_classes
 
@@ -595,6 +631,8 @@ def split_equivalence_classes(EQ,
                               equivalence_classes: ty.List['Eq_Class']) -> ty.Tuple[ty.List['Eq_Class'], bool]:
   assert type(equivalence_classes) is list
 
+  equivalence_classes = list(equivalence_classes)
+
   changed: bool = False
 
   log('  SPLIT', len(equivalence_classes), 'CLASS(ES)')
@@ -602,8 +640,8 @@ def split_equivalence_classes(EQ,
   for eq in list(equivalence_classes):
     # log('    SPLIT', eq)
 
-    # reps = EQ._select_representatives(eq)
-    reps = eq.random_sample(EQ.num_representatives)
+    reps = EQ._select_representatives(eq)
+    # reps = eq.random_sample(EQ.num_representatives)
 
     EQ.make_measurements(((i,) for i in reps), measurements)
     EQ.make_measurements(combination_kernels(reps, repeat=2), measurements, measure_ports=False)
@@ -781,11 +819,14 @@ class Equivalence_Class_Finder:
     n_random: int
 
     if self.random_representatives:
-      n_alphabetic = 1
-      n_random     = max(0, self.num_representatives - 1)
+      n_alphabetic = self.num_representatives // 4
+      n_random     = self.num_representatives - n_alphabetic
     else:
-      n_alphabetic = max(0, self.num_representatives - 1)
-      n_random     = 1
+      n_random     = self.num_representatives // 4
+      n_alphabetic = self.num_representatives - n_random
+
+    # n_random     = 0
+    # n_alphabetic = self.num_representatives - n_random
 
     assert n_alphabetic + n_random == self.num_representatives
 
@@ -798,14 +839,14 @@ class Equivalence_Class_Finder:
     return reps
 
   @staticmethod
-  def eq_class_ipc_and_mpc(eq_clss: Eq_Class, measurements: Benchmark_Run_Summary_Aggregator):
+  def eq_class_ipc_and_mpc(eq_clss: Eq_Class, measurements: Benchmark_Run_Aggregator):
     insts = [(i.name,) for i in eq_clss]
     ipc   = numpy.mean([measurements[i].ipc.mean for i in insts])
-    mpc   = numpy.mean([measurements[i].mpc.mean for i in insts])
+    mpc   = numpy.mean([measurements[i].umpc.mean for i in insts])
     return ipc, mpc
 
   @classmethod
-  def log_eq_class(clss, eq_clss: Eq_Class, measurements: Benchmark_Run_Summary_Aggregator, indent: int = 2):
+  def log_eq_class(clss, eq_clss: Eq_Class, measurements: Benchmark_Run_Aggregator, indent: int = 2):
     insts    = sorted(i.name for i in eq_clss)
     ipc, mpc = clss.eq_class_ipc_and_mpc(eq_clss, measurements)
     log(' ' * indent, eq_clss,
@@ -815,7 +856,7 @@ class Equivalence_Class_Finder:
 
   def make_measurements(self,
                         kernels: ty.Iterable[ty.Tuple[ir.Instruction, ...]],
-                        measurements: Benchmark_Run_Summary_Aggregator,
+                        measurements: Benchmark_Run_Aggregator,
                         *,
                         measure_ports: bool = None,
                         force: bool = False,
@@ -846,7 +887,7 @@ class Equivalence_Class_Finder:
       return False
 
     extra_counters = [
-      # 'RESOURCE_STALLS',
+      'RESOURCE_STALLS',
     ]
 
     if measure_ports:
@@ -885,29 +926,30 @@ class Equivalence_Class_Finder:
                                          tmp_dir               = str(pathlib.Path.cwd() / 'tmp'),
                                          debug                 = False,):
           try:
+            run.drop_details()
+
             if self.yaml_log is not None:
               self.yaml_log.write('---\n')
               yaml.dump(run, self.yaml_log)
               self.yaml_log.write('...\n')
 
             if self.json_log is not None:
-              print(Benchmark_Run_Summary.from_benchmark_run(run).to_json(), file=self.json_log)
+              print(run.to_json(), file=self.json_log)
 
-            name    = run.benchmark.name
-            summary = Benchmark_Run_Summary.from_benchmark_run(run)
-            err     = test_equivalence(name, name, summary, summary)
+            name = run.benchmark.name
+            err  = test_equivalence(name, name, run, run)
 
             if err:
               print(f'error: IPC({name}) != IPC({name})', ':', err, file=sys.stderr)
               exit(1)
 
-            added = measurements.add_measurement(summary)
+            added = measurements.add_measurement(run)
 
             if NUM_SAMPLES >= MAX_NUM_SAMPLES or added:
               key = tuple(run.benchmark.instructions)
               unknown.remove(key)
               if not added:
-                measurements.force_add(summary)
+                measurements.force_add(run)
           finally:
             if self.yaml_log is not None:
               self.yaml_log.flush()
@@ -917,20 +959,17 @@ class Equivalence_Class_Finder:
         NUM_SAMPLES += NUM_SAMPLES_STEP
     finally:
       if output:
+        os.sched_yield()
         write_measurements(measurements, output)
     return True
 
 
-def read_measurements_from_files(measurements: Benchmark_Run_Summary_Aggregator,
+def read_measurements_from_files(measurements: Benchmark_Run_Aggregator,
                                  files: ty.Sequence[pathlib.Path]):
   for file in files:
     log('READING MEASUREMENTS FROM', shlex.quote(str(file)))
 
-    with open(file) as fd:
-      for line in fd:
-        run = Benchmark_Run_Summary.from_json(line)
-
-        measurements.add_measurement(run)
+    measurements.read_from_file(file)
 
   log('FOUND', len(measurements), 'measurement(s)')
 
@@ -938,7 +977,7 @@ def read_measurements_from_files(measurements: Benchmark_Run_Summary_Aggregator,
 def read_equivalence_classes_from_file(instruction_set: ir.Instruction_Set, file: str):
   try:
     with argparse.FileType('r')(file) as fd:
-      json_eq_classes = json.load(fd)
+      json_eq_classes = json.load(fd, allow_comments=True)
 
       equivalence_classes = []
       all_insts = []
@@ -964,7 +1003,7 @@ def read_equivalence_classes_from_file(instruction_set: ir.Instruction_Set, file
     exit(1)
 
 
-def write_measurements(measurements: Benchmark_Run_Summary_Aggregator, file: pathlib.Path):
+def write_measurements(measurements: Benchmark_Run_Aggregator, file: pathlib.Path):
   assert isinstance(file, pathlib.Path)
 
   log('WRITE MEASUREMENTS TO', shlex.quote(str(file)))
@@ -972,21 +1011,17 @@ def write_measurements(measurements: Benchmark_Run_Summary_Aggregator, file: pat
   written = 0
 
   if str(file) in ['/dev/stdout', '/dev/stderr']:
-    with open(file, 'w') as fd:
-      for m in measurements.all_measurements():
-        print(m.to_json(), file=fd)
-        written += 1
+    written += measurements.write_to_file(file, only_best=False)
   else:
     with tempfile.NamedTemporaryFile(mode='w',
                                      prefix='eq-class-measurements.',
-                                     suffix='.jsonl',
+                                     suffix=file.suffix,
                                      delete=False,) as fd:
-      for m in measurements.all_measurements():
-        print(m.to_json(), file=fd)
-        written += 1
-      os.makedirs(file.parent, exist_ok=True)
+      written += measurements.write_to_file(pathlib.Path(fd.name), only_best=False)
+    os.makedirs(file.parent, exist_ok=True)
+    if file.exists():
       shutil.move(file, file.with_suffix(file.suffix + '.bkp'))
-      shutil.move(fd.name, file)
+    shutil.move(fd.name, file)
 
   log('WROTE', written, 'measurement(s)')
 
@@ -995,12 +1030,19 @@ def combination_kernels(*iterables: ty.Sequence[ir.Instruction], repeat: int = 1
   ## AB is not necessarily the same as AABB (x86 is weird)
   ## so we use product instead of combinations
   for combi in itertools.product(*iterables, repeat=repeat):
-    for N in range(1, 6):
-      kernel = sum(((i,) * N for i in combi), ())
-
+    for kernel in repetition_kernels(combi):
       yield kernel
 
 
+def repetition_kernels(kernel: ty.Tuple[ir.Instruction, ...],
+                       max_repeat: int = 5) -> ty.Iterable[ty.Tuple[ir.Instruction, ...]]:
+  assert max_repeat >= 1
+
+  for N in range(1, max_repeat + 1):
+    k = sum(((i,) * N for i in kernel), ())
+    yield k
+
+
 class Percent:
   def __init__(self, numerator, denominator = None):
     if type(numerator) is Percent:
@@ -1046,8 +1088,8 @@ Percent.FIVE = Percent(5, 100)
 
 
 def test_equivalence(A: ir.Instruction, B: ir.Instruction,
-                     runA: Benchmark_Run_Summary, runB: Benchmark_Run_Summary,
-                     runAB: Benchmark_Run_Summary = None, *, margin: Percent = Percent.FIVE) -> ty.Optional[str]:
+                     runA: Benchmark_Run, runB: Benchmark_Run,
+                     runAB: Benchmark_Run = None, *, margin: Percent = Percent.FIVE) -> ty.Optional[str]:
   """
     Check if instruction A and B are equivalent.
     Returns an error message iff they are not equal, None otherwise.
@@ -1069,17 +1111,25 @@ def test_equivalence(A: ir.Instruction, B: ir.Instruction,
       dev  = round(stat.stddev, 3)
       return f'{mean}±{dev}'
 
+    assert runA.num_fused_muops == runB.num_fused_muops, f'num_fused_muops({A})={runA.num_fused_muops} ' \
+                                                         f'num_fused_muops({B})={runB.num_fused_muops}'
+
+    assert runA.num_unfused_muops == runB.num_unfused_muops, f'num_unfused_muops({A})={runA.num_unfused_muops} ' \
+                                                             f'num_unfused_muops({B})={runB.num_unfused_muops}'
+
     assert IPCs_are_equivalent(runA, runB, margin),  f'IPC({A})={pstat(runA.ipc)} IPC({B})={pstat(runB.ipc)}'
     if runAB:
       assert IPCs_are_equivalent(runA, runAB, margin), f'IPC({A})={pstat(runA.ipc)} IPC({A} {B})={pstat(runAB.ipc)}'
       assert IPCs_are_equivalent(runB, runAB, margin), f'IPC({B})={pstat(runB.ipc)} IPC({A} {B})={pstat(runAB.ipc)}'
 
-    assert MPCs_are_equivalent(runA, runB, margin),  f'MPC({A})={pstat(runA.mpc)} MPC({B})={pstat(runB.mpc)}'
+    assert MPCs_are_equivalent(runA, runB, margin),  f'MPC({A})={pstat(runA.umpc)} MPC({B})={pstat(runB.umpc)}'
     if runAB:
-      assert MPCs_are_equivalent(runA, runAB, margin), f'MPC({A})={pstat(runA.mpc)} MPC({A} {B})={pstat(runAB.mpc)}'
-      assert MPCs_are_equivalent(runB, runAB, margin), f'MPC({B})={pstat(runB.mpc)} MPC({A} {B})={pstat(runAB.mpc)}'
+      assert MPCs_are_equivalent(runA, runAB, margin), f'MPC({A})={pstat(runA.umpc)} MPC({A} {B})={pstat(runAB.umpc)}'
+      assert MPCs_are_equivalent(runB, runAB, margin), f'MPC({B})={pstat(runB.umpc)} MPC({A} {B})={pstat(runAB.umpc)}'
 
-    if runA.port_muops and runB.port_muops:
+    USE_PORTS = False
+
+    if USE_PORTS and runA.port_muops and runB.port_muops:
       def ports_used(run) -> ty.Set[int]:
         "return set of ports used by a benchmark run (not by how much each port is used)"
 
@@ -1089,7 +1139,7 @@ def test_equivalence(A: ir.Instruction, B: ir.Instruction,
           ## a port that receives less than 5% of all muops is considered unused
           ## (i.e. that usage is some measurement noise, comes from another hyperthread, ...)
           ## TODO: investigate example ADC_GPR64i64_IMMi32
-          if (stat.mean / run.muops.mean) >= 0.05:
+          if (stat.mean / run.unfused_muops.mean) >= 0.05:
             ports.add(port)
 
         return ports
@@ -1125,6 +1175,12 @@ def round_float(value: float) -> fractions.Fraction:
   return rounded
 
 
+def powerset(iterable: ty.Iterable['T']) -> ty.Iterable[ty.Tuple['T', ...]]:
+    "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
+    s = list(iterable)
+    return itertools.chain.from_iterable(itertools.combinations(s, r) for r in range(len(s) + 1))
+
+
 def ttest_1samp_from_stats(mean, std, var, nobs, popmean):
   """
     Calculate the T-test for the mean of ONE group of scores from descriptive statistics.
@@ -1191,29 +1247,30 @@ def tost_paired(y, x, low, upp, transform=None):
 P_VALUE = 0.05
 
 
-def IPCs_are_equivalent(a: Benchmark_Run_Summary, b: Benchmark_Run_Summary, margin: Percent = Percent.FIVE):
+def IPCs_are_equivalent(a: Benchmark_Run, b: Benchmark_Run, margin: Percent = Percent.FIVE):
   assert 0 <= margin <= 1
 
-  if round(a.ipc.p75, 2) == round(b.ipc.p75, 2):
-    return True
+  # if round(a.ipc.p75, 2) == round(b.ipc.p75, 2):
+  #   return True
 
   margin = min(a.ipc.mean, b.ipc.mean) * float(margin)
 
   return means_are_equivalent(a.ipc, b.ipc, margin)
 
 
-def MPCs_are_equivalent(a: Benchmark_Run_Summary, b: Benchmark_Run_Summary, margin: Percent = Percent.FIVE):
+def MPCs_are_equivalent(a: Benchmark_Run, b: Benchmark_Run, margin: Percent = Percent.FIVE):
   assert 0 <= margin <= 1
 
-  if round(a.mpc.p75, 2) == round(b.mpc.p75, 2):
-    return True
+  # if round(a.umpc.p75, 2) == round(b.umpc.p75, 2) and round(a.fmpc.p75, 2) == round(b.fmpc.p75, 2):
+  #   return True
 
-  margin = min(a.mpc.mean, b.mpc.mean) * float(margin)
+  u_margin = min(a.umpc.mean, b.umpc.mean) * float(margin)
+  f_margin = min(a.fmpc.mean, b.fmpc.mean) * float(margin)
 
-  return means_are_equivalent(a.mpc, b.mpc, margin)
+  return means_are_equivalent(a.umpc, b.umpc, u_margin) and means_are_equivalent(a.fmpc, b.fmpc, f_margin)
 
 
-def port_muops_are_equivalent(a: Benchmark_Run_Summary, b: Benchmark_Run_Summary, port: int):
+def port_muops_are_equivalent(a: Benchmark_Run, b: Benchmark_Run, port: int):
   ## Muop per port counters aren't that precise and port usage fluctuates way more than
   ## cycles or total number of muops, so we are only looking for a very broad sense of equality here.
   ## The average benchmark runs 5_000_000 muops, so this works out to a margin of 200_000 muops.
diff --git a/tools/solve-ilp b/tools/solve-ilp
index e0c83f4..509ce8f 100644
--- a/tools/solve-ilp
+++ b/tools/solve-ilp
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 
 import argparse
 import collections
@@ -5,45 +6,28 @@ import dataclasses
 import enum
 import functools
 import itertools
-import json
+import operator
 import pathlib
 import random
 import sys
+import typing as ty
 
-sys.path.append(str(pathlib.Path(__file__).parent / 'src'))
+try:
+  import pipedream
+except ImportError:
+  sys.path.append(str(pathlib.Path(__file__).parent.parent / 'src'))
 
 from pipedream.ilp import *
-from pipedream.benchmark.types import Benchmark_Spec, Benchmark_Run_Summary, Benchmark_Run_Summary_Aggregator
+from pipedream.benchmark.types import Benchmark_Spec, Benchmark_Run, Benchmark_Run_Aggregator
+import pipedream.utils.json as json
+import pipedream.utils.terminal as terminal
 
 MEASURE_PORTS = True
 
 
-def main_solve_for_muI_mapping(num_ports: int = 8):
-  parser = argparse.ArgumentParser()
-
-  parser.add_argument('FILE')
-  parser.add_argument('EQUIVALENCE_CLASSES')
-
-  args = parser.parse_args()
-
-  measurements = Benchmark_Run_Summary_Aggregator(max_stddev=math.inf, min_samples=0)
-
-  for F in [args.FILE]:
-    with open(F) as fd:
-      try:
-        for line in fd:
-          run = Benchmark_Run_Summary.from_json(line)
-
-          measurements.add_measurement(run)
-      except json.JSONDecodeError as e:
-        print('error: malformed file', repr(F) + ':', e, file=sys.stderr)
-
-  with open(args.EQUIVALENCE_CLASSES) as fd:
-    eq_json    = json.load(fd)
-    eq_classes = [eq['insts'] for eq in eq_json]
-
-    # muIs = [random.choice(eq) for eq in eq_classes]
-    muIs = [eq[0] for eq in eq_classes]
+def main_solve_for_muI_mapping(measurements: Benchmark_Run_Aggregator,
+                               representatives: ty.List[str], num_ports: int = 8):
+  muIs = representatives
 
   muI_runs  = [measurements[muI] for muI in muIs]
   pair_runs = [measurements[a, b] for a, b in itertools.combinations(muIs, 2)]
@@ -51,23 +35,27 @@ def main_solve_for_muI_mapping(num_ports: int = 8):
   KERNELS                 = []
   INSTS                   = []
   REAL_MUOPS              = []  # filled in below
-  SLOWDOWN_MUOPS          = []  # filled in below
   REAL_PORTS              = [Port(f'P{i}') for i in range(num_ports)]
-  SLOWDOWN_PORTS          = []  # filled in below
+  REAL_PORTS              = [Port(f'P{i}') for i in [0, 1, 2, 3, 5, 6]]
+  THROTTLES               = []  # filled in below
   kernels_with_bottleneck = set()
 
   NAME_2_INST = {}
 
   delta_IM = {}
   delta_MP = {}
+  delta_IT = {}
 
   print('SELECTED MUOP INSTRUCTIONS', len(muI_runs))
 
+  for M in muI_runs:
+    print(M.name)
+
   for run in list(muI_runs):
     assert len(run.kernel) == 1
 
-    num_muops = round(run.muops.mean / run.instructions.mean)
-    mpc       = run.mpc.mean
+    num_muops = round(run.unfused_muops.mean / run.instructions.mean)
+    mpc       = run.umpc.mean
 
     assert mpc > 0.1, run.name
 
@@ -75,8 +63,8 @@ def main_solve_for_muI_mapping(num_ports: int = 8):
 
     slowdown = (mpc < 0.95)
 
-    n = ' '.join(run.kernel)
-    i = Inst(name=n, num_muops=num_muops)
+    n = run.name
+    i = Inst(name=n, num_fused_muops=run.num_fused_muops, num_unfused_muops=run.num_unfused_muops)
     m = Muop(name=n)
 
     INSTS          += [i]
@@ -86,455 +74,583 @@ def main_solve_for_muI_mapping(num_ports: int = 8):
     delta_IM[i, m] = True
 
     if slowdown:
-      slowdown_m = Muop(name='slowdown_' + n)
-      slowdown_p = Port(name=f'slowdown {round(mpc, 2)}', max_throughput=mpc + 0.1, type=Port_Type.SLOWDOWN)
+      throttle = Port(name=f'throttle {round(mpc, 2)}', max_throughput=mpc + 0.1, type=Port_Type.SLOWDOWN)
 
-      SLOWDOWN_MUOPS.append(slowdown_m)
-      SLOWDOWN_PORTS.append(slowdown_p)
+      THROTTLES.append(throttle)
 
-      delta_IM[i, slowdown_m] = True
-      delta_MP[slowdown_m, slowdown_p] = True
+      delta_IT[i, throttle] = True
 
     print(' ', Benchmark_Spec.name_from_instruction_names(run.kernel).ljust(75), ':',
-          f'IPC({run.ipc.mean:5.3}) MPC({run.mpc.mean:5.3})', ':',
+          f'IPC({run.ipc.mean:5.3}) MPC({run.umpc.mean:5.3})', ':',
           '(SLOWDOWN)' if slowdown else '')
 
-    MUOPS = REAL_MUOPS + SLOWDOWN_MUOPS
-    PORTS = REAL_PORTS + SLOWDOWN_PORTS + SPEEDUP_PORTS
+    MUOPS     = REAL_MUOPS
+    PORTS     = REAL_PORTS
+    THROTTLES = THROTTLES
     INSTS.sort(key=lambda i: i.name)
     MUOPS.sort(key=lambda m: m.name)
     PORTS.sort(key=lambda p: p.name)
+    THROTTLES.sort(key=lambda p: p.name)
 
-    for m in SLOWDOWN_MUOPS:
-      for p in PORTS:
-        delta_MP.setdefault((m, p), False)
+  for run in sorted(muI_runs, key=lambda r: r.name) + sorted(pair_runs, key=lambda r: r.name):
+    # assert all(i in NAME_2_INST for i in run.kernel), run.kernel
 
-    for m in REAL_MUOPS:
-      for p in SLOWDOWN_PORTS:
-        delta_MP[m, p] = False
-
-    for run in sorted(muI_runs) + sorted(pair_runs):
-      assert all(i in NAME_2_INST for i in run.kernel), run.kernel
-
-      if run in pair_runs:
-        print(' ', Benchmark_Spec.name_from_instruction_names(run.kernel).ljust(75), ':',
-              f'IPC({run.ipc.mean:5.3}) MPC({run.mpc.mean:5.3})',
-              f': {" ".join(str(p) for p in sorted(ports_used(run)))}')
+    if run in pair_runs:
+      print(' ', Benchmark_Spec.name_from_instruction_names(run.kernel).ljust(75), ':',
+            f'IPC({run.ipc.mean:5.3}) MPC({run.umpc.mean:5.3})',
+            f': {" ".join(str(p) for p in sorted(ports_used(run)))}')
 
-      ipc   = run.ipc.p90
-      mpc   = run.mpc.p90
+    ipc   = run.ipc.p90
+    umpc  = run.umpc.p90
+    fmpc  = run.fmpc.p90
+    try:
       insts = [NAME_2_INST[i] for i in run.kernel]
+    except KeyError:
+      print(NAME_2_INST)
+      raise
 
-      K = Kernel(mpc=mpc, ipc=ipc, insts=insts)
-      KERNELS += [K]
-
-      if len(run.kernel) == 1 and run.kernel[0] in complexIs:
-        KERNELS += [Kernel(mpc=mpc, ipc=ipc, insts=insts)]
+    K = Kernel(insts=insts, ipc=ipc, fmpc=fmpc, umpc=umpc)
+    KERNELS += [K]
 
-      # FIXME: ??? how to do this without ports ???
-      has_bottleneck = run.mpc < 0.95
+    # FIXME: ??? how to do this without ports ???
+    # has_bottleneck = run.umpc.mean >= 0.95
+    # assert has_bottleneck
+    has_bottleneck = True
 
-      if has_bottleneck:
-        kernels_with_bottleneck.add(K)
+    if has_bottleneck:
+      kernels_with_bottleneck.add(K)
 
-    KERNELS = Domain(Kernel, sorted(KERNELS, key=lambda k: (len(list(k)), str(K))))
-    INSTS   = Domain(Inst, INSTS)
-    MUOPS   = Domain(Muop, MUOPS)
-    PORTS   = Domain(Port, PORTS)
+  KERNELS = Domain(Kernel, sorted(KERNELS, key=lambda k: (len(list(k)), str(K))))
+  INSTS   = Domain(Inst, INSTS)
+  MUOPS   = Domain(Muop, MUOPS)
+  PORTS   = Domain(Port, PORTS)
 
+  if verbose:
     print('SOLVE WITH', len(KERNELS), 'KERNELS')
 
-    # print(*[I.name for I in INSTS])
-    # print(*[repr(str(K)) for K in KERNELS])
+  # print(*[I.name for I in INSTS])
+  # print(*[repr(str(K)) for K in KERNELS])
 
-    try:
-      outputs = solve_for_delta(
-        KERNELS, INSTS, MUOPS, PORTS,
-        const_delta_im          = delta_IM,
-        const_delta_mp          = delta_MP,
-        kernels_with_bottleneck = kernels_with_bottleneck,
-        saturation_margin       = 0.05,
-        max_error               = 0.06,
-        min_throughput          = 0.05,
-        print_iis               = True,
-        # print_iis               = False,
-      )
-
-      muI_outputs = [o for o in outputs if len(o.kernel) == 1]
-
-      print()
-      print('FOUND DECOMPOSITIONS:')
-      for o in muI_outputs:
-        print(' ', Benchmark_Spec.name_from_instruction_names(i.name for i in o.kernel) + ':',
-              o.cpu.merged_muop_str(o.used_muops()))
-      print()
-
-      ilp_outputs += muI_outputs
-      all_outputs += outputs
-    except ILP_Error as e:
-      print(e, file=sys.stderr)
+  try:
+    outputs = solve_for_delta(
+      KERNELS, INSTS, MUOPS, PORTS,
+      const_delta_im          = delta_IM,
+      const_delta_mp          = delta_MP,
+      kernels_with_bottleneck = kernels_with_bottleneck,
+      saturation_margin       = 0.975,
+      max_error               = 0.01,
+      min_throughput          = 0.05,
+      # print_iis               = True,
+      print_iis               = False,
+    )
+
+    muI_outputs = [o for o in outputs if len(o.kernel) == 1]
+
+    # print()
+    # print('FOUND DECOMPOSITIONS:')
+    # for o in muI_outputs:
+    #   print(' ', Benchmark_Spec.name_from_instruction_names(i.name for i in o.kernel) + ':',
+    #         o.cpu.merged_muop_str(o.used_muops()))
+    # print()
+
+    ilp_outputs += muI_outputs
+    all_outputs += outputs
+  except ILP_Error as e:
+    # print(e, file=sys.stderr)
+    raise
   if not ilp_outputs:
     exit()
 
   show_outputs(ilp_outputs, all_outputs)
 
 
-def main_solve_for_complex_instructions():
-  parser = argparse.ArgumentParser()
+def solve_for_complex_instruction(complexI, muIs, measurements,
+                                  hardcoded_mapping: ty.Collection[ty.Iterable[int]] = None, *,
+                                  verbose: bool):
+  assert complexI not in muIs
 
-  parser.add_argument('FILE')
-  parser.add_argument('EQUIVALENCE_CLASSES')
+  complexI_run = measurements[complexI]
+  muI_runs      = [measurements[muI] for muI in muIs]
+  combined_runs = []
 
-  args = parser.parse_args()
+  muI_set  = frozenset(muIs)
+  inst_set = frozenset([complexI]) | muI_set
 
-  measurements = Benchmark_Run_Summary_Aggregator(max_stddev=math.inf, min_samples=0)
+  for run in measurements:
+    run_insts = frozenset(run.kernel)
 
-  for F in [args.FILE]:
-    with open(F) as fd:
-      try:
-        for line in fd:
-          run = Benchmark_Run_Summary.from_json(line)
+    if len(run.kernel) > 1 and complexI in run.kernel and (run_insts & inst_set == run_insts):
+      combined_runs.append(run)
 
-          key = collections.Counter(run.kernel)
-          if len(key) == 1:
-            gcd = next(iter(key.values()))
-          else:
-            gcd = functools.reduce(math.gcd, key.values())
+  combined_runs.sort(key=lambda r: [len(r.kernel), r.kernel])
 
-          for i, cnt in key.items():
-            assert cnt % gcd == 0, [cnt, gcd]
-            key[i] = cnt // gcd
+  ### ILP
 
-          kernel = tuple(sorted(sum([[i] * c for i, c in key.items()], [])))
-          if len(key) == 1:
-            run = dataclasses.replace(run, kernel=kernel)
+  KERNELS                 = []
+  INSTS                   = []
+  REAL_MUOPS              = []  # filled in below
+  COMPLEX_MUOPS           = []  # filled in below
+  REAL_PORTS              = [Port(f'P{i}') for i in range(8)]
+  # REAL_PORTS              = [Port(f'P{i}') for i in [0, 1, 2, 3, 5, 6]]
+  # REAL_PORTS              = [Port(f'P{i}') for i in [0, 1, 5, 6]]
+  THROTTLES               = []  # filled in below
+  kernels_with_bottleneck = set()
 
-          if kernel in measurements:
-            old_run = measurements[kernel]
+  def id2port(port_id: int):
+    ps = [p for p in REAL_PORTS if p.name == f'P{port_id}']
+    assert len(ps) == 1, [port_id, ps]
+    return ps[0]
 
-            if run.mpc.p90 > old_run.mpc.p90:
-              assert abs(run.mpc.p90 - run.mpc.mean) < 0.1, run.kernel
+  NAME_2_INST = {}
 
-              measurements.remove_measurement(kernel)
-              measurements.force_add(run)
-          else:
-            measurements.add_measurement(run)
-      except json.JSONDecodeError as e:
-        print('error: malformed file', repr(F) + ':', e, file=sys.stderr)
+  delta_IM = collections.defaultdict(bool)
+  delta_MP = {}
+  delta_IP = {}
+  delta_IT = {}
+  mu_KP    = {}
 
-  with open(args.EQUIVALENCE_CLASSES) as fd:
-    eq_json    = json.load(fd)
-    eq_classes = [eq['insts'] for eq in eq_json]
+  def ports_used(run: Benchmark_Run) -> ty.FrozenSet[Port]:
+    if run.port_muops:
+      return frozenset(id2port(p) for p in run.ports_used())
+    else:
+      ports = frozenset()
 
-    # muIs = [random.choice(eq) for eq in eq_classes]
-    muIs = [eq[0] for eq in eq_classes]
+      for i in run.kernel:
+        run = measurements[i]
 
-    for muI in list(muIs):
-      run = measurements[muI]
+        ports |= ports_used(run)
 
-      ## exclude movups/movaps
-      if run.mpc.mean >= len(run.ports_used()):
-        muIs.remove(muI)
-        continue
+      return ports
 
-      ## exclude div/sqrt/...
-      if run.mpc.mean <= 0.5:
-        muIs.remove(muI)
-        continue
+  if verbose:
+    print('COMPLEX INSTRUCTION:', complexI)
+    for run in [complexI_run]:
+      print(' ', Benchmark_Spec.name_from_instruction_names(run.kernel).ljust(60), ':',
+            f'IPC({run.ipc.mean:5.3}) fMPC({run.fmpc.mean:5.3}) uMPC({run.umpc.mean:5.3})', ':',
+            f'{" ".join(str(p) for p in sorted(ports_used(run)))}',)
+    print('SELECTED MUOP INSTRUCTIONS', len(muI_runs))
+    for run in sorted(muI_runs, key=lambda r: r.kernel):
+      print(' ', Benchmark_Spec.name_from_instruction_names(run.kernel).ljust(60), ':',
+            f'IPC({run.ipc.mean:5.3}) fMPC({run.fmpc.mean:5.3}) uMPC({run.umpc.mean:5.3})', ':',
+            f'{" ".join(str(p) for p in sorted(ports_used(run)))}',)
+    print('SELECTED COMBINATIONS:', len(combined_runs))
+    for run in sorted(combined_runs, key=lambda r: str(r.kernel)):
+      ports = frozenset()
+      for i in run.kernel:
+        if i in complexI:
+          continue
+        ports |= ports_used(measurements[i])
+      print(' ', Benchmark_Spec.name_from_instruction_names(run.kernel).ljust(110), ':',
+            f'IPC({run.ipc.mean:5.3}) fMPC({run.fmpc.mean:5.3}) uMPC({run.umpc.mean:5.3})',
+            f': {" ".join(str(p) for p in sorted(ports))}')
 
-    # muIs = random.sample(muIs, 15)
+  for run in [complexI_run]:
+    assert len(run.kernel) == 1
 
-  muIs = [
-    'ADD_GPR32i32_GPR32i32',
-    'ADCX_GPR32i32_GPR32i32',
-    'ADC_GPR32i32_IMMi32',
-    'ADDPD_VR128f64x2_VR128f64x2',
-    'ADOX_GPR32i32_GPR32i32',
-    'AESDECLAST_VR128i32x4_VR128i32x4',
-    'ANDN_GPR32i32_GPR32i32_GPR32i32',
-    'ANDNPD_VR128u64x2_VR128u64x2',
-    'BLENDVPD_VR128f64x2_VR128f64x2',
-    'BSF_GPR32i32_GPR32i32',
-    'BTC_GPR32i32_GPR32i32',
-    'CMOVNZ_GPR32i32_GPR32i32',
-    'INSERTPS_VR128f32x4_VR128f32x4_IMMu8',
-    'LDDQU_VR128f64x2_MEM64i32x4',
-    'MOV_GPR64i64_IMMi64',
-    'XOR_GPR32i32_IMMi32',
-    # 'MOVAPD_VR128f64x2_VR128f64x2',
-  ]
+    assert run.num_unfused_muops > 1, [run.kernel, run.num_unfused_muops]
 
-  complexIs = [
-    # 'ADC_EAXi32_IMMi32',
-    # 'ADD_GPR64i64_MEM64i64',
-    'BEXTR_GPR64i64_GPR64i64_GPR64i64',
-    # 'CVTTPD2DQ_VR128i32x4_VR128f64x2',
-    # 'PACKSSWB_VR128i16x8_MEM64i16x8_SSE2',
-    # 'PCMPGTB_VR128i8x16_MEM64i8x16_PENTIUMMMX',
-  ]
+    n = ' '.join(run.kernel)
+    i = Inst(name=n, num_fused_muops=run.num_fused_muops, num_unfused_muops=run.num_unfused_muops)
 
-  # print('!(' + '|'.join(I for I in muIs + complexIs) + ')')
-  # exit()
+    INSTS += [i]
+    NAME_2_INST[n] = i
 
-  all_outputs = []
-  ilp_outputs = []
+    muops = [Muop(f'Mx{i}') for i in range(run.num_unfused_muops)]
+    COMPLEX_MUOPS += muops
 
-  muI_runs      = [measurements[muI] for muI in muIs]
-  combined_runs = []
+    for m in muops:
+      delta_IM[i, m] = True
 
-  for complexI in complexIs:
-    complexI_run = measurements[complexI]
+    run_ports = ports_used(run)
+    assert run_ports
 
-    muI_set  = frozenset(muIs)
-    inst_set = frozenset([complexI]) | muI_set
+    for p in REAL_PORTS:
+      delta_IP[i, p] = (p in run_ports)
 
-    data = collections.defaultdict(set)
-    for run in measurements:
-      run_insts = frozenset(run.kernel)
+    def set_hardcoded_mapping(mapping):
+      assert len(mapping) == run.num_unfused_muops
 
-      if len(run.kernel) > 1 and complexI in run.kernel and (run_insts & inst_set == run_insts):
-        # print(*sorted(run_insts))
-        data[run_insts].add(run)
+      for muop, ports in enumerate(mapping):
+        assert len(ports) == len(set(ports))
 
-    for _, runs in data.items():
-      runs = sorted(runs, key=lambda r: len(r.kernel))
+        ports = [REAL_PORTS[p] for p in ports]
 
-      for run in runs[:1] + runs[-2:-1]:
-        if run not in combined_runs:
-          combined_runs.append(run)
+        for port in REAL_PORTS:
+          delta_MP[muops[muop], port] = bool(port in ports)
 
-    combined_runs.sort(key=lambda r: [len(r.kernel), r.kernel])
+    if hardcoded_mapping:
+      set_hardcoded_mapping(hardcoded_mapping)
+    elif 0:
+      if n == 'BEXTR_GPR64i64_GPR64i64_GPR64i64':
+        set_hardcoded_mapping([0, 6], [1, 5])
+        set_hardcoded_mapping([0, 1, 6], [1, 5, 6])
+        set_hardcoded_mapping([0, 1, 5, 6], [0, 1, 5, 6])
 
-    ### ILP
+      if n == 'LEA_GPR16i16_ADDR64i64':
+        set_hardcoded_mapping([0, 1, 5, 6], [1, 5])
 
-    KERNELS                 = []
-    INSTS                   = []
-    REAL_MUOPS              = []  # filled in below
-    SLOWDOWN_MUOPS          = []  # filled in below
-    COMPLEX_MUOPS           = [Muop(f'Mx{i}') for i in range(complexI_run.num_muops)]
-    REAL_PORTS              = [Port(f'P{i}') for i in range(8)]
-    # REAL_PORTS              = [Port('P0'), Port('P1'), Port('P5'), Port('P6')]
-    SLOWDOWN_PORTS          = []  # filled in below
-    SPEEDUP_PORTS           = [Port(f'P{chr(ord("a") + i)}', type=Port_Type.SPEEDUP) for i in range(0)]
-    kernels_with_bottleneck = set()
+      if n == 'IMUL_GPR16i16_GPR16i16_IMMi8':
+        set_hardcoded_mapping([0, 1, 5, 6], [1])
 
-    def id2port(port_id: int):
-      ps = [p for p in REAL_PORTS if p.name == f'P{port_id}']
-      assert len(ps) == 1, [port_id, ps]
-      return ps[0]
+  for run in list(muI_runs):
+    assert len(run.kernel) == 1
 
-    NAME_2_INST = {}
+    mpc = run.umpc.mean
 
-    delta_IM = {}
-    delta_MP = {}
-    delta_IP = {}
-    mu_KP    = {}
+    assert mpc > 0.1, run.name
 
-    # MPC_THROTTLE_MUOP = Muop(name='MPC', is_virtual=True)
-    # MPC_THROTTLE_PORT = Port(name='MPC', max_throughput=4, type=Port_Type.SLOWDOWN)
-    # SLOWDOWN_MUOPS += [MPC_THROTTLE_MUOP]
-    # SLOWDOWN_PORTS += [MPC_THROTTLE_PORT]
-    # delta_MP[MPC_THROTTLE_MUOP, MPC_THROTTLE_PORT] = True
+    n = ' '.join(run.kernel)
+    i = Inst(name=n, num_fused_muops=run.num_fused_muops, num_unfused_muops=run.num_unfused_muops)
 
-    def ports_used(run: Benchmark_Run_Summary) -> ty.FrozenSet[Port]:
-      return frozenset(id2port(p) for p in run.ports_used())
+    INSTS += [i]
+    NAME_2_INST[n] = i
 
-    print('COMPLEX INSTRUCTION:', complexI)
-    for run in [complexI_run]:
-      print(' ', Benchmark_Spec.name_from_instruction_names(run.kernel).ljust(60), ':',
-            f'IPC({run.ipc.mean:5.3}) MPC({run.mpc.mean:5.3})', ':',
-            f'{" ".join(str(p) for p in sorted(ports_used(run)))}',)
-    print('SELECTED MUOP INSTRUCTIONS', len(muI_runs))
-    for run in sorted(muI_runs, key=lambda r: r.kernel):
-      print(' ', Benchmark_Spec.name_from_instruction_names(run.kernel).ljust(60), ':',
-            f'IPC({run.ipc.mean:5.3}) MPC({run.mpc.mean:5.3})', ':',
-            f'{" ".join(str(p) for p in sorted(ports_used(run)))}',)
-    print('SELECTED COMBINATIONS:', len(combined_runs))
-    for run in sorted(combined_runs, key=lambda r: r.kernel):
-      print(' ', Benchmark_Spec.name_from_instruction_names(run.kernel).ljust(110), ':',
-            f'IPC({run.ipc.mean:5.3}) MPC({run.mpc.mean:5.3})',
-            f': {" ".join(str(p) for p in sorted(ports_used(run)))}')
+    run_ports = ports_used(run)
+    assert run_ports
 
-    for run in [complexI_run]:
-      assert len(run.kernel) == 1
+    if n == 'MOV_MEM64i64_GPR64i64':
+      assert run.num_unfused_muops == 2, run.kernel
 
-      num_muops = round(run.muops.mean / run.instructions.mean)
+      muops = [Muop(name=f'n/{i}') for i in range(run.num_unfused_muops)]
 
-      assert num_muops > 1, [run.kernel, num_muops]
+      REAL_MUOPS += muops
 
-      n = ' '.join(run.kernel)
-      i = Inst(name=n, num_muops=num_muops)
+      for m in muops:
+        delta_IM[i, m] = True
 
-      INSTS += [i]
-      NAME_2_INST[n] = i
+      delta_MP[muops[0], REAL_PORTS[4]] = True
+      delta_MP[muops[1], REAL_PORTS[2]] = True
+      delta_MP[muops[1], REAL_PORTS[3]] = True
+      delta_MP[muops[1], REAL_PORTS[7]] = True
+    else:
+      assert run.num_unfused_muops == 1, run.kernel
 
-      for m in COMPLEX_MUOPS:
-        delta_IM[i, m] = True
+      m = Muop(name=n)
 
-      # assert run.kernel[0] == 'BEXTR_GPR64i64_GPR64i64_GPR64i64', run.kernel
+      REAL_MUOPS += [m]
 
-      # delta_MP[COMPLEX_MUOPS[0], id2port(0)] = True
-      # delta_MP[COMPLEX_MUOPS[0], id2port(6)] = True
+      delta_IM[i, m] = True
 
-      # delta_MP[COMPLEX_MUOPS[1], id2port(1)] = True
-      # delta_MP[COMPLEX_MUOPS[1], id2port(5)] = True
+      for p in REAL_PORTS:
+        delta_MP[m, p] = (p in run_ports)
 
-      # for M in COMPLEX_MUOPS:
-      #   for P in REAL_PORTS:
-      #     delta_MP.setdefault((M, P), False)
+    for p in REAL_PORTS:
+      delta_IP[i, p] = (p in run_ports)
 
-      # for p in SPEEDUP_PORTS:
-      #   delta_MP[COMPLEX_MUOPS[0], p] = False
-      #   delta_MP[COMPLEX_MUOPS[1], p] = False
+    slowdown = (mpc < 0.95)
 
-      # if MEASURE_PORTS:
-      #   run_ports = ports_used(run)
+    if slowdown:
+      throttle = Port(name=f'slowdown {round(mpc, 2)}', max_throughput=mpc + 0.0, type=Port_Type.SLOWDOWN)
 
-      #   for p in REAL_PORTS:
-      #     delta_IP[i, p] = (p in run_ports)
+      THROTTLES.append(throttle)
 
-      # MPC throttle
-      # delta_IM[i, MPC_THROTTLE_MUOP] = True
+      delta_IT[i, throttle] = True
 
-    for run in list(muI_runs):
-      assert len(run.kernel) == 1
+  MUOPS     = REAL_MUOPS + COMPLEX_MUOPS
+  PORTS     = REAL_PORTS
+  THROTTLES = THROTTLES
+  INSTS.sort(key=lambda i: i.name)
+  MUOPS.sort(key=lambda m: m.name)
+  PORTS.sort(key=lambda p: p.name)
+  THROTTLES.sort(key=lambda p: p.name)
 
-      num_muops = round(run.muops.mean / run.instructions.mean)
-      mpc       = run.mpc.mean
+  for i in INSTS:
+    for m in MUOPS:
+      delta_IM.setdefault((i, m), False)
 
-      assert mpc > 0.1, run.name
+  for i in INSTS:
+    for t in THROTTLES:
+      delta_IT.setdefault((i, t), False)
 
-      assert num_muops == 1, run.kernel
+  for m in REAL_MUOPS:
+    for p in PORTS:
+      delta_MP.setdefault((m, p), False)
 
-      slowdown = (mpc < 0.95)
+  for run in sorted(set([complexI_run] + combined_runs), key=lambda r: r.name):
+    assert all(i in NAME_2_INST for i in run.kernel), run.kernel
 
-      n = ' '.join(run.kernel)
-      i = Inst(name=n, num_muops=num_muops)
-      m = Muop(name=n)
+    ipc    = run.ipc.mean
+    fmpc   = run.fmpc.mean
+    umpc   = run.umpc.mean
+    insts = [NAME_2_INST[i] for i in run.kernel]
 
-      INSTS      += [i]
-      REAL_MUOPS += [m]
-      NAME_2_INST[n] = i
+    K = Kernel(insts=insts, ipc=ipc, fmpc=fmpc, umpc=umpc)
+    KERNELS += [K]
 
-      delta_IM[i, m] = True
+    ## NOTE: there are benchmarks that don't saturate any port but have an uMPC > 1
+    ##       So far all I've seen contained at least three different instructions,
+    ##       one of which must be BEXTR and one must be BSF.
+    # if run.umpc.mean > 0.95:
+    #   kernels_with_bottleneck.add(K)
 
-      run_ports = ports_used(run)
+    for port, stat in run.port_muops.items():
+      usage = stat.p90 / run.cycles.mean
 
-      for p in REAL_PORTS:
-        delta_MP[m, p] = (p in run_ports)
+      if usage > 0.9:
+        kernels_with_bottleneck.add(K)
+        mu_KP[K, REAL_PORTS[port]] = 0.9
 
-      if slowdown:
-        slowdown_m = Muop(name='slowdown_' + n, is_virtual=True)
-        slowdown_p = Port(name=f'slowdown {round(mpc, 2)}', max_throughput=mpc + 0.1, type=Port_Type.SLOWDOWN)
+      if usage < 0.1:
+        mu_KP[K, REAL_PORTS[port]] = 0.0
 
-        SLOWDOWN_MUOPS.append(slowdown_m)
-        SLOWDOWN_PORTS.append(slowdown_p)
+      mu_KP[K, REAL_PORTS[port]] = usage
+      # print(f'mu_KP[{K}, {REAL_PORTS[port]}] = {usage}')
 
-        delta_IM[i, slowdown_m] = True
-        delta_MP[slowdown_m, slowdown_p] = True
+  KERNELS   = Domain(Kernel, sorted(KERNELS, key=lambda k: (len(list(k)), str(K))))
+  INSTS     = Domain(Inst, INSTS)
+  MUOPS     = Domain(Muop, MUOPS)
+  PORTS     = Domain(Port, PORTS)
+  THROTTLES = Domain(Port, THROTTLES)
 
-      # MPC throttle
-      # delta_IM[i, MPC_THROTTLE_MUOP] = True
+  # for M in MUOPS:
+  #   for P in [REAL_PORTS[2], REAL_PORTS[3], REAL_PORTS[4], REAL_PORTS[7]]:
+  #     delta_MP[M, P] = False
 
-    MUOPS = REAL_MUOPS + COMPLEX_MUOPS + SLOWDOWN_MUOPS
-    PORTS = REAL_PORTS + SLOWDOWN_PORTS + SPEEDUP_PORTS
-    INSTS.sort(key=lambda i: i.name)
-    MUOPS.sort(key=lambda m: m.name)
-    PORTS.sort(key=lambda p: p.name)
+  if verbose:
+    print(complexI + ': SOLVE WITH', len(KERNELS), 'KERNELS')
 
-    for m in SLOWDOWN_MUOPS:
-      for p in PORTS:
-        delta_MP.setdefault((m, p), False)
+  try:
+    cpu = CPU_Model()
+
+    for I, M in INSTS * MUOPS:
+      if delta_IM[I, M]:
+        cpu.add_im_edge(I, M)
+
+    for M, P in MUOPS * PORTS:
+      if delta_MP[M, P]:
+        cpu.add_mp_edge(M, P)
+
+    for I, T in INSTS * THROTTLES:
+      if delta_IT[I, T]:
+        cpu.add_it_edge(I, T)
+
+    # outputs = solve_for_delta(
+    #   KERNELS, INSTS, MUOPS, PORTS, THROTTLES,
+    #   const_delta_im          = delta_IM,
+    #   const_delta_mp          = delta_MP,
+    #   const_delta_it          = delta_IT,
+    #   const_delta_ip          = delta_IP,
+    #   allow_errors            = False,
+    #   max_error               = 0.1,
+    outputs = solve_for_throughput(
+      cpu, list(KERNELS),
+      # const_mu_KP             = mu_KP,
+      kernels_with_bottleneck = kernels_with_bottleneck,
+      saturation_margin       = 0.975,
+      min_throughput          = 0.05,
+      print_iis               = False,
+      verbose                 = verbose,
+    )
+
+    # print(complexI + ':', hardcoded_mapping, 'max_error', max([o.error for o in outputs]))
+    return outputs
+  except ILP_Error as e:
+    # print(complexI + ':', hardcoded_mapping, e, file=sys.stderr)
+    # return []
+    raise e
+
+
+def main_solve_for_complex_instructions(measurements: Benchmark_Run_Aggregator,
+                                        representatives: ty.List[str]):
+  muIs = representatives
 
-    for m in REAL_MUOPS:
-      for p in SLOWDOWN_PORTS:
-        delta_MP[m, p] = False
+  complexIs = [
+    # 'ADC_EAXi32_IMMi32',
+    # 'ADD_GPR64i64_MEM64i64',
+    'ADD_MEM64i64_GPR64i64',
+    # 'BSWAP_GPR64i64',
+    # 'BEXTR_GPR64i64_GPR64i64_GPR64i64',
+    # 'IMUL_GPR16i16_GPR16i16_IMMi8',
+    # 'LEA_GPR16i16_ADDR64i64',
+    # 'CVTTPD2DQ_VR128i32x4_VR128f64x2',
+    # 'PACKSSWB_VR128i16x8_MEM64i16x8_SSE2',
+    # 'PCMPGTB_VR128i8x16_MEM64i8x16_PENTIUMMMX',
+    # 'MOV_MEM64i64_GPR64i64',
+  ]
 
-    for run in sorted([complexI_run] + combined_runs, key=lambda r: r.name):
-      assert all(i in NAME_2_INST for i in run.kernel), run.kernel
+  complexIs = [c for c in complexIs if c not in muIs]
 
-      ipc   = run.ipc.p90
-      mpc   = run.mpc.p90
-      insts = [NAME_2_INST[i] for i in run.kernel]
+  log('SELECTED', len(complexIs), 'COMPLEX INSTRUCTIONS')
+
+  # print('!(' + '|'.join(I for I in muIs + complexIs) + ')')
+  # exit()
 
-      K = Kernel(mpc=mpc, ipc=ipc, insts=insts)
-      KERNELS += [K]
+  all_outputs = []
 
-      if len(run.kernel) == 1 and run.kernel[0] in complexIs:
-        KERNELS += [Kernel(mpc=mpc, ipc=ipc, insts=insts)]
+  class Candidate_Decompositions(ty.NamedTuple):
+    error: float
+    decompositions: ty.List[str]
 
-      if len(run.kernel) > 1 and run.mpc.mean > 0.95:
-        kernels_with_bottleneck.add(K)
+  decompositions: ty.Dict[str, Candidate_Decomposition] = {}
 
-      mu = 0
+  results = []
 
-      for port, stat in run.port_muops.items():
-        # if stat.p90 / run.cycles.mean > 0.9:
-        #   kernels_with_bottleneck.add(K)
+  def summarize_outputs(complexI: str, decomposition: str, outputs):
+    if decomposition:
+      print((complexI + ':').ljust(100), decomposition)
+    print('ERRORS:        ', str(len([o for o in outputs if o.has_error])) + '/' + str(len(outputs)))
 
-        mu += (stat.mean / run.muops.mean) * run.mpc.mean
+    def show(name, key):
+      x = max(outputs, key=key, default='-')
+      print(name, key(x), f'(sum={sum([key(o) for o in outputs])} - {o.merged_muop_str()} - {o.kernel})')
 
-        if (stat.mean / run.muops.mean) < 0.05:
-          # mu_KP[K, id2port(port)] = 0
-          pass
-        else:
-          mu_KP[K, id2port(port)] = (stat.mean / run.muops.mean) * run.mpc.mean
+    show('IPC+MPC ERROR: ', key=lambda o: o.error)
+    show('MAX_IPC_ERROR: ', key=lambda o: o.ipc_error)
+    show('MAX_fMPC_ERROR:', key=lambda o: o.fmpc_error)
+    show('MAX_uMPC_ERROR:', key=lambda o: o.umpc_error)
+    print()
+    print()
 
-      if run.port_muops:
-        ## MOVAPD et al :/
-        # assert abs(mu - run.mpc.mean) < 0.05, f'{" ".join(run.kernel)} {mu:_.3f} {run.mpc.mean:_.3f}'
-        pass
+  for complexI in complexIs:
+    if 1:
+      run = measurements[complexI]
+
+      ports = run.ports_used()
+
+      mapping_sort_key = lambda m: [len(m), m]
+      sort_mapping     = lambda m: tuple(sorted(m, key=mapping_sort_key))
+
+      if 0:  # SLOOOOW
+        mappings = set()
+
+        for mapping in itertools.product(filter(None, powerset(ports)), repeat=run.num_unfused_muops):
+          mapping_ports = functools.reduce(operator.__or__, map(frozenset, mapping))
+
+          if mapping_ports == ports:
+            print('?', *mapping)
+            mappings.add(sort_mapping(mapping))
+      else:
+        port_sets = [
+          (0,),
+          (1,),
+          (4,),
+          (5,),
+          (6,),
+          (0, 1),
+          (0, 5),
+          (0, 6),
+          (1, 5),
+          (2, 3,),
+          (0, 1, 5),
+          (2, 3, 7),
+          (0, 1, 5, 6),
+        ]
+
+        mappings = itertools.product(port_sets, repeat=run.num_unfused_muops)
+        mappings = (m for m in mappings if functools.reduce(operator.__or__, map(frozenset, m)) == ports)
+        mappings = (sort_mapping(m) for m in mappings)
+        mappings = set(mappings)
+
+      mappings = sorted(mappings, key=sort_mapping)
+
+      for idx, mapping in enumerate(mappings, 1):
+        try:
+          outputs = solve_for_complex_instruction(complexI, muIs, measurements, mapping,
+                                                  verbose=False)
+
+          main_output = [o for o in outputs if len(o.kernel) == 1 and o.kernel[0].name == complexI]
+          assert len(main_output) == 1
+          main_output = main_output[0]
+
+          # new_err    = sum([o.error for o in outputs])
+          new_err    = max([o.umpc_error for o in outputs]), sum([o.umpc_error for o in outputs]), max([o.fmpc_error for o in outputs])
+          new_decomp = main_output.merged_muop_str()
+
+          if complexI in decompositions:
+            old_error, old_decomps = decompositions[complexI]
+
+            if new_err < old_error:
+              # print('  ', f'{decomp.decomposition} ({decomp.error}) < {old_decomp.decomposition} ({old_decomp.error})')
+
+              decompositions[complexI] = Candidate_Decompositions(new_err, [new_decomp])
+            elif new_err == old_error:
+              # print('  ', f'{decomp.decomposition} ({decomp.error}) == {old_decomp.decomposition} ({old_decomp.error})')
+              decompositions[complexI].decompositions.append(new_decomp)
+          else:
+            decompositions[complexI] = Candidate_Decompositions(new_err, [new_decomp])
+
+          result = [
+            # len(sum(mapping, ())),
+            sum([o.error for o in outputs]),
+            max([o.error for o in outputs]),
+            max([o.ipc_error for o in outputs]), max([o.fmpc_error for o in outputs]), max([o.umpc_error for o in outputs]),
+            sum([o.ipc_error for o in outputs]), sum([o.fmpc_error for o in outputs]), sum([o.umpc_error for o in outputs]),
+            len([o for o in outputs if o.has_error]),
+            mapping,
+            complexI,
+          ]
+          # print(mapping, *result[:-1])
+
+          results.append(result)
+          print(
+            f'{idx}/{len(mappings)}',
+            ' ', complexI,
+            # f'err={round(new_err, 7)}',
+            f'err={new_err}',
+            '-',
+            new_decomp,
+            '/',
+            ' | '.join(decompositions[complexI].decompositions)
+          )
+        except ILP_Error as e:
+          print('ILP_Error', e)
+          continue
+    else:
+      try:
+        outputs = solve_for_complex_instruction(complexI, muIs, measurements,
+                                                hardcoded_mapping=[(1, 5), (0, 1, 5, 6)], verbose=True)
+      except ILP_Error as e:
+        print('error:', e, file=sys.stderr)
+        continue
 
-    KERNELS = Domain(Kernel, sorted(KERNELS, key=lambda k: (len(list(k)), str(K))))
-    INSTS   = Domain(Inst, INSTS)
-    MUOPS   = Domain(Muop, MUOPS)
-    PORTS   = Domain(Port, PORTS)
+      main_output = [o for o in outputs if len(o.kernel) == 1 and o.kernel[0].name == complexI]
+      assert len(main_output) == 1
+      main_output = main_output[0]
 
-    print('SOLVE WITH', len(KERNELS), 'KERNELS')
+      summarize_outputs(complexI, main_output.merged_muop_str(), outputs)
 
-    try:
-      outputs = solve_for_delta(
-        KERNELS, INSTS, MUOPS, PORTS,
-        const_delta_im          = delta_IM,
-        const_delta_mp          = delta_MP,
-        # const_delta_ip          = delta_IP,
-        # const_mu_KP             = mu_KP,
-        kernels_with_bottleneck = kernels_with_bottleneck,
-        saturation_margin       = 0.05,
-        max_error               = 0.05,
-        min_throughput          = 0.05,
-        print_iis               = True,
-        # print_iis               = False,
-      )
-
-      # complex_outputs = [o for o in outputs if len(o.kernel) == 1 and o.kernel[0].num_muops != 1]
-      complex_outputs = [o for o in outputs if max(i.num_muops for i in o.kernel) > 1]
-
-      # print()
-      # print('FOUND DECOMPOSITIONS:')
-      # for o in complex_outputs:
-      #   print(' ', Benchmark_Spec.name_from_instruction_names(i.name for i in o.kernel) + ':',
-      #         o.cpu.merged_muop_str(o.used_muops()))
-      # print()
-
-      ilp_outputs += complex_outputs
       all_outputs += outputs
-    except ILP_Error as e:
-      print(e, file=sys.stderr)
-  if not ilp_outputs:
-    exit()
 
-  for o in ilp_outputs:
-    print()
-    print('FOUND DECOMPOSITIONS:')
-    for o in complex_outputs:
-      print(' ', Benchmark_Spec.name_from_instruction_names(i.name for i in o.kernel) + ':',
-            o.cpu.merged_muop_str(o.used_muops()))
-    print()
-  print('ERRORS:', str(len([o for o in all_outputs if o.has_error])) + '/' + str(len(outputs)))
+  # if results:
+  #   for result in sorted(results):
+  #     print(*result)
+  #   print()
+  #   return
+
+  ilp_outputs = [o for o in all_outputs if max(i.num_fused_muops for i in o.kernel) > 1]
+
+  print()
+  print('FOUND DECOMPOSITIONS:')
+  for i, o in decompositions.items():
+    print(' ', (i + ':').ljust(85), ' | '.join(o.decompositions), f'({o.error})')
+  print()
+  summarize_outputs(None, None, all_outputs)
 
   show_outputs(ilp_outputs, all_outputs)
 
+  die = 0
+
+  def assert_decompositions(inst, want_decomp):
+    nonlocal die, decompositions
+
+    if inst in decompositions:
+      have_decomps = decompositions[inst].decompositions
+
+      if [want_decomp] == have_decomps:
+        # YAY!
+        pass
+      elif want_decomp in have_decomps:
+        print('Non-unique decomposition:', inst.ljust(60), f'WANT: {want_decomp:15} HAVE: {" | ".join(have_decomps)}')
+        die = 1
+      else:
+        print('Invalid decomposition:', inst.ljust(60), f'WANT: {want_decomp:15} HAVE: {" | ".join(have_decomps)}')
+        die = 1
+
+  assert_decompositions('IMUL_GPR16i16_GPR16i16_IMMi8',     '1P1 + 1P0156')
+  assert_decompositions('BEXTR_GPR64i64_GPR64i64_GPR64i64', '1P06 + 1P15')
+  assert_decompositions('LEA_GPR16i16_ADDR64i64',           '1P15 + 1P0156')
+  exit(die)
+
 
 def show_outputs(ilp_outputs, all_outputs):
   ilp_outputs.sort(key=lambda ilp: tuple(ilp.kernel))
@@ -550,7 +666,7 @@ def show_outputs(ilp_outputs, all_outputs):
   import io
 
   C = R = math.ceil(math.sqrt(len(ilp_outputs)))
-  while C * (R - 1) >= len(ilp_outputs):
+  while R > 1 and C * (R - 1) >= len(ilp_outputs):
     R -= 1
 
   fig = plt.figure()
@@ -563,15 +679,24 @@ def show_outputs(ilp_outputs, all_outputs):
   # print()
 
   dot = pygraphviz.AGraph(direct=True)
-  for o in sorted(all_outputs, key=lambda o: [len(o.kernel), str(o.kernel)]):
-    if not o.has_error:
+
+  for o in sorted(all_outputs, key=lambda o: [o.error, len(o.kernel), str(o.kernel)]):
+    if not any(i.num_unfused_muops > 1 for i in o.kernel):
       continue
-    # if not any('BEXTR' in i.name for i in o.kernel):
+
+    # if len(o.kernel) != 2:
+    #   print('skip', o.kernel)
     #   continue
-    # if len(o.kernel) > 1 and not any('INSERT' in i.name or 'AES' in i.name for i in o.kernel):
+    # if not any('BEXTR' in i.name for i in o.kernel):
     #   continue
-    # if len(o.kernel) != 1:
+    # if not any('ROL' in i.name for i in o.kernel):
     #   continue
+
+    if not o.has_error:
+      # if len(o.kernel) != 1:
+      #   continue
+      pass
+      # continue
     o.add_to_dot(dot)
   dot.layout(prog='dot')
   dot.draw(f'test.dot')
@@ -640,6 +765,86 @@ def show_outputs(ilp_outputs, all_outputs):
     pass
 
 
+def powerset(iterable: ty.Iterable['T']) -> ty.Iterable[ty.Tuple['T', ...]]:
+    "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
+    s = list(iterable)
+    assert s
+
+    it = itertools.chain.from_iterable(itertools.combinations(s, r) for r in range(len(s) + 1))
+    return it
+
+
+def log(*msg):
+  print(terminal.Info_Line.timestamp(), '-', *msg, file=sys.stderr)
+
+
 if __name__ == '__main__':
-  main_solve_for_complex_instructions()
-  # main_solve_for_muI_mapping()
+  parser = argparse.ArgumentParser()
+
+  parser.set_defaults(command=None)
+
+  subps = parser.add_subparsers()
+
+  ##############################################################################
+  subp = subps.add_parser('complexI-mapping')
+  subp.set_defaults(command=main_solve_for_complex_instructions)
+
+  subp.add_argument('MEASUREMENTS_FILE', type=pathlib.Path)
+  subp.add_argument('EQUIVALENCE_CLASSES', type=pathlib.Path)
+
+  ##############################################################################
+  subp = subps.add_parser('muI-mapping')
+  subp.set_defaults(command=main_solve_for_muI_mapping)
+
+  subp.add_argument('MEASUREMENTS_FILE', type=pathlib.Path)
+  subp.add_argument('EQUIVALENCE_CLASSES', type=pathlib.Path)
+
+  ##############################################################################
+
+  args = parser.parse_args()
+
+  if not args.command:
+    parser.error('No command specified')
+
+  measurements = Benchmark_Run_Aggregator(max_stddev=math.inf, min_samples=0)
+
+  log('READ MEASUREMENTS FROM', args.MEASUREMENTS_FILE)
+
+  try:
+    measurements.read_from_file(args.MEASUREMENTS_FILE)
+  except json.JSONDecodeError as e:
+    print('error: malformed file', repr(F) + ':', e, file=sys.stderr)
+
+  log('READ', len(measurements), 'MEASUREMENTS')
+  log('READ EQUIVALENCE CLASSES FROM', args.EQUIVALENCE_CLASSES)
+
+  with open(args.EQUIVALENCE_CLASSES) as fd:
+    eq_json    = json.load(fd)
+    eq_classes = [eq['insts'] for eq in eq_json]
+
+    # muIs = [random.choice(eq) for eq in eq_classes]
+    muIs = [eq[0] for eq in eq_classes]
+
+    for muI in list(muIs):
+      run = measurements.get(muI)
+      if not run:
+        muIs.remove(muI)
+        continue
+
+      ## exclude movups/movaps
+      if run.umpc.mean >= len(run.ports_used()):
+        muIs.remove(muI)
+        continue
+
+      ## exclude div/sqrt/...
+      if run.umpc.mean <= 0.5:
+        muIs.remove(muI)
+        continue
+
+    # muIs = random.sample(muIs, 15)
+
+  log('READ', len(muIs), 'SIMPLE INSTRUCTIONS/EQUIVALENCE CLASSES')
+
+  ##############################################################################
+
+  args.command(measurements, muIs)
-- 
GitLab