From 71e255517d50b15b76b4eaf2587bf5fc34c4450b Mon Sep 17 00:00:00 2001 From: Fabian Gruber <fabian.gruber@inria.fr> Date: Wed, 31 Jul 2019 14:49:49 +0200 Subject: [PATCH] Wrote scripts to solve port-mapping ILP problem. cherry-pick of several intermediate commits. 00685293ca106d91e191f0be18 from Thu Jul 4 11:00:48 2019 +0200. 9386c13adba5de1584eff6f18f from Thu May 16 08:09:25 2019 +0200 3813e316c21427a3066f23509e from Fri May 10 16:56:23 2019 +0200 50ee0275a794109eeda6592421 from Sun May 5 10:19:47 2019 +0200 0b54af769395b07dc12af24ce3 from Sat May 4 20:04:48 2019 +0200 d07f97d17f1a96a09ff5dbe502 from Sat May 4 15:30:37 2019 +0200 e924c740bf634cf052b89f9125 from Sat May 4 15:21:27 2019 +0200 e3138dad5270c22c15663fbb45 from Sat May 4 14:00:44 2019 +0200 a58a53c132a23c8d2af1c69d72 from Fri May 3 16:42:26 2019 +0200 ILP model: save latest version. Many a long night was spent hacking on this. Deadlines where tight. Midnight oil was burnt. So I didn't commit intermediate stages. --- tools/find-eqivalence-classes | 343 +++++++----- tools/solve-ilp | 965 +++++++++++++++++++++------------- 2 files changed, 785 insertions(+), 523 deletions(-) diff --git a/tools/find-eqivalence-classes b/tools/find-eqivalence-classes index 4ec8be0..ce55ae7 100755 --- a/tools/find-eqivalence-classes +++ b/tools/find-eqivalence-classes @@ -33,6 +33,7 @@ from pipedream.utils import chunks, nub import pipedream.asm.ir as ir import pipedream.benchmark.common as common import pipedream.utils.yaml as yaml +import pipedream.utils.json as json BLACKLISTED_INSTRUCTIONS = frozenset([ # 'OR_GPR16i16_IMMi16', @@ -77,7 +78,7 @@ def main(): # inputs subp.add_argument('--measurements', - dest='measurements_file', + dest='measurements_input', required=True, type=pathlib.Path, help='File to read benchmark results from',) @@ -93,7 +94,11 @@ def main(): # outputs subp.add_argument('-eo', '--eq-class-output', type=pathlib.Path, default=pathlib.Path('/dev/stdout')) - subp.add_argument('-mo', '--measurement-output', type=pathlib.Path, default=None) + subp.add_argument('-mo', '--measurements-output', + dest='measurements_output', + required=False, + type=pathlib.Path, + help='File to write benchmark results to',) subp.add_argument('--yaml-log', default=None, type=pathlib.Path) subp.add_argument('--json-log', default=None, type=pathlib.Path) @@ -147,17 +152,17 @@ def main(): # inputs subp.add_argument('--measurements', - dest='measurements_file', + dest='measurements_input', required=True, type=pathlib.Path, - help='File to read benchmark results from and write them back to',) + help='File to read benchmark results from',) subp.add_argument('--eq-classes', dest='eq_classes_file', required=True, type=pathlib.Path, help='File to read equivalence classes from',) subp.add_argument('--tag', - default=None, + required=True, type=str, help='Tag selector to select complex instructions',) subp.add_argument('--min-muops', @@ -171,6 +176,13 @@ def main(): with a IPC/MPC stddev higher than this are ignored. """,) + ## outputs + subp.add_argument('-mo', '--measurements-output', + dest='measurements_output', + required=False, + type=pathlib.Path, + help='File to write benchmark results to',) + ############################################################################## args = parser.parse_args() @@ -180,11 +192,15 @@ def main(): if command is None: parser.error('must supply a command') + if args.measurements_output is None: + args.measurements_output = args.measurements_input + command(**vars(args)) def generate_simple_ilp_input(*, - measurements_file: pathlib.Path, + measurements_input: pathlib.Path, + measurements_output: pathlib.Path, eq_classes_file: pathlib.Path, tag: ty.Optional[str], min_muops: int, @@ -193,21 +209,23 @@ def generate_simple_ilp_input(*, arch = ir.Architecture.for_name('x86') instruction_set = arch.instruction_set() - measurements = Benchmark_Run_Summary_Aggregator( + measurements = Benchmark_Run_Aggregator( max_stddev=max_stddev, # *0.9 to account for dropping the bottom & top 5 percent. min_samples=MIN_NUM_SAMPLES * 0.9, ) + # touch measurements file + open(measurements_output, 'a').close() + try: - read_measurements_from_files(measurements, [measurements_file]) + read_measurements_from_files(measurements, [measurements_input]) eq_classes = read_equivalence_classes_from_file(instruction_set, eq_classes_file) except FileNotFoundError as e: print('error:', e, file=sys.stderr) exit(1) - complex_insts = common.glob_instruction_tags(arch, [tag]) - # representatives = [random.choice(eq) for eq in eq_classes] + complex_insts = list(nub(common.glob_instruction_tags(arch, [tag]))) representatives = list(nub(eq[0] for eq in eq_classes)) if not complex_insts: @@ -234,108 +252,120 @@ def generate_simple_ilp_input(*, ordering = Ordering.ALPHABETIC, ) - EQ.make_measurements(kernels, measurements, measure_ports, output=measurements_file) + kernels = [k for k in kernels if tuple(i.name for i in k) not in measurements] - log('MEASURE: muI') - make_measurements(nub((i,) for i in complex_insts + representatives), measure_ports=True) + CHUNK_SIZE = 2000 - assert complex_insts[0].name in measurements + for chunk in chunks(kernels, CHUNK_SIZE): + try: + EQ.make_measurements(chunk, measurements, measure_ports=measure_ports, output=measurements_output) + finally: + os.sched_yield() - if min_muops: - for i in list(complex_insts): - m = measurements[i.name] + log('MEASURE: complexI', f'({len(complex_insts)})') + make_measurements([(i,) for i in complex_insts], measure_ports=True) - num_muops = round(m.muops.mean / m.instructions.mean) + complex_insts = [i for i in complex_insts if measurements[i.name].num_unfused_muops > 1] - if num_muops < min_muops: - complex_insts.remove(i) + if not complex_insts: + log('no real complexI found') + return - kernels = list(combination_kernels(complex_insts, representatives)) - log('MEASURE: muI x complexI', f'({len(kernels)})') - for chunk in chunks(kernels, 1000): - make_measurements(chunk, measure_ports=False) + log('MEASURE: muI ', f'({len(representatives)})') + make_measurements([(i,) for i in representatives], measure_ports=True) - del kernels + def combinations_of_instructions_using_only_ports(ports: ty.FrozenSet['Port']) -> ty.Iterable[ty.Tuple[ir.Instruction, + ...]]: + port_sets = set(frozenset(ps) for ps in powerset(ports)) + port_sets.remove(frozenset()) - ## higher order kernels + assert frozenset([6]) in port_sets - kernels = [] + assert len([i for i in representatives if i.name == 'JMP_0']) - if False: # NEW WAY, TODO IMPLEMENT - for complexI in complex_insts: - mCI = measurements[complexI.name] + combinations = collections.defaultdict(list) - if MEASURE_PORTS: - ## TODO: read ports from args or partial model - ports = mCI.ports_used() - else: - interferes_with = set() + # FIXME: we really want a set cover for every possible subset of ports ... but that is expensive + for N in range(1, 8): + for kernel in itertools.combinations(representatives, N): + kernel_ports = frozenset() - for muI in representatives: - mMUI = measurements[muI.name] - mCI_MUI = measurements[complexI.name, muI.name] + for inst in kernel: + run = measurements[inst.name] - # no interference iff MPC(A) + MPC(B) == MPC(A B) - if abs((mc.mpc.mean + mr.mpc.mean) - mcr.mpc.mean) <= 0.1: - continue + kernel_ports |= run.ports_used() - interferes_with.add(muI) + # if kernel_ports in combinations: + # kernel = min(kernel, combinations[kernel_ports], key=len) - ports = set() + # combinations[kernel_ports] = tuple(sorted(kernel, key=lambda i: i.name)) - for muI in interferes_with: - muI_ports = ... # TODO: read partial model + combinations[kernel_ports].append(tuple(sorted(kernel, key=lambda i: i.name))) - ports |= muI_ports + # if not port_sets: + # break - for port_set in powerset(ports): - kernel = minimal_combination_of_instructions_using_only_port_set(port_set) + if False: + print('COVERED ' + str(ports) + ':') + for port_set in sorted(combinations, key=lambda ps: [len(ps), ps]): + for kernel in combinations[port_set]: + print(' ', Benchmark_Spec.name_from_instruction_names(i.name for i in kernel), *sorted(port_set)) + print('UNCOVERED ' + str(ports) + ':') + for port_set in sorted(port_sets, key=lambda ps: [len(ps), ps]): + if port_set not in combinations: + print(' ', *sorted(port_set)) + print() + exit() - kernels.append(kernel) - kernels.append((complexI,) + kernel) - else: - interferes_with = collections.defaultdict(set) + # print('MAX', max(map(len, combinations.values()))) - for c in complex_insts: - for r in representatives: - if c is r: - continue - mc = measurements[c.name] - mr = measurements[r.name] + out = [] - if mc.ports_used() and mr.ports_used(): - if not (mc.ports_used() & mr.ports_used()): - # does not use same ports. no interference - continue - else: - mcr = measurements[c.name, r.name] + for port_set in sorted(combinations, key=lambda ps: [len(ps), ps]): + kernels = combinations[port_set] + kernels = sorted(kernels, key=len)[:3] - # no interference iff MPC(A) + MPC(B) == MPC(A B) - if abs((mc.mpc.mean + mr.mpc.mean) - mcr.mpc.mean) <= 0.1: - continue + out += kernels - interferes_with[c].add(r) + return sorted(out, key=len) - for c, interferes in interferes_with.items(): - mc = measurements[c.name] + assert complex_insts[0].name in measurements - num_muops = round(mc.muops.mean / mc.instructions.mean) + if min_muops: + for i in list(complex_insts): + m = measurements[i.name] - for combi in combination_kernels([c], interferes, interferes): - kernels.append(combi) + num_muops = round(m.unfused_muops.mean / m.instructions.mean) - log('MEASURE: muI x muI x complexI', f'({len(kernels)})') + if num_muops < min_muops: + complex_insts.remove(i) + + ## higher order kernels + + kernels = [] - for chunk in chunks(kernels, 1000): - make_measurements(chunk, measure_ports=False) + for complexI in complex_insts: + for combi in combination_kernels([complexI], representatives): + kernels.append(combi) + + log('MEASURE: muI x complexI', f'({len(kernels)})') + make_measurements(kernels, measure_ports=True) + + kernels = [] + for complexI in complex_insts: + for combi in combination_kernels([complexI], representatives, representatives): + kernels.append(combi) + + log('MEASURE: muI x muI x complexI', f'({len(kernels)})') + make_measurements(kernels, measure_ports=True) def find_equivalence_classes(*, - measurements_file: ty.Optional[pathlib.Path], + measurements_input: ty.Optional[pathlib.Path], eq_classes_file: ty.Optional[pathlib.Path], tag: ty.Optional[str], eq_class_output: ty.Optional[pathlib.Path], - measurement_output: ty.Optional[pathlib.Path], + measurements_output: ty.Optional[pathlib.Path], yaml_log: ty.Optional[pathlib.Path], json_log: ty.Optional[pathlib.Path], num_representatives: int, @@ -360,13 +390,13 @@ def find_equivalence_classes(*, common.set_process_name('pipedream-equivalence-classes') common.set_scheduler_params() - measurements = Benchmark_Run_Summary_Aggregator( + measurements = Benchmark_Run_Aggregator( max_stddev=max_stddev, # *0.9 to account for dropping the bottom & top 5 percent. min_samples=MIN_NUM_SAMPLES * 0.9, ) - read_measurements_from_files(measurements, [measurements_file]) + read_measurements_from_files(measurements, [measurements_input]) ############################################################################## ## build initial input equivalence classes @@ -407,6 +437,9 @@ def find_equivalence_classes(*, if inst.name in BLACKLISTED_INSTRUCTIONS: return True + if inst.name == 'JMP_0': + return False + # forbid instructions with a read/write to a fixed register (forces data dependencies) for op in inst.operands: if not isinstance(op, ir.Register_Operand): @@ -424,7 +457,7 @@ def find_equivalence_classes(*, EQ.make_measurements([(i,) for i in all_insts], measurements, - output=measurement_output,) + output=measurements_output,) muI_grouped_by_prefix = collections.defaultdict(list) @@ -435,7 +468,7 @@ def find_equivalence_classes(*, run = measurements[i.name] - if run.num_muops != 1: + if run.num_unfused_muops != 1: continue muI.append(i) @@ -456,7 +489,7 @@ def find_equivalence_classes(*, measurements = measurements, equivalence_classes = prefix_eq_classes, eq_class_output = None, - measurement_output = None, + measurements_output = None, yaml_log = yaml_log, json_log = json_log, ) @@ -470,7 +503,7 @@ def find_equivalence_classes(*, measurements = measurements, equivalence_classes = equivalence_classes, eq_class_output = eq_class_output, - measurement_output = measurement_output, + measurements_output = measurements_output, yaml_log = yaml_log, json_log = json_log, ) @@ -478,10 +511,10 @@ def find_equivalence_classes(*, def _find_equivalence_classes(*, EQ: 'Equivalence_Class_Finder', - measurements: Benchmark_Run_Summary_Aggregator, + measurements: Benchmark_Run_Aggregator, equivalence_classes: ty.List['Eq_Class'], eq_class_output: ty.Optional[pathlib.Path], - measurement_output: ty.Optional[pathlib.Path], + measurements_output: ty.Optional[pathlib.Path], yaml_log: ty.Optional[pathlib.Path], json_log: ty.Optional[pathlib.Path],) -> ty.List['Eq_Class']: @@ -519,7 +552,7 @@ def _find_equivalence_classes(*, ############################################################################## ## write output - if eq_class_output or measurement_output: + if eq_class_output or measurements_output: log('WRITE RESULTS') if eq_class_output: @@ -534,6 +567,9 @@ def _find_equivalence_classes(*, print(',', file=fd) first = False print(' ', '{', file=fd) + print(' ', ' ', '"avg-ipc": ' + str(numpy.mean([measurements[i].ipc.mean for i in insts])) + ',', file=fd), + print(' ', ' ', '"avg-fused-mpc": ' + str(numpy.mean([measurements[i].fmpc.mean for i in insts])) + ',', file=fd), + print(' ', ' ', '"avg-unfused-mpc": ' + str(numpy.mean([measurements[i].umpc.mean for i in insts])) + ',', file=fd), print(' ', ' ', '"insts": [' + ', '.join('"' + i + '"' for i in insts) + ']', file=fd) print(' ', '}', end = '', file=fd) print(file=fd) @@ -542,8 +578,8 @@ def _find_equivalence_classes(*, except argparse.ArgumentTypeError as e: print(e, file=sys.stderr) - if measurement_output is not None: - write_measurements(measurements, measurement_output) + if measurements_output is not None: + write_measurements(measurements, measurements_output) return equivalence_classes @@ -595,6 +631,8 @@ def split_equivalence_classes(EQ, equivalence_classes: ty.List['Eq_Class']) -> ty.Tuple[ty.List['Eq_Class'], bool]: assert type(equivalence_classes) is list + equivalence_classes = list(equivalence_classes) + changed: bool = False log(' SPLIT', len(equivalence_classes), 'CLASS(ES)') @@ -602,8 +640,8 @@ def split_equivalence_classes(EQ, for eq in list(equivalence_classes): # log(' SPLIT', eq) - # reps = EQ._select_representatives(eq) - reps = eq.random_sample(EQ.num_representatives) + reps = EQ._select_representatives(eq) + # reps = eq.random_sample(EQ.num_representatives) EQ.make_measurements(((i,) for i in reps), measurements) EQ.make_measurements(combination_kernels(reps, repeat=2), measurements, measure_ports=False) @@ -781,11 +819,14 @@ class Equivalence_Class_Finder: n_random: int if self.random_representatives: - n_alphabetic = 1 - n_random = max(0, self.num_representatives - 1) + n_alphabetic = self.num_representatives // 4 + n_random = self.num_representatives - n_alphabetic else: - n_alphabetic = max(0, self.num_representatives - 1) - n_random = 1 + n_random = self.num_representatives // 4 + n_alphabetic = self.num_representatives - n_random + + # n_random = 0 + # n_alphabetic = self.num_representatives - n_random assert n_alphabetic + n_random == self.num_representatives @@ -798,14 +839,14 @@ class Equivalence_Class_Finder: return reps @staticmethod - def eq_class_ipc_and_mpc(eq_clss: Eq_Class, measurements: Benchmark_Run_Summary_Aggregator): + def eq_class_ipc_and_mpc(eq_clss: Eq_Class, measurements: Benchmark_Run_Aggregator): insts = [(i.name,) for i in eq_clss] ipc = numpy.mean([measurements[i].ipc.mean for i in insts]) - mpc = numpy.mean([measurements[i].mpc.mean for i in insts]) + mpc = numpy.mean([measurements[i].umpc.mean for i in insts]) return ipc, mpc @classmethod - def log_eq_class(clss, eq_clss: Eq_Class, measurements: Benchmark_Run_Summary_Aggregator, indent: int = 2): + def log_eq_class(clss, eq_clss: Eq_Class, measurements: Benchmark_Run_Aggregator, indent: int = 2): insts = sorted(i.name for i in eq_clss) ipc, mpc = clss.eq_class_ipc_and_mpc(eq_clss, measurements) log(' ' * indent, eq_clss, @@ -815,7 +856,7 @@ class Equivalence_Class_Finder: def make_measurements(self, kernels: ty.Iterable[ty.Tuple[ir.Instruction, ...]], - measurements: Benchmark_Run_Summary_Aggregator, + measurements: Benchmark_Run_Aggregator, *, measure_ports: bool = None, force: bool = False, @@ -846,7 +887,7 @@ class Equivalence_Class_Finder: return False extra_counters = [ - # 'RESOURCE_STALLS', + 'RESOURCE_STALLS', ] if measure_ports: @@ -885,29 +926,30 @@ class Equivalence_Class_Finder: tmp_dir = str(pathlib.Path.cwd() / 'tmp'), debug = False,): try: + run.drop_details() + if self.yaml_log is not None: self.yaml_log.write('---\n') yaml.dump(run, self.yaml_log) self.yaml_log.write('...\n') if self.json_log is not None: - print(Benchmark_Run_Summary.from_benchmark_run(run).to_json(), file=self.json_log) + print(run.to_json(), file=self.json_log) - name = run.benchmark.name - summary = Benchmark_Run_Summary.from_benchmark_run(run) - err = test_equivalence(name, name, summary, summary) + name = run.benchmark.name + err = test_equivalence(name, name, run, run) if err: print(f'error: IPC({name}) != IPC({name})', ':', err, file=sys.stderr) exit(1) - added = measurements.add_measurement(summary) + added = measurements.add_measurement(run) if NUM_SAMPLES >= MAX_NUM_SAMPLES or added: key = tuple(run.benchmark.instructions) unknown.remove(key) if not added: - measurements.force_add(summary) + measurements.force_add(run) finally: if self.yaml_log is not None: self.yaml_log.flush() @@ -917,20 +959,17 @@ class Equivalence_Class_Finder: NUM_SAMPLES += NUM_SAMPLES_STEP finally: if output: + os.sched_yield() write_measurements(measurements, output) return True -def read_measurements_from_files(measurements: Benchmark_Run_Summary_Aggregator, +def read_measurements_from_files(measurements: Benchmark_Run_Aggregator, files: ty.Sequence[pathlib.Path]): for file in files: log('READING MEASUREMENTS FROM', shlex.quote(str(file))) - with open(file) as fd: - for line in fd: - run = Benchmark_Run_Summary.from_json(line) - - measurements.add_measurement(run) + measurements.read_from_file(file) log('FOUND', len(measurements), 'measurement(s)') @@ -938,7 +977,7 @@ def read_measurements_from_files(measurements: Benchmark_Run_Summary_Aggregator, def read_equivalence_classes_from_file(instruction_set: ir.Instruction_Set, file: str): try: with argparse.FileType('r')(file) as fd: - json_eq_classes = json.load(fd) + json_eq_classes = json.load(fd, allow_comments=True) equivalence_classes = [] all_insts = [] @@ -964,7 +1003,7 @@ def read_equivalence_classes_from_file(instruction_set: ir.Instruction_Set, file exit(1) -def write_measurements(measurements: Benchmark_Run_Summary_Aggregator, file: pathlib.Path): +def write_measurements(measurements: Benchmark_Run_Aggregator, file: pathlib.Path): assert isinstance(file, pathlib.Path) log('WRITE MEASUREMENTS TO', shlex.quote(str(file))) @@ -972,21 +1011,17 @@ def write_measurements(measurements: Benchmark_Run_Summary_Aggregator, file: pat written = 0 if str(file) in ['/dev/stdout', '/dev/stderr']: - with open(file, 'w') as fd: - for m in measurements.all_measurements(): - print(m.to_json(), file=fd) - written += 1 + written += measurements.write_to_file(file, only_best=False) else: with tempfile.NamedTemporaryFile(mode='w', prefix='eq-class-measurements.', - suffix='.jsonl', + suffix=file.suffix, delete=False,) as fd: - for m in measurements.all_measurements(): - print(m.to_json(), file=fd) - written += 1 - os.makedirs(file.parent, exist_ok=True) + written += measurements.write_to_file(pathlib.Path(fd.name), only_best=False) + os.makedirs(file.parent, exist_ok=True) + if file.exists(): shutil.move(file, file.with_suffix(file.suffix + '.bkp')) - shutil.move(fd.name, file) + shutil.move(fd.name, file) log('WROTE', written, 'measurement(s)') @@ -995,12 +1030,19 @@ def combination_kernels(*iterables: ty.Sequence[ir.Instruction], repeat: int = 1 ## AB is not necessarily the same as AABB (x86 is weird) ## so we use product instead of combinations for combi in itertools.product(*iterables, repeat=repeat): - for N in range(1, 6): - kernel = sum(((i,) * N for i in combi), ()) - + for kernel in repetition_kernels(combi): yield kernel +def repetition_kernels(kernel: ty.Tuple[ir.Instruction, ...], + max_repeat: int = 5) -> ty.Iterable[ty.Tuple[ir.Instruction, ...]]: + assert max_repeat >= 1 + + for N in range(1, max_repeat + 1): + k = sum(((i,) * N for i in kernel), ()) + yield k + + class Percent: def __init__(self, numerator, denominator = None): if type(numerator) is Percent: @@ -1046,8 +1088,8 @@ Percent.FIVE = Percent(5, 100) def test_equivalence(A: ir.Instruction, B: ir.Instruction, - runA: Benchmark_Run_Summary, runB: Benchmark_Run_Summary, - runAB: Benchmark_Run_Summary = None, *, margin: Percent = Percent.FIVE) -> ty.Optional[str]: + runA: Benchmark_Run, runB: Benchmark_Run, + runAB: Benchmark_Run = None, *, margin: Percent = Percent.FIVE) -> ty.Optional[str]: """ Check if instruction A and B are equivalent. Returns an error message iff they are not equal, None otherwise. @@ -1069,17 +1111,25 @@ def test_equivalence(A: ir.Instruction, B: ir.Instruction, dev = round(stat.stddev, 3) return f'{mean}±{dev}' + assert runA.num_fused_muops == runB.num_fused_muops, f'num_fused_muops({A})={runA.num_fused_muops} ' \ + f'num_fused_muops({B})={runB.num_fused_muops}' + + assert runA.num_unfused_muops == runB.num_unfused_muops, f'num_unfused_muops({A})={runA.num_unfused_muops} ' \ + f'num_unfused_muops({B})={runB.num_unfused_muops}' + assert IPCs_are_equivalent(runA, runB, margin), f'IPC({A})={pstat(runA.ipc)} IPC({B})={pstat(runB.ipc)}' if runAB: assert IPCs_are_equivalent(runA, runAB, margin), f'IPC({A})={pstat(runA.ipc)} IPC({A} {B})={pstat(runAB.ipc)}' assert IPCs_are_equivalent(runB, runAB, margin), f'IPC({B})={pstat(runB.ipc)} IPC({A} {B})={pstat(runAB.ipc)}' - assert MPCs_are_equivalent(runA, runB, margin), f'MPC({A})={pstat(runA.mpc)} MPC({B})={pstat(runB.mpc)}' + assert MPCs_are_equivalent(runA, runB, margin), f'MPC({A})={pstat(runA.umpc)} MPC({B})={pstat(runB.umpc)}' if runAB: - assert MPCs_are_equivalent(runA, runAB, margin), f'MPC({A})={pstat(runA.mpc)} MPC({A} {B})={pstat(runAB.mpc)}' - assert MPCs_are_equivalent(runB, runAB, margin), f'MPC({B})={pstat(runB.mpc)} MPC({A} {B})={pstat(runAB.mpc)}' + assert MPCs_are_equivalent(runA, runAB, margin), f'MPC({A})={pstat(runA.umpc)} MPC({A} {B})={pstat(runAB.umpc)}' + assert MPCs_are_equivalent(runB, runAB, margin), f'MPC({B})={pstat(runB.umpc)} MPC({A} {B})={pstat(runAB.umpc)}' - if runA.port_muops and runB.port_muops: + USE_PORTS = False + + if USE_PORTS and runA.port_muops and runB.port_muops: def ports_used(run) -> ty.Set[int]: "return set of ports used by a benchmark run (not by how much each port is used)" @@ -1089,7 +1139,7 @@ def test_equivalence(A: ir.Instruction, B: ir.Instruction, ## a port that receives less than 5% of all muops is considered unused ## (i.e. that usage is some measurement noise, comes from another hyperthread, ...) ## TODO: investigate example ADC_GPR64i64_IMMi32 - if (stat.mean / run.muops.mean) >= 0.05: + if (stat.mean / run.unfused_muops.mean) >= 0.05: ports.add(port) return ports @@ -1125,6 +1175,12 @@ def round_float(value: float) -> fractions.Fraction: return rounded +def powerset(iterable: ty.Iterable['T']) -> ty.Iterable[ty.Tuple['T', ...]]: + "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)" + s = list(iterable) + return itertools.chain.from_iterable(itertools.combinations(s, r) for r in range(len(s) + 1)) + + def ttest_1samp_from_stats(mean, std, var, nobs, popmean): """ Calculate the T-test for the mean of ONE group of scores from descriptive statistics. @@ -1191,29 +1247,30 @@ def tost_paired(y, x, low, upp, transform=None): P_VALUE = 0.05 -def IPCs_are_equivalent(a: Benchmark_Run_Summary, b: Benchmark_Run_Summary, margin: Percent = Percent.FIVE): +def IPCs_are_equivalent(a: Benchmark_Run, b: Benchmark_Run, margin: Percent = Percent.FIVE): assert 0 <= margin <= 1 - if round(a.ipc.p75, 2) == round(b.ipc.p75, 2): - return True + # if round(a.ipc.p75, 2) == round(b.ipc.p75, 2): + # return True margin = min(a.ipc.mean, b.ipc.mean) * float(margin) return means_are_equivalent(a.ipc, b.ipc, margin) -def MPCs_are_equivalent(a: Benchmark_Run_Summary, b: Benchmark_Run_Summary, margin: Percent = Percent.FIVE): +def MPCs_are_equivalent(a: Benchmark_Run, b: Benchmark_Run, margin: Percent = Percent.FIVE): assert 0 <= margin <= 1 - if round(a.mpc.p75, 2) == round(b.mpc.p75, 2): - return True + # if round(a.umpc.p75, 2) == round(b.umpc.p75, 2) and round(a.fmpc.p75, 2) == round(b.fmpc.p75, 2): + # return True - margin = min(a.mpc.mean, b.mpc.mean) * float(margin) + u_margin = min(a.umpc.mean, b.umpc.mean) * float(margin) + f_margin = min(a.fmpc.mean, b.fmpc.mean) * float(margin) - return means_are_equivalent(a.mpc, b.mpc, margin) + return means_are_equivalent(a.umpc, b.umpc, u_margin) and means_are_equivalent(a.fmpc, b.fmpc, f_margin) -def port_muops_are_equivalent(a: Benchmark_Run_Summary, b: Benchmark_Run_Summary, port: int): +def port_muops_are_equivalent(a: Benchmark_Run, b: Benchmark_Run, port: int): ## Muop per port counters aren't that precise and port usage fluctuates way more than ## cycles or total number of muops, so we are only looking for a very broad sense of equality here. ## The average benchmark runs 5_000_000 muops, so this works out to a margin of 200_000 muops. diff --git a/tools/solve-ilp b/tools/solve-ilp index e0c83f4..509ce8f 100644 --- a/tools/solve-ilp +++ b/tools/solve-ilp @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 import argparse import collections @@ -5,45 +6,28 @@ import dataclasses import enum import functools import itertools -import json +import operator import pathlib import random import sys +import typing as ty -sys.path.append(str(pathlib.Path(__file__).parent / 'src')) +try: + import pipedream +except ImportError: + sys.path.append(str(pathlib.Path(__file__).parent.parent / 'src')) from pipedream.ilp import * -from pipedream.benchmark.types import Benchmark_Spec, Benchmark_Run_Summary, Benchmark_Run_Summary_Aggregator +from pipedream.benchmark.types import Benchmark_Spec, Benchmark_Run, Benchmark_Run_Aggregator +import pipedream.utils.json as json +import pipedream.utils.terminal as terminal MEASURE_PORTS = True -def main_solve_for_muI_mapping(num_ports: int = 8): - parser = argparse.ArgumentParser() - - parser.add_argument('FILE') - parser.add_argument('EQUIVALENCE_CLASSES') - - args = parser.parse_args() - - measurements = Benchmark_Run_Summary_Aggregator(max_stddev=math.inf, min_samples=0) - - for F in [args.FILE]: - with open(F) as fd: - try: - for line in fd: - run = Benchmark_Run_Summary.from_json(line) - - measurements.add_measurement(run) - except json.JSONDecodeError as e: - print('error: malformed file', repr(F) + ':', e, file=sys.stderr) - - with open(args.EQUIVALENCE_CLASSES) as fd: - eq_json = json.load(fd) - eq_classes = [eq['insts'] for eq in eq_json] - - # muIs = [random.choice(eq) for eq in eq_classes] - muIs = [eq[0] for eq in eq_classes] +def main_solve_for_muI_mapping(measurements: Benchmark_Run_Aggregator, + representatives: ty.List[str], num_ports: int = 8): + muIs = representatives muI_runs = [measurements[muI] for muI in muIs] pair_runs = [measurements[a, b] for a, b in itertools.combinations(muIs, 2)] @@ -51,23 +35,27 @@ def main_solve_for_muI_mapping(num_ports: int = 8): KERNELS = [] INSTS = [] REAL_MUOPS = [] # filled in below - SLOWDOWN_MUOPS = [] # filled in below REAL_PORTS = [Port(f'P{i}') for i in range(num_ports)] - SLOWDOWN_PORTS = [] # filled in below + REAL_PORTS = [Port(f'P{i}') for i in [0, 1, 2, 3, 5, 6]] + THROTTLES = [] # filled in below kernels_with_bottleneck = set() NAME_2_INST = {} delta_IM = {} delta_MP = {} + delta_IT = {} print('SELECTED MUOP INSTRUCTIONS', len(muI_runs)) + for M in muI_runs: + print(M.name) + for run in list(muI_runs): assert len(run.kernel) == 1 - num_muops = round(run.muops.mean / run.instructions.mean) - mpc = run.mpc.mean + num_muops = round(run.unfused_muops.mean / run.instructions.mean) + mpc = run.umpc.mean assert mpc > 0.1, run.name @@ -75,8 +63,8 @@ def main_solve_for_muI_mapping(num_ports: int = 8): slowdown = (mpc < 0.95) - n = ' '.join(run.kernel) - i = Inst(name=n, num_muops=num_muops) + n = run.name + i = Inst(name=n, num_fused_muops=run.num_fused_muops, num_unfused_muops=run.num_unfused_muops) m = Muop(name=n) INSTS += [i] @@ -86,455 +74,583 @@ def main_solve_for_muI_mapping(num_ports: int = 8): delta_IM[i, m] = True if slowdown: - slowdown_m = Muop(name='slowdown_' + n) - slowdown_p = Port(name=f'slowdown {round(mpc, 2)}', max_throughput=mpc + 0.1, type=Port_Type.SLOWDOWN) + throttle = Port(name=f'throttle {round(mpc, 2)}', max_throughput=mpc + 0.1, type=Port_Type.SLOWDOWN) - SLOWDOWN_MUOPS.append(slowdown_m) - SLOWDOWN_PORTS.append(slowdown_p) + THROTTLES.append(throttle) - delta_IM[i, slowdown_m] = True - delta_MP[slowdown_m, slowdown_p] = True + delta_IT[i, throttle] = True print(' ', Benchmark_Spec.name_from_instruction_names(run.kernel).ljust(75), ':', - f'IPC({run.ipc.mean:5.3}) MPC({run.mpc.mean:5.3})', ':', + f'IPC({run.ipc.mean:5.3}) MPC({run.umpc.mean:5.3})', ':', '(SLOWDOWN)' if slowdown else '') - MUOPS = REAL_MUOPS + SLOWDOWN_MUOPS - PORTS = REAL_PORTS + SLOWDOWN_PORTS + SPEEDUP_PORTS + MUOPS = REAL_MUOPS + PORTS = REAL_PORTS + THROTTLES = THROTTLES INSTS.sort(key=lambda i: i.name) MUOPS.sort(key=lambda m: m.name) PORTS.sort(key=lambda p: p.name) + THROTTLES.sort(key=lambda p: p.name) - for m in SLOWDOWN_MUOPS: - for p in PORTS: - delta_MP.setdefault((m, p), False) + for run in sorted(muI_runs, key=lambda r: r.name) + sorted(pair_runs, key=lambda r: r.name): + # assert all(i in NAME_2_INST for i in run.kernel), run.kernel - for m in REAL_MUOPS: - for p in SLOWDOWN_PORTS: - delta_MP[m, p] = False - - for run in sorted(muI_runs) + sorted(pair_runs): - assert all(i in NAME_2_INST for i in run.kernel), run.kernel - - if run in pair_runs: - print(' ', Benchmark_Spec.name_from_instruction_names(run.kernel).ljust(75), ':', - f'IPC({run.ipc.mean:5.3}) MPC({run.mpc.mean:5.3})', - f': {" ".join(str(p) for p in sorted(ports_used(run)))}') + if run in pair_runs: + print(' ', Benchmark_Spec.name_from_instruction_names(run.kernel).ljust(75), ':', + f'IPC({run.ipc.mean:5.3}) MPC({run.umpc.mean:5.3})', + f': {" ".join(str(p) for p in sorted(ports_used(run)))}') - ipc = run.ipc.p90 - mpc = run.mpc.p90 + ipc = run.ipc.p90 + umpc = run.umpc.p90 + fmpc = run.fmpc.p90 + try: insts = [NAME_2_INST[i] for i in run.kernel] + except KeyError: + print(NAME_2_INST) + raise - K = Kernel(mpc=mpc, ipc=ipc, insts=insts) - KERNELS += [K] - - if len(run.kernel) == 1 and run.kernel[0] in complexIs: - KERNELS += [Kernel(mpc=mpc, ipc=ipc, insts=insts)] + K = Kernel(insts=insts, ipc=ipc, fmpc=fmpc, umpc=umpc) + KERNELS += [K] - # FIXME: ??? how to do this without ports ??? - has_bottleneck = run.mpc < 0.95 + # FIXME: ??? how to do this without ports ??? + # has_bottleneck = run.umpc.mean >= 0.95 + # assert has_bottleneck + has_bottleneck = True - if has_bottleneck: - kernels_with_bottleneck.add(K) + if has_bottleneck: + kernels_with_bottleneck.add(K) - KERNELS = Domain(Kernel, sorted(KERNELS, key=lambda k: (len(list(k)), str(K)))) - INSTS = Domain(Inst, INSTS) - MUOPS = Domain(Muop, MUOPS) - PORTS = Domain(Port, PORTS) + KERNELS = Domain(Kernel, sorted(KERNELS, key=lambda k: (len(list(k)), str(K)))) + INSTS = Domain(Inst, INSTS) + MUOPS = Domain(Muop, MUOPS) + PORTS = Domain(Port, PORTS) + if verbose: print('SOLVE WITH', len(KERNELS), 'KERNELS') - # print(*[I.name for I in INSTS]) - # print(*[repr(str(K)) for K in KERNELS]) + # print(*[I.name for I in INSTS]) + # print(*[repr(str(K)) for K in KERNELS]) - try: - outputs = solve_for_delta( - KERNELS, INSTS, MUOPS, PORTS, - const_delta_im = delta_IM, - const_delta_mp = delta_MP, - kernels_with_bottleneck = kernels_with_bottleneck, - saturation_margin = 0.05, - max_error = 0.06, - min_throughput = 0.05, - print_iis = True, - # print_iis = False, - ) - - muI_outputs = [o for o in outputs if len(o.kernel) == 1] - - print() - print('FOUND DECOMPOSITIONS:') - for o in muI_outputs: - print(' ', Benchmark_Spec.name_from_instruction_names(i.name for i in o.kernel) + ':', - o.cpu.merged_muop_str(o.used_muops())) - print() - - ilp_outputs += muI_outputs - all_outputs += outputs - except ILP_Error as e: - print(e, file=sys.stderr) + try: + outputs = solve_for_delta( + KERNELS, INSTS, MUOPS, PORTS, + const_delta_im = delta_IM, + const_delta_mp = delta_MP, + kernels_with_bottleneck = kernels_with_bottleneck, + saturation_margin = 0.975, + max_error = 0.01, + min_throughput = 0.05, + # print_iis = True, + print_iis = False, + ) + + muI_outputs = [o for o in outputs if len(o.kernel) == 1] + + # print() + # print('FOUND DECOMPOSITIONS:') + # for o in muI_outputs: + # print(' ', Benchmark_Spec.name_from_instruction_names(i.name for i in o.kernel) + ':', + # o.cpu.merged_muop_str(o.used_muops())) + # print() + + ilp_outputs += muI_outputs + all_outputs += outputs + except ILP_Error as e: + # print(e, file=sys.stderr) + raise if not ilp_outputs: exit() show_outputs(ilp_outputs, all_outputs) -def main_solve_for_complex_instructions(): - parser = argparse.ArgumentParser() +def solve_for_complex_instruction(complexI, muIs, measurements, + hardcoded_mapping: ty.Collection[ty.Iterable[int]] = None, *, + verbose: bool): + assert complexI not in muIs - parser.add_argument('FILE') - parser.add_argument('EQUIVALENCE_CLASSES') + complexI_run = measurements[complexI] + muI_runs = [measurements[muI] for muI in muIs] + combined_runs = [] - args = parser.parse_args() + muI_set = frozenset(muIs) + inst_set = frozenset([complexI]) | muI_set - measurements = Benchmark_Run_Summary_Aggregator(max_stddev=math.inf, min_samples=0) + for run in measurements: + run_insts = frozenset(run.kernel) - for F in [args.FILE]: - with open(F) as fd: - try: - for line in fd: - run = Benchmark_Run_Summary.from_json(line) + if len(run.kernel) > 1 and complexI in run.kernel and (run_insts & inst_set == run_insts): + combined_runs.append(run) - key = collections.Counter(run.kernel) - if len(key) == 1: - gcd = next(iter(key.values())) - else: - gcd = functools.reduce(math.gcd, key.values()) + combined_runs.sort(key=lambda r: [len(r.kernel), r.kernel]) - for i, cnt in key.items(): - assert cnt % gcd == 0, [cnt, gcd] - key[i] = cnt // gcd + ### ILP - kernel = tuple(sorted(sum([[i] * c for i, c in key.items()], []))) - if len(key) == 1: - run = dataclasses.replace(run, kernel=kernel) + KERNELS = [] + INSTS = [] + REAL_MUOPS = [] # filled in below + COMPLEX_MUOPS = [] # filled in below + REAL_PORTS = [Port(f'P{i}') for i in range(8)] + # REAL_PORTS = [Port(f'P{i}') for i in [0, 1, 2, 3, 5, 6]] + # REAL_PORTS = [Port(f'P{i}') for i in [0, 1, 5, 6]] + THROTTLES = [] # filled in below + kernels_with_bottleneck = set() - if kernel in measurements: - old_run = measurements[kernel] + def id2port(port_id: int): + ps = [p for p in REAL_PORTS if p.name == f'P{port_id}'] + assert len(ps) == 1, [port_id, ps] + return ps[0] - if run.mpc.p90 > old_run.mpc.p90: - assert abs(run.mpc.p90 - run.mpc.mean) < 0.1, run.kernel + NAME_2_INST = {} - measurements.remove_measurement(kernel) - measurements.force_add(run) - else: - measurements.add_measurement(run) - except json.JSONDecodeError as e: - print('error: malformed file', repr(F) + ':', e, file=sys.stderr) + delta_IM = collections.defaultdict(bool) + delta_MP = {} + delta_IP = {} + delta_IT = {} + mu_KP = {} - with open(args.EQUIVALENCE_CLASSES) as fd: - eq_json = json.load(fd) - eq_classes = [eq['insts'] for eq in eq_json] + def ports_used(run: Benchmark_Run) -> ty.FrozenSet[Port]: + if run.port_muops: + return frozenset(id2port(p) for p in run.ports_used()) + else: + ports = frozenset() - # muIs = [random.choice(eq) for eq in eq_classes] - muIs = [eq[0] for eq in eq_classes] + for i in run.kernel: + run = measurements[i] - for muI in list(muIs): - run = measurements[muI] + ports |= ports_used(run) - ## exclude movups/movaps - if run.mpc.mean >= len(run.ports_used()): - muIs.remove(muI) - continue + return ports - ## exclude div/sqrt/... - if run.mpc.mean <= 0.5: - muIs.remove(muI) - continue + if verbose: + print('COMPLEX INSTRUCTION:', complexI) + for run in [complexI_run]: + print(' ', Benchmark_Spec.name_from_instruction_names(run.kernel).ljust(60), ':', + f'IPC({run.ipc.mean:5.3}) fMPC({run.fmpc.mean:5.3}) uMPC({run.umpc.mean:5.3})', ':', + f'{" ".join(str(p) for p in sorted(ports_used(run)))}',) + print('SELECTED MUOP INSTRUCTIONS', len(muI_runs)) + for run in sorted(muI_runs, key=lambda r: r.kernel): + print(' ', Benchmark_Spec.name_from_instruction_names(run.kernel).ljust(60), ':', + f'IPC({run.ipc.mean:5.3}) fMPC({run.fmpc.mean:5.3}) uMPC({run.umpc.mean:5.3})', ':', + f'{" ".join(str(p) for p in sorted(ports_used(run)))}',) + print('SELECTED COMBINATIONS:', len(combined_runs)) + for run in sorted(combined_runs, key=lambda r: str(r.kernel)): + ports = frozenset() + for i in run.kernel: + if i in complexI: + continue + ports |= ports_used(measurements[i]) + print(' ', Benchmark_Spec.name_from_instruction_names(run.kernel).ljust(110), ':', + f'IPC({run.ipc.mean:5.3}) fMPC({run.fmpc.mean:5.3}) uMPC({run.umpc.mean:5.3})', + f': {" ".join(str(p) for p in sorted(ports))}') - # muIs = random.sample(muIs, 15) + for run in [complexI_run]: + assert len(run.kernel) == 1 - muIs = [ - 'ADD_GPR32i32_GPR32i32', - 'ADCX_GPR32i32_GPR32i32', - 'ADC_GPR32i32_IMMi32', - 'ADDPD_VR128f64x2_VR128f64x2', - 'ADOX_GPR32i32_GPR32i32', - 'AESDECLAST_VR128i32x4_VR128i32x4', - 'ANDN_GPR32i32_GPR32i32_GPR32i32', - 'ANDNPD_VR128u64x2_VR128u64x2', - 'BLENDVPD_VR128f64x2_VR128f64x2', - 'BSF_GPR32i32_GPR32i32', - 'BTC_GPR32i32_GPR32i32', - 'CMOVNZ_GPR32i32_GPR32i32', - 'INSERTPS_VR128f32x4_VR128f32x4_IMMu8', - 'LDDQU_VR128f64x2_MEM64i32x4', - 'MOV_GPR64i64_IMMi64', - 'XOR_GPR32i32_IMMi32', - # 'MOVAPD_VR128f64x2_VR128f64x2', - ] + assert run.num_unfused_muops > 1, [run.kernel, run.num_unfused_muops] - complexIs = [ - # 'ADC_EAXi32_IMMi32', - # 'ADD_GPR64i64_MEM64i64', - 'BEXTR_GPR64i64_GPR64i64_GPR64i64', - # 'CVTTPD2DQ_VR128i32x4_VR128f64x2', - # 'PACKSSWB_VR128i16x8_MEM64i16x8_SSE2', - # 'PCMPGTB_VR128i8x16_MEM64i8x16_PENTIUMMMX', - ] + n = ' '.join(run.kernel) + i = Inst(name=n, num_fused_muops=run.num_fused_muops, num_unfused_muops=run.num_unfused_muops) - # print('!(' + '|'.join(I for I in muIs + complexIs) + ')') - # exit() + INSTS += [i] + NAME_2_INST[n] = i - all_outputs = [] - ilp_outputs = [] + muops = [Muop(f'Mx{i}') for i in range(run.num_unfused_muops)] + COMPLEX_MUOPS += muops - muI_runs = [measurements[muI] for muI in muIs] - combined_runs = [] + for m in muops: + delta_IM[i, m] = True - for complexI in complexIs: - complexI_run = measurements[complexI] + run_ports = ports_used(run) + assert run_ports - muI_set = frozenset(muIs) - inst_set = frozenset([complexI]) | muI_set + for p in REAL_PORTS: + delta_IP[i, p] = (p in run_ports) - data = collections.defaultdict(set) - for run in measurements: - run_insts = frozenset(run.kernel) + def set_hardcoded_mapping(mapping): + assert len(mapping) == run.num_unfused_muops - if len(run.kernel) > 1 and complexI in run.kernel and (run_insts & inst_set == run_insts): - # print(*sorted(run_insts)) - data[run_insts].add(run) + for muop, ports in enumerate(mapping): + assert len(ports) == len(set(ports)) - for _, runs in data.items(): - runs = sorted(runs, key=lambda r: len(r.kernel)) + ports = [REAL_PORTS[p] for p in ports] - for run in runs[:1] + runs[-2:-1]: - if run not in combined_runs: - combined_runs.append(run) + for port in REAL_PORTS: + delta_MP[muops[muop], port] = bool(port in ports) - combined_runs.sort(key=lambda r: [len(r.kernel), r.kernel]) + if hardcoded_mapping: + set_hardcoded_mapping(hardcoded_mapping) + elif 0: + if n == 'BEXTR_GPR64i64_GPR64i64_GPR64i64': + set_hardcoded_mapping([0, 6], [1, 5]) + set_hardcoded_mapping([0, 1, 6], [1, 5, 6]) + set_hardcoded_mapping([0, 1, 5, 6], [0, 1, 5, 6]) - ### ILP + if n == 'LEA_GPR16i16_ADDR64i64': + set_hardcoded_mapping([0, 1, 5, 6], [1, 5]) - KERNELS = [] - INSTS = [] - REAL_MUOPS = [] # filled in below - SLOWDOWN_MUOPS = [] # filled in below - COMPLEX_MUOPS = [Muop(f'Mx{i}') for i in range(complexI_run.num_muops)] - REAL_PORTS = [Port(f'P{i}') for i in range(8)] - # REAL_PORTS = [Port('P0'), Port('P1'), Port('P5'), Port('P6')] - SLOWDOWN_PORTS = [] # filled in below - SPEEDUP_PORTS = [Port(f'P{chr(ord("a") + i)}', type=Port_Type.SPEEDUP) for i in range(0)] - kernels_with_bottleneck = set() + if n == 'IMUL_GPR16i16_GPR16i16_IMMi8': + set_hardcoded_mapping([0, 1, 5, 6], [1]) - def id2port(port_id: int): - ps = [p for p in REAL_PORTS if p.name == f'P{port_id}'] - assert len(ps) == 1, [port_id, ps] - return ps[0] + for run in list(muI_runs): + assert len(run.kernel) == 1 - NAME_2_INST = {} + mpc = run.umpc.mean - delta_IM = {} - delta_MP = {} - delta_IP = {} - mu_KP = {} + assert mpc > 0.1, run.name - # MPC_THROTTLE_MUOP = Muop(name='MPC', is_virtual=True) - # MPC_THROTTLE_PORT = Port(name='MPC', max_throughput=4, type=Port_Type.SLOWDOWN) - # SLOWDOWN_MUOPS += [MPC_THROTTLE_MUOP] - # SLOWDOWN_PORTS += [MPC_THROTTLE_PORT] - # delta_MP[MPC_THROTTLE_MUOP, MPC_THROTTLE_PORT] = True + n = ' '.join(run.kernel) + i = Inst(name=n, num_fused_muops=run.num_fused_muops, num_unfused_muops=run.num_unfused_muops) - def ports_used(run: Benchmark_Run_Summary) -> ty.FrozenSet[Port]: - return frozenset(id2port(p) for p in run.ports_used()) + INSTS += [i] + NAME_2_INST[n] = i - print('COMPLEX INSTRUCTION:', complexI) - for run in [complexI_run]: - print(' ', Benchmark_Spec.name_from_instruction_names(run.kernel).ljust(60), ':', - f'IPC({run.ipc.mean:5.3}) MPC({run.mpc.mean:5.3})', ':', - f'{" ".join(str(p) for p in sorted(ports_used(run)))}',) - print('SELECTED MUOP INSTRUCTIONS', len(muI_runs)) - for run in sorted(muI_runs, key=lambda r: r.kernel): - print(' ', Benchmark_Spec.name_from_instruction_names(run.kernel).ljust(60), ':', - f'IPC({run.ipc.mean:5.3}) MPC({run.mpc.mean:5.3})', ':', - f'{" ".join(str(p) for p in sorted(ports_used(run)))}',) - print('SELECTED COMBINATIONS:', len(combined_runs)) - for run in sorted(combined_runs, key=lambda r: r.kernel): - print(' ', Benchmark_Spec.name_from_instruction_names(run.kernel).ljust(110), ':', - f'IPC({run.ipc.mean:5.3}) MPC({run.mpc.mean:5.3})', - f': {" ".join(str(p) for p in sorted(ports_used(run)))}') + run_ports = ports_used(run) + assert run_ports - for run in [complexI_run]: - assert len(run.kernel) == 1 + if n == 'MOV_MEM64i64_GPR64i64': + assert run.num_unfused_muops == 2, run.kernel - num_muops = round(run.muops.mean / run.instructions.mean) + muops = [Muop(name=f'n/{i}') for i in range(run.num_unfused_muops)] - assert num_muops > 1, [run.kernel, num_muops] + REAL_MUOPS += muops - n = ' '.join(run.kernel) - i = Inst(name=n, num_muops=num_muops) + for m in muops: + delta_IM[i, m] = True - INSTS += [i] - NAME_2_INST[n] = i + delta_MP[muops[0], REAL_PORTS[4]] = True + delta_MP[muops[1], REAL_PORTS[2]] = True + delta_MP[muops[1], REAL_PORTS[3]] = True + delta_MP[muops[1], REAL_PORTS[7]] = True + else: + assert run.num_unfused_muops == 1, run.kernel - for m in COMPLEX_MUOPS: - delta_IM[i, m] = True + m = Muop(name=n) - # assert run.kernel[0] == 'BEXTR_GPR64i64_GPR64i64_GPR64i64', run.kernel + REAL_MUOPS += [m] - # delta_MP[COMPLEX_MUOPS[0], id2port(0)] = True - # delta_MP[COMPLEX_MUOPS[0], id2port(6)] = True + delta_IM[i, m] = True - # delta_MP[COMPLEX_MUOPS[1], id2port(1)] = True - # delta_MP[COMPLEX_MUOPS[1], id2port(5)] = True + for p in REAL_PORTS: + delta_MP[m, p] = (p in run_ports) - # for M in COMPLEX_MUOPS: - # for P in REAL_PORTS: - # delta_MP.setdefault((M, P), False) + for p in REAL_PORTS: + delta_IP[i, p] = (p in run_ports) - # for p in SPEEDUP_PORTS: - # delta_MP[COMPLEX_MUOPS[0], p] = False - # delta_MP[COMPLEX_MUOPS[1], p] = False + slowdown = (mpc < 0.95) - # if MEASURE_PORTS: - # run_ports = ports_used(run) + if slowdown: + throttle = Port(name=f'slowdown {round(mpc, 2)}', max_throughput=mpc + 0.0, type=Port_Type.SLOWDOWN) - # for p in REAL_PORTS: - # delta_IP[i, p] = (p in run_ports) + THROTTLES.append(throttle) - # MPC throttle - # delta_IM[i, MPC_THROTTLE_MUOP] = True + delta_IT[i, throttle] = True - for run in list(muI_runs): - assert len(run.kernel) == 1 + MUOPS = REAL_MUOPS + COMPLEX_MUOPS + PORTS = REAL_PORTS + THROTTLES = THROTTLES + INSTS.sort(key=lambda i: i.name) + MUOPS.sort(key=lambda m: m.name) + PORTS.sort(key=lambda p: p.name) + THROTTLES.sort(key=lambda p: p.name) - num_muops = round(run.muops.mean / run.instructions.mean) - mpc = run.mpc.mean + for i in INSTS: + for m in MUOPS: + delta_IM.setdefault((i, m), False) - assert mpc > 0.1, run.name + for i in INSTS: + for t in THROTTLES: + delta_IT.setdefault((i, t), False) - assert num_muops == 1, run.kernel + for m in REAL_MUOPS: + for p in PORTS: + delta_MP.setdefault((m, p), False) - slowdown = (mpc < 0.95) + for run in sorted(set([complexI_run] + combined_runs), key=lambda r: r.name): + assert all(i in NAME_2_INST for i in run.kernel), run.kernel - n = ' '.join(run.kernel) - i = Inst(name=n, num_muops=num_muops) - m = Muop(name=n) + ipc = run.ipc.mean + fmpc = run.fmpc.mean + umpc = run.umpc.mean + insts = [NAME_2_INST[i] for i in run.kernel] - INSTS += [i] - REAL_MUOPS += [m] - NAME_2_INST[n] = i + K = Kernel(insts=insts, ipc=ipc, fmpc=fmpc, umpc=umpc) + KERNELS += [K] - delta_IM[i, m] = True + ## NOTE: there are benchmarks that don't saturate any port but have an uMPC > 1 + ## So far all I've seen contained at least three different instructions, + ## one of which must be BEXTR and one must be BSF. + # if run.umpc.mean > 0.95: + # kernels_with_bottleneck.add(K) - run_ports = ports_used(run) + for port, stat in run.port_muops.items(): + usage = stat.p90 / run.cycles.mean - for p in REAL_PORTS: - delta_MP[m, p] = (p in run_ports) + if usage > 0.9: + kernels_with_bottleneck.add(K) + mu_KP[K, REAL_PORTS[port]] = 0.9 - if slowdown: - slowdown_m = Muop(name='slowdown_' + n, is_virtual=True) - slowdown_p = Port(name=f'slowdown {round(mpc, 2)}', max_throughput=mpc + 0.1, type=Port_Type.SLOWDOWN) + if usage < 0.1: + mu_KP[K, REAL_PORTS[port]] = 0.0 - SLOWDOWN_MUOPS.append(slowdown_m) - SLOWDOWN_PORTS.append(slowdown_p) + mu_KP[K, REAL_PORTS[port]] = usage + # print(f'mu_KP[{K}, {REAL_PORTS[port]}] = {usage}') - delta_IM[i, slowdown_m] = True - delta_MP[slowdown_m, slowdown_p] = True + KERNELS = Domain(Kernel, sorted(KERNELS, key=lambda k: (len(list(k)), str(K)))) + INSTS = Domain(Inst, INSTS) + MUOPS = Domain(Muop, MUOPS) + PORTS = Domain(Port, PORTS) + THROTTLES = Domain(Port, THROTTLES) - # MPC throttle - # delta_IM[i, MPC_THROTTLE_MUOP] = True + # for M in MUOPS: + # for P in [REAL_PORTS[2], REAL_PORTS[3], REAL_PORTS[4], REAL_PORTS[7]]: + # delta_MP[M, P] = False - MUOPS = REAL_MUOPS + COMPLEX_MUOPS + SLOWDOWN_MUOPS - PORTS = REAL_PORTS + SLOWDOWN_PORTS + SPEEDUP_PORTS - INSTS.sort(key=lambda i: i.name) - MUOPS.sort(key=lambda m: m.name) - PORTS.sort(key=lambda p: p.name) + if verbose: + print(complexI + ': SOLVE WITH', len(KERNELS), 'KERNELS') - for m in SLOWDOWN_MUOPS: - for p in PORTS: - delta_MP.setdefault((m, p), False) + try: + cpu = CPU_Model() + + for I, M in INSTS * MUOPS: + if delta_IM[I, M]: + cpu.add_im_edge(I, M) + + for M, P in MUOPS * PORTS: + if delta_MP[M, P]: + cpu.add_mp_edge(M, P) + + for I, T in INSTS * THROTTLES: + if delta_IT[I, T]: + cpu.add_it_edge(I, T) + + # outputs = solve_for_delta( + # KERNELS, INSTS, MUOPS, PORTS, THROTTLES, + # const_delta_im = delta_IM, + # const_delta_mp = delta_MP, + # const_delta_it = delta_IT, + # const_delta_ip = delta_IP, + # allow_errors = False, + # max_error = 0.1, + outputs = solve_for_throughput( + cpu, list(KERNELS), + # const_mu_KP = mu_KP, + kernels_with_bottleneck = kernels_with_bottleneck, + saturation_margin = 0.975, + min_throughput = 0.05, + print_iis = False, + verbose = verbose, + ) + + # print(complexI + ':', hardcoded_mapping, 'max_error', max([o.error for o in outputs])) + return outputs + except ILP_Error as e: + # print(complexI + ':', hardcoded_mapping, e, file=sys.stderr) + # return [] + raise e + + +def main_solve_for_complex_instructions(measurements: Benchmark_Run_Aggregator, + representatives: ty.List[str]): + muIs = representatives - for m in REAL_MUOPS: - for p in SLOWDOWN_PORTS: - delta_MP[m, p] = False + complexIs = [ + # 'ADC_EAXi32_IMMi32', + # 'ADD_GPR64i64_MEM64i64', + 'ADD_MEM64i64_GPR64i64', + # 'BSWAP_GPR64i64', + # 'BEXTR_GPR64i64_GPR64i64_GPR64i64', + # 'IMUL_GPR16i16_GPR16i16_IMMi8', + # 'LEA_GPR16i16_ADDR64i64', + # 'CVTTPD2DQ_VR128i32x4_VR128f64x2', + # 'PACKSSWB_VR128i16x8_MEM64i16x8_SSE2', + # 'PCMPGTB_VR128i8x16_MEM64i8x16_PENTIUMMMX', + # 'MOV_MEM64i64_GPR64i64', + ] - for run in sorted([complexI_run] + combined_runs, key=lambda r: r.name): - assert all(i in NAME_2_INST for i in run.kernel), run.kernel + complexIs = [c for c in complexIs if c not in muIs] - ipc = run.ipc.p90 - mpc = run.mpc.p90 - insts = [NAME_2_INST[i] for i in run.kernel] + log('SELECTED', len(complexIs), 'COMPLEX INSTRUCTIONS') + + # print('!(' + '|'.join(I for I in muIs + complexIs) + ')') + # exit() - K = Kernel(mpc=mpc, ipc=ipc, insts=insts) - KERNELS += [K] + all_outputs = [] - if len(run.kernel) == 1 and run.kernel[0] in complexIs: - KERNELS += [Kernel(mpc=mpc, ipc=ipc, insts=insts)] + class Candidate_Decompositions(ty.NamedTuple): + error: float + decompositions: ty.List[str] - if len(run.kernel) > 1 and run.mpc.mean > 0.95: - kernels_with_bottleneck.add(K) + decompositions: ty.Dict[str, Candidate_Decomposition] = {} - mu = 0 + results = [] - for port, stat in run.port_muops.items(): - # if stat.p90 / run.cycles.mean > 0.9: - # kernels_with_bottleneck.add(K) + def summarize_outputs(complexI: str, decomposition: str, outputs): + if decomposition: + print((complexI + ':').ljust(100), decomposition) + print('ERRORS: ', str(len([o for o in outputs if o.has_error])) + '/' + str(len(outputs))) - mu += (stat.mean / run.muops.mean) * run.mpc.mean + def show(name, key): + x = max(outputs, key=key, default='-') + print(name, key(x), f'(sum={sum([key(o) for o in outputs])} - {o.merged_muop_str()} - {o.kernel})') - if (stat.mean / run.muops.mean) < 0.05: - # mu_KP[K, id2port(port)] = 0 - pass - else: - mu_KP[K, id2port(port)] = (stat.mean / run.muops.mean) * run.mpc.mean + show('IPC+MPC ERROR: ', key=lambda o: o.error) + show('MAX_IPC_ERROR: ', key=lambda o: o.ipc_error) + show('MAX_fMPC_ERROR:', key=lambda o: o.fmpc_error) + show('MAX_uMPC_ERROR:', key=lambda o: o.umpc_error) + print() + print() - if run.port_muops: - ## MOVAPD et al :/ - # assert abs(mu - run.mpc.mean) < 0.05, f'{" ".join(run.kernel)} {mu:_.3f} {run.mpc.mean:_.3f}' - pass + for complexI in complexIs: + if 1: + run = measurements[complexI] + + ports = run.ports_used() + + mapping_sort_key = lambda m: [len(m), m] + sort_mapping = lambda m: tuple(sorted(m, key=mapping_sort_key)) + + if 0: # SLOOOOW + mappings = set() + + for mapping in itertools.product(filter(None, powerset(ports)), repeat=run.num_unfused_muops): + mapping_ports = functools.reduce(operator.__or__, map(frozenset, mapping)) + + if mapping_ports == ports: + print('?', *mapping) + mappings.add(sort_mapping(mapping)) + else: + port_sets = [ + (0,), + (1,), + (4,), + (5,), + (6,), + (0, 1), + (0, 5), + (0, 6), + (1, 5), + (2, 3,), + (0, 1, 5), + (2, 3, 7), + (0, 1, 5, 6), + ] + + mappings = itertools.product(port_sets, repeat=run.num_unfused_muops) + mappings = (m for m in mappings if functools.reduce(operator.__or__, map(frozenset, m)) == ports) + mappings = (sort_mapping(m) for m in mappings) + mappings = set(mappings) + + mappings = sorted(mappings, key=sort_mapping) + + for idx, mapping in enumerate(mappings, 1): + try: + outputs = solve_for_complex_instruction(complexI, muIs, measurements, mapping, + verbose=False) + + main_output = [o for o in outputs if len(o.kernel) == 1 and o.kernel[0].name == complexI] + assert len(main_output) == 1 + main_output = main_output[0] + + # new_err = sum([o.error for o in outputs]) + new_err = max([o.umpc_error for o in outputs]), sum([o.umpc_error for o in outputs]), max([o.fmpc_error for o in outputs]) + new_decomp = main_output.merged_muop_str() + + if complexI in decompositions: + old_error, old_decomps = decompositions[complexI] + + if new_err < old_error: + # print(' ', f'{decomp.decomposition} ({decomp.error}) < {old_decomp.decomposition} ({old_decomp.error})') + + decompositions[complexI] = Candidate_Decompositions(new_err, [new_decomp]) + elif new_err == old_error: + # print(' ', f'{decomp.decomposition} ({decomp.error}) == {old_decomp.decomposition} ({old_decomp.error})') + decompositions[complexI].decompositions.append(new_decomp) + else: + decompositions[complexI] = Candidate_Decompositions(new_err, [new_decomp]) + + result = [ + # len(sum(mapping, ())), + sum([o.error for o in outputs]), + max([o.error for o in outputs]), + max([o.ipc_error for o in outputs]), max([o.fmpc_error for o in outputs]), max([o.umpc_error for o in outputs]), + sum([o.ipc_error for o in outputs]), sum([o.fmpc_error for o in outputs]), sum([o.umpc_error for o in outputs]), + len([o for o in outputs if o.has_error]), + mapping, + complexI, + ] + # print(mapping, *result[:-1]) + + results.append(result) + print( + f'{idx}/{len(mappings)}', + ' ', complexI, + # f'err={round(new_err, 7)}', + f'err={new_err}', + '-', + new_decomp, + '/', + ' | '.join(decompositions[complexI].decompositions) + ) + except ILP_Error as e: + print('ILP_Error', e) + continue + else: + try: + outputs = solve_for_complex_instruction(complexI, muIs, measurements, + hardcoded_mapping=[(1, 5), (0, 1, 5, 6)], verbose=True) + except ILP_Error as e: + print('error:', e, file=sys.stderr) + continue - KERNELS = Domain(Kernel, sorted(KERNELS, key=lambda k: (len(list(k)), str(K)))) - INSTS = Domain(Inst, INSTS) - MUOPS = Domain(Muop, MUOPS) - PORTS = Domain(Port, PORTS) + main_output = [o for o in outputs if len(o.kernel) == 1 and o.kernel[0].name == complexI] + assert len(main_output) == 1 + main_output = main_output[0] - print('SOLVE WITH', len(KERNELS), 'KERNELS') + summarize_outputs(complexI, main_output.merged_muop_str(), outputs) - try: - outputs = solve_for_delta( - KERNELS, INSTS, MUOPS, PORTS, - const_delta_im = delta_IM, - const_delta_mp = delta_MP, - # const_delta_ip = delta_IP, - # const_mu_KP = mu_KP, - kernels_with_bottleneck = kernels_with_bottleneck, - saturation_margin = 0.05, - max_error = 0.05, - min_throughput = 0.05, - print_iis = True, - # print_iis = False, - ) - - # complex_outputs = [o for o in outputs if len(o.kernel) == 1 and o.kernel[0].num_muops != 1] - complex_outputs = [o for o in outputs if max(i.num_muops for i in o.kernel) > 1] - - # print() - # print('FOUND DECOMPOSITIONS:') - # for o in complex_outputs: - # print(' ', Benchmark_Spec.name_from_instruction_names(i.name for i in o.kernel) + ':', - # o.cpu.merged_muop_str(o.used_muops())) - # print() - - ilp_outputs += complex_outputs all_outputs += outputs - except ILP_Error as e: - print(e, file=sys.stderr) - if not ilp_outputs: - exit() - for o in ilp_outputs: - print() - print('FOUND DECOMPOSITIONS:') - for o in complex_outputs: - print(' ', Benchmark_Spec.name_from_instruction_names(i.name for i in o.kernel) + ':', - o.cpu.merged_muop_str(o.used_muops())) - print() - print('ERRORS:', str(len([o for o in all_outputs if o.has_error])) + '/' + str(len(outputs))) + # if results: + # for result in sorted(results): + # print(*result) + # print() + # return + + ilp_outputs = [o for o in all_outputs if max(i.num_fused_muops for i in o.kernel) > 1] + + print() + print('FOUND DECOMPOSITIONS:') + for i, o in decompositions.items(): + print(' ', (i + ':').ljust(85), ' | '.join(o.decompositions), f'({o.error})') + print() + summarize_outputs(None, None, all_outputs) show_outputs(ilp_outputs, all_outputs) + die = 0 + + def assert_decompositions(inst, want_decomp): + nonlocal die, decompositions + + if inst in decompositions: + have_decomps = decompositions[inst].decompositions + + if [want_decomp] == have_decomps: + # YAY! + pass + elif want_decomp in have_decomps: + print('Non-unique decomposition:', inst.ljust(60), f'WANT: {want_decomp:15} HAVE: {" | ".join(have_decomps)}') + die = 1 + else: + print('Invalid decomposition:', inst.ljust(60), f'WANT: {want_decomp:15} HAVE: {" | ".join(have_decomps)}') + die = 1 + + assert_decompositions('IMUL_GPR16i16_GPR16i16_IMMi8', '1P1 + 1P0156') + assert_decompositions('BEXTR_GPR64i64_GPR64i64_GPR64i64', '1P06 + 1P15') + assert_decompositions('LEA_GPR16i16_ADDR64i64', '1P15 + 1P0156') + exit(die) + def show_outputs(ilp_outputs, all_outputs): ilp_outputs.sort(key=lambda ilp: tuple(ilp.kernel)) @@ -550,7 +666,7 @@ def show_outputs(ilp_outputs, all_outputs): import io C = R = math.ceil(math.sqrt(len(ilp_outputs))) - while C * (R - 1) >= len(ilp_outputs): + while R > 1 and C * (R - 1) >= len(ilp_outputs): R -= 1 fig = plt.figure() @@ -563,15 +679,24 @@ def show_outputs(ilp_outputs, all_outputs): # print() dot = pygraphviz.AGraph(direct=True) - for o in sorted(all_outputs, key=lambda o: [len(o.kernel), str(o.kernel)]): - if not o.has_error: + + for o in sorted(all_outputs, key=lambda o: [o.error, len(o.kernel), str(o.kernel)]): + if not any(i.num_unfused_muops > 1 for i in o.kernel): continue - # if not any('BEXTR' in i.name for i in o.kernel): + + # if len(o.kernel) != 2: + # print('skip', o.kernel) # continue - # if len(o.kernel) > 1 and not any('INSERT' in i.name or 'AES' in i.name for i in o.kernel): + # if not any('BEXTR' in i.name for i in o.kernel): # continue - # if len(o.kernel) != 1: + # if not any('ROL' in i.name for i in o.kernel): # continue + + if not o.has_error: + # if len(o.kernel) != 1: + # continue + pass + # continue o.add_to_dot(dot) dot.layout(prog='dot') dot.draw(f'test.dot') @@ -640,6 +765,86 @@ def show_outputs(ilp_outputs, all_outputs): pass +def powerset(iterable: ty.Iterable['T']) -> ty.Iterable[ty.Tuple['T', ...]]: + "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)" + s = list(iterable) + assert s + + it = itertools.chain.from_iterable(itertools.combinations(s, r) for r in range(len(s) + 1)) + return it + + +def log(*msg): + print(terminal.Info_Line.timestamp(), '-', *msg, file=sys.stderr) + + if __name__ == '__main__': - main_solve_for_complex_instructions() - # main_solve_for_muI_mapping() + parser = argparse.ArgumentParser() + + parser.set_defaults(command=None) + + subps = parser.add_subparsers() + + ############################################################################## + subp = subps.add_parser('complexI-mapping') + subp.set_defaults(command=main_solve_for_complex_instructions) + + subp.add_argument('MEASUREMENTS_FILE', type=pathlib.Path) + subp.add_argument('EQUIVALENCE_CLASSES', type=pathlib.Path) + + ############################################################################## + subp = subps.add_parser('muI-mapping') + subp.set_defaults(command=main_solve_for_muI_mapping) + + subp.add_argument('MEASUREMENTS_FILE', type=pathlib.Path) + subp.add_argument('EQUIVALENCE_CLASSES', type=pathlib.Path) + + ############################################################################## + + args = parser.parse_args() + + if not args.command: + parser.error('No command specified') + + measurements = Benchmark_Run_Aggregator(max_stddev=math.inf, min_samples=0) + + log('READ MEASUREMENTS FROM', args.MEASUREMENTS_FILE) + + try: + measurements.read_from_file(args.MEASUREMENTS_FILE) + except json.JSONDecodeError as e: + print('error: malformed file', repr(F) + ':', e, file=sys.stderr) + + log('READ', len(measurements), 'MEASUREMENTS') + log('READ EQUIVALENCE CLASSES FROM', args.EQUIVALENCE_CLASSES) + + with open(args.EQUIVALENCE_CLASSES) as fd: + eq_json = json.load(fd) + eq_classes = [eq['insts'] for eq in eq_json] + + # muIs = [random.choice(eq) for eq in eq_classes] + muIs = [eq[0] for eq in eq_classes] + + for muI in list(muIs): + run = measurements.get(muI) + if not run: + muIs.remove(muI) + continue + + ## exclude movups/movaps + if run.umpc.mean >= len(run.ports_used()): + muIs.remove(muI) + continue + + ## exclude div/sqrt/... + if run.umpc.mean <= 0.5: + muIs.remove(muI) + continue + + # muIs = random.sample(muIs, 15) + + log('READ', len(muIs), 'SIMPLE INSTRUCTIONS/EQUIVALENCE CLASSES') + + ############################################################################## + + args.command(measurements, muIs) -- GitLab