diff --git a/src/pipedream/benchmark/common.py b/src/pipedream/benchmark/common.py index 41ee5156338a1379cc97c31c4e397dee46f3c08e..5ceba573f9a6f3944e9c353898638e0e62e6e6db 100644 --- a/src/pipedream/benchmark/common.py +++ b/src/pipedream/benchmark/common.py @@ -366,9 +366,9 @@ class Perf_Counter_Spec: 'cycle counter', [ ## recent intel CPUs only - "CPU_CLK_UNHALTED", + "CPU_CLK_THREAD_UNHALTED:u=1", ## works on recent AMD - 'CYCLES_NOT_IN_HALT', + "CYCLES_NOT_IN_HALT", ## fallback "PAPI_TOT_CYC", ], @@ -1237,6 +1237,9 @@ class _Benchmark_Runner: out.insts(chunk) + if (benchmark.align_kernel): + out.align() + if gen_iaca_markers: out.emit_iaca_stop_marker() diff --git a/src/pipedream/benchmark/types.py b/src/pipedream/benchmark/types.py index 1ffd46f2ce9a964f1b0e8d4e015ec4401aa6ccc5..bd1fb5b0e5dde91649d165a0b8c6c7a201f8c2ce 100644 --- a/src/pipedream/benchmark/types.py +++ b/src/pipedream/benchmark/types.py @@ -52,7 +52,7 @@ class Benchmark_Spec(yaml.YAML_Serializable): unroll_factor: int, kernel_iterations: int, arch: ir.Architecture, instructions: ty.List[ir.Instruction], register_pools: ty.Optional[ty.Dict[ir.Instruction, int]], - loop_overhead: Loop_Overhead): + align_kernel: bool, loop_overhead: Loop_Overhead): """ ctor. @@ -64,6 +64,7 @@ class Benchmark_Spec(yaml.YAML_Serializable): :param arch asm.Architecture for the CPU architecture this benchmark is for :param instructions instructions in this benchmark kernel :param register_pools relative importance of the instruction in terms of amount of register reserved + :param align_kernel whether we add padding after each unrolling of `instructions` :param loop_overhead number of instructions dynamically executed to run loop around benchmark kernel (does not include instructions of kernel itself) """ @@ -77,6 +78,7 @@ class Benchmark_Spec(yaml.YAML_Serializable): self.arch = arch self.instructions = instructions self.register_pools = register_pools + self.align_kernel = align_kernel self.loop_overhead = loop_overhead @classmethod @@ -85,6 +87,7 @@ class Benchmark_Spec(yaml.YAML_Serializable): arch: ir.Architecture, instructions: ty.Sequence[ir.Instruction], register_pools: ty.Optional[ty.Dict[ir.Instruction, int]] = None, + align_kernel: bool = False, name: str = None, num_dynamic_instructions: int, unrolled_length: int = None, @@ -141,6 +144,7 @@ class Benchmark_Spec(yaml.YAML_Serializable): arch = arch, instructions = instructions, register_pools = register_pools, + align_kernel = align_kernel, loop_overhead = arch.loop_overhead(kernel_iterations), )