diff --git a/src/pipedream/benchmark/common.py b/src/pipedream/benchmark/common.py
index 41ee5156338a1379cc97c31c4e397dee46f3c08e..5ceba573f9a6f3944e9c353898638e0e62e6e6db 100644
--- a/src/pipedream/benchmark/common.py
+++ b/src/pipedream/benchmark/common.py
@@ -366,9 +366,9 @@ class Perf_Counter_Spec:
       'cycle counter',
       [
         ## recent intel CPUs only
-        "CPU_CLK_UNHALTED",
+        "CPU_CLK_THREAD_UNHALTED:u=1",
         ## works on recent AMD
-        'CYCLES_NOT_IN_HALT',
+        "CYCLES_NOT_IN_HALT",
         ## fallback
         "PAPI_TOT_CYC",
       ],
@@ -1237,6 +1237,9 @@ class _Benchmark_Runner:
 
         out.insts(chunk)
 
+        if (benchmark.align_kernel):
+          out.align()
+
       if gen_iaca_markers:
         out.emit_iaca_stop_marker()
 
diff --git a/src/pipedream/benchmark/types.py b/src/pipedream/benchmark/types.py
index 1ffd46f2ce9a964f1b0e8d4e015ec4401aa6ccc5..bd1fb5b0e5dde91649d165a0b8c6c7a201f8c2ce 100644
--- a/src/pipedream/benchmark/types.py
+++ b/src/pipedream/benchmark/types.py
@@ -52,7 +52,7 @@ class Benchmark_Spec(yaml.YAML_Serializable):
                unroll_factor: int, kernel_iterations: int,
                arch: ir.Architecture, instructions: ty.List[ir.Instruction],
                register_pools: ty.Optional[ty.Dict[ir.Instruction, int]],
-               loop_overhead: Loop_Overhead):
+               align_kernel: bool, loop_overhead: Loop_Overhead):
     """
       ctor.
 
@@ -64,6 +64,7 @@ class Benchmark_Spec(yaml.YAML_Serializable):
       :param arch                 asm.Architecture for the CPU architecture this benchmark is for
       :param instructions         instructions in this benchmark kernel
       :param register_pools       relative importance of the instruction in terms of amount of register reserved
+      :param align_kernel         whether we add padding after each unrolling of `instructions`
       :param loop_overhead        number of instructions dynamically executed to run loop around
                                   benchmark kernel (does not include instructions of kernel itself)
     """
@@ -77,6 +78,7 @@ class Benchmark_Spec(yaml.YAML_Serializable):
     self.arch              = arch
     self.instructions      = instructions
     self.register_pools    = register_pools
+    self.align_kernel      = align_kernel
     self.loop_overhead     = loop_overhead
 
   @classmethod
@@ -85,6 +87,7 @@ class Benchmark_Spec(yaml.YAML_Serializable):
                         arch: ir.Architecture,
                         instructions: ty.Sequence[ir.Instruction],
                         register_pools: ty.Optional[ty.Dict[ir.Instruction, int]] = None,
+                        align_kernel: bool = False,
                         name: str = None,
                         num_dynamic_instructions: int,
                         unrolled_length: int = None,
@@ -141,6 +144,7 @@ class Benchmark_Spec(yaml.YAML_Serializable):
       arch              = arch,
       instructions      = instructions,
       register_pools    = register_pools,
+      align_kernel      = align_kernel,
       loop_overhead     = arch.loop_overhead(kernel_iterations),
     )