bench_runner.py 40 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143
  1. #!/usr/bin/env -S uv run --script
  2. # /// script
  3. # requires-python = ">=3.10"
  4. # dependencies = [
  5. # "numpy",
  6. # "rich",
  7. # "scipy",
  8. # "quantiphy",
  9. # ]
  10. # ///
  11. """Script to run GoogleBenchmark binaries repeatedly and render results.
  12. This script helps run benchmarks repeatedly and render the resulting
  13. measurements in a way that effectively surfaces noisy benchmarks and provides
  14. statistically significant information about the measurements.
  15. There are two primary modes:
  16. 1) Running a single experiment benchmark binary repeatedly to understand that
  17. benchmark's performance.
  18. 2) Running both an experiment and a baseline benchmark binary that include the
  19. same benchmark names to understand the change in performance for each named
  20. benchmark.
  21. Across all of these modes, when rendering a specific metric for a benchmark, we
  22. also render the confidence intervals based on the specified `--alpha` parameter.
  23. For mode (1) when running a single benchmark binary, there is additional support
  24. for passing regular expressions that describe a set of comparable benchmarks for
  25. some main benchmark. When used, the comparable benchmarks for each main one are
  26. rendered as a delta of the main rather than as completely independent metrics.
  27. For mode (2) when running an experiment and baseline binary, every benchmark is
  28. rendered as a delta of the experiment vs. the baseline.
  29. Whenever rendering a delta, this script will flag statistically significant
  30. (according to the provided `--alpha`) improvements or regressions, compute the
  31. improvement or regression, and display the resulting p-value. This script uses
  32. non-parametric U-test for statistical significance, the same as Go's benchmark
  33. comparison tools, based on the large body of evidence that benchmarks rarely if
  34. ever tend to adhere to a normal or other known distribution. A non-parametric
  35. statistical model instead provides a much more realistic basis for comparing two
  36. measurements.
  37. The reported metrics themselves are also classified into "speed" vs. "cost"
  38. metrics in order to model whether larger is an improvement or a regression.
  39. The script uses `uv` to run it rather than Python directly, which manages and
  40. caches its dependencies. For installation instructions for `uv` see:
  41. - Carbon's documentation:
  42. https://docs.carbon-lang.dev/docs/project/contribution_tools.html#optional-tools
  43. - UV's documentation: https://docs.astral.sh/uv/getting-started/installation/
  44. """
  45. from __future__ import annotations
  46. __copyright__ = """
  47. Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  48. Exceptions. See /LICENSE for license information.
  49. SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  50. """
  51. import argparse
  52. import json
  53. import math
  54. import numpy as np # type: ignore
  55. import re
  56. import scipy as sp # type: ignore
  57. import subprocess
  58. import sys
  59. from collections import defaultdict
  60. from dataclasses import dataclass, field
  61. from enum import Enum
  62. from pathlib import Path
  63. from quantiphy import Quantity # type: ignore
  64. from rich.console import Console
  65. from rich.padding import Padding
  66. from rich.progress import track
  67. from rich.table import Column, Table
  68. from rich.text import Text
  69. from rich.theme import Theme
  70. from typing import Optional
  71. def parse_args(args: Optional[list[str]] = None) -> argparse.Namespace:
  72. """Parsers command-line arguments and flags."""
  73. parser = argparse.ArgumentParser(description=__doc__)
  74. parser.add_argument(
  75. "--exp_benchmark",
  76. metavar="BINARY",
  77. required=True,
  78. type=Path,
  79. help="The experiment benchmark binary to run",
  80. )
  81. parser.add_argument(
  82. "--base_benchmark",
  83. metavar="BINARY",
  84. type=Path,
  85. help="""
  86. The baseline benchmark binary to run.
  87. Passing this flag will enable both a baseline and experiment, and change the
  88. analysis to compute and display any statistically significant delta as well
  89. as the before and after values of the each benchmark run.
  90. """.strip(),
  91. )
  92. parser.add_argument(
  93. "--benchmark_args",
  94. action="append",
  95. default=[],
  96. metavar="ARG",
  97. help="Extra arguments to both the experiment and baseline benchmark",
  98. )
  99. parser.add_argument(
  100. "--exp_benchmark_args",
  101. action="append",
  102. default=[],
  103. metavar="ARG",
  104. help="Extra arguments to the experiment benchmark",
  105. )
  106. parser.add_argument(
  107. "--base_benchmark_args",
  108. action="append",
  109. default=[],
  110. metavar="ARG",
  111. help="Extra arguments to the baseline benchmark",
  112. )
  113. parser.add_argument(
  114. "--benchmark_comparable_re",
  115. metavar="PATTERN",
  116. action="append",
  117. default=[],
  118. help="""
  119. A regular expression that is used to match sets of benchmarks that should be
  120. compared with each other. This flag may be specified multiple times with
  121. different regular expressions to handle multiple different grouping schemes or
  122. structures. May not be combined with `base_benchmark`.
  123. Each regular expression is used to group together benchmark names distinguished
  124. by a "tag" substring in the name. Either the regex as a whole or a `tag`
  125. symbolic capture group within the regex designates this substring. Further, a
  126. `main` symbolic capture group _must_ be included and only match when the
  127. specific substring is the main benchmark name and other matching ones should be
  128. viewed as comparisons against it. When rendering, only the name matching the
  129. main capture group will be rendered, with others rendered as comparisons against
  130. it based on the tag, and with statistical significance to evaluate the
  131. comparison.
  132. Example regex: `(?P<tag>(?P<main>Carbon)|Abseil|LLVM)HashBench`
  133. This produces three tags, `Carbon`, `Abseil`, and `LLVM`. The main tag is
  134. `Carbon`.
  135. TODO: This is only currently supported without a base benchmark to provide
  136. relative comparisons within a single benchmark binary. There are good models for
  137. handling this and surfacing delta-of-delta information with a base benchmark
  138. binary.
  139. """.strip(),
  140. )
  141. parser.add_argument(
  142. "--runs",
  143. default=10,
  144. metavar="N",
  145. type=int,
  146. help="Number of runs of the benchmark",
  147. )
  148. parser.add_argument(
  149. "--wall_time",
  150. action="store_true",
  151. help="Use wall-clock time instead of CPU time",
  152. )
  153. parser.add_argument(
  154. "--show_iterations",
  155. action="store_true",
  156. help="Show the iteration counts",
  157. )
  158. parser.add_argument(
  159. "--extra_metrics_filter",
  160. metavar="PATTERN",
  161. type=str,
  162. help="A regex filter on the names of extra metrics to display.",
  163. )
  164. parser.add_argument(
  165. "--alpha",
  166. default=0.05,
  167. metavar="𝛂",
  168. type=float,
  169. help="""
  170. Threshold for P-values to be considered statistically significant. Also used to
  171. compute the confidence intervals for individual metrics.
  172. """.strip(),
  173. )
  174. parser.add_argument(
  175. "--output",
  176. choices=["console", "json"],
  177. default="console",
  178. help="""
  179. Output format to use, note that `json` output doesn't do any analysis of the
  180. results, and just dumps the aggregate JSON data from the repeated runs.
  181. """.strip(),
  182. )
  183. return parser.parse_args(args=args)
  184. # Default arguments that will be passed even when arguments are passed with
  185. # `--benchmark_args` to the script. These can be undone by overriding them in
  186. # explicitly passed arguments.
  187. DEFAULT_BENCHMARK_ARGS = [
  188. # Randomize the order in which the benchmarks run to avoid skewed results
  189. # due to a specific order.
  190. "--benchmark_enable_random_interleaving",
  191. # Reduce the default minimum time to 0.1s as it's more effective to use
  192. # multiple runs to improve confidence in measurements.
  193. "--benchmark_min_time=0.1s",
  194. ]
  195. # Pre-compiled regexes to match metrics that measure _speed_: larger is better.
  196. SPEED_METRIC_PATTERNS = [
  197. re.compile(p)
  198. for p in [
  199. r"(?i)rate",
  200. r"(?i).*per[\s_](second|ms|ns)",
  201. ]
  202. ]
  203. # Pre-compiled regexes to match metrics that measure _cost_: smaller is better.
  204. COST_METRIC_PATTERNS = [
  205. re.compile(p)
  206. for p in [
  207. r"(?i)cycles",
  208. r"(?i)instructions",
  209. r"(?i)time",
  210. ]
  211. ]
  212. # Theme for use with the Rich `Console` printing.
  213. THEME = Theme(
  214. {
  215. "base_median": "cyan",
  216. "exp_median": "magenta",
  217. "base_conf": "cyan",
  218. "exp_conf": "magenta",
  219. "slower": "bright_red",
  220. "faster": "bright_green",
  221. }
  222. )
  223. # The set of benchmark keys we ignore in the JSON data structure. Most of these
  224. # are things are incidental, but a few are more surprising. See comments on
  225. # specific entries for details.
  226. IGNORED_BENCHMARK_KEYS = set(
  227. [
  228. "name",
  229. "family_index",
  230. "per_family_instance_index",
  231. "run_name",
  232. "run_type",
  233. "repetitions",
  234. "repetition_index",
  235. "threads",
  236. # We don't render `iterations` because we instead directly compute
  237. # statistical error bars using the multiple iterations. This removes the
  238. # need for manually considering the iteration count.
  239. "iterations",
  240. # We ignore the time and time unit metrics here because we directly
  241. # access and special case these metrics in order to apply the unit to
  242. # the times.
  243. "real_time",
  244. "cpu_time",
  245. "time_unit",
  246. ]
  247. )
  248. class DeltaKind(Enum):
  249. """Models the relevant kinds of deltas that we end up wanting to render."""
  250. IMPROVEMENT = "[faster]👍[/faster]"
  251. NEUTRAL = "~"
  252. REGRESSION = "[slower]👎[/slower]"
  253. NOISE = ""
  254. def __str__(self) -> str:
  255. return self.value
  256. @dataclass
  257. class RenderedDelta:
  258. """Rendered delta and pvalue for some metric."""
  259. kind: DeltaKind
  260. delta: str
  261. pvalue: str
  262. @dataclass
  263. class RenderedMetric:
  264. """Rendered non-delta metric and its confidence interval."""
  265. median: str
  266. conf: str
  267. @dataclass
  268. class BenchmarkRunMetrics:
  269. """The main data class used to collect metrics for benchmark runs.
  270. The data is read in using a JSON format that isn't organized in a convenient
  271. way to analyze and render, so we re-organize it into this data class and use
  272. that for analysis.
  273. Each object of this class corresponds to a specific named benchmark.
  274. """
  275. # The main metrics for this named benchmark, or the "experiment". This field
  276. # is always populated.
  277. exp: list[Quantity] = field(default_factory=lambda: [])
  278. # The metrics for this named benchmark in the base execution. May be empty
  279. # if no base execution was provided to compute a delta against.
  280. base: list[Quantity] = field(default_factory=lambda: [])
  281. # Any comparable benchmark metrics, indexed by the tag name to use when
  282. # rendering the comparison. May be empty if there are no comparable
  283. # benchmarks for the main one this represents.
  284. comps: defaultdict[str, list[Quantity]] = field(
  285. default_factory=lambda: defaultdict(list)
  286. )
  287. @dataclass
  288. class ComparableBenchmarkMapping:
  289. """Organizes any comparable benchmarks.
  290. Constructed with the list of benchmark names and regexes that describe
  291. comparable name structures.
  292. Names that match one of these regexes are organized into the main name in
  293. `main_benchmark_names`, and the comparable names in various mappings to
  294. allow computing comparisons metrics between the main and comparable names.
  295. Names that don't match any of the regexes are just directly included in
  296. `main_benchmark_names`.
  297. """
  298. # Names that are considered "main" benchmarks after filtering.
  299. main_benchmark_names: list[str]
  300. # Maps a comparison benchmark name to its base name (tag removed).
  301. name_to_base: dict[str, str]
  302. # Maps a base name to its main benchmark name.
  303. base_to_main_name: dict[str, str]
  304. # Maps a comparison benchmark name to its tag.
  305. name_to_comp_tag: dict[str, str]
  306. # Maps a main benchmark name to a list of its comparison tags.
  307. main_name_to_comp_tags: dict[str, list[str]]
  308. def __init__(
  309. self,
  310. original_benchmark_names: list[str],
  311. comparable_re_strs: list[str],
  312. console: Console,
  313. ):
  314. """Identify main and comparable benchmarks."""
  315. self.main_benchmark_names = []
  316. self.name_to_base = {}
  317. self.base_to_main_name = {}
  318. self.name_to_comp_tag = {}
  319. self.main_name_to_comp_tags = {}
  320. comp_res = [
  321. re.compile(comparable_re_str)
  322. for comparable_re_str in comparable_re_strs
  323. ]
  324. for comp_re in comp_res:
  325. if "main" not in comp_re.groupindex:
  326. console.log(
  327. "ERROR: No main capture group in the "
  328. "`--benchmark_comparable_re` flag!"
  329. )
  330. sys.exit(1)
  331. for name in original_benchmark_names:
  332. comp_match = next(
  333. (m for comp_re in comp_res if (m := comp_re.search(name))), None
  334. )
  335. if not comp_match:
  336. # Non-comparable benchmark
  337. self.main_benchmark_names.append(name)
  338. continue
  339. tag_group = 0
  340. if "tag" in comp_match.re.groupindex:
  341. tag_group = comp_match.re.groupindex["tag"]
  342. tag = comp_match.group(tag_group)
  343. tag_begin, tag_end = comp_match.span(tag_group)
  344. base_name = name[:tag_begin] + name[tag_end:]
  345. self.name_to_base[name] = base_name
  346. if comp_match.group("main"):
  347. self.base_to_main_name[base_name] = name
  348. self.main_benchmark_names.append(name)
  349. else:
  350. self.name_to_comp_tag[name] = tag
  351. # Verify that for all the comparable benchmarks we actually found a main
  352. # benchmark name. We can't do this while processing initially as we
  353. # don't know the relative order of main and comparable benchmark names.
  354. #
  355. # Also collect a list of all the comparison tags for a given main name.
  356. # self.main_name_to_comp_tags: dict[str, list[str]] = {}
  357. for comp, comp_tag in self.name_to_comp_tag.items():
  358. base_name = self.name_to_base[comp]
  359. main_name = self.base_to_main_name[base_name]
  360. if not main_name:
  361. console.log(
  362. f"ERROR: Comparable benchmark `{comp}` has no corresponding"
  363. " main benchmark name!"
  364. )
  365. sys.exit(1)
  366. if comp_tag in self.main_name_to_comp_tags.get(main_name, []):
  367. console.log(
  368. f"ERROR: Duplicate comparison tag `{comp_tag}` for main "
  369. f"benchmark `{main_name}`!"
  370. )
  371. sys.exit(1)
  372. self.main_name_to_comp_tags.setdefault(main_name, []).append(
  373. comp_tag
  374. )
  375. def float_ratio(nom: float, denom: float) -> float:
  376. """Translate a ratio of floats into a float, handling divide by zero."""
  377. if denom != 0.0:
  378. return nom / denom
  379. elif nom > 0.0:
  380. return math.inf
  381. elif nom < 0.0:
  382. return -math.inf
  383. else:
  384. return 0.0
  385. def render_fixed_width_float(x: float) -> str:
  386. """Renders a floating point value into a fixed width string."""
  387. if math.isinf(x):
  388. return f"{x:>4f}{'':<3}"
  389. frac, whole = math.modf(x)
  390. frac_str = f"{math.fabs(frac):<4.3f}"[1:]
  391. return f"{int(whole):> 3}{frac_str}"
  392. def render_ratio(ratio: float) -> str:
  393. """Renders a ratio into a human-friendly string form.
  394. This uses a % for ratios with a magnitude less than 1.0. For ratios with a
  395. larger magnitude, they are rendered as a fixed width floating point number
  396. with an `x` suffix.
  397. """
  398. if ratio > 1.0 or ratio < -1.0:
  399. return f"{render_fixed_width_float(ratio)}x"
  400. else:
  401. return f"{render_fixed_width_float(ratio * 100.0)}%"
  402. def render_metric(
  403. alpha: float, times: list[Quantity], is_base: bool
  404. ) -> RenderedMetric:
  405. """Render a non-delta metric.
  406. Computes the string to use for both the metric itself and the string to show
  407. the confidence interval for that metric.
  408. Args:
  409. alpha: The alpha value to use for the confidence interval.
  410. times: The list of measurements.
  411. is_base:
  412. Whether to use the "baseline" or "experiment" theme in the rendered
  413. strings.
  414. """
  415. if is_base:
  416. style_prefix = "base_"
  417. else:
  418. style_prefix = "exp_"
  419. units = times[0].units
  420. if all(x == times[0] for x in times):
  421. with Quantity.prefs(number_fmt="{whole:>3}{frac:<4} {units}"):
  422. return RenderedMetric(
  423. f"[{style_prefix}median]{times[0]:.3}[/{style_prefix}median]",
  424. "",
  425. )
  426. median = Quantity(np.median(times), units=units)
  427. median_test = sp.stats.quantile_test(times, q=median)
  428. median_ci = median_test.confidence_interval(confidence_level=(1.0 - alpha))
  429. ci_str = "?"
  430. if not math.isnan(median_ci.low) and not math.isnan(median_ci.high):
  431. low_delta = median - median_ci.low
  432. high_delta = median_ci.high - median
  433. assert low_delta >= 0.0, high_delta >= 0.0
  434. delta = max(low_delta, high_delta)
  435. ci_str = render_ratio(float_ratio(delta, median))
  436. with Quantity.prefs(number_fmt="{whole:>3}{frac:<4} {units}"):
  437. return RenderedMetric(
  438. f"[{style_prefix}median]{median:.3}[/{style_prefix}median]",
  439. f"[{style_prefix}conf]{ci_str:9}[/{style_prefix}conf]",
  440. )
  441. def render_delta(
  442. metric: str, alpha: float, base: list[Quantity], exp: list[Quantity]
  443. ) -> RenderedDelta:
  444. """Render a delta metric.
  445. This handles computing the delta, its statistical significance, and
  446. whether that delta is an improvement or a regression based on the specific
  447. metric name.
  448. Args:
  449. metric:
  450. The name of the metric to guide whether bigger or smaller is an
  451. improvement.
  452. alpha: The alpha value to use for the confidence interval.
  453. base: The baseline measurements.
  454. exp: The experiment measurements.
  455. """
  456. # Skip any delta when all the data is zero. This typically occurs for
  457. # uninteresting metrics or metrics that weren't collected for a given run.
  458. if all(b == 0 for b in base) and all(e == 0 for e in exp):
  459. return RenderedDelta(DeltaKind.NEUTRAL, "", "")
  460. if any(speed_pat.search(metric) for speed_pat in SPEED_METRIC_PATTERNS):
  461. bigger_style = "faster"
  462. smaller_style = "slower"
  463. bigger_kind = DeltaKind.IMPROVEMENT
  464. smaller_kind = DeltaKind.REGRESSION
  465. elif any(cost_pat.search(metric) for cost_pat in COST_METRIC_PATTERNS):
  466. bigger_style = "slower"
  467. smaller_style = "faster"
  468. bigger_kind = DeltaKind.REGRESSION
  469. smaller_kind = DeltaKind.IMPROVEMENT
  470. else:
  471. return RenderedDelta(DeltaKind.NEUTRAL, "", "")
  472. u_test = sp.stats.mannwhitneyu(base, exp)
  473. if u_test.pvalue >= alpha:
  474. return RenderedDelta(
  475. DeltaKind.NOISE, " ?? ", f"p={u_test.pvalue:.3}"
  476. )
  477. kind = DeltaKind.NEUTRAL
  478. base_median = np.median(base)
  479. exp_median = np.median(exp)
  480. exp_ratio = float_ratio(exp_median, base_median)
  481. # TODO: Maybe the threshold of "interesting" should be configurable instead
  482. # of being fixed at 0.1%.
  483. if exp_ratio >= 1.001:
  484. style = bigger_style
  485. kind = bigger_kind
  486. elif exp_ratio <= 0.999:
  487. style = smaller_style
  488. kind = smaller_kind
  489. else:
  490. style = "default"
  491. if exp_ratio >= 2.0 or exp_ratio <= 0.5:
  492. return RenderedDelta(
  493. kind,
  494. f"[{style}]{render_fixed_width_float(exp_ratio)}x[/{style}]",
  495. f"p={u_test.pvalue:.3}",
  496. )
  497. # Use a percent-delta for smaller ratios to make the delta more easily
  498. # understood by readers.
  499. exp_delta_percent = (
  500. float_ratio(exp_median - base_median, base_median) * 100.0
  501. )
  502. return RenderedDelta(
  503. kind,
  504. f"[{style}]{render_fixed_width_float(exp_delta_percent)}%[/{style}]",
  505. f"p={u_test.pvalue:.3}",
  506. )
  507. def render_metric_column(
  508. metric: str,
  509. alpha: float,
  510. runs: list[BenchmarkRunMetrics],
  511. ) -> Table:
  512. """Render the column of the benchmark results table for a given metric.
  513. We render a single column for each metric, and use a careful line-oriented
  514. layout within the column to ensure "rows" line up for each individual
  515. benchmark. Within the column, we use a nested table to layout the different
  516. rendered strings.
  517. A key goal of the rendering throughout is to arrange for rendered numbers to
  518. have the decimal point in a consistent column so that it isn't confusing for
  519. readers to identify the position of the decimal point and magnitude of the
  520. number rendered.
  521. Args:
  522. metric: The name of the metric to render.
  523. alpha: The alpha value to use for the confidence interval.
  524. runs: The list of benchmark runs.
  525. """
  526. t = Table.grid(
  527. Column(),
  528. # It might seem like we want the left column here to be right-aligned,
  529. # but we're going to carefully align the digits in the format string,
  530. # and we can't easily control the length of units. So we left-align to
  531. # simplify the digit layout.
  532. Column(justify="left"),
  533. Column(justify="center"),
  534. Column(justify="left"),
  535. padding=(0, 1),
  536. )
  537. for run in runs:
  538. if len(run.base) != 0:
  539. # We have a baseline run to compare against, so compute the delta
  540. # between it and the experiment as well as the specific baseline run
  541. # metric.
  542. rendered_delta = render_delta(metric, alpha, run.base, run.exp)
  543. rendered_base = render_metric(alpha, run.base, is_base=True)
  544. # Add the delta as the first row, then the baseline metric.
  545. t.add_row(
  546. str(rendered_delta.kind),
  547. rendered_delta.delta,
  548. "",
  549. rendered_delta.pvalue,
  550. )
  551. t.add_row("", rendered_base.median, "±", rendered_base.conf)
  552. # Now render the experiment metric and add its row.
  553. rendered_exp = render_metric(alpha, run.exp, is_base=False)
  554. t.add_row("", rendered_exp.median, "±", rendered_exp.conf)
  555. # If we have any comparable benchmarks, render each of them as first a
  556. # delta and then the specific comparable metric as its own kind of
  557. # baseline.
  558. #
  559. # TODO: At some point when we support combining baseline _runs_ with
  560. # comparable metrics, we'll need to change this to render both baseline
  561. # and experiment comparables and a delta-of-delta. But currently we
  562. # don't support combining these which simplifies the rendering here.
  563. for name, comp in sorted(run.comps.items()):
  564. rendered_delta = render_delta(metric, alpha, comp, run.exp)
  565. t.add_row(
  566. str(rendered_delta.kind),
  567. rendered_delta.delta,
  568. "",
  569. rendered_delta.pvalue,
  570. )
  571. rendered_comp = render_metric(alpha, comp, is_base=True)
  572. t.add_row("", rendered_comp.median, "±", rendered_comp.conf)
  573. # Lastly, if we had a baseline run or any comparable metrics we will
  574. # have rendered multiple lines of data. Add a blank line so that these
  575. # form a visual group.
  576. if len(run.base) != 0 or len(run.comps) != 0:
  577. t.add_row()
  578. return t
  579. def run_benchmark_binary(
  580. binary_path: Path,
  581. common_args: list[str],
  582. specific_args: list[str],
  583. num_runs: int,
  584. console: Console,
  585. ) -> list[dict]:
  586. """Runs a benchmark binary multiple times and collects results.
  587. The results are parsed out of the JSON output from each run, and returned as
  588. a list of dictionaries. Each dictionary represents one run.
  589. This will log the command being run, show a progress bar for each run
  590. performed, and then log de-duplicated `stderr` output from the runs.
  591. """
  592. # If the binary path has no directory components and exists as a relative
  593. # file, add `./` as a prefix. Otherwise, we want to pass the name unchanged
  594. # for `PATH` search.
  595. binary_str = str(binary_path)
  596. if len(binary_path.parts) == 1 and binary_path.exists():
  597. binary_str = f"./{binary_str}"
  598. run_cmd = (
  599. [binary_str]
  600. + DEFAULT_BENCHMARK_ARGS
  601. + common_args
  602. + specific_args
  603. # Pass the format flag last as it is required and can't be overridden.
  604. + ["--benchmark_format=json"]
  605. )
  606. console.log(f"Executing: {' '.join(run_cmd)}")
  607. runs_data = []
  608. unique_stderr: list[bytes] = []
  609. for _ in track(
  610. range(num_runs), description=f"Running {binary_path.name}..."
  611. ):
  612. p = subprocess.run(
  613. run_cmd,
  614. check=True,
  615. stdout=subprocess.PIPE,
  616. stderr=subprocess.PIPE,
  617. )
  618. runs_data.append(json.loads(p.stdout))
  619. stderr = p.stderr.strip()
  620. if len(stderr) != 0 and stderr not in unique_stderr:
  621. unique_stderr.append(stderr)
  622. for stderr_output in unique_stderr:
  623. # Decode stderr, replacing errors in case of non-UTF-8 characters.
  624. console.log(
  625. f"{binary_path.name} stderr:\n"
  626. f"{stderr_output.decode('utf-8', errors='replace')}"
  627. )
  628. return runs_data
  629. def print_run_context(
  630. console: Console,
  631. num_runs: int,
  632. exp_runs: list[dict],
  633. has_baseline: bool,
  634. ) -> None:
  635. """Prints the context from the benchmark runs.
  636. This replicates the useful context information from Google Benchmark's
  637. default output, such as CPU information and cache sizes.
  638. TODO: Print differently when context of base and experiment runs differ.
  639. Args:
  640. console: The rich console to print to.
  641. num_runs: The number of times the benchmarks were run.
  642. exp_runs: The results from the experiment benchmark runs.
  643. has_baseline: Whether a baseline benchmark was also run.
  644. """
  645. if has_baseline:
  646. runs_description = f"Ran baseline and experiment {num_runs} times"
  647. else:
  648. runs_description = f"Ran {num_runs} times"
  649. context = exp_runs[0]["context"]
  650. console.print(
  651. f"{runs_description} on "
  652. f"{context['num_cpus']} x {context['mhz_per_cpu']} MHz CPUs"
  653. )
  654. console.print("CPU caches:")
  655. for cache in context["caches"]:
  656. size = Quantity(cache["size"], binary=True)
  657. console.print(f" L{cache['level']} {cache['type']} {size:b}")
  658. console.print(
  659. f"Load avg: {' '.join([str(load) for load in context['load_avg']])}"
  660. )
  661. def get_benchmark_names_and_metrics(
  662. console: Console,
  663. parsed_args: argparse.Namespace,
  664. exp_runs: list[dict],
  665. base_runs: list[dict],
  666. ) -> tuple[list[str], list[str]]:
  667. """Extracts benchmark names and metrics from benchmark run results.
  668. This function determines the list of unique benchmark names and the metrics
  669. to be displayed based on the benchmark output and command-line arguments.
  670. Args:
  671. parsed_args: The parsed command-line arguments.
  672. exp_runs: A list of benchmark run results for the experiment binary.
  673. base_runs: A list of benchmark run results for the baseline binary.
  674. Returns:
  675. - The list of unique benchmark names, maintaining their order.
  676. - The list of metrics to display.
  677. """
  678. # Start with the base time and iteration metrics requested.
  679. metrics: list[str] = []
  680. if parsed_args.wall_time:
  681. metrics.append("real_time")
  682. else:
  683. metrics.append("cpu_time")
  684. if parsed_args.show_iterations:
  685. metrics.append("iterations")
  686. # Compile a regex for filtering extra metrics, if provided.
  687. if metrics_filter_str := parsed_args.extra_metrics_filter:
  688. metrics_filter = re.compile(metrics_filter_str)
  689. else:
  690. metrics_filter = None
  691. # We only need to inspect the first run to find all benchmark and metric
  692. # names. We combine benchmarks from both experiment and baseline runs to get
  693. # a complete set.
  694. one_run_benchmarks = exp_runs[0]["benchmarks"]
  695. if parsed_args.base_benchmark:
  696. one_run_benchmarks += base_runs[0]["benchmarks"]
  697. benchmark_name_set: set[str] = set()
  698. benchmark_name_indices: dict[str, tuple[int, int]] = {}
  699. for benchmark in one_run_benchmarks:
  700. name = benchmark["name"]
  701. benchmark_name_set.add(name)
  702. indices = (
  703. benchmark["family_index"],
  704. benchmark["per_family_instance_index"],
  705. )
  706. if name not in benchmark_name_indices:
  707. benchmark_name_indices[name] = indices
  708. else:
  709. if benchmark_name_indices[name] != indices:
  710. console.print(
  711. f"ERROR: Inconsintent indices {indices} and "
  712. f"{benchmark_name_indices[name]} for benchmark `{name}`."
  713. )
  714. sys.exit(1)
  715. # Add any extra metrics from this benchmark.
  716. for key in benchmark.keys():
  717. if key in metrics or key in IGNORED_BENCHMARK_KEYS:
  718. continue
  719. if metrics_filter and not re.search(metrics_filter, key):
  720. continue
  721. metrics.append(key)
  722. benchmark_names = sorted(
  723. list(benchmark_name_set), key=lambda name: benchmark_name_indices[name]
  724. )
  725. return benchmark_names, metrics
  726. def collect_benchmark_metrics(
  727. benchmark_names: list[str],
  728. metrics: list[str],
  729. exp_runs: list[dict],
  730. base_runs: list[dict],
  731. comp_mapping: ComparableBenchmarkMapping,
  732. ) -> dict[str, dict[str, BenchmarkRunMetrics]]:
  733. """Collects and organizes all benchmark metrics from raw run data.
  734. This function takes the raw benchmark run data and organizes it into a
  735. structured format suitable for analysis and rendering. It initializes the
  736. main data structure, handles the mapping of comparable benchmarks, and
  737. populates the metrics for both experiment and baseline runs.
  738. Args:
  739. benchmark_names: The initial list of unique benchmark names.
  740. metrics: A list of all metric names to be collected.
  741. exp_runs: A list of benchmark run results for the experiment binary.
  742. base_runs: A list of benchmark run results for the baseline binary.
  743. comp_mapping: The mapping of comparable benchmarks.
  744. Returns:
  745. A dictionary where keys are metric names. The values are another
  746. dictionary where keys are benchmark names and values are
  747. BenchmarkRunMetrics objects containing the collected measurements.
  748. """
  749. # Initialize the data structure to hold all collected metrics.
  750. benchmark_metrics: dict[str, dict[str, BenchmarkRunMetrics]] = {
  751. metric: {name: BenchmarkRunMetrics() for name in benchmark_names}
  752. for metric in metrics
  753. }
  754. # Populate metrics from the experiment runs.
  755. for run in exp_runs:
  756. for b in run["benchmarks"]:
  757. name = b["name"]
  758. for metric in metrics:
  759. # Time metrics have a `time_unit` field that needs to be
  760. # appended for correct parsing by the Quantity library.
  761. unit = b.get("time_unit", "") if "time" in metric else ""
  762. # If this is a comparable benchmark, add its metrics to the
  763. # 'comps' list of its corresponding main benchmark.
  764. if maybe_comp_tag := comp_mapping.name_to_comp_tag.get(name):
  765. main_name = comp_mapping.base_to_main_name[
  766. comp_mapping.name_to_base[name]
  767. ]
  768. benchmark_metrics[metric][main_name].comps[
  769. maybe_comp_tag
  770. ].append(Quantity(f"{b[metric]}{unit}"))
  771. # Otherwise, add it to the 'exp' list of its own entry if it's
  772. # a main benchmark.
  773. elif name in benchmark_names:
  774. benchmark_metrics[metric][name].exp.append(
  775. Quantity(f"{b[metric]}{unit}")
  776. )
  777. # Populate metrics from the baseline runs.
  778. for run in base_runs:
  779. for b in run["benchmarks"]:
  780. name = b["name"]
  781. # Baseline runs don't have comparable benchmarks, so we only need
  782. # to populate the 'base' list for main benchmarks.
  783. if name in benchmark_names:
  784. for metric in metrics:
  785. unit = b.get("time_unit", "") if "time" in metric else ""
  786. benchmark_metrics[metric][name].base.append(
  787. Quantity(f"{b[metric]}{unit}")
  788. )
  789. return benchmark_metrics
  790. def print_metric_key(
  791. console: Console,
  792. alpha: float,
  793. has_baseline: bool,
  794. comp_mapping: ComparableBenchmarkMapping,
  795. ) -> None:
  796. """Prints a legend for the metrics table.
  797. This explains the format of the output table, including what the delta,
  798. median, and confidence interval values represent.
  799. Args:
  800. console: The rich console to print to.
  801. alpha: The alpha value for statistical significance.
  802. has_baseline: Whether a baseline benchmark was run.
  803. """
  804. console.print("Metric key:")
  805. conf = int(100 * (1.0 - alpha))
  806. name = "BenchmarkName..."
  807. delta_icon = str(DeltaKind.IMPROVEMENT)
  808. delta = "[faster]<delta>[/faster]"
  809. p = "p=<U-test P-value>"
  810. base_median = "[base_median]<median>[/base_median]"
  811. base_conf = f"[base_conf]<% at {conf}th conf>[/base_conf]"
  812. exp_median = "[exp_median]<median>[/exp_median]"
  813. exp_conf = f"[exp_conf]<% at {conf}th conf>[/exp_conf]"
  814. key_table = Table.grid(
  815. Column(justify="right"),
  816. Column(),
  817. Column(),
  818. Column(),
  819. Column(),
  820. padding=(0, 1),
  821. )
  822. if has_baseline:
  823. key_table.add_row(name, delta_icon, delta, "", p)
  824. key_table.add_row("baseline:", "", base_median, "±", base_conf)
  825. key_table.add_row("experiment:", "", exp_median, "±", exp_conf)
  826. else:
  827. key_table.add_row(name, "", exp_median, "±", exp_conf)
  828. # Only display comparable key if we have comparables to display.
  829. if bool(comp_mapping.name_to_comp_tag):
  830. key_table.add_row("vs Comparable:", delta_icon, delta, p)
  831. key_table.add_row("", "", base_median, "±", base_conf)
  832. console.print(Padding(key_table, (0, 0, 1, 3)))
  833. def print_results_table(
  834. console: Console,
  835. alpha: float,
  836. has_baseline: bool,
  837. metrics: list[str],
  838. benchmark_names: list[str],
  839. benchmark_metrics: dict[str, dict[str, BenchmarkRunMetrics]],
  840. comp_mapping: ComparableBenchmarkMapping,
  841. ) -> None:
  842. """Builds and prints the main results table.
  843. This function constructs a rich `Table` to display the benchmark results,
  844. including deltas, medians, and confidence intervals for each metric. It then
  845. prints this to the provided `console`.
  846. Args:
  847. console: The rich console to print to.
  848. metrics: A list of metric names to be displayed as columns.
  849. benchmark_names: A list of main benchmark names for the rows.
  850. alpha: The alpha value for statistical significance.
  851. benchmark_metrics: A nested dictionary containing the collected metrics
  852. for each benchmark and metric.
  853. has_baseline: Whether a baseline benchmark was run.
  854. comp_mapping: The mapping of comparable benchmarks.
  855. """
  856. METRIC_TITLES = {
  857. "real_time": "Wall Time",
  858. "cpu_time": "CPU Time",
  859. "iterations": "Iterations",
  860. }
  861. name_width = max(
  862. (
  863. len(name)
  864. for name in (
  865. benchmark_names
  866. + [
  867. f"vs {tag}:"
  868. for tag in comp_mapping.name_to_comp_tag.values()
  869. ]
  870. + ["experiment:"]
  871. )
  872. )
  873. )
  874. table = Table(show_edge=False)
  875. # The benchmark name column we want to justify right for the sub-labels, but
  876. # we will fill the name in the column completed and the name will visually
  877. # be justified to the left, so force the heading to justify left unlike the
  878. # column text. We also disable wrapping because we manually fill the column
  879. # and require line-precise layout.
  880. table.add_column(
  881. Text("Benchmark", justify="left"), justify="right", no_wrap=True
  882. )
  883. for metric in metrics:
  884. title = Text(METRIC_TITLES.get(metric, metric), justify="center")
  885. table.add_column(title, justify="left", no_wrap=True)
  886. name_t = Table.grid(Column(justify="right", no_wrap=True), expand=True)
  887. for name in benchmark_names:
  888. name_t.add_row(f"{name}{'.' * (name_width - len(name))}")
  889. if has_baseline:
  890. name_t.add_row("baseline:")
  891. name_t.add_row("experiment:")
  892. name_t.add_row()
  893. elif comp_tags := comp_mapping.main_name_to_comp_tags.get(name):
  894. for tag in comp_tags:
  895. name_t.add_row(f"vs {tag}:")
  896. name_t.add_row()
  897. name_t.add_row()
  898. row = [name_t]
  899. for metric in metrics:
  900. metric_runs = benchmark_metrics[metric]
  901. row.append(
  902. render_metric_column(
  903. metric, alpha, [metric_runs[name] for name in benchmark_names]
  904. )
  905. )
  906. table.add_row(*row)
  907. console.print(table)
  908. def main() -> None:
  909. parsed_args = parse_args()
  910. console = Console(theme=THEME)
  911. Quantity.set_prefs(spacer=" ", map_sf=Quantity.map_sf_to_greek)
  912. if parsed_args.base_benchmark and parsed_args.benchmark_comparable_re:
  913. console.print(
  914. "ERROR: Cannot mix a base benchmark binary with benchmark "
  915. "comparisons."
  916. )
  917. sys.exit(1)
  918. # Run the benchmark(s) and collect the results into a data structure for
  919. # processing.
  920. num_runs = parsed_args.runs
  921. base_runs: list[dict] = []
  922. has_baseline = bool(parsed_args.base_benchmark)
  923. if has_baseline:
  924. base_runs = run_benchmark_binary(
  925. parsed_args.base_benchmark,
  926. parsed_args.benchmark_args,
  927. parsed_args.base_benchmark_args,
  928. num_runs,
  929. console,
  930. )
  931. exp_runs = run_benchmark_binary(
  932. parsed_args.exp_benchmark,
  933. parsed_args.benchmark_args,
  934. parsed_args.exp_benchmark_args,
  935. num_runs,
  936. console,
  937. )
  938. # If JSON output is requested, just dump the data without further
  939. # processing.
  940. if parsed_args.output == "json":
  941. console.log("Printing JSON results...")
  942. console.print_json(json.dumps(exp_runs))
  943. if has_baseline:
  944. console.print_json(json.dumps(base_runs))
  945. return
  946. print_run_context(console, num_runs, exp_runs, has_baseline)
  947. # Collect the benchmark names and metric names.
  948. benchmark_names, metrics = get_benchmark_names_and_metrics(
  949. console, parsed_args, exp_runs, base_runs
  950. )
  951. # Build any mappings between main benchmark names and comparables, and reset
  952. # our benchmark names to the main ones.
  953. comp_mapping = ComparableBenchmarkMapping(
  954. benchmark_names, parsed_args.benchmark_comparable_re, console
  955. )
  956. benchmark_names = comp_mapping.main_benchmark_names
  957. # Collect and organize the actual benchmark metrics from the raw JSON
  958. # structures across the runs. This pivots the data into an easy to analyze
  959. # and render structure, but doesn't do the analysis itself.
  960. benchmark_metrics = collect_benchmark_metrics(
  961. benchmark_names, metrics, exp_runs, base_runs, comp_mapping
  962. )
  963. # Analyze and render a readable table of the collected metrics. This is
  964. # where we do statistical analysis and render confidence intervals,
  965. # significance, and other helpful indicators based on the collected data. We
  966. # also print relevant keys to reading and interpreting the rendered data.
  967. alpha = parsed_args.alpha
  968. console.print(
  969. "Computing statistically significant deltas only where"
  970. f"the P-value < 𝛂 of {alpha}"
  971. )
  972. print_metric_key(console, alpha, has_baseline, comp_mapping)
  973. print_results_table(
  974. console,
  975. alpha,
  976. has_baseline,
  977. metrics,
  978. benchmark_names,
  979. benchmark_metrics,
  980. comp_mapping,
  981. )
  982. if __name__ == "__main__":
  983. main()