| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143 |
- #!/usr/bin/env -S uv run --script
- # /// script
- # requires-python = ">=3.10"
- # dependencies = [
- # "numpy",
- # "rich",
- # "scipy",
- # "quantiphy",
- # ]
- # ///
- """Script to run GoogleBenchmark binaries repeatedly and render results.
- This script helps run benchmarks repeatedly and render the resulting
- measurements in a way that effectively surfaces noisy benchmarks and provides
- statistically significant information about the measurements.
- There are two primary modes:
- 1) Running a single experiment benchmark binary repeatedly to understand that
- benchmark's performance.
- 2) Running both an experiment and a baseline benchmark binary that include the
- same benchmark names to understand the change in performance for each named
- benchmark.
- Across all of these modes, when rendering a specific metric for a benchmark, we
- also render the confidence intervals based on the specified `--alpha` parameter.
- For mode (1) when running a single benchmark binary, there is additional support
- for passing regular expressions that describe a set of comparable benchmarks for
- some main benchmark. When used, the comparable benchmarks for each main one are
- rendered as a delta of the main rather than as completely independent metrics.
- For mode (2) when running an experiment and baseline binary, every benchmark is
- rendered as a delta of the experiment vs. the baseline.
- Whenever rendering a delta, this script will flag statistically significant
- (according to the provided `--alpha`) improvements or regressions, compute the
- improvement or regression, and display the resulting p-value. This script uses
- non-parametric U-test for statistical significance, the same as Go's benchmark
- comparison tools, based on the large body of evidence that benchmarks rarely if
- ever tend to adhere to a normal or other known distribution. A non-parametric
- statistical model instead provides a much more realistic basis for comparing two
- measurements.
- The reported metrics themselves are also classified into "speed" vs. "cost"
- metrics in order to model whether larger is an improvement or a regression.
- The script uses `uv` to run it rather than Python directly, which manages and
- caches its dependencies. For installation instructions for `uv` see:
- - Carbon's documentation:
- https://docs.carbon-lang.dev/docs/project/contribution_tools.html#optional-tools
- - UV's documentation: https://docs.astral.sh/uv/getting-started/installation/
- """
- from __future__ import annotations
- __copyright__ = """
- Part of the Carbon Language project, under the Apache License v2.0 with LLVM
- Exceptions. See /LICENSE for license information.
- SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- """
- import argparse
- import json
- import math
- import numpy as np # type: ignore
- import re
- import scipy as sp # type: ignore
- import subprocess
- import sys
- from collections import defaultdict
- from dataclasses import dataclass, field
- from enum import Enum
- from pathlib import Path
- from quantiphy import Quantity # type: ignore
- from rich.console import Console
- from rich.padding import Padding
- from rich.progress import track
- from rich.table import Column, Table
- from rich.text import Text
- from rich.theme import Theme
- from typing import Optional
- def parse_args(args: Optional[list[str]] = None) -> argparse.Namespace:
- """Parsers command-line arguments and flags."""
- parser = argparse.ArgumentParser(description=__doc__)
- parser.add_argument(
- "--exp_benchmark",
- metavar="BINARY",
- required=True,
- type=Path,
- help="The experiment benchmark binary to run",
- )
- parser.add_argument(
- "--base_benchmark",
- metavar="BINARY",
- type=Path,
- help="""
- The baseline benchmark binary to run.
- Passing this flag will enable both a baseline and experiment, and change the
- analysis to compute and display any statistically significant delta as well
- as the before and after values of the each benchmark run.
- """.strip(),
- )
- parser.add_argument(
- "--benchmark_args",
- action="append",
- default=[],
- metavar="ARG",
- help="Extra arguments to both the experiment and baseline benchmark",
- )
- parser.add_argument(
- "--exp_benchmark_args",
- action="append",
- default=[],
- metavar="ARG",
- help="Extra arguments to the experiment benchmark",
- )
- parser.add_argument(
- "--base_benchmark_args",
- action="append",
- default=[],
- metavar="ARG",
- help="Extra arguments to the baseline benchmark",
- )
- parser.add_argument(
- "--benchmark_comparable_re",
- metavar="PATTERN",
- action="append",
- default=[],
- help="""
- A regular expression that is used to match sets of benchmarks that should be
- compared with each other. This flag may be specified multiple times with
- different regular expressions to handle multiple different grouping schemes or
- structures. May not be combined with `base_benchmark`.
- Each regular expression is used to group together benchmark names distinguished
- by a "tag" substring in the name. Either the regex as a whole or a `tag`
- symbolic capture group within the regex designates this substring. Further, a
- `main` symbolic capture group _must_ be included and only match when the
- specific substring is the main benchmark name and other matching ones should be
- viewed as comparisons against it. When rendering, only the name matching the
- main capture group will be rendered, with others rendered as comparisons against
- it based on the tag, and with statistical significance to evaluate the
- comparison.
- Example regex: `(?P<tag>(?P<main>Carbon)|Abseil|LLVM)HashBench`
- This produces three tags, `Carbon`, `Abseil`, and `LLVM`. The main tag is
- `Carbon`.
- TODO: This is only currently supported without a base benchmark to provide
- relative comparisons within a single benchmark binary. There are good models for
- handling this and surfacing delta-of-delta information with a base benchmark
- binary.
- """.strip(),
- )
- parser.add_argument(
- "--runs",
- default=10,
- metavar="N",
- type=int,
- help="Number of runs of the benchmark",
- )
- parser.add_argument(
- "--wall_time",
- action="store_true",
- help="Use wall-clock time instead of CPU time",
- )
- parser.add_argument(
- "--show_iterations",
- action="store_true",
- help="Show the iteration counts",
- )
- parser.add_argument(
- "--extra_metrics_filter",
- metavar="PATTERN",
- type=str,
- help="A regex filter on the names of extra metrics to display.",
- )
- parser.add_argument(
- "--alpha",
- default=0.05,
- metavar="𝛂",
- type=float,
- help="""
- Threshold for P-values to be considered statistically significant. Also used to
- compute the confidence intervals for individual metrics.
- """.strip(),
- )
- parser.add_argument(
- "--output",
- choices=["console", "json"],
- default="console",
- help="""
- Output format to use, note that `json` output doesn't do any analysis of the
- results, and just dumps the aggregate JSON data from the repeated runs.
- """.strip(),
- )
- return parser.parse_args(args=args)
- # Default arguments that will be passed even when arguments are passed with
- # `--benchmark_args` to the script. These can be undone by overriding them in
- # explicitly passed arguments.
- DEFAULT_BENCHMARK_ARGS = [
- # Randomize the order in which the benchmarks run to avoid skewed results
- # due to a specific order.
- "--benchmark_enable_random_interleaving",
- # Reduce the default minimum time to 0.1s as it's more effective to use
- # multiple runs to improve confidence in measurements.
- "--benchmark_min_time=0.1s",
- ]
- # Pre-compiled regexes to match metrics that measure _speed_: larger is better.
- SPEED_METRIC_PATTERNS = [
- re.compile(p)
- for p in [
- r"(?i)rate",
- r"(?i).*per[\s_](second|ms|ns)",
- ]
- ]
- # Pre-compiled regexes to match metrics that measure _cost_: smaller is better.
- COST_METRIC_PATTERNS = [
- re.compile(p)
- for p in [
- r"(?i)cycles",
- r"(?i)instructions",
- r"(?i)time",
- ]
- ]
- # Theme for use with the Rich `Console` printing.
- THEME = Theme(
- {
- "base_median": "cyan",
- "exp_median": "magenta",
- "base_conf": "cyan",
- "exp_conf": "magenta",
- "slower": "bright_red",
- "faster": "bright_green",
- }
- )
- # The set of benchmark keys we ignore in the JSON data structure. Most of these
- # are things are incidental, but a few are more surprising. See comments on
- # specific entries for details.
- IGNORED_BENCHMARK_KEYS = set(
- [
- "name",
- "family_index",
- "per_family_instance_index",
- "run_name",
- "run_type",
- "repetitions",
- "repetition_index",
- "threads",
- # We don't render `iterations` because we instead directly compute
- # statistical error bars using the multiple iterations. This removes the
- # need for manually considering the iteration count.
- "iterations",
- # We ignore the time and time unit metrics here because we directly
- # access and special case these metrics in order to apply the unit to
- # the times.
- "real_time",
- "cpu_time",
- "time_unit",
- ]
- )
- class DeltaKind(Enum):
- """Models the relevant kinds of deltas that we end up wanting to render."""
- IMPROVEMENT = "[faster]👍[/faster]"
- NEUTRAL = "~"
- REGRESSION = "[slower]👎[/slower]"
- NOISE = ""
- def __str__(self) -> str:
- return self.value
- @dataclass
- class RenderedDelta:
- """Rendered delta and pvalue for some metric."""
- kind: DeltaKind
- delta: str
- pvalue: str
- @dataclass
- class RenderedMetric:
- """Rendered non-delta metric and its confidence interval."""
- median: str
- conf: str
- @dataclass
- class BenchmarkRunMetrics:
- """The main data class used to collect metrics for benchmark runs.
- The data is read in using a JSON format that isn't organized in a convenient
- way to analyze and render, so we re-organize it into this data class and use
- that for analysis.
- Each object of this class corresponds to a specific named benchmark.
- """
- # The main metrics for this named benchmark, or the "experiment". This field
- # is always populated.
- exp: list[Quantity] = field(default_factory=lambda: [])
- # The metrics for this named benchmark in the base execution. May be empty
- # if no base execution was provided to compute a delta against.
- base: list[Quantity] = field(default_factory=lambda: [])
- # Any comparable benchmark metrics, indexed by the tag name to use when
- # rendering the comparison. May be empty if there are no comparable
- # benchmarks for the main one this represents.
- comps: defaultdict[str, list[Quantity]] = field(
- default_factory=lambda: defaultdict(list)
- )
- @dataclass
- class ComparableBenchmarkMapping:
- """Organizes any comparable benchmarks.
- Constructed with the list of benchmark names and regexes that describe
- comparable name structures.
- Names that match one of these regexes are organized into the main name in
- `main_benchmark_names`, and the comparable names in various mappings to
- allow computing comparisons metrics between the main and comparable names.
- Names that don't match any of the regexes are just directly included in
- `main_benchmark_names`.
- """
- # Names that are considered "main" benchmarks after filtering.
- main_benchmark_names: list[str]
- # Maps a comparison benchmark name to its base name (tag removed).
- name_to_base: dict[str, str]
- # Maps a base name to its main benchmark name.
- base_to_main_name: dict[str, str]
- # Maps a comparison benchmark name to its tag.
- name_to_comp_tag: dict[str, str]
- # Maps a main benchmark name to a list of its comparison tags.
- main_name_to_comp_tags: dict[str, list[str]]
- def __init__(
- self,
- original_benchmark_names: list[str],
- comparable_re_strs: list[str],
- console: Console,
- ):
- """Identify main and comparable benchmarks."""
- self.main_benchmark_names = []
- self.name_to_base = {}
- self.base_to_main_name = {}
- self.name_to_comp_tag = {}
- self.main_name_to_comp_tags = {}
- comp_res = [
- re.compile(comparable_re_str)
- for comparable_re_str in comparable_re_strs
- ]
- for comp_re in comp_res:
- if "main" not in comp_re.groupindex:
- console.log(
- "ERROR: No main capture group in the "
- "`--benchmark_comparable_re` flag!"
- )
- sys.exit(1)
- for name in original_benchmark_names:
- comp_match = next(
- (m for comp_re in comp_res if (m := comp_re.search(name))), None
- )
- if not comp_match:
- # Non-comparable benchmark
- self.main_benchmark_names.append(name)
- continue
- tag_group = 0
- if "tag" in comp_match.re.groupindex:
- tag_group = comp_match.re.groupindex["tag"]
- tag = comp_match.group(tag_group)
- tag_begin, tag_end = comp_match.span(tag_group)
- base_name = name[:tag_begin] + name[tag_end:]
- self.name_to_base[name] = base_name
- if comp_match.group("main"):
- self.base_to_main_name[base_name] = name
- self.main_benchmark_names.append(name)
- else:
- self.name_to_comp_tag[name] = tag
- # Verify that for all the comparable benchmarks we actually found a main
- # benchmark name. We can't do this while processing initially as we
- # don't know the relative order of main and comparable benchmark names.
- #
- # Also collect a list of all the comparison tags for a given main name.
- # self.main_name_to_comp_tags: dict[str, list[str]] = {}
- for comp, comp_tag in self.name_to_comp_tag.items():
- base_name = self.name_to_base[comp]
- main_name = self.base_to_main_name[base_name]
- if not main_name:
- console.log(
- f"ERROR: Comparable benchmark `{comp}` has no corresponding"
- " main benchmark name!"
- )
- sys.exit(1)
- if comp_tag in self.main_name_to_comp_tags.get(main_name, []):
- console.log(
- f"ERROR: Duplicate comparison tag `{comp_tag}` for main "
- f"benchmark `{main_name}`!"
- )
- sys.exit(1)
- self.main_name_to_comp_tags.setdefault(main_name, []).append(
- comp_tag
- )
- def float_ratio(nom: float, denom: float) -> float:
- """Translate a ratio of floats into a float, handling divide by zero."""
- if denom != 0.0:
- return nom / denom
- elif nom > 0.0:
- return math.inf
- elif nom < 0.0:
- return -math.inf
- else:
- return 0.0
- def render_fixed_width_float(x: float) -> str:
- """Renders a floating point value into a fixed width string."""
- if math.isinf(x):
- return f"{x:>4f}{'':<3}"
- (frac, whole) = math.modf(x)
- frac_str = f"{math.fabs(frac):<4.3f}"[1:]
- return f"{int(whole):> 3}{frac_str}"
- def render_ratio(ratio: float) -> str:
- """Renders a ratio into a human-friendly string form.
- This uses a % for ratios with a magnitude less than 1.0. For ratios with a
- larger magnitude, they are rendered as a fixed width floating point number
- with an `x` suffix.
- """
- if ratio > 1.0 or ratio < -1.0:
- return f"{render_fixed_width_float(ratio)}x"
- else:
- return f"{render_fixed_width_float(ratio * 100.0)}%"
- def render_metric(
- alpha: float, times: list[Quantity], is_base: bool
- ) -> RenderedMetric:
- """Render a non-delta metric.
- Computes the string to use for both the metric itself and the string to show
- the confidence interval for that metric.
- Args:
- alpha: The alpha value to use for the confidence interval.
- times: The list of measurements.
- is_base:
- Whether to use the "baseline" or "experiment" theme in the rendered
- strings.
- """
- if is_base:
- style_prefix = "base_"
- else:
- style_prefix = "exp_"
- units = times[0].units
- if all(x == times[0] for x in times):
- with Quantity.prefs(number_fmt="{whole:>3}{frac:<4} {units}"):
- return RenderedMetric(
- f"[{style_prefix}median]{times[0]:.3}[/{style_prefix}median]",
- "",
- )
- median = Quantity(np.median(times), units=units)
- median_test = sp.stats.quantile_test(times, q=median)
- median_ci = median_test.confidence_interval(confidence_level=(1.0 - alpha))
- ci_str = "?"
- if not math.isnan(median_ci.low) and not math.isnan(median_ci.high):
- low_delta = median - median_ci.low
- high_delta = median_ci.high - median
- assert low_delta >= 0.0, high_delta >= 0.0
- delta = max(low_delta, high_delta)
- ci_str = render_ratio(float_ratio(delta, median))
- with Quantity.prefs(number_fmt="{whole:>3}{frac:<4} {units}"):
- return RenderedMetric(
- f"[{style_prefix}median]{median:.3}[/{style_prefix}median]",
- f"[{style_prefix}conf]{ci_str:9}[/{style_prefix}conf]",
- )
- def render_delta(
- metric: str, alpha: float, base: list[Quantity], exp: list[Quantity]
- ) -> RenderedDelta:
- """Render a delta metric.
- This handles computing the delta, its statistical significance, and
- whether that delta is an improvement or a regression based on the specific
- metric name.
- Args:
- metric:
- The name of the metric to guide whether bigger or smaller is an
- improvement.
- alpha: The alpha value to use for the confidence interval.
- base: The baseline measurements.
- exp: The experiment measurements.
- """
- # Skip any delta when all the data is zero. This typically occurs for
- # uninteresting metrics or metrics that weren't collected for a given run.
- if all(b == 0 for b in base) and all(e == 0 for e in exp):
- return RenderedDelta(DeltaKind.NEUTRAL, "", "")
- if any(speed_pat.search(metric) for speed_pat in SPEED_METRIC_PATTERNS):
- bigger_style = "faster"
- smaller_style = "slower"
- bigger_kind = DeltaKind.IMPROVEMENT
- smaller_kind = DeltaKind.REGRESSION
- elif any(cost_pat.search(metric) for cost_pat in COST_METRIC_PATTERNS):
- bigger_style = "slower"
- smaller_style = "faster"
- bigger_kind = DeltaKind.REGRESSION
- smaller_kind = DeltaKind.IMPROVEMENT
- else:
- return RenderedDelta(DeltaKind.NEUTRAL, "", "")
- u_test = sp.stats.mannwhitneyu(base, exp)
- if u_test.pvalue >= alpha:
- return RenderedDelta(
- DeltaKind.NOISE, " ?? ", f"p={u_test.pvalue:.3}"
- )
- kind = DeltaKind.NEUTRAL
- base_median = np.median(base)
- exp_median = np.median(exp)
- exp_ratio = float_ratio(exp_median, base_median)
- # TODO: Maybe the threshold of "interesting" should be configurable instead
- # of being fixed at 0.1%.
- if exp_ratio >= 1.001:
- style = bigger_style
- kind = bigger_kind
- elif exp_ratio <= 0.999:
- style = smaller_style
- kind = smaller_kind
- else:
- style = "default"
- if exp_ratio >= 2.0 or exp_ratio <= 0.5:
- return RenderedDelta(
- kind,
- f"[{style}]{render_fixed_width_float(exp_ratio)}x[/{style}]",
- f"p={u_test.pvalue:.3}",
- )
- # Use a percent-delta for smaller ratios to make the delta more easily
- # understood by readers.
- exp_delta_percent = (
- float_ratio(exp_median - base_median, base_median) * 100.0
- )
- return RenderedDelta(
- kind,
- f"[{style}]{render_fixed_width_float(exp_delta_percent)}%[/{style}]",
- f"p={u_test.pvalue:.3}",
- )
- def render_metric_column(
- metric: str,
- alpha: float,
- runs: list[BenchmarkRunMetrics],
- ) -> Table:
- """Render the column of the benchmark results table for a given metric.
- We render a single column for each metric, and use a careful line-oriented
- layout within the column to ensure "rows" line up for each individual
- benchmark. Within the column, we use a nested table to layout the different
- rendered strings.
- A key goal of the rendering throughout is to arrange for rendered numbers to
- have the decimal point in a consistent column so that it isn't confusing for
- readers to identify the position of the decimal point and magnitude of the
- number rendered.
- Args:
- metric: The name of the metric to render.
- alpha: The alpha value to use for the confidence interval.
- runs: The list of benchmark runs.
- """
- t = Table.grid(
- Column(),
- # It might seem like we want the left column here to be right-aligned,
- # but we're going to carefully align the digits in the format string,
- # and we can't easily control the length of units. So we left-align to
- # simplify the digit layout.
- Column(justify="left"),
- Column(justify="center"),
- Column(justify="left"),
- padding=(0, 1),
- )
- for run in runs:
- if len(run.base) != 0:
- # We have a baseline run to compare against, so compute the delta
- # between it and the experiment as well as the specific baseline run
- # metric.
- rendered_delta = render_delta(metric, alpha, run.base, run.exp)
- rendered_base = render_metric(alpha, run.base, is_base=True)
- # Add the delta as the first row, then the baseline metric.
- t.add_row(
- str(rendered_delta.kind),
- rendered_delta.delta,
- "",
- rendered_delta.pvalue,
- )
- t.add_row("", rendered_base.median, "±", rendered_base.conf)
- # Now render the experiment metric and add its row.
- rendered_exp = render_metric(alpha, run.exp, is_base=False)
- t.add_row("", rendered_exp.median, "±", rendered_exp.conf)
- # If we have any comparable benchmarks, render each of them as first a
- # delta and then the specific comparable metric as its own kind of
- # baseline.
- #
- # TODO: At some point when we support combining baseline _runs_ with
- # comparable metrics, we'll need to change this to render both baseline
- # and experiment comparables and a delta-of-delta. But currently we
- # don't support combining these which simplifies the rendering here.
- for name, comp in sorted(run.comps.items()):
- rendered_delta = render_delta(metric, alpha, comp, run.exp)
- t.add_row(
- str(rendered_delta.kind),
- rendered_delta.delta,
- "",
- rendered_delta.pvalue,
- )
- rendered_comp = render_metric(alpha, comp, is_base=True)
- t.add_row("", rendered_comp.median, "±", rendered_comp.conf)
- # Lastly, if we had a baseline run or any comparable metrics we will
- # have rendered multiple lines of data. Add a blank line so that these
- # form a visual group.
- if len(run.base) != 0 or len(run.comps) != 0:
- t.add_row()
- return t
- def run_benchmark_binary(
- binary_path: Path,
- common_args: list[str],
- specific_args: list[str],
- num_runs: int,
- console: Console,
- ) -> list[dict]:
- """Runs a benchmark binary multiple times and collects results.
- The results are parsed out of the JSON output from each run, and returned as
- a list of dictionaries. Each dictionary represents one run.
- This will log the command being run, show a progress bar for each run
- performed, and then log de-duplicated `stderr` output from the runs.
- """
- # If the binary path has no directory components and exists as a relative
- # file, add `./` as a prefix. Otherwise, we want to pass the name unchanged
- # for `PATH` search.
- binary_str = str(binary_path)
- if len(binary_path.parts) == 1 and binary_path.exists():
- binary_str = f"./{binary_str}"
- run_cmd = (
- [binary_str]
- + DEFAULT_BENCHMARK_ARGS
- + common_args
- + specific_args
- # Pass the format flag last as it is required and can't be overridden.
- + ["--benchmark_format=json"]
- )
- console.log(f"Executing: {' '.join(run_cmd)}")
- runs_data = []
- unique_stderr: list[bytes] = []
- for _ in track(
- range(num_runs), description=f"Running {binary_path.name}..."
- ):
- p = subprocess.run(
- run_cmd,
- check=True,
- stdout=subprocess.PIPE,
- stderr=subprocess.PIPE,
- )
- runs_data.append(json.loads(p.stdout))
- stderr = p.stderr.strip()
- if len(stderr) != 0 and stderr not in unique_stderr:
- unique_stderr.append(stderr)
- for stderr_output in unique_stderr:
- # Decode stderr, replacing errors in case of non-UTF-8 characters.
- console.log(
- f"{binary_path.name} stderr:\n"
- f"{stderr_output.decode('utf-8', errors='replace')}"
- )
- return runs_data
- def print_run_context(
- console: Console,
- num_runs: int,
- exp_runs: list[dict],
- has_baseline: bool,
- ) -> None:
- """Prints the context from the benchmark runs.
- This replicates the useful context information from Google Benchmark's
- default output, such as CPU information and cache sizes.
- TODO: Print differently when context of base and experiment runs differ.
- Args:
- console: The rich console to print to.
- num_runs: The number of times the benchmarks were run.
- exp_runs: The results from the experiment benchmark runs.
- has_baseline: Whether a baseline benchmark was also run.
- """
- if has_baseline:
- runs_description = f"Ran baseline and experiment {num_runs} times"
- else:
- runs_description = f"Ran {num_runs} times"
- context = exp_runs[0]["context"]
- console.print(
- f"{runs_description} on "
- f"{context['num_cpus']} x {context['mhz_per_cpu']} MHz CPUs"
- )
- console.print("CPU caches:")
- for cache in context["caches"]:
- size = Quantity(cache["size"], binary=True)
- console.print(f" L{cache['level']} {cache['type']} {size:b}")
- console.print(
- f"Load avg: {' '.join([str(load) for load in context['load_avg']])}"
- )
- def get_benchmark_names_and_metrics(
- console: Console,
- parsed_args: argparse.Namespace,
- exp_runs: list[dict],
- base_runs: list[dict],
- ) -> tuple[list[str], list[str]]:
- """Extracts benchmark names and metrics from benchmark run results.
- This function determines the list of unique benchmark names and the metrics
- to be displayed based on the benchmark output and command-line arguments.
- Args:
- parsed_args: The parsed command-line arguments.
- exp_runs: A list of benchmark run results for the experiment binary.
- base_runs: A list of benchmark run results for the baseline binary.
- Returns:
- - The list of unique benchmark names, maintaining their order.
- - The list of metrics to display.
- """
- # Start with the base time and iteration metrics requested.
- metrics: list[str] = []
- if parsed_args.wall_time:
- metrics.append("real_time")
- else:
- metrics.append("cpu_time")
- if parsed_args.show_iterations:
- metrics.append("iterations")
- # Compile a regex for filtering extra metrics, if provided.
- if metrics_filter_str := parsed_args.extra_metrics_filter:
- metrics_filter = re.compile(metrics_filter_str)
- else:
- metrics_filter = None
- # We only need to inspect the first run to find all benchmark and metric
- # names. We combine benchmarks from both experiment and baseline runs to get
- # a complete set.
- one_run_benchmarks = exp_runs[0]["benchmarks"]
- if parsed_args.base_benchmark:
- one_run_benchmarks += base_runs[0]["benchmarks"]
- benchmark_name_set: set[str] = set()
- benchmark_name_indices: dict[str, tuple[int, int]] = {}
- for benchmark in one_run_benchmarks:
- name = benchmark["name"]
- benchmark_name_set.add(name)
- indices = (
- benchmark["family_index"],
- benchmark["per_family_instance_index"],
- )
- if name not in benchmark_name_indices:
- benchmark_name_indices[name] = indices
- else:
- if benchmark_name_indices[name] != indices:
- console.print(
- f"ERROR: Inconsintent indices {indices} and "
- f"{benchmark_name_indices[name]} for benchmark `{name}`."
- )
- sys.exit(1)
- # Add any extra metrics from this benchmark.
- for key in benchmark.keys():
- if key in metrics or key in IGNORED_BENCHMARK_KEYS:
- continue
- if metrics_filter and not re.search(metrics_filter, key):
- continue
- metrics.append(key)
- benchmark_names = sorted(
- list(benchmark_name_set), key=lambda name: benchmark_name_indices[name]
- )
- return benchmark_names, metrics
- def collect_benchmark_metrics(
- benchmark_names: list[str],
- metrics: list[str],
- exp_runs: list[dict],
- base_runs: list[dict],
- comp_mapping: ComparableBenchmarkMapping,
- ) -> dict[str, dict[str, BenchmarkRunMetrics]]:
- """Collects and organizes all benchmark metrics from raw run data.
- This function takes the raw benchmark run data and organizes it into a
- structured format suitable for analysis and rendering. It initializes the
- main data structure, handles the mapping of comparable benchmarks, and
- populates the metrics for both experiment and baseline runs.
- Args:
- benchmark_names: The initial list of unique benchmark names.
- metrics: A list of all metric names to be collected.
- exp_runs: A list of benchmark run results for the experiment binary.
- base_runs: A list of benchmark run results for the baseline binary.
- comp_mapping: The mapping of comparable benchmarks.
- Returns:
- A dictionary where keys are metric names. The values are another
- dictionary where keys are benchmark names and values are
- BenchmarkRunMetrics objects containing the collected measurements.
- """
- # Initialize the data structure to hold all collected metrics.
- benchmark_metrics: dict[str, dict[str, BenchmarkRunMetrics]] = {
- metric: {name: BenchmarkRunMetrics() for name in benchmark_names}
- for metric in metrics
- }
- # Populate metrics from the experiment runs.
- for run in exp_runs:
- for b in run["benchmarks"]:
- name = b["name"]
- for metric in metrics:
- # Time metrics have a `time_unit` field that needs to be
- # appended for correct parsing by the Quantity library.
- unit = b.get("time_unit", "") if "time" in metric else ""
- # If this is a comparable benchmark, add its metrics to the
- # 'comps' list of its corresponding main benchmark.
- if maybe_comp_tag := comp_mapping.name_to_comp_tag.get(name):
- main_name = comp_mapping.base_to_main_name[
- comp_mapping.name_to_base[name]
- ]
- benchmark_metrics[metric][main_name].comps[
- maybe_comp_tag
- ].append(Quantity(f"{b[metric]}{unit}"))
- # Otherwise, add it to the 'exp' list of its own entry if it's
- # a main benchmark.
- elif name in benchmark_names:
- benchmark_metrics[metric][name].exp.append(
- Quantity(f"{b[metric]}{unit}")
- )
- # Populate metrics from the baseline runs.
- for run in base_runs:
- for b in run["benchmarks"]:
- name = b["name"]
- # Baseline runs don't have comparable benchmarks, so we only need
- # to populate the 'base' list for main benchmarks.
- if name in benchmark_names:
- for metric in metrics:
- unit = b.get("time_unit", "") if "time" in metric else ""
- benchmark_metrics[metric][name].base.append(
- Quantity(f"{b[metric]}{unit}")
- )
- return benchmark_metrics
- def print_metric_key(
- console: Console,
- alpha: float,
- has_baseline: bool,
- comp_mapping: ComparableBenchmarkMapping,
- ) -> None:
- """Prints a legend for the metrics table.
- This explains the format of the output table, including what the delta,
- median, and confidence interval values represent.
- Args:
- console: The rich console to print to.
- alpha: The alpha value for statistical significance.
- has_baseline: Whether a baseline benchmark was run.
- """
- console.print("Metric key:")
- conf = int(100 * (1.0 - alpha))
- name = "BenchmarkName..."
- delta_icon = str(DeltaKind.IMPROVEMENT)
- delta = "[faster]<delta>[/faster]"
- p = "p=<U-test P-value>"
- base_median = "[base_median]<median>[/base_median]"
- base_conf = f"[base_conf]<% at {conf}th conf>[/base_conf]"
- exp_median = "[exp_median]<median>[/exp_median]"
- exp_conf = f"[exp_conf]<% at {conf}th conf>[/exp_conf]"
- key_table = Table.grid(
- Column(justify="right"),
- Column(),
- Column(),
- Column(),
- Column(),
- padding=(0, 1),
- )
- if has_baseline:
- key_table.add_row(name, delta_icon, delta, "", p)
- key_table.add_row("baseline:", "", base_median, "±", base_conf)
- key_table.add_row("experiment:", "", exp_median, "±", exp_conf)
- else:
- key_table.add_row(name, "", exp_median, "±", exp_conf)
- # Only display comparable key if we have comparables to display.
- if bool(comp_mapping.name_to_comp_tag):
- key_table.add_row("vs Comparable:", delta_icon, delta, p)
- key_table.add_row("", "", base_median, "±", base_conf)
- console.print(Padding(key_table, (0, 0, 1, 3)))
- def print_results_table(
- console: Console,
- alpha: float,
- has_baseline: bool,
- metrics: list[str],
- benchmark_names: list[str],
- benchmark_metrics: dict[str, dict[str, BenchmarkRunMetrics]],
- comp_mapping: ComparableBenchmarkMapping,
- ) -> None:
- """Builds and prints the main results table.
- This function constructs a rich `Table` to display the benchmark results,
- including deltas, medians, and confidence intervals for each metric. It then
- prints this to the provided `console`.
- Args:
- console: The rich console to print to.
- metrics: A list of metric names to be displayed as columns.
- benchmark_names: A list of main benchmark names for the rows.
- alpha: The alpha value for statistical significance.
- benchmark_metrics: A nested dictionary containing the collected metrics
- for each benchmark and metric.
- has_baseline: Whether a baseline benchmark was run.
- comp_mapping: The mapping of comparable benchmarks.
- """
- METRIC_TITLES = {
- "real_time": "Wall Time",
- "cpu_time": "CPU Time",
- "iterations": "Iterations",
- }
- name_width = max(
- (
- len(name)
- for name in (
- benchmark_names
- + [
- f"vs {tag}:"
- for tag in comp_mapping.name_to_comp_tag.values()
- ]
- + ["experiment:"]
- )
- )
- )
- table = Table(show_edge=False)
- # The benchmark name column we want to justify right for the sub-labels, but
- # we will fill the name in the column completed and the name will visually
- # be justified to the left, so force the heading to justify left unlike the
- # column text. We also disable wrapping because we manually fill the column
- # and require line-precise layout.
- table.add_column(
- Text("Benchmark", justify="left"), justify="right", no_wrap=True
- )
- for metric in metrics:
- title = Text(METRIC_TITLES.get(metric, metric), justify="center")
- table.add_column(title, justify="left", no_wrap=True)
- name_t = Table.grid(Column(justify="right", no_wrap=True), expand=True)
- for name in benchmark_names:
- name_t.add_row(f"{name}{'.' * (name_width - len(name))}")
- if has_baseline:
- name_t.add_row("baseline:")
- name_t.add_row("experiment:")
- name_t.add_row()
- elif comp_tags := comp_mapping.main_name_to_comp_tags.get(name):
- for tag in comp_tags:
- name_t.add_row(f"vs {tag}:")
- name_t.add_row()
- name_t.add_row()
- row = [name_t]
- for metric in metrics:
- metric_runs = benchmark_metrics[metric]
- row.append(
- render_metric_column(
- metric, alpha, [metric_runs[name] for name in benchmark_names]
- )
- )
- table.add_row(*row)
- console.print(table)
- def main() -> None:
- parsed_args = parse_args()
- console = Console(theme=THEME)
- Quantity.set_prefs(spacer=" ", map_sf=Quantity.map_sf_to_greek)
- if parsed_args.base_benchmark and parsed_args.benchmark_comparable_re:
- console.print(
- "ERROR: Cannot mix a base benchmark binary with benchmark "
- "comparisons."
- )
- sys.exit(1)
- # Run the benchmark(s) and collect the results into a data structure for
- # processing.
- num_runs = parsed_args.runs
- base_runs: list[dict] = []
- has_baseline = bool(parsed_args.base_benchmark)
- if has_baseline:
- base_runs = run_benchmark_binary(
- parsed_args.base_benchmark,
- parsed_args.benchmark_args,
- parsed_args.base_benchmark_args,
- num_runs,
- console,
- )
- exp_runs = run_benchmark_binary(
- parsed_args.exp_benchmark,
- parsed_args.benchmark_args,
- parsed_args.exp_benchmark_args,
- num_runs,
- console,
- )
- # If JSON output is requested, just dump the data without further
- # processing.
- if parsed_args.output == "json":
- console.log("Printing JSON results...")
- console.print_json(json.dumps(exp_runs))
- if has_baseline:
- console.print_json(json.dumps(base_runs))
- return
- print_run_context(console, num_runs, exp_runs, has_baseline)
- # Collect the benchmark names and metric names.
- benchmark_names, metrics = get_benchmark_names_and_metrics(
- console, parsed_args, exp_runs, base_runs
- )
- # Build any mappings between main benchmark names and comparables, and reset
- # our benchmark names to the main ones.
- comp_mapping = ComparableBenchmarkMapping(
- benchmark_names, parsed_args.benchmark_comparable_re, console
- )
- benchmark_names = comp_mapping.main_benchmark_names
- # Collect and organize the actual benchmark metrics from the raw JSON
- # structures across the runs. This pivots the data into an easy to analyze
- # and render structure, but doesn't do the analysis itself.
- benchmark_metrics = collect_benchmark_metrics(
- benchmark_names, metrics, exp_runs, base_runs, comp_mapping
- )
- # Analyze and render a readable table of the collected metrics. This is
- # where we do statistical analysis and render confidence intervals,
- # significance, and other helpful indicators based on the collected data. We
- # also print relevant keys to reading and interpreting the rendered data.
- alpha = parsed_args.alpha
- console.print(
- "Computing statistically significant deltas only where"
- f"the P-value < 𝛂 of {alpha}"
- )
- print_metric_key(console, alpha, has_baseline, comp_mapping)
- print_results_table(
- console,
- alpha,
- has_baseline,
- metrics,
- benchmark_names,
- benchmark_metrics,
- comp_mapping,
- )
- if __name__ == "__main__":
- main()
|