tomteb
/
carbon-lang


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143
							#!/usr/bin/env -S uv run --script

# /// script
# requires-python = ">=3.10"
# dependencies = [
#     "numpy",
#     "rich",
#     "scipy",
#     "quantiphy",
# ]
# ///

"""Script to run GoogleBenchmark binaries repeatedly and render results.

This script helps run benchmarks repeatedly and render the resulting
measurements in a way that effectively surfaces noisy benchmarks and provides
statistically significant information about the measurements.

There are two primary modes:

1) Running a single experiment benchmark binary repeatedly to understand that
   benchmark's performance.

2) Running both an experiment and a baseline benchmark binary that include the
   same benchmark names to understand the change in performance for each named
   benchmark.

Across all of these modes, when rendering a specific metric for a benchmark, we
also render the confidence intervals based on the specified `--alpha` parameter.

For mode (1) when running a single benchmark binary, there is additional support
for passing regular expressions that describe a set of comparable benchmarks for
some main benchmark. When used, the comparable benchmarks for each main one are
rendered as a delta of the main rather than as completely independent metrics.

For mode (2) when running an experiment and baseline binary, every benchmark is
rendered as a delta of the experiment vs. the baseline.

Whenever rendering a delta, this script will flag statistically significant
(according to the provided `--alpha`) improvements or regressions, compute the
improvement or regression, and display the resulting p-value. This script uses
non-parametric U-test for statistical significance, the same as Go's benchmark
comparison tools, based on the large body of evidence that benchmarks rarely if
ever tend to adhere to a normal or other known distribution. A non-parametric
statistical model instead provides a much more realistic basis for comparing two
measurements.

The reported metrics themselves are also classified into "speed" vs. "cost"
metrics in order to model whether larger is an improvement or a regression.

The script uses `uv` to run it rather than Python directly, which manages and
caches its dependencies. For installation instructions for `uv` see:
- Carbon's documentation:
  https://docs.carbon-lang.dev/docs/project/contribution_tools.html#optional-tools
- UV's documentation: https://docs.astral.sh/uv/getting-started/installation/
"""

from __future__ import annotations

__copyright__ = """
Part of the Carbon Language project, under the Apache License v2.0 with LLVM
Exceptions. See /LICENSE for license information.
SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
"""

import argparse
import json
import math
import numpy as np  # type: ignore
import re
import scipy as sp  # type: ignore
import subprocess
import sys
from collections import defaultdict
from dataclasses import dataclass, field
from enum import Enum
from pathlib import Path
from quantiphy import Quantity  # type: ignore
from rich.console import Console
from rich.padding import Padding
from rich.progress import track
from rich.table import Column, Table
from rich.text import Text
from rich.theme import Theme
from typing import Optional


def parse_args(args: Optional[list[str]] = None) -> argparse.Namespace:
    """Parsers command-line arguments and flags."""
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
        "--exp_benchmark",
        metavar="BINARY",
        required=True,
        type=Path,
        help="The experiment benchmark binary to run",
    )
    parser.add_argument(
        "--base_benchmark",
        metavar="BINARY",
        type=Path,
        help="""
The baseline benchmark binary to run.

Passing this flag will enable both a baseline and experiment, and change the
analysis to compute and display any statistically significant delta as well
as the before and after values of the each benchmark run.
""".strip(),
    )
    parser.add_argument(
        "--benchmark_args",
        action="append",
        default=[],
        metavar="ARG",
        help="Extra arguments to both the experiment and baseline benchmark",
    )
    parser.add_argument(
        "--exp_benchmark_args",
        action="append",
        default=[],
        metavar="ARG",
        help="Extra arguments to the experiment benchmark",
    )
    parser.add_argument(
        "--base_benchmark_args",
        action="append",
        default=[],
        metavar="ARG",
        help="Extra arguments to the baseline benchmark",
    )
    parser.add_argument(
        "--benchmark_comparable_re",
        metavar="PATTERN",
        action="append",
        default=[],
        help="""
A regular expression that is used to match sets of benchmarks that should be
compared with each other. This flag may be specified multiple times with
different regular expressions to handle multiple different grouping schemes or
structures. May not be combined with `base_benchmark`.

Each regular expression is used to group together benchmark names distinguished
by a "tag" substring in the name. Either the regex as a whole or a `tag`
symbolic capture group within the regex designates this substring. Further, a
`main` symbolic capture group _must_ be included and only match when the
specific substring is the main benchmark name and other matching ones should be
viewed as comparisons against it. When rendering, only the name matching the
main capture group will be rendered, with others rendered as comparisons against
it based on the tag, and with statistical significance to evaluate the
comparison.

Example regex: `(?P<tag>(?P<main>Carbon)|Abseil|LLVM)HashBench`

This produces three tags, `Carbon`, `Abseil`, and `LLVM`. The main tag is
`Carbon`.

TODO: This is only currently supported without a base benchmark to provide
relative comparisons within a single benchmark binary. There are good models for
handling this and surfacing delta-of-delta information with a base benchmark
binary.
""".strip(),
    )
    parser.add_argument(
        "--runs",
        default=10,
        metavar="N",
        type=int,
        help="Number of runs of the benchmark",
    )
    parser.add_argument(
        "--wall_time",
        action="store_true",
        help="Use wall-clock time instead of CPU time",
    )
    parser.add_argument(
        "--show_iterations",
        action="store_true",
        help="Show the iteration counts",
    )
    parser.add_argument(
        "--extra_metrics_filter",
        metavar="PATTERN",
        type=str,
        help="A regex filter on the names of extra metrics to display.",
    )
    parser.add_argument(
        "--alpha",
        default=0.05,
        metavar="𝛂",
        type=float,
        help="""
Threshold for P-values to be considered statistically significant. Also used to
compute the confidence intervals for individual metrics.
""".strip(),
    )
    parser.add_argument(
        "--output",
        choices=["console", "json"],
        default="console",
        help="""
Output format to use, note that `json` output doesn't do any analysis of the
results, and just dumps the aggregate JSON data from the repeated runs.
""".strip(),
    )
    return parser.parse_args(args=args)


# Default arguments that will be passed even when arguments are passed with
# `--benchmark_args` to the script. These can be undone by overriding them in
# explicitly passed arguments.
DEFAULT_BENCHMARK_ARGS = [
    # Randomize the order in which the benchmarks run to avoid skewed results
    # due to a specific order.
    "--benchmark_enable_random_interleaving",
    # Reduce the default minimum time to 0.1s as it's more effective to use
    # multiple runs to improve confidence in measurements.
    "--benchmark_min_time=0.1s",
]


# Pre-compiled regexes to match metrics that measure _speed_: larger is better.
SPEED_METRIC_PATTERNS = [
    re.compile(p)
    for p in [
        r"(?i)rate",
        r"(?i).*per[\s_](second|ms|ns)",
    ]
]


# Pre-compiled regexes to match metrics that measure _cost_: smaller is better.
COST_METRIC_PATTERNS = [
    re.compile(p)
    for p in [
        r"(?i)cycles",
        r"(?i)instructions",
        r"(?i)time",
    ]
]


# Theme for use with the Rich `Console` printing.
THEME = Theme(
    {
        "base_median": "cyan",
        "exp_median": "magenta",
        "base_conf": "cyan",
        "exp_conf": "magenta",
        "slower": "bright_red",
        "faster": "bright_green",
    }
)


# The set of benchmark keys we ignore in the JSON data structure. Most of these
# are things are incidental, but a few are more surprising. See comments on
# specific entries for details.
IGNORED_BENCHMARK_KEYS = set(
    [
        "name",
        "family_index",
        "per_family_instance_index",
        "run_name",
        "run_type",
        "repetitions",
        "repetition_index",
        "threads",
        # We don't render `iterations` because we instead directly compute
        # statistical error bars using the multiple iterations. This removes the
        # need for manually considering the iteration count.
        "iterations",
        # We ignore the time and time unit metrics here because we directly
        # access and special case these metrics in order to apply the unit to
        # the times.
        "real_time",
        "cpu_time",
        "time_unit",
    ]
)


class DeltaKind(Enum):
    """Models the relevant kinds of deltas that we end up wanting to render."""

    IMPROVEMENT = "[faster]👍[/faster]"
    NEUTRAL = "~"
    REGRESSION = "[slower]👎[/slower]"
    NOISE = ""

    def __str__(self) -> str:
        return self.value


@dataclass
class RenderedDelta:
    """Rendered delta and pvalue for some metric."""

    kind: DeltaKind
    delta: str
    pvalue: str


@dataclass
class RenderedMetric:
    """Rendered non-delta metric and its confidence interval."""

    median: str
    conf: str


@dataclass
class BenchmarkRunMetrics:
    """The main data class used to collect metrics for benchmark runs.

    The data is read in using a JSON format that isn't organized in a convenient
    way to analyze and render, so we re-organize it into this data class and use
    that for analysis.

    Each object of this class corresponds to a specific named benchmark.
    """

    # The main metrics for this named benchmark, or the "experiment". This field
    # is always populated.
    exp: list[Quantity] = field(default_factory=lambda: [])

    # The metrics for this named benchmark in the base execution. May be empty
    # if no base execution was provided to compute a delta against.
    base: list[Quantity] = field(default_factory=lambda: [])

    # Any comparable benchmark metrics, indexed by the tag name to use when
    # rendering the comparison. May be empty if there are no comparable
    # benchmarks for the main one this represents.
    comps: defaultdict[str, list[Quantity]] = field(
        default_factory=lambda: defaultdict(list)
    )


@dataclass
class ComparableBenchmarkMapping:
    """Organizes any comparable benchmarks.

    Constructed with the list of benchmark names and regexes that describe
    comparable name structures.

    Names that match one of these regexes are organized into the main name in
    `main_benchmark_names`, and the comparable names in various mappings to
    allow computing comparisons metrics between the main and comparable names.

    Names that don't match any of the regexes are just directly included in
    `main_benchmark_names`.
    """

    # Names that are considered "main" benchmarks after filtering.
    main_benchmark_names: list[str]
    # Maps a comparison benchmark name to its base name (tag removed).
    name_to_base: dict[str, str]
    # Maps a base name to its main benchmark name.
    base_to_main_name: dict[str, str]
    # Maps a comparison benchmark name to its tag.
    name_to_comp_tag: dict[str, str]
    # Maps a main benchmark name to a list of its comparison tags.
    main_name_to_comp_tags: dict[str, list[str]]

    def __init__(
        self,
        original_benchmark_names: list[str],
        comparable_re_strs: list[str],
        console: Console,
    ):
        """Identify main and comparable benchmarks."""
        self.main_benchmark_names = []
        self.name_to_base = {}
        self.base_to_main_name = {}
        self.name_to_comp_tag = {}
        self.main_name_to_comp_tags = {}

        comp_res = [
            re.compile(comparable_re_str)
            for comparable_re_str in comparable_re_strs
        ]
        for comp_re in comp_res:
            if "main" not in comp_re.groupindex:
                console.log(
                    "ERROR: No main capture group in the "
                    "`--benchmark_comparable_re` flag!"
                )
                sys.exit(1)

        for name in original_benchmark_names:
            comp_match = next(
                (m for comp_re in comp_res if (m := comp_re.search(name))), None
            )
            if not comp_match:
                # Non-comparable benchmark
                self.main_benchmark_names.append(name)
                continue

            tag_group = 0
            if "tag" in comp_match.re.groupindex:
                tag_group = comp_match.re.groupindex["tag"]

            tag = comp_match.group(tag_group)
            tag_begin, tag_end = comp_match.span(tag_group)
            base_name = name[:tag_begin] + name[tag_end:]
            self.name_to_base[name] = base_name

            if comp_match.group("main"):
                self.base_to_main_name[base_name] = name
                self.main_benchmark_names.append(name)
            else:
                self.name_to_comp_tag[name] = tag

        # Verify that for all the comparable benchmarks we actually found a main
        # benchmark name. We can't do this while processing initially as we
        # don't know the relative order of main and comparable benchmark names.
        #
        # Also collect a list of all the comparison tags for a given main name.
        # self.main_name_to_comp_tags: dict[str, list[str]] = {}
        for comp, comp_tag in self.name_to_comp_tag.items():
            base_name = self.name_to_base[comp]
            main_name = self.base_to_main_name[base_name]
            if not main_name:
                console.log(
                    f"ERROR: Comparable benchmark `{comp}` has no corresponding"
                    " main benchmark name!"
                )
                sys.exit(1)

            if comp_tag in self.main_name_to_comp_tags.get(main_name, []):
                console.log(
                    f"ERROR: Duplicate comparison tag `{comp_tag}` for main "
                    f"benchmark `{main_name}`!"
                )
                sys.exit(1)
            self.main_name_to_comp_tags.setdefault(main_name, []).append(
                comp_tag
            )


def float_ratio(nom: float, denom: float) -> float:
    """Translate a ratio of floats into a float, handling divide by zero."""
    if denom != 0.0:
        return nom / denom
    elif nom > 0.0:
        return math.inf
    elif nom < 0.0:
        return -math.inf
    else:
        return 0.0


def render_fixed_width_float(x: float) -> str:
    """Renders a floating point value into a fixed width string."""
    if math.isinf(x):
        return f"{x:>4f}{'':<3}"

    frac, whole = math.modf(x)
    frac_str = f"{math.fabs(frac):<4.3f}"[1:]
    return f"{int(whole):> 3}{frac_str}"


def render_ratio(ratio: float) -> str:
    """Renders a ratio into a human-friendly string form.

    This uses a % for ratios with a magnitude less than 1.0. For ratios with a
    larger magnitude, they are rendered as a fixed width floating point number
    with an `x` suffix.
    """
    if ratio > 1.0 or ratio < -1.0:
        return f"{render_fixed_width_float(ratio)}x"
    else:
        return f"{render_fixed_width_float(ratio * 100.0)}%"


def render_metric(
    alpha: float, times: list[Quantity], is_base: bool
) -> RenderedMetric:
    """Render a non-delta metric.

    Computes the string to use for both the metric itself and the string to show
    the confidence interval for that metric.

    Args:
        alpha: The alpha value to use for the confidence interval.
        times: The list of measurements.
        is_base:
            Whether to use the "baseline" or "experiment" theme in the rendered
            strings.
    """

    if is_base:
        style_prefix = "base_"
    else:
        style_prefix = "exp_"

    units = times[0].units
    if all(x == times[0] for x in times):
        with Quantity.prefs(number_fmt="{whole:>3}{frac:<4} {units}"):
            return RenderedMetric(
                f"[{style_prefix}median]{times[0]:.3}[/{style_prefix}median]",
                "",
            )

    median = Quantity(np.median(times), units=units)
    median_test = sp.stats.quantile_test(times, q=median)
    median_ci = median_test.confidence_interval(confidence_level=(1.0 - alpha))

    ci_str = "?"
    if not math.isnan(median_ci.low) and not math.isnan(median_ci.high):
        low_delta = median - median_ci.low
        high_delta = median_ci.high - median
        assert low_delta >= 0.0, high_delta >= 0.0
        delta = max(low_delta, high_delta)
        ci_str = render_ratio(float_ratio(delta, median))

    with Quantity.prefs(number_fmt="{whole:>3}{frac:<4} {units}"):
        return RenderedMetric(
            f"[{style_prefix}median]{median:.3}[/{style_prefix}median]",
            f"[{style_prefix}conf]{ci_str:9}[/{style_prefix}conf]",
        )


def render_delta(
    metric: str, alpha: float, base: list[Quantity], exp: list[Quantity]
) -> RenderedDelta:
    """Render a delta metric.

    This handles computing the delta, its statistical significance, and
    whether that delta is an improvement or a regression based on the specific
    metric name.

    Args:
        metric:
            The name of the metric to guide whether bigger or smaller is an
            improvement.
        alpha: The alpha value to use for the confidence interval.
        base: The baseline measurements.
        exp: The experiment measurements.
    """
    # Skip any delta when all the data is zero. This typically occurs for
    # uninteresting metrics or metrics that weren't collected for a given run.
    if all(b == 0 for b in base) and all(e == 0 for e in exp):
        return RenderedDelta(DeltaKind.NEUTRAL, "", "")

    if any(speed_pat.search(metric) for speed_pat in SPEED_METRIC_PATTERNS):
        bigger_style = "faster"
        smaller_style = "slower"
        bigger_kind = DeltaKind.IMPROVEMENT
        smaller_kind = DeltaKind.REGRESSION
    elif any(cost_pat.search(metric) for cost_pat in COST_METRIC_PATTERNS):
        bigger_style = "slower"
        smaller_style = "faster"
        bigger_kind = DeltaKind.REGRESSION
        smaller_kind = DeltaKind.IMPROVEMENT
    else:
        return RenderedDelta(DeltaKind.NEUTRAL, "", "")

    u_test = sp.stats.mannwhitneyu(base, exp)
    if u_test.pvalue >= alpha:
        return RenderedDelta(
            DeltaKind.NOISE, "  ??       ", f"p={u_test.pvalue:.3}"
        )

    kind = DeltaKind.NEUTRAL

    base_median = np.median(base)
    exp_median = np.median(exp)
    exp_ratio = float_ratio(exp_median, base_median)
    # TODO: Maybe the threshold of "interesting" should be configurable instead
    # of being fixed at 0.1%.
    if exp_ratio >= 1.001:
        style = bigger_style
        kind = bigger_kind
    elif exp_ratio <= 0.999:
        style = smaller_style
        kind = smaller_kind
    else:
        style = "default"

    if exp_ratio >= 2.0 or exp_ratio <= 0.5:
        return RenderedDelta(
            kind,
            f"[{style}]{render_fixed_width_float(exp_ratio)}x[/{style}]",
            f"p={u_test.pvalue:.3}",
        )

    # Use a percent-delta for smaller ratios to make the delta more easily
    # understood by readers.
    exp_delta_percent = (
        float_ratio(exp_median - base_median, base_median) * 100.0
    )
    return RenderedDelta(
        kind,
        f"[{style}]{render_fixed_width_float(exp_delta_percent)}%[/{style}]",
        f"p={u_test.pvalue:.3}",
    )


def render_metric_column(
    metric: str,
    alpha: float,
    runs: list[BenchmarkRunMetrics],
) -> Table:
    """Render the column of the benchmark results table for a given metric.

    We render a single column for each metric, and use a careful line-oriented
    layout within the column to ensure "rows" line up for each individual
    benchmark. Within the column, we use a nested table to layout the different
    rendered strings.

    A key goal of the rendering throughout is to arrange for rendered numbers to
    have the decimal point in a consistent column so that it isn't confusing for
    readers to identify the position of the decimal point and magnitude of the
    number rendered.

    Args:
        metric: The name of the metric to render.
        alpha: The alpha value to use for the confidence interval.
        runs: The list of benchmark runs.
    """
    t = Table.grid(
        Column(),
        # It might seem like we want the left column here to be right-aligned,
        # but we're going to carefully align the digits in the format string,
        # and we can't easily control the length of units. So we left-align to
        # simplify the digit layout.
        Column(justify="left"),
        Column(justify="center"),
        Column(justify="left"),
        padding=(0, 1),
    )

    for run in runs:
        if len(run.base) != 0:
            # We have a baseline run to compare against, so compute the delta
            # between it and the experiment as well as the specific baseline run
            # metric.
            rendered_delta = render_delta(metric, alpha, run.base, run.exp)
            rendered_base = render_metric(alpha, run.base, is_base=True)

            # Add the delta as the first row, then the baseline metric.
            t.add_row(
                str(rendered_delta.kind),
                rendered_delta.delta,
                "",
                rendered_delta.pvalue,
            )
            t.add_row("", rendered_base.median, "±", rendered_base.conf)

        # Now render the experiment metric and add its row.
        rendered_exp = render_metric(alpha, run.exp, is_base=False)
        t.add_row("", rendered_exp.median, "±", rendered_exp.conf)

        # If we have any comparable benchmarks, render each of them as first a
        # delta and then the specific comparable metric as its own kind of
        # baseline.
        #
        # TODO: At some point when we support combining baseline _runs_ with
        # comparable metrics, we'll need to change this to render both baseline
        # and experiment comparables and a delta-of-delta. But currently we
        # don't support combining these which simplifies the rendering here.
        for name, comp in sorted(run.comps.items()):
            rendered_delta = render_delta(metric, alpha, comp, run.exp)
            t.add_row(
                str(rendered_delta.kind),
                rendered_delta.delta,
                "",
                rendered_delta.pvalue,
            )
            rendered_comp = render_metric(alpha, comp, is_base=True)
            t.add_row("", rendered_comp.median, "±", rendered_comp.conf)

        # Lastly, if we had a baseline run or any comparable metrics we will
        # have rendered multiple lines of data. Add a blank line so that these
        # form a visual group.
        if len(run.base) != 0 or len(run.comps) != 0:
            t.add_row()

    return t


def run_benchmark_binary(
    binary_path: Path,
    common_args: list[str],
    specific_args: list[str],
    num_runs: int,
    console: Console,
) -> list[dict]:
    """Runs a benchmark binary multiple times and collects results.

    The results are parsed out of the JSON output from each run, and returned as
    a list of dictionaries. Each dictionary represents one run.

    This will log the command being run, show a progress bar for each run
    performed, and then log de-duplicated `stderr` output from the runs.
    """
    # If the binary path has no directory components and exists as a relative
    # file, add `./` as a prefix. Otherwise, we want to pass the name unchanged
    # for `PATH` search.
    binary_str = str(binary_path)
    if len(binary_path.parts) == 1 and binary_path.exists():
        binary_str = f"./{binary_str}"
    run_cmd = (
        [binary_str]
        + DEFAULT_BENCHMARK_ARGS
        + common_args
        + specific_args
        # Pass the format flag last as it is required and can't be overridden.
        + ["--benchmark_format=json"]
    )
    console.log(f"Executing: {' '.join(run_cmd)}")

    runs_data = []
    unique_stderr: list[bytes] = []
    for _ in track(
        range(num_runs), description=f"Running {binary_path.name}..."
    ):
        p = subprocess.run(
            run_cmd,
            check=True,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
        )
        runs_data.append(json.loads(p.stdout))
        stderr = p.stderr.strip()
        if len(stderr) != 0 and stderr not in unique_stderr:
            unique_stderr.append(stderr)

    for stderr_output in unique_stderr:
        # Decode stderr, replacing errors in case of non-UTF-8 characters.
        console.log(
            f"{binary_path.name} stderr:\n"
            f"{stderr_output.decode('utf-8', errors='replace')}"
        )

    return runs_data


def print_run_context(
    console: Console,
    num_runs: int,
    exp_runs: list[dict],
    has_baseline: bool,
) -> None:
    """Prints the context from the benchmark runs.

    This replicates the useful context information from Google Benchmark's
    default output, such as CPU information and cache sizes.

    TODO: Print differently when context of base and experiment runs differ.

    Args:
        console: The rich console to print to.
        num_runs: The number of times the benchmarks were run.
        exp_runs: The results from the experiment benchmark runs.
        has_baseline: Whether a baseline benchmark was also run.
    """
    if has_baseline:
        runs_description = f"Ran baseline and experiment {num_runs} times"
    else:
        runs_description = f"Ran {num_runs} times"
    context = exp_runs[0]["context"]
    console.print(
        f"{runs_description} on "
        f"{context['num_cpus']} x {context['mhz_per_cpu']} MHz CPUs"
    )
    console.print("CPU caches:")
    for cache in context["caches"]:
        size = Quantity(cache["size"], binary=True)
        console.print(f"  L{cache['level']} {cache['type']} {size:b}")
    console.print(
        f"Load avg: {' '.join([str(load) for load in context['load_avg']])}"
    )


def get_benchmark_names_and_metrics(
    console: Console,
    parsed_args: argparse.Namespace,
    exp_runs: list[dict],
    base_runs: list[dict],
) -> tuple[list[str], list[str]]:
    """Extracts benchmark names and metrics from benchmark run results.

    This function determines the list of unique benchmark names and the metrics
    to be displayed based on the benchmark output and command-line arguments.

    Args:
        parsed_args: The parsed command-line arguments.
        exp_runs: A list of benchmark run results for the experiment binary.
        base_runs: A list of benchmark run results for the baseline binary.

    Returns:
        - The list of unique benchmark names, maintaining their order.
        - The list of metrics to display.
    """
    # Start with the base time and iteration metrics requested.
    metrics: list[str] = []
    if parsed_args.wall_time:
        metrics.append("real_time")
    else:
        metrics.append("cpu_time")
    if parsed_args.show_iterations:
        metrics.append("iterations")

    # Compile a regex for filtering extra metrics, if provided.
    if metrics_filter_str := parsed_args.extra_metrics_filter:
        metrics_filter = re.compile(metrics_filter_str)
    else:
        metrics_filter = None

    # We only need to inspect the first run to find all benchmark and metric
    # names. We combine benchmarks from both experiment and baseline runs to get
    # a complete set.
    one_run_benchmarks = exp_runs[0]["benchmarks"]
    if parsed_args.base_benchmark:
        one_run_benchmarks += base_runs[0]["benchmarks"]

    benchmark_name_set: set[str] = set()
    benchmark_name_indices: dict[str, tuple[int, int]] = {}
    for benchmark in one_run_benchmarks:
        name = benchmark["name"]
        benchmark_name_set.add(name)
        indices = (
            benchmark["family_index"],
            benchmark["per_family_instance_index"],
        )
        if name not in benchmark_name_indices:
            benchmark_name_indices[name] = indices
        else:
            if benchmark_name_indices[name] != indices:
                console.print(
                    f"ERROR: Inconsintent indices {indices} and "
                    f"{benchmark_name_indices[name]} for benchmark `{name}`."
                )
                sys.exit(1)

        # Add any extra metrics from this benchmark.
        for key in benchmark.keys():
            if key in metrics or key in IGNORED_BENCHMARK_KEYS:
                continue
            if metrics_filter and not re.search(metrics_filter, key):
                continue
            metrics.append(key)

    benchmark_names = sorted(
        list(benchmark_name_set), key=lambda name: benchmark_name_indices[name]
    )
    return benchmark_names, metrics


def collect_benchmark_metrics(
    benchmark_names: list[str],
    metrics: list[str],
    exp_runs: list[dict],
    base_runs: list[dict],
    comp_mapping: ComparableBenchmarkMapping,
) -> dict[str, dict[str, BenchmarkRunMetrics]]:
    """Collects and organizes all benchmark metrics from raw run data.

    This function takes the raw benchmark run data and organizes it into a
    structured format suitable for analysis and rendering. It initializes the
    main data structure, handles the mapping of comparable benchmarks, and
    populates the metrics for both experiment and baseline runs.

    Args:
        benchmark_names: The initial list of unique benchmark names.
        metrics: A list of all metric names to be collected.
        exp_runs: A list of benchmark run results for the experiment binary.
        base_runs: A list of benchmark run results for the baseline binary.
        comp_mapping: The mapping of comparable benchmarks.

    Returns:
        A dictionary where keys are metric names. The values are another
        dictionary where keys are benchmark names and values are
        BenchmarkRunMetrics objects containing the collected measurements.
    """
    # Initialize the data structure to hold all collected metrics.
    benchmark_metrics: dict[str, dict[str, BenchmarkRunMetrics]] = {
        metric: {name: BenchmarkRunMetrics() for name in benchmark_names}
        for metric in metrics
    }

    # Populate metrics from the experiment runs.
    for run in exp_runs:
        for b in run["benchmarks"]:
            name = b["name"]
            for metric in metrics:
                # Time metrics have a `time_unit` field that needs to be
                # appended for correct parsing by the Quantity library.
                unit = b.get("time_unit", "") if "time" in metric else ""

                # If this is a comparable benchmark, add its metrics to the
                # 'comps' list of its corresponding main benchmark.
                if maybe_comp_tag := comp_mapping.name_to_comp_tag.get(name):
                    main_name = comp_mapping.base_to_main_name[
                        comp_mapping.name_to_base[name]
                    ]
                    benchmark_metrics[metric][main_name].comps[
                        maybe_comp_tag
                    ].append(Quantity(f"{b[metric]}{unit}"))
                # Otherwise, add it to the 'exp' list of its own entry if it's
                # a main benchmark.
                elif name in benchmark_names:
                    benchmark_metrics[metric][name].exp.append(
                        Quantity(f"{b[metric]}{unit}")
                    )

    # Populate metrics from the baseline runs.
    for run in base_runs:
        for b in run["benchmarks"]:
            name = b["name"]
            # Baseline runs don't have comparable benchmarks, so we only need
            # to populate the 'base' list for main benchmarks.
            if name in benchmark_names:
                for metric in metrics:
                    unit = b.get("time_unit", "") if "time" in metric else ""
                    benchmark_metrics[metric][name].base.append(
                        Quantity(f"{b[metric]}{unit}")
                    )

    return benchmark_metrics


def print_metric_key(
    console: Console,
    alpha: float,
    has_baseline: bool,
    comp_mapping: ComparableBenchmarkMapping,
) -> None:
    """Prints a legend for the metrics table.

    This explains the format of the output table, including what the delta,
    median, and confidence interval values represent.

    Args:
        console: The rich console to print to.
        alpha: The alpha value for statistical significance.
        has_baseline: Whether a baseline benchmark was run.
    """
    console.print("Metric key:")

    conf = int(100 * (1.0 - alpha))

    name = "BenchmarkName..."
    delta_icon = str(DeltaKind.IMPROVEMENT)
    delta = "[faster]<delta>[/faster]"
    p = "p=<U-test P-value>"
    base_median = "[base_median]<median>[/base_median]"
    base_conf = f"[base_conf]<% at {conf}th conf>[/base_conf]"
    exp_median = "[exp_median]<median>[/exp_median]"
    exp_conf = f"[exp_conf]<% at {conf}th conf>[/exp_conf]"

    key_table = Table.grid(
        Column(justify="right"),
        Column(),
        Column(),
        Column(),
        Column(),
        padding=(0, 1),
    )
    if has_baseline:
        key_table.add_row(name, delta_icon, delta, "", p)
        key_table.add_row("baseline:", "", base_median, "±", base_conf)
        key_table.add_row("experiment:", "", exp_median, "±", exp_conf)
    else:
        key_table.add_row(name, "", exp_median, "±", exp_conf)
        # Only display comparable key if we have comparables to display.
        if bool(comp_mapping.name_to_comp_tag):
            key_table.add_row("vs Comparable:", delta_icon, delta, p)
            key_table.add_row("", "", base_median, "±", base_conf)
    console.print(Padding(key_table, (0, 0, 1, 3)))


def print_results_table(
    console: Console,
    alpha: float,
    has_baseline: bool,
    metrics: list[str],
    benchmark_names: list[str],
    benchmark_metrics: dict[str, dict[str, BenchmarkRunMetrics]],
    comp_mapping: ComparableBenchmarkMapping,
) -> None:
    """Builds and prints the main results table.

    This function constructs a rich `Table` to display the benchmark results,
    including deltas, medians, and confidence intervals for each metric. It then
    prints this to the provided `console`.

    Args:
        console: The rich console to print to.
        metrics: A list of metric names to be displayed as columns.
        benchmark_names: A list of main benchmark names for the rows.
        alpha: The alpha value for statistical significance.
        benchmark_metrics: A nested dictionary containing the collected metrics
                           for each benchmark and metric.
        has_baseline: Whether a baseline benchmark was run.
        comp_mapping: The mapping of comparable benchmarks.
    """
    METRIC_TITLES = {
        "real_time": "Wall Time",
        "cpu_time": "CPU Time",
        "iterations": "Iterations",
    }

    name_width = max(
        (
            len(name)
            for name in (
                benchmark_names
                + [
                    f"vs {tag}:"
                    for tag in comp_mapping.name_to_comp_tag.values()
                ]
                + ["experiment:"]
            )
        )
    )

    table = Table(show_edge=False)
    # The benchmark name column we want to justify right for the sub-labels, but
    # we will fill the name in the column completed and the name will visually
    # be justified to the left, so force the heading to justify left unlike the
    # column text. We also disable wrapping because we manually fill the column
    # and require line-precise layout.
    table.add_column(
        Text("Benchmark", justify="left"), justify="right", no_wrap=True
    )
    for metric in metrics:
        title = Text(METRIC_TITLES.get(metric, metric), justify="center")
        table.add_column(title, justify="left", no_wrap=True)

    name_t = Table.grid(Column(justify="right", no_wrap=True), expand=True)
    for name in benchmark_names:
        name_t.add_row(f"{name}{'.' * (name_width - len(name))}")
        if has_baseline:
            name_t.add_row("baseline:")
            name_t.add_row("experiment:")
            name_t.add_row()
        elif comp_tags := comp_mapping.main_name_to_comp_tags.get(name):
            for tag in comp_tags:
                name_t.add_row(f"vs {tag}:")
                name_t.add_row()
            name_t.add_row()

    row = [name_t]
    for metric in metrics:
        metric_runs = benchmark_metrics[metric]
        row.append(
            render_metric_column(
                metric, alpha, [metric_runs[name] for name in benchmark_names]
            )
        )
    table.add_row(*row)
    console.print(table)


def main() -> None:
    parsed_args = parse_args()
    console = Console(theme=THEME)
    Quantity.set_prefs(spacer=" ", map_sf=Quantity.map_sf_to_greek)

    if parsed_args.base_benchmark and parsed_args.benchmark_comparable_re:
        console.print(
            "ERROR: Cannot mix a base benchmark binary with benchmark "
            "comparisons."
        )
        sys.exit(1)

    # Run the benchmark(s) and collect the results into a data structure for
    # processing.
    num_runs = parsed_args.runs
    base_runs: list[dict] = []
    has_baseline = bool(parsed_args.base_benchmark)
    if has_baseline:
        base_runs = run_benchmark_binary(
            parsed_args.base_benchmark,
            parsed_args.benchmark_args,
            parsed_args.base_benchmark_args,
            num_runs,
            console,
        )

    exp_runs = run_benchmark_binary(
        parsed_args.exp_benchmark,
        parsed_args.benchmark_args,
        parsed_args.exp_benchmark_args,
        num_runs,
        console,
    )

    # If JSON output is requested, just dump the data without further
    # processing.
    if parsed_args.output == "json":
        console.log("Printing JSON results...")
        console.print_json(json.dumps(exp_runs))
        if has_baseline:
            console.print_json(json.dumps(base_runs))
        return

    print_run_context(console, num_runs, exp_runs, has_baseline)

    # Collect the benchmark names and metric names.
    benchmark_names, metrics = get_benchmark_names_and_metrics(
        console, parsed_args, exp_runs, base_runs
    )

    # Build any mappings between main benchmark names and comparables, and reset
    # our benchmark names to the main ones.
    comp_mapping = ComparableBenchmarkMapping(
        benchmark_names, parsed_args.benchmark_comparable_re, console
    )
    benchmark_names = comp_mapping.main_benchmark_names

    # Collect and organize the actual benchmark metrics from the raw JSON
    # structures across the runs. This pivots the data into an easy to analyze
    # and render structure, but doesn't do the analysis itself.
    benchmark_metrics = collect_benchmark_metrics(
        benchmark_names, metrics, exp_runs, base_runs, comp_mapping
    )

    # Analyze and render a readable table of the collected metrics. This is
    # where we do statistical analysis and render confidence intervals,
    # significance, and other helpful indicators based on the collected data. We
    # also print relevant keys to reading and interpreting the rendered data.
    alpha = parsed_args.alpha
    console.print(
        "Computing statistically significant deltas only where"
        f"the P-value < 𝛂 of {alpha}"
    )
    print_metric_key(console, alpha, has_baseline, comp_mapping)
    print_results_table(
        console,
        alpha,
        has_baseline,
        metrics,
        benchmark_names,
        benchmark_metrics,
        comp_mapping,
    )


if __name__ == "__main__":
    main()