1 год назад · a9c815c9f4
--- a/common/BUILD
+++ b/common/BUILD
@@ -38,7 +38,10 @@ cc_library(
 
				 cc_library(
			
 
				     name = "benchmark_main",
			
 
				     srcs = ["benchmark_main.cpp"],
			
 
				+    hdrs = ["benchmark_main.h"],
			
 
				     deps = [
			
 
				+        ":check",
			
 
				+        ":exe_path",
			
 
				         ":init_llvm",
			
 
				         "@abseil-cpp//absl/flags:parse",
			
 
				         "@google_benchmark//:benchmark",
			
--- a/common/benchmark_main.cpp
+++ b/common/benchmark_main.cpp
@@ -2,17 +2,41 @@
 
				 // Exceptions. See /LICENSE for license information.
			
 
				 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
			
 
				 
			
 
				+#include "common/benchmark_main.h"
			
 
				+
			
 
				 #include <benchmark/benchmark.h>
			
 
				 
			
 
				+#include <string>
			
 
				+
			
 
				 #include "absl/flags/parse.h"
			
 
				+#include "common/check.h"
			
 
				+#include "common/exe_path.h"
			
 
				 #include "common/init_llvm.h"
			
 
				 #include "llvm/ADT/ArrayRef.h"
			
 
				 #include "llvm/ADT/StringRef.h"
			
 
				 
			
 
				+static bool after_main = false;
			
 
				+static llvm::StringRef exe_path;
			
 
				+
			
 
				+namespace Carbon::Testing {
			
 
				+
			
 
				+auto GetBenchmarkExePath() -> llvm::StringRef {
			
 
				+  CARBON_CHECK(after_main)
			
 
				+      << "Must not query the executable path until after `main` is entered!";
			
 
				+  return exe_path;
			
 
				+}
			
 
				+
			
 
				+}  // namespace Carbon::Testing
			
 
				+
			
 
				+// TODO: Refactor this to share code with `gtest_main.cpp`.
			
 
				 auto main(int orig_argc, char** orig_argv) -> int {
			
 
				   // Do LLVM's initialization first, this will also transform UTF-16 to UTF-8.
			
 
				   Carbon::InitLLVM init_llvm(orig_argc, orig_argv);
			
 
				 
			
 
				+  std::string exe_path_storage = Carbon::FindExecutablePath(orig_argv[0]);
			
 
				+  exe_path = exe_path_storage;
			
 
				+  after_main = true;
			
 
				+
			
 
				   // Inject a flag to override the defaults for benchmarks. This can still be
			
 
				   // disabled by user arguments.
			
 
				   llvm::SmallVector<char*> injected_argv_storage(orig_argv,
			
--- a/common/benchmark_main.h
+++ b/common/benchmark_main.h
@@ -0,0 +1,22 @@
 
				+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
			
 
				+// Exceptions. See /LICENSE for license information.
			
 
				+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
			
 
				+
			
 
				+#ifndef CARBON_COMMON_BENCHMARK_MAIN_H_
			
 
				+#define CARBON_COMMON_BENCHMARK_MAIN_H_
			
 
				+
			
 
				+#include "llvm/ADT/StringRef.h"
			
 
				+
			
 
				+// When using the Carbon `main` function for benchmarks, we export some extra
			
 
				+// information about the test binary that can be accessed with this header.
			
 
				+//
			
 
				+// TODO: Refactor this to share code with `gtest_main.h`.
			
 
				+
			
 
				+namespace Carbon::Testing {
			
 
				+
			
 
				+// Returns the executable path of the benchmark binary.
			
 
				+auto GetBenchmarkExePath() -> llvm::StringRef;
			
 
				+
			
 
				+}  // namespace Carbon::Testing
			
 
				+
			
 
				+#endif  // CARBON_COMMON_BENCHMARK_MAIN_H_
			
--- a/testing/base/BUILD
+++ b/testing/base/BUILD
@@ -5,7 +5,7 @@
 
				 # Trivial, single-file testing libraries. More complex libraries should get
			
 
				 # their own directory.
			
 
				 
			
 
				-load("@rules_cc//cc:defs.bzl", "cc_library", "cc_test")
			
 
				+load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library", "cc_test")
			
 
				 
			
 
				 package(default_visibility = ["//visibility:public"])
			
 
				 
			
@@ -40,6 +40,49 @@ cc_test(
 
				     ],
			
 
				 )
			
 
				 
			
 
				+cc_library(
			
 
				+    name = "source_gen_lib",
			
 
				+    testonly = 1,
			
 
				+    srcs = ["source_gen.cpp"],
			
 
				+    hdrs = ["source_gen.h"],
			
 
				+    deps = [
			
 
				+        "//common:check",
			
 
				+        "//common:map",
			
 
				+        "//common:set",
			
 
				+        "//toolchain/lex:token_kind",
			
 
				+        "@abseil-cpp//absl/random",
			
 
				+        "@llvm-project//llvm:Support",
			
 
				+    ],
			
 
				+)
			
 
				+
			
 
				+cc_test(
			
 
				+    name = "source_gen_test",
			
 
				+    size = "small",
			
 
				+    srcs = ["source_gen_test.cpp"],
			
 
				+    deps = [
			
 
				+        ":gtest_main",
			
 
				+        ":source_gen_lib",
			
 
				+        "//common:set",
			
 
				+        "//toolchain/driver",
			
 
				+        "@googletest//:gtest",
			
 
				+        "@llvm-project//llvm:Support",
			
 
				+    ],
			
 
				+)
			
 
				+
			
 
				+cc_binary(
			
 
				+    name = "source_gen",
			
 
				+    testonly = 1,
			
 
				+    srcs = ["source_gen_main.cpp"],
			
 
				+    deps = [
			
 
				+        ":source_gen_lib",
			
 
				+        "//common:bazel_working_dir",
			
 
				+        "//common:command_line",
			
 
				+        "//common:init_llvm",
			
 
				+        "//common:ostream",
			
 
				+        "@llvm-project//llvm:Support",
			
 
				+    ],
			
 
				+)
			
 
				+
			
 
				 cc_library(
			
 
				     name = "test_raw_ostream",
			
 
				     testonly = 1,
			
--- a/testing/base/source_gen.cpp
+++ b/testing/base/source_gen.cpp
@@ -0,0 +1,695 @@
 
				+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
			
 
				+// Exceptions. See /LICENSE for license information.
			
 
				+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
			
 
				+
			
 
				+#include "testing/base/source_gen.h"
			
 
				+
			
 
				+#include <numeric>
			
 
				+
			
 
				+#include "llvm/ADT/ArrayRef.h"
			
 
				+#include "llvm/ADT/Sequence.h"
			
 
				+#include "llvm/ADT/SmallVector.h"
			
 
				+#include "llvm/ADT/StringExtras.h"
			
 
				+#include "llvm/Support/FormatVariadic.h"
			
 
				+#include "toolchain/lex/token_kind.h"
			
 
				+
			
 
				+namespace Carbon::Testing {
			
 
				+
			
 
				+auto SourceGen::Global() -> SourceGen& {
			
 
				+  static SourceGen global_gen;
			
 
				+  return global_gen;
			
 
				+}
			
 
				+
			
 
				+SourceGen::SourceGen(Language language) : language_(language) {}
			
 
				+
			
 
				+// Some heuristic numbers used when formatting generated code. These heuristics
			
 
				+// are loosely based on what we expect to make Carbon code readable, and might
			
 
				+// not fit as well in C++, but we use the same heuristics across languages for
			
 
				+// simplicity and to make the output in different languages more directly
			
 
				+// comparable.
			
 
				+constexpr static int NumSingleLineFunctionParams = 3;
			
 
				+constexpr static int NumSingleLineMethodParams = 2;
			
 
				+constexpr static int MaxParamsPerLine = 4;
			
 
				+
			
 
				+static auto EstimateAvgFunctionDeclLines(SourceGen::FunctionDeclParams params)
			
 
				+    -> double {
			
 
				+  // Currently model a uniform distribution [0, max] parameters. Assume a line
			
 
				+  // break before the first parameter for >3 and after every 4th.
			
 
				+  int param_lines = 0;
			
 
				+  for (int num_params : llvm::seq_inclusive(0, params.max_params)) {
			
 
				+    if (num_params > NumSingleLineFunctionParams) {
			
 
				+      param_lines += (num_params + MaxParamsPerLine - 1) / MaxParamsPerLine;
			
 
				+    }
			
 
				+  }
			
 
				+  return 1.0 + static_cast<double>(param_lines) / (params.max_params + 1);
			
 
				+}
			
 
				+
			
 
				+static auto EstimateAvgMethodDeclLines(SourceGen::MethodDeclParams params)
			
 
				+    -> double {
			
 
				+  // Currently model a uniform distribution [0, max] parameters. Assume a line
			
 
				+  // break before the first parameter for >2 and after every 4th.
			
 
				+  int param_lines = 0;
			
 
				+  for (int num_params : llvm::seq_inclusive(0, params.max_params)) {
			
 
				+    if (num_params > NumSingleLineMethodParams) {
			
 
				+      param_lines += (num_params + MaxParamsPerLine - 1) / MaxParamsPerLine;
			
 
				+    }
			
 
				+  }
			
 
				+  return 1.0 + static_cast<double>(param_lines) / (params.max_params + 1);
			
 
				+}
			
 
				+
			
 
				+// Note that this should match the heuristics used when formatting.
			
 
				+// TODO: See top-level TODO about line estimates and formatting.
			
 
				+static auto EstimateAvgClassDefLines(SourceGen::ClassParams params) -> double {
			
 
				+  // Comment line, and class open line.
			
 
				+  double avg = 2.0;
			
 
				+
			
 
				+  // One comment line and blank line per function, plus the function lines.
			
 
				+  avg +=
			
 
				+      (2.0 + EstimateAvgFunctionDeclLines(params.public_function_decl_params)) *
			
 
				+      params.public_function_decls;
			
 
				+  avg += (2.0 + EstimateAvgMethodDeclLines(params.public_method_decl_params)) *
			
 
				+         params.public_method_decls;
			
 
				+  avg += (2.0 +
			
 
				+          EstimateAvgFunctionDeclLines(params.private_function_decl_params)) *
			
 
				+         params.private_function_decls;
			
 
				+  avg += (2.0 + EstimateAvgMethodDeclLines(params.private_method_decl_params)) *
			
 
				+         params.private_method_decls;
			
 
				+
			
 
				+  // A blank line and all the fields (if any).
			
 
				+  if (params.private_field_decls > 0) {
			
 
				+    avg += 1.0 + params.private_field_decls;
			
 
				+  }
			
 
				+
			
 
				+  // No need to account for the class close line, we have an extra blank line
			
 
				+  // count for the last of the above.
			
 
				+  return avg;
			
 
				+}
			
 
				+
			
 
				+auto SourceGen::GenAPIFileDenseDecls(int target_lines, DenseDeclParams params)
			
 
				+    -> std::string {
			
 
				+  std::string source;
			
 
				+  llvm::raw_string_ostream os(source);
			
 
				+
			
 
				+  // Figure out how many classes fit in our target lines, each separated by a
			
 
				+  // blank line. We need to account the comment lines below to start the file.
			
 
				+  // Note that we want a blank line after our file comment block, so every class
			
 
				+  // needs a blank line.
			
 
				+  constexpr int NumFileCommentLines = 4;
			
 
				+  double avg_class_lines = EstimateAvgClassDefLines(params.class_params);
			
 
				+  CARBON_CHECK(target_lines > NumFileCommentLines + avg_class_lines)
			
 
				+      << "Not enough target lines to generate a single class!";
			
 
				+  int num_classes = static_cast<double>(target_lines - NumFileCommentLines) /
			
 
				+                    (avg_class_lines + 1);
			
 
				+  int expected_lines =
			
 
				+      NumFileCommentLines + num_classes * (avg_class_lines + 1);
			
 
				+
			
 
				+  os << "// Generated " << (!IsCpp() ? "Carbon" : "C++") << " source file.\n";
			
 
				+  os << llvm::formatv("// {0} target lines: {1} classes, {2} expected lines",
			
 
				+                      target_lines, num_classes, expected_lines)
			
 
				+     << "\n";
			
 
				+  os << "//\n// Generating as an API file with dense declarations.\n";
			
 
				+
			
 
				+  auto class_gen_state = GetClassGenState(num_classes, params.class_params);
			
 
				+  for ([[maybe_unused]] int _ : llvm::seq(num_classes)) {
			
 
				+    os << "\n";
			
 
				+    GenerateClassDef(params.class_params, class_gen_state, os);
			
 
				+  }
			
 
				+
			
 
				+  // Make sure we consumed all the state.
			
 
				+  CARBON_CHECK(class_gen_state.public_function_param_counts.empty());
			
 
				+  CARBON_CHECK(class_gen_state.public_method_param_counts.empty());
			
 
				+  CARBON_CHECK(class_gen_state.private_function_param_counts.empty());
			
 
				+  CARBON_CHECK(class_gen_state.private_method_param_counts.empty());
			
 
				+  CARBON_CHECK(class_gen_state.class_names.empty());
			
 
				+
			
 
				+  return source;
			
 
				+}
			
 
				+
			
 
				+auto SourceGen::GetShuffledIdentifiers(int number, int min_length,
			
 
				+                                       int max_length, bool uniform)
			
 
				+    -> llvm::SmallVector<llvm::StringRef> {
			
 
				+  llvm::SmallVector<llvm::StringRef> idents =
			
 
				+      GetIdentifiers(number, min_length, max_length, uniform);
			
 
				+  std::shuffle(idents.begin(), idents.end(), rng_);
			
 
				+  return idents;
			
 
				+}
			
 
				+
			
 
				+auto SourceGen::GetShuffledUniqueIdentifiers(int number, int min_length,
			
 
				+                                             int max_length, bool uniform)
			
 
				+    -> llvm::SmallVector<llvm::StringRef> {
			
 
				+  CARBON_CHECK(min_length >= 4)
			
 
				+      << "Cannot trivially guarantee enough distinct, unique identifiers for "
			
 
				+         "lengths <= 3";
			
 
				+  llvm::SmallVector<llvm::StringRef> idents =
			
 
				+      GetUniqueIdentifiers(number, min_length, max_length, uniform);
			
 
				+  std::shuffle(idents.begin(), idents.end(), rng_);
			
 
				+  return idents;
			
 
				+}
			
 
				+
			
 
				+auto SourceGen::GetIdentifiers(int number, int min_length, int max_length,
			
 
				+                               bool uniform)
			
 
				+    -> llvm::SmallVector<llvm::StringRef> {
			
 
				+  llvm::SmallVector<llvm::StringRef> idents = GetIdentifiersImpl(
			
 
				+      number, min_length, max_length, uniform,
			
 
				+      [this](int length, int length_count,
			
 
				+             llvm::SmallVectorImpl<llvm::StringRef>& dest) {
			
 
				+        auto length_idents = GetSingleLengthIdentifiers(length, length_count);
			
 
				+        dest.append(length_idents.begin(), length_idents.end());
			
 
				+      });
			
 
				+
			
 
				+  return idents;
			
 
				+}
			
 
				+
			
 
				+auto SourceGen::GetUniqueIdentifiers(int number, int min_length, int max_length,
			
 
				+                                     bool uniform)
			
 
				+    -> llvm::SmallVector<llvm::StringRef> {
			
 
				+  CARBON_CHECK(min_length >= 4)
			
 
				+      << "Cannot trivially guarantee enough distinct, unique identifiers for "
			
 
				+         "lengths <= 3";
			
 
				+  llvm::SmallVector<llvm::StringRef> idents =
			
 
				+      GetIdentifiersImpl(number, min_length, max_length, uniform,
			
 
				+                         [this](int length, int length_count,
			
 
				+                                llvm::SmallVectorImpl<llvm::StringRef>& dest) {
			
 
				+                           AppendUniqueIdentifiers(length, length_count, dest);
			
 
				+                         });
			
 
				+
			
 
				+  return idents;
			
 
				+}
			
 
				+
			
 
				+auto SourceGen::GetSingleLengthIdentifiers(int length, int number)
			
 
				+    -> llvm::ArrayRef<llvm::StringRef> {
			
 
				+  llvm::SmallVector<llvm::StringRef>& idents =
			
 
				+      identifiers_by_length_.Insert(length, {}).value();
			
 
				+
			
 
				+  if (static_cast<int>(idents.size()) < number) {
			
 
				+    idents.reserve(number);
			
 
				+    for ([[maybe_unused]] int _ : llvm::seq<int>(idents.size(), number)) {
			
 
				+      auto ident_storage =
			
 
				+          llvm::MutableArrayRef(reinterpret_cast<char*>(storage_.Allocate(
			
 
				+                                    /*Size=*/length, /*Alignment=*/1)),
			
 
				+                                length);
			
 
				+      GenerateRandomIdentifier(ident_storage);
			
 
				+      llvm::StringRef new_id(ident_storage.data(), length);
			
 
				+      idents.push_back(new_id);
			
 
				+    }
			
 
				+    CARBON_CHECK(static_cast<int>(idents.size()) == number);
			
 
				+  }
			
 
				+  return llvm::ArrayRef(idents).slice(0, number);
			
 
				+}
			
 
				+
			
 
				+static auto IdentifierStartChars() -> llvm::ArrayRef<char> {
			
 
				+  static llvm::SmallVector<char> chars = [] {
			
 
				+    llvm::SmallVector<char> chars;
			
 
				+    for (char c : llvm::seq_inclusive('A', 'Z')) {
			
 
				+      chars.push_back(c);
			
 
				+    }
			
 
				+    for (char c : llvm::seq_inclusive('a', 'z')) {
			
 
				+      chars.push_back(c);
			
 
				+    }
			
 
				+    return chars;
			
 
				+  }();
			
 
				+  return chars;
			
 
				+}
			
 
				+
			
 
				+static auto IdentifierChars() -> llvm::ArrayRef<char> {
			
 
				+  static llvm::SmallVector<char> chars = [] {
			
 
				+    llvm::ArrayRef<char> start_chars = IdentifierStartChars();
			
 
				+    llvm::SmallVector<char> chars(start_chars.begin(), start_chars.end());
			
 
				+    chars.push_back('_');
			
 
				+    for (char c : llvm::seq_inclusive('0', '9')) {
			
 
				+      chars.push_back(c);
			
 
				+    }
			
 
				+    return chars;
			
 
				+  }();
			
 
				+  return chars;
			
 
				+}
			
 
				+
			
 
				+constexpr static llvm::StringRef NonCarbonCppKeywords[] = {
			
 
				+    "asm", "do",     "double", "float", "int",      "long",
			
 
				+    "new", "signed", "try",    "unix",  "unsigned", "xor",
			
 
				+};
			
 
				+
			
 
				+// Returns a random identifier string of the specified length.
			
 
				+//
			
 
				+// Ensures this is a valid identifier, avoiding any overlapping syntaxes or
			
 
				+// keywords both in Carbon and C++.
			
 
				+//
			
 
				+// This routine is somewhat expensive and so is useful to cache and reduce the
			
 
				+// frequency of calls. However, each time it is called it computes a completely
			
 
				+// new random identifier and so can be useful to eventually find a distinct
			
 
				+// identifier when needed.
			
 
				+auto SourceGen::GenerateRandomIdentifier(
			
 
				+    llvm::MutableArrayRef<char> dest_storage) -> void {
			
 
				+  llvm::ArrayRef<char> start_chars = IdentifierStartChars();
			
 
				+  llvm::ArrayRef<char> chars = IdentifierChars();
			
 
				+
			
 
				+  auto ident = llvm::StringRef(dest_storage.data(), dest_storage.size());
			
 
				+  do {
			
 
				+    dest_storage[0] =
			
 
				+        start_chars[absl::Uniform<int>(rng_, 0, start_chars.size())];
			
 
				+    for (int i : llvm::seq<int>(1, dest_storage.size())) {
			
 
				+      dest_storage[i] = chars[absl::Uniform<int>(rng_, 0, chars.size())];
			
 
				+    }
			
 
				+  } while (
			
 
				+      // TODO: Clean up and simplify this code. With some small refactorings and
			
 
				+      // post-processing we should be able to make this both easier to read and
			
 
				+      // less inefficient.
			
 
				+      llvm::any_of(
			
 
				+          Lex::TokenKind::KeywordTokens,
			
 
				+          [ident](auto token) { return ident == token.fixed_spelling(); }) ||
			
 
				+      llvm::is_contained(NonCarbonCppKeywords, ident) ||
			
 
				+      (llvm::is_contained({'i', 'u', 'f'}, ident[0]) &&
			
 
				+       llvm::all_of(ident.substr(1),
			
 
				+                    [](const char c) { return llvm::isDigit(c); })));
			
 
				+}
			
 
				+
			
 
				+// Appends a number of unique, random identifiers with a particular length to
			
 
				+// the provided destination vector.
			
 
				+//
			
 
				+// Uses, and when necessary grows, a cached sequence of random identifiers with
			
 
				+// the specified length. Because these are cached, this is efficient to call
			
 
				+// repeatedly, but will not produce a different sequence of identifiers.
			
 
				+auto SourceGen::AppendUniqueIdentifiers(
			
 
				+    int length, int number, llvm::SmallVectorImpl<llvm::StringRef>& dest)
			
 
				+    -> void {
			
 
				+  auto& [count, unique_idents] =
			
 
				+      unique_identifiers_by_length_.Insert(length, {}).value();
			
 
				+
			
 
				+  // See if we need to grow our pool of unique identifiers with the requested
			
 
				+  // length.
			
 
				+  if (count < number) {
			
 
				+    // We'll need to insert exactly the requested new unique identifiers. All
			
 
				+    // our other inserts will find an existing entry.
			
 
				+    unique_idents.GrowForInsertCount(count - number);
			
 
				+
			
 
				+    // Generate the needed number of identifiers.
			
 
				+    for ([[maybe_unused]] int _ : llvm::seq<int>(count, number)) {
			
 
				+      // Allocate stable storage for the identifier so we can form stable
			
 
				+      // `StringRef`s to it.
			
 
				+      auto ident_storage =
			
 
				+          llvm::MutableArrayRef(reinterpret_cast<char*>(storage_.Allocate(
			
 
				+                                    /*Size=*/length, /*Alignment=*/1)),
			
 
				+                                length);
			
 
				+      // Repeatedly generate novel identifiers of this length until we find a
			
 
				+      // new unique one.
			
 
				+      for (;;) {
			
 
				+        GenerateRandomIdentifier(ident_storage);
			
 
				+        auto result =
			
 
				+            unique_idents.Insert(llvm::StringRef(ident_storage.data(), length));
			
 
				+        if (result.is_inserted()) {
			
 
				+          break;
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+    count = number;
			
 
				+  }
			
 
				+  // Append all the identifiers directly out of the set. We make no guarantees
			
 
				+  // about the relative order so we just use the non-deterministic order of the
			
 
				+  // set and avoid additional storage.
			
 
				+  //
			
 
				+  // TODO: It's awkward the `ForEach` here can't early-exit. This just walks the
			
 
				+  // whole set which is harmless if inefficient. We should add early exiting
			
 
				+  // the loop support to `Set` and update this code.
			
 
				+  unique_idents.ForEach([&](llvm::StringRef ident) {
			
 
				+    if (number > 0) {
			
 
				+      dest.push_back(ident);
			
 
				+      --number;
			
 
				+    }
			
 
				+  });
			
 
				+  CARBON_CHECK(number == 0);
			
 
				+}
			
 
				+
			
 
				+// An array of the counts that should be used for each identifier length to
			
 
				+// produce our desired distribution.
			
 
				+//
			
 
				+// Note that the zero-based index corresponds to a 1-based length, so the count
			
 
				+// for identifiers of length 1 is at index 0.
			
 
				+static constexpr std::array<int, 64> IdentifierLengthCounts = [] {
			
 
				+  std::array<int, 64> ident_length_counts;
			
 
				+  // For non-uniform distribution, we simulate a distribution roughly based on
			
 
				+  // the observed histogram of identifier lengths, but smoothed a bit and
			
 
				+  // reduced to small counts so that we cycle through all the lengths
			
 
				+  // reasonably quickly. We want sampling of even 10% of NumTokens from this
			
 
				+  // in a round-robin form to not be skewed overly much. This still inherently
			
 
				+  // compresses the long tail as we'd rather have coverage even though it
			
 
				+  // distorts the distribution a bit.
			
 
				+  //
			
 
				+  // The distribution here comes from a script that analyzes source code run
			
 
				+  // over a few directories of LLVM. The script renders a visual ascii-art
			
 
				+  // histogram along with the data for each bucket, and that output is
			
 
				+  // included in comments above each bucket size below to help visualize the
			
 
				+  // rough shape we're aiming for.
			
 
				+  //
			
 
				+  // 1 characters   [3976]  ███████████████████████████████▊
			
 
				+  ident_length_counts[0] = 40;
			
 
				+  // 2 characters   [3724]  █████████████████████████████▊
			
 
				+  ident_length_counts[1] = 40;
			
 
				+  // 3 characters   [4173]  █████████████████████████████████▍
			
 
				+  ident_length_counts[2] = 40;
			
 
				+  // 4 characters   [5000]  ████████████████████████████████████████
			
 
				+  ident_length_counts[3] = 50;
			
 
				+  // 5 characters   [1568]  ████████████▌
			
 
				+  ident_length_counts[4] = 20;
			
 
				+  // 6 characters   [2226]  █████████████████▊
			
 
				+  ident_length_counts[5] = 20;
			
 
				+  // 7 characters   [2380]  ███████████████████
			
 
				+  ident_length_counts[6] = 20;
			
 
				+  // 8 characters   [1786]  ██████████████▎
			
 
				+  ident_length_counts[7] = 18;
			
 
				+  // 9 characters   [1397]  ███████████▏
			
 
				+  ident_length_counts[8] = 12;
			
 
				+  // 10 characters  [ 739]  █████▉
			
 
				+  ident_length_counts[9] = 12;
			
 
				+  // 11 characters  [ 779]  ██████▎
			
 
				+  ident_length_counts[10] = 12;
			
 
				+  // 12 characters  [1344]  ██████████▊
			
 
				+  ident_length_counts[11] = 12;
			
 
				+  // 13 characters  [ 498]  ████
			
 
				+  ident_length_counts[12] = 5;
			
 
				+  // 14 characters  [ 284]  ██▎
			
 
				+  ident_length_counts[13] = 3;
			
 
				+  // 15 characters  [ 172]  █▍
			
 
				+  // 16 characters  [ 278]  ██▎
			
 
				+  // 17 characters  [ 191]  █▌
			
 
				+  // 18 characters  [ 207]  █▋
			
 
				+  for (int i = 14; i < 18; ++i) {
			
 
				+    ident_length_counts[i] = 2;
			
 
				+  }
			
 
				+  // 19 - 63 characters are all <100 but non-zero, and we map them to 1 for
			
 
				+  // coverage despite slightly over weighting the tail.
			
 
				+  for (int i = 18; i < 64; ++i) {
			
 
				+    ident_length_counts[i] = 1;
			
 
				+  }
			
 
				+  return ident_length_counts;
			
 
				+}();
			
 
				+
			
 
				+// A helper to sum elements of a range.
			
 
				+template <typename T>
			
 
				+static auto Sum(const T& range) -> int {
			
 
				+  return std::accumulate(range.begin(), range.end(), 0);
			
 
				+}
			
 
				+
			
 
				+// A template function that implements the common logic of `GetIdentifiers` and
			
 
				+// `GetUniqueIdentifiers`. Most parameters correspond to the parameters of those
			
 
				+// functions. Additionally, an `AppendFunc` callable is provided to implement
			
 
				+// the appending operation.
			
 
				+//
			
 
				+// The main functionality provided here is collecting the correct number of
			
 
				+// identifiers from each of the lengths in the range [min_length, max_length]
			
 
				+// and either in our default representative distribution or a uniform
			
 
				+// distribution.
			
 
				+auto SourceGen::GetIdentifiersImpl(int number, int min_length, int max_length,
			
 
				+                                   bool uniform,
			
 
				+                                   llvm::function_ref<AppendFn> append)
			
 
				+    -> llvm::SmallVector<llvm::StringRef> {
			
 
				+  CARBON_CHECK(min_length <= max_length);
			
 
				+  CARBON_CHECK(uniform || max_length <= 64)
			
 
				+      << "Cannot produce a meaningful non-uniform distribution of lengths "
			
 
				+         "longer than 64 as those are exceedingly rare in our observed data "
			
 
				+         "sets.";
			
 
				+
			
 
				+  llvm::SmallVector<llvm::StringRef> idents;
			
 
				+  idents.reserve(number);
			
 
				+
			
 
				+  // First, compute the total weight of the distribution so we know how many
			
 
				+  // identifiers we'll get each time we collect from it.
			
 
				+  int num_lengths = max_length - min_length + 1;
			
 
				+  auto length_counts =
			
 
				+      llvm::ArrayRef(IdentifierLengthCounts).slice(min_length - 1, num_lengths);
			
 
				+  int count_sum = uniform ? num_lengths : Sum(length_counts);
			
 
				+  CARBON_CHECK(count_sum >= 1);
			
 
				+
			
 
				+  int number_rem = number % count_sum;
			
 
				+
			
 
				+  // Finally, walk through each length in the distribution.
			
 
				+  for (int length : llvm::seq_inclusive(min_length, max_length)) {
			
 
				+    // Scale how many identifiers we want of this length if computing a
			
 
				+    // non-uniform distribution. For uniform, we always take one.
			
 
				+    int scale = uniform ? 1 : IdentifierLengthCounts[length - 1];
			
 
				+
			
 
				+    // Now we can compute how many identifiers of this length to request.
			
 
				+    int length_count = (number / count_sum) * scale;
			
 
				+    if (number_rem > 0) {
			
 
				+      int rem_adjustment = std::min(scale, number_rem);
			
 
				+      length_count += rem_adjustment;
			
 
				+      number_rem -= rem_adjustment;
			
 
				+    }
			
 
				+    append(length, length_count, idents);
			
 
				+  }
			
 
				+  CARBON_CHECK(number_rem == 0)
			
 
				+      << "Unexpected number remaining: " << number_rem;
			
 
				+  CARBON_CHECK(static_cast<int>(idents.size()) == number)
			
 
				+      << "Ended up with " << idents.size()
			
 
				+      << " identifiers instead of the requested " << number;
			
 
				+
			
 
				+  return idents;
			
 
				+}
			
 
				+
			
 
				+// Returns a shuffled sequence of integers in the range [min, max].
			
 
				+//
			
 
				+// The order of the returned integers is random, but each integer in the range
			
 
				+// appears the same number of times in the result, with the number of
			
 
				+// appearances rounded up for lower numbers and rounded down for higher numbers
			
 
				+// in order to exactly produce `number` results.
			
 
				+auto SourceGen::GetShuffledInts(int number, int min, int max)
			
 
				+    -> llvm::SmallVector<int> {
			
 
				+  llvm::SmallVector<int> ints;
			
 
				+  ints.reserve(number);
			
 
				+
			
 
				+  // Evenly distribute to each value between min and max.
			
 
				+  int num_values = max - min + 1;
			
 
				+  for (int i : llvm::seq_inclusive(min, max)) {
			
 
				+    int i_count = number / num_values;
			
 
				+    i_count += i < (min + (number % num_values));
			
 
				+    ints.append(i_count, i);
			
 
				+  }
			
 
				+  CARBON_CHECK(static_cast<int>(ints.size()) == number);
			
 
				+
			
 
				+  std::shuffle(ints.begin(), ints.end(), rng_);
			
 
				+  return ints;
			
 
				+}
			
 
				+
			
 
				+// Given a number of class definitions and the params with which to generate
			
 
				+// them, builds the state that will be used while generating that many classes.
			
 
				+//
			
 
				+// We build the state first and across all the class definitions that will be
			
 
				+// generated so that we can distribute random components across all the
			
 
				+// definitions.
			
 
				+auto SourceGen::GetClassGenState(int number, ClassParams params)
			
 
				+    -> ClassGenState {
			
 
				+  ClassGenState state;
			
 
				+  state.public_function_param_counts =
			
 
				+      GetShuffledInts(number * params.public_function_decls, 0,
			
 
				+                      params.public_function_decl_params.max_params);
			
 
				+  state.public_method_param_counts =
			
 
				+      GetShuffledInts(number * params.public_method_decls, 0,
			
 
				+                      params.public_method_decl_params.max_params);
			
 
				+  state.private_function_param_counts =
			
 
				+      GetShuffledInts(number * params.private_function_decls, 0,
			
 
				+                      params.private_function_decl_params.max_params);
			
 
				+  state.private_method_param_counts =
			
 
				+      GetShuffledInts(number * params.private_method_decls, 0,
			
 
				+                      params.private_method_decl_params.max_params);
			
 
				+
			
 
				+  state.class_names = GetShuffledUniqueIdentifiers(number, /*min_length=*/5);
			
 
				+  int num_members =
			
 
				+      number * (params.public_function_decls + params.public_method_decls +
			
 
				+                params.private_function_decls + params.private_method_decls +
			
 
				+                params.private_field_decls);
			
 
				+  state.member_names = GetShuffledIdentifiers(num_members, /*min_length=*/4);
			
 
				+  int num_params = Sum(state.public_function_param_counts) +
			
 
				+                   Sum(state.public_method_param_counts) +
			
 
				+                   Sum(state.private_function_param_counts) +
			
 
				+                   Sum(state.private_method_param_counts);
			
 
				+  state.param_names = GetShuffledIdentifiers(num_params);
			
 
				+  return state;
			
 
				+}
			
 
				+
			
 
				+// A helper to pop series of unique identifiers off a sequence of random
			
 
				+// identifiers that may have duplicates.
			
 
				+//
			
 
				+// This is particularly designed to work with the sequences of non-unique
			
 
				+// identifiers produced by `GetShuffledIdentifiers` with the important property
			
 
				+// that while popping off unique identifiers found in the shuffled list, we
			
 
				+// don't change the distribution of identifier lengths.
			
 
				+//
			
 
				+// The uniqueness is only per-instance of the class, and so an instance can be
			
 
				+// used to extract a series of names that share a scope.
			
 
				+//
			
 
				+// It works by scanning the sequence to extract each unique identifier found,
			
 
				+// swapping it to the back and popping it off the list. This does shuffle the
			
 
				+// order, but it isn't expected to do so in an interesting way.
			
 
				+//
			
 
				+// It also provides a fallback path in case there are no unique identifiers left
			
 
				+// which computes fresh, random identifiers with the same length as the next one
			
 
				+// in the sequence until a unique one is found.
			
 
				+//
			
 
				+// For simplicity of the fallback path, the lifetime of the identifiers produced
			
 
				+// is bound to the lifetime of the popper instance, and not the generator as a
			
 
				+// whole. If this is ever a problematic constraint, we can start copying
			
 
				+// fallback identifiers into the generator's storage.
			
 
				+class SourceGen::UniqueIdentifierPopper {
			
 
				+ public:
			
 
				+  explicit UniqueIdentifierPopper(SourceGen& gen,
			
 
				+                                  llvm::SmallVectorImpl<llvm::StringRef>& data)
			
 
				+      : gen_(&gen), data_(&data), it_(data_->rbegin()) {}
			
 
				+
			
 
				+  // Pop the next unique identifier that can be found in the data, or synthesize
			
 
				+  // one with a valid length. Always consumes exactly one identifier from the
			
 
				+  // data.
			
 
				+  //
			
 
				+  // Note that the lifetime of the underlying identifier is that of the popper
			
 
				+  // and not the underlying data.
			
 
				+  auto Pop() -> llvm::StringRef {
			
 
				+    for (auto end = data_->rend(); it_ != end; ++it_) {
			
 
				+      auto insert = set_.Insert(*it_);
			
 
				+      if (!insert.is_inserted()) {
			
 
				+        continue;
			
 
				+      }
			
 
				+
			
 
				+      if (it_ != data_->rbegin()) {
			
 
				+        std::swap(*data_->rbegin(), *it_);
			
 
				+      }
			
 
				+      CARBON_CHECK(insert.key() == data_->back());
			
 
				+      return data_->pop_back_val();
			
 
				+    }
			
 
				+
			
 
				+    // Out of unique elements. Overwrite the back, preserving its length,
			
 
				+    // generating a new identifiers until we find a unique one and return that.
			
 
				+    // This ensures we continue to consume the structure and produce the same
			
 
				+    // size identifiers even in the fallback.
			
 
				+    int length = data_->pop_back_val().size();
			
 
				+    auto fallback_ident_storage =
			
 
				+        llvm::MutableArrayRef(reinterpret_cast<char*>(gen_->storage_.Allocate(
			
 
				+                                  /*Size=*/length, /*Alignment=*/1)),
			
 
				+                              length);
			
 
				+    for (;;) {
			
 
				+      gen_->GenerateRandomIdentifier(fallback_ident_storage);
			
 
				+      auto fallback_id = llvm::StringRef(fallback_ident_storage.data(), length);
			
 
				+      if (set_.Insert(fallback_id).is_inserted()) {
			
 
				+        return fallback_id;
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+ private:
			
 
				+  SourceGen* gen_;
			
 
				+  llvm::SmallVectorImpl<llvm::StringRef>* data_;
			
 
				+  llvm::SmallVectorImpl<llvm::StringRef>::reverse_iterator it_;
			
 
				+  Set<llvm::StringRef> set_;
			
 
				+};
			
 
				+
			
 
				+// Generates a function declaration and writes it to the provided stream.
			
 
				+//
			
 
				+// The declaration can be configured with a function name, private modifier,
			
 
				+// whether it is a method, the parameter count, an how indented it is.
			
 
				+//
			
 
				+// This is also provided a collection of identifiers to consume as parameter
			
 
				+// names -- it will use a unique popper to extract unique parameter names from
			
 
				+// this collection.
			
 
				+auto SourceGen::GenerateFunctionDecl(
			
 
				+    llvm::StringRef name, bool is_private, bool is_method, int param_count,
			
 
				+    llvm::StringRef indent, llvm::SmallVectorImpl<llvm::StringRef>& param_names,
			
 
				+    llvm::raw_ostream& os) -> void {
			
 
				+  os << indent << "// TODO: make better comment text\n";
			
 
				+  if (!IsCpp()) {
			
 
				+    os << indent << (is_private ? "private " : "") << "fn " << name;
			
 
				+
			
 
				+    if (is_method) {
			
 
				+      os << "[self: Self]";
			
 
				+    }
			
 
				+  } else {
			
 
				+    os << indent;
			
 
				+    if (!is_method) {
			
 
				+      os << "static ";
			
 
				+    }
			
 
				+    os << "auto " << name;
			
 
				+  }
			
 
				+
			
 
				+  os << "(";
			
 
				+
			
 
				+  if (param_count >
			
 
				+      (is_method ? NumSingleLineMethodParams : NumSingleLineFunctionParams)) {
			
 
				+    os << "\n" << indent << "    ";
			
 
				+  }
			
 
				+  UniqueIdentifierPopper unique_param_names(*this, param_names);
			
 
				+  for (int i : llvm::seq(param_count)) {
			
 
				+    if (i > 0) {
			
 
				+      if ((i % MaxParamsPerLine) == 0) {
			
 
				+        os << ",\n" << indent << "    ";
			
 
				+      } else {
			
 
				+        os << ", ";
			
 
				+      }
			
 
				+    }
			
 
				+    if (!IsCpp()) {
			
 
				+      os << unique_param_names.Pop() << ": i32";
			
 
				+    } else {
			
 
				+      os << "int " << unique_param_names.Pop();
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  os << ")" << (IsCpp() ? " -> void" : "") << ";\n";
			
 
				+}
			
 
				+
			
 
				+// Generate a class definition and write it to the provided stream.
			
 
				+//
			
 
				+// The structure of the definition is guided by the `params` provided, and it
			
 
				+// consumes the provided state.
			
 
				+auto SourceGen::GenerateClassDef(const ClassParams& params,
			
 
				+                                 ClassGenState& state, llvm::raw_ostream& os)
			
 
				+    -> void {
			
 
				+  os << "// TODO: make better comment text\n";
			
 
				+  os << "class " << state.class_names.pop_back_val() << " {\n";
			
 
				+  if (IsCpp()) {
			
 
				+    os << " public:\n";
			
 
				+  }
			
 
				+
			
 
				+  UniqueIdentifierPopper unique_member_names(*this, state.member_names);
			
 
				+  llvm::ListSeparator line_sep("\n");
			
 
				+  for ([[maybe_unused]] int _ : llvm::seq(params.public_function_decls)) {
			
 
				+    os << line_sep;
			
 
				+    GenerateFunctionDecl(unique_member_names.Pop(), /*is_private=*/false,
			
 
				+                         /*is_method=*/false,
			
 
				+                         state.public_function_param_counts.pop_back_val(),
			
 
				+                         /*indent=*/"  ", state.param_names, os);
			
 
				+  }
			
 
				+  for ([[maybe_unused]] int _ : llvm::seq(params.public_method_decls)) {
			
 
				+    os << line_sep;
			
 
				+    GenerateFunctionDecl(unique_member_names.Pop(), /*is_private=*/false,
			
 
				+                         /*is_method=*/true,
			
 
				+                         state.public_method_param_counts.pop_back_val(),
			
 
				+                         /*indent=*/"  ", state.param_names, os);
			
 
				+  }
			
 
				+
			
 
				+  if (IsCpp()) {
			
 
				+    os << "\n private:\n";
			
 
				+    // Reset the separator.
			
 
				+    line_sep = llvm::ListSeparator("\n");
			
 
				+  }
			
 
				+
			
 
				+  for ([[maybe_unused]] int _ : llvm::seq(params.private_function_decls)) {
			
 
				+    os << line_sep;
			
 
				+    GenerateFunctionDecl(unique_member_names.Pop(), /*is_private=*/true,
			
 
				+                         /*is_method=*/false,
			
 
				+                         state.private_function_param_counts.pop_back_val(),
			
 
				+                         /*indent=*/"  ", state.param_names, os);
			
 
				+  }
			
 
				+  for ([[maybe_unused]] int _ : llvm::seq(params.private_method_decls)) {
			
 
				+    os << line_sep;
			
 
				+    GenerateFunctionDecl(unique_member_names.Pop(), /*is_private=*/true,
			
 
				+                         /*is_method=*/true,
			
 
				+                         state.private_method_param_counts.pop_back_val(),
			
 
				+                         /*indent=*/"  ", state.param_names, os);
			
 
				+  }
			
 
				+  os << line_sep;
			
 
				+  for ([[maybe_unused]] int _ : llvm::seq(params.private_field_decls)) {
			
 
				+    if (!IsCpp()) {
			
 
				+      os << "  private var " << unique_member_names.Pop() << ": i32;\n";
			
 
				+    } else {
			
 
				+      os << "  int " << unique_member_names.Pop() << ";\n";
			
 
				+    }
			
 
				+  }
			
 
				+  os << "}" << (IsCpp() ? ";" : "") << "\n";
			
 
				+}
			
 
				+
			
 
				+}  // namespace Carbon::Testing
			
--- a/testing/base/source_gen.h
+++ b/testing/base/source_gen.h
@@ -0,0 +1,247 @@
 
				+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
			
 
				+// Exceptions. See /LICENSE for license information.
			
 
				+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
			
 
				+
			
 
				+#ifndef CARBON_TESTING_BASE_SOURCE_GEN_H_
			
 
				+#define CARBON_TESTING_BASE_SOURCE_GEN_H_
			
 
				+
			
 
				+#include <string>
			
 
				+
			
 
				+#include "absl/random/random.h"
			
 
				+#include "common/map.h"
			
 
				+#include "common/set.h"
			
 
				+#include "llvm/ADT/ArrayRef.h"
			
 
				+#include "llvm/ADT/StringRef.h"
			
 
				+#include "llvm/Support/Allocator.h"
			
 
				+
			
 
				+namespace Carbon::Testing {
			
 
				+
			
 
				+// Provides source code generation facilities.
			
 
				+//
			
 
				+// This class works to generate valid but random & meaningless source code in
			
 
				+// interesting patterns for benchmarking. It is very incomplete. A high level
			
 
				+// set of long-term goals:
			
 
				+//
			
 
				+// - Generate interesting patterns and structures of code that have emerged as
			
 
				+//   toolchain performance bottlenecks in practice in C++ codebases.
			
 
				+// - Generate code that includes most Carbon language features (and whatever
			
 
				+//   reasonable C++ analogs could be used for comparative purposes):
			
 
				+//   - Functions
			
 
				+//   - Classes with class functions, methods, and fields
			
 
				+//   - Interfaces
			
 
				+//   - Checked generics and templates
			
 
				+//   - Nested and unnested impls
			
 
				+//   - Nested classes
			
 
				+//   - Inline and out-of-line function and method definitions
			
 
				+//   - Imports and exports
			
 
				+//   - API files and impl files.
			
 
				+// - Be random but deterministic. The goal is benchmarking and so while this
			
 
				+//   code should strive for not producing trivially predictable patterns, it
			
 
				+//   should also strive to be consistent and suitable for benchmarking. Wherever
			
 
				+//   possible, it should permute the order and content without randomizing the
			
 
				+//   total count, size, or complexity.
			
 
				+//
			
 
				+// Note that the default and primary generation target is interesting Carbon
			
 
				+// source code. We have a best-effort to alternatively generate comparable C++
			
 
				+// constructs to the Carbon ones for comparative benchmarking, but there is no
			
 
				+// goal to cover all the interesting C++ patterns we might want to benchmark,
			
 
				+// and we don't aim for perfectly synthesizing C++ analogs. We can always drop
			
 
				+// fidelity for the C++ code path if needed for simplicity.
			
 
				+//
			
 
				+// TODO: There are numerous places where we hard code a fixed quantity. Instead,
			
 
				+// we should build a rich but general system to easily encode a discrete
			
 
				+// distribution that is sampled. We have a specialized version of this for
			
 
				+// identifiers that should be generalized.
			
 
				+class SourceGen {
			
 
				+ public:
			
 
				+  enum class Language {
			
 
				+    Carbon,
			
 
				+    Cpp,
			
 
				+  };
			
 
				+
			
 
				+  struct FunctionDeclParams {
			
 
				+    // TODD: Arbitrary default, should switch to a distribution from data.
			
 
				+    int max_params = 4;
			
 
				+  };
			
 
				+
			
 
				+  struct MethodDeclParams {
			
 
				+    // TODD: Arbitrary default, should switch to a distribution from data.
			
 
				+    int max_params = 4;
			
 
				+  };
			
 
				+
			
 
				+  // Parameters used to generate a class in a generated file.
			
 
				+  //
			
 
				+  // Currently, this uses a fixed number of each kind of declaration, with
			
 
				+  // arbitrary defaults chosen. The defaults currently skew towards large
			
 
				+  // classes with lots of nested declarations.
			
 
				+  // TODO: Switch these to distributions based on data.
			
 
				+  //
			
 
				+  // TODO: Add support for generating definitions and parameters to control
			
 
				+  // them.
			
 
				+  struct ClassParams {
			
 
				+    int public_function_decls = 4;
			
 
				+    FunctionDeclParams public_function_decl_params = {.max_params = 8};
			
 
				+
			
 
				+    int public_method_decls = 10;
			
 
				+    MethodDeclParams public_method_decl_params;
			
 
				+
			
 
				+    int private_function_decls = 2;
			
 
				+    FunctionDeclParams private_function_decl_params = {.max_params = 6};
			
 
				+
			
 
				+    int private_method_decls = 8;
			
 
				+    MethodDeclParams private_method_decl_params = {.max_params = 6};
			
 
				+
			
 
				+    int private_field_decls = 6;
			
 
				+  };
			
 
				+
			
 
				+  // Parameters used to generate a file with dense declarations.
			
 
				+  struct DenseDeclParams {
			
 
				+    // TODO: Add more parameters to control generating top-level constructs
			
 
				+    // other than class definitions.
			
 
				+
			
 
				+    // Parameters used when generating class definitions.
			
 
				+    ClassParams class_params = {};
			
 
				+  };
			
 
				+
			
 
				+  // Access a global instance of this type to generate Carbon code for
			
 
				+  // benchmarks, tests, or other places where sharing a common instance is
			
 
				+  // useful. Note that there is nothing thread safe about this instance or type.
			
 
				+  static auto Global() -> SourceGen&;
			
 
				+
			
 
				+  // Construct a source generator for the provided language, by default Carbon.
			
 
				+  explicit SourceGen(Language language = Language::Carbon);
			
 
				+
			
 
				+  // Generate an API file with dense classes containing function forward
			
 
				+  // declarations.
			
 
				+  //
			
 
				+  // Accepts a number of `target_lines` for the resulting source code. This is a
			
 
				+  // rough approximation used to scale all the other constructs up and down
			
 
				+  // accordingly. For C++ source generation, we work to generate the same number
			
 
				+  // of constructs as Carbon would for the given line count over keeping the
			
 
				+  // actual line count close to the target.
			
 
				+  //
			
 
				+  // TODO: Currently, the formatting and line breaks of generating code are
			
 
				+  // extremely rough still, and those are a large factor in adherence to
			
 
				+  // `target_lines`. Long term, the goal is to get as close as we can to any
			
 
				+  // automatically formatted code while still keeping the stability of
			
 
				+  // benchmarking.
			
 
				+  auto GenAPIFileDenseDecls(int target_lines, DenseDeclParams params)
			
 
				+      -> std::string;
			
 
				+
			
 
				+  // Get some number of randomly shuffled identifiers.
			
 
				+  //
			
 
				+  // The identifiers start with a character [A-Za-z], other characters may also
			
 
				+  // include [0-9_]. Both Carbon and C++ keywords are excluded along with any
			
 
				+  // other non-identifier syntaxes that overlap to ensure all of these can be
			
 
				+  // used as identifiers.
			
 
				+  //
			
 
				+  // The order will be different for each call to this function, but the
			
 
				+  // specific identifiers may remain the same in order to reduce the cost of
			
 
				+  // repeated calls. However, the sum of the identifier sizes returned is
			
 
				+  // guaranteed to be the same for every call with the same number of
			
 
				+  // identifiers so that benchmarking all of these identifiers has predictable
			
 
				+  // and stable cost.
			
 
				+  //
			
 
				+  // Optionally, callers can request a minimum and maximum length. By default,
			
 
				+  // the length distribution used across the identifiers will mirror the
			
 
				+  // observed distribution of identifiers in C++ source code and our expectation
			
 
				+  // of them in Carbon source code. The maximum length in this default
			
 
				+  // distribution cannot be more than 64.
			
 
				+  //
			
 
				+  // Callers can request a uniform distribution across [min_length, max_length],
			
 
				+  // and when it is requested there is no limit on `max_length`.
			
 
				+  auto GetShuffledIdentifiers(int number, int min_length = 1,
			
 
				+                              int max_length = 64, bool uniform = false)
			
 
				+      -> llvm::SmallVector<llvm::StringRef>;
			
 
				+
			
 
				+  // Same as `GetShuffledIdentifiers`, but ensures there are no collisions.
			
 
				+  auto GetShuffledUniqueIdentifiers(int number, int min_length = 4,
			
 
				+                                    int max_length = 64, bool uniform = false)
			
 
				+      -> llvm::SmallVector<llvm::StringRef>;
			
 
				+
			
 
				+  // Returns a collection of un-shuffled identifiers, otherwise the same as
			
 
				+  // `GetShuffledIdentifiers`.
			
 
				+  //
			
 
				+  // Usually, benchmarks should use the shuffled version. However, this is
			
 
				+  // useful when there is already a post-processing step to shuffle things as it
			
 
				+  // is *dramatically* more efficient, especially in debug builds.
			
 
				+  auto GetIdentifiers(int number, int min_length = 1, int max_length = 64,
			
 
				+                      bool uniform = false)
			
 
				+      -> llvm::SmallVector<llvm::StringRef>;
			
 
				+
			
 
				+  // Returns a collection of un-shuffled unique identifiers, otherwise the same
			
 
				+  // as `GetShuffledUniqueIdentifiers`.
			
 
				+  //
			
 
				+  // Usually, benchmarks should use the shuffled version. However, this is
			
 
				+  // useful when there is already a post-processing step to shuffle things.
			
 
				+  auto GetUniqueIdentifiers(int number, int min_length = 1, int max_length = 64,
			
 
				+                            bool uniform = false)
			
 
				+      -> llvm::SmallVector<llvm::StringRef>;
			
 
				+
			
 
				+  // Returns a shared collection of random identifiers of a specific length.
			
 
				+  //
			
 
				+  // For a single, exact length, we have an even cheaper routine to return
			
 
				+  // access to a shared collection of identifiers. The order of these is a
			
 
				+  // single fixed random order for a given execution. The returned array
			
 
				+  // reference is only valid until the next call any method on this generator.
			
 
				+  auto GetSingleLengthIdentifiers(int length, int number)
			
 
				+      -> llvm::ArrayRef<llvm::StringRef>;
			
 
				+
			
 
				+ private:
			
 
				+  // The shuffled state used to generate some number of classes.
			
 
				+  //
			
 
				+  // This state encodes all the shuffled entropy used for generating a number of
			
 
				+  // class definitions. While generating definitions, the state here will be
			
 
				+  // consumed until empty.
			
 
				+  struct ClassGenState {
			
 
				+    llvm::SmallVector<int> public_function_param_counts;
			
 
				+    llvm::SmallVector<int> public_method_param_counts;
			
 
				+    llvm::SmallVector<int> private_function_param_counts;
			
 
				+    llvm::SmallVector<int> private_method_param_counts;
			
 
				+
			
 
				+    llvm::SmallVector<llvm::StringRef> class_names;
			
 
				+    llvm::SmallVector<llvm::StringRef> member_names;
			
 
				+    llvm::SmallVector<llvm::StringRef> param_names;
			
 
				+  };
			
 
				+
			
 
				+  class UniqueIdentifierPopper;
			
 
				+  friend UniqueIdentifierPopper;
			
 
				+
			
 
				+  using AppendFn = auto(int length, int number,
			
 
				+                        llvm::SmallVectorImpl<llvm::StringRef>& dest) -> void;
			
 
				+
			
 
				+  auto IsCpp() -> bool { return language_ == Language::Cpp; }
			
 
				+
			
 
				+  auto GenerateRandomIdentifier(llvm::MutableArrayRef<char> dest_storage)
			
 
				+      -> void;
			
 
				+  auto AppendUniqueIdentifiers(int length, int number,
			
 
				+                               llvm::SmallVectorImpl<llvm::StringRef>& dest)
			
 
				+      -> void;
			
 
				+  auto GetIdentifiersImpl(int number, int min_length, int max_length,
			
 
				+                          bool uniform, llvm::function_ref<AppendFn> append)
			
 
				+      -> llvm::SmallVector<llvm::StringRef>;
			
 
				+
			
 
				+  auto GetShuffledInts(int number, int min, int max) -> llvm::SmallVector<int>;
			
 
				+
			
 
				+  auto GetClassGenState(int number, ClassParams params) -> ClassGenState;
			
 
				+
			
 
				+  auto GenerateFunctionDecl(llvm::StringRef name, bool is_private,
			
 
				+                            bool is_method, int param_count,
			
 
				+                            llvm::StringRef indent,
			
 
				+                            llvm::SmallVectorImpl<llvm::StringRef>& param_names,
			
 
				+                            llvm::raw_ostream& os) -> void;
			
 
				+  auto GenerateClassDef(const ClassParams& params, ClassGenState& state,
			
 
				+                        llvm::raw_ostream& os) -> void;
			
 
				+
			
 
				+  absl::BitGen rng_;
			
 
				+  llvm::BumpPtrAllocator storage_;
			
 
				+
			
 
				+  Map<int, llvm::SmallVector<llvm::StringRef>> identifiers_by_length_;
			
 
				+  Map<int, std::pair<int, Set<llvm::StringRef>>> unique_identifiers_by_length_;
			
 
				+
			
 
				+  Language language_;
			
 
				+};
			
 
				+
			
 
				+}  // namespace Carbon::Testing
			
 
				+
			
 
				+#endif  // CARBON_TESTING_BASE_SOURCE_GEN_H_
			
--- a/testing/base/source_gen_main.cpp
+++ b/testing/base/source_gen_main.cpp
@@ -0,0 +1,115 @@
 
				+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
			
 
				+// Exceptions. See /LICENSE for license information.
			
 
				+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
			
 
				+
			
 
				+#include "common/bazel_working_dir.h"
			
 
				+#include "common/command_line.h"
			
 
				+#include "common/init_llvm.h"
			
 
				+#include "common/ostream.h"
			
 
				+#include "llvm/ADT/ArrayRef.h"
			
 
				+#include "llvm/ADT/SmallVector.h"
			
 
				+#include "llvm/ADT/StringRef.h"
			
 
				+#include "llvm/Support/FileSystem.h"
			
 
				+#include "testing/base/source_gen.h"
			
 
				+
			
 
				+namespace Carbon::Testing {
			
 
				+namespace {
			
 
				+
			
 
				+constexpr CommandLine::CommandInfo Info = {
			
 
				+    .name = "source_gen",
			
 
				+    .help = R"""(
			
 
				+A source generator for Carbon.
			
 
				+)""",
			
 
				+};
			
 
				+
			
 
				+constexpr CommandLine::ArgInfo OutputArgInfo = {
			
 
				+    .name = "output",
			
 
				+    .value_name = "FILE",
			
 
				+    .help = R"""(
			
 
				+Writes the generate source code to a file rather than stdout.
			
 
				+)""",
			
 
				+};
			
 
				+
			
 
				+constexpr CommandLine::ArgInfo LinesArgInfo = {
			
 
				+    .name = "lines",
			
 
				+    .value_name = "N",
			
 
				+    .help = R"""(
			
 
				+The number of lines of code to target for a generated source file.
			
 
				+)""",
			
 
				+};
			
 
				+
			
 
				+constexpr CommandLine::ArgInfo LanguageArgInfo = {
			
 
				+    .name = "language",
			
 
				+    //.value_name = "[carbon|cpp]",
			
 
				+    .help = R"""(
			
 
				+The language of source code to generate. The C++ source generation is best
			
 
				+effort to try to provide as much comparable benchmarking as possible, but the
			
 
				+primary language focus is generating Carbon.
			
 
				+)""",
			
 
				+};
			
 
				+
			
 
				+auto Run(llvm::ArrayRef<llvm::StringRef> args) -> bool {
			
 
				+  // Default to outputting to stdout and writing 10k lines of source code.
			
 
				+  llvm::StringRef output_filename = "-";
			
 
				+  int lines = 10'000;
			
 
				+  SourceGen::Language language;
			
 
				+
			
 
				+  CommandLine::ParseResult parsed_args = CommandLine::Parse(
			
 
				+      args, llvm::outs(), llvm::errs(), Info,
			
 
				+      [&](CommandLine::CommandBuilder& b) {
			
 
				+        b.AddStringOption(OutputArgInfo,
			
 
				+                          [&](auto& arg_b) { arg_b.Set(&output_filename); });
			
 
				+        b.AddIntegerOption(LinesArgInfo,
			
 
				+                           [&](auto& arg_b) { arg_b.Set(&lines); });
			
 
				+        b.AddOneOfOption(LanguageArgInfo, [&](auto& arg_b) {
			
 
				+          arg_b.SetOneOf(
			
 
				+              {
			
 
				+                  arg_b.OneOfValue("carbon", SourceGen::Language::Carbon)
			
 
				+                      .Default(true),
			
 
				+                  arg_b.OneOfValue("cpp", SourceGen::Language::Cpp),
			
 
				+              },
			
 
				+              &language);
			
 
				+        });
			
 
				+
			
 
				+        // No-op action as there is only one operation for this command.
			
 
				+        b.Do([] {});
			
 
				+      });
			
 
				+  if (parsed_args == CommandLine::ParseResult::Error) {
			
 
				+    return false;
			
 
				+  } else if (parsed_args == CommandLine::ParseResult::MetaSuccess) {
			
 
				+    // Fully handled by the CLI library.
			
 
				+    return true;
			
 
				+  }
			
 
				+
			
 
				+  std::optional<llvm::raw_fd_ostream> output_file;
			
 
				+  llvm::raw_fd_ostream* output = &llvm::outs();
			
 
				+  if (output_filename != "-") {
			
 
				+    std::error_code ec;
			
 
				+    output_file.emplace(output_filename, ec, llvm::sys::fs::OF_None);
			
 
				+    if (ec) {
			
 
				+      llvm::errs() << "ERROR: Unable to open output file '" << output_filename
			
 
				+                   << "': " << ec.message() << "\n";
			
 
				+      return false;
			
 
				+    }
			
 
				+    output = &*output_file;
			
 
				+  }
			
 
				+
			
 
				+  SourceGen gen(language);
			
 
				+  *output << gen.GenAPIFileDenseDecls(lines, SourceGen::DenseDeclParams{});
			
 
				+  output->flush();
			
 
				+  return true;
			
 
				+}
			
 
				+
			
 
				+}  // namespace
			
 
				+}  // namespace Carbon::Testing
			
 
				+
			
 
				+auto main(int argc, char** argv) -> int {
			
 
				+  // Do LLVM's initialization first, this will also transform UTF-16 to UTF-8.
			
 
				+  Carbon::InitLLVM init_llvm(argc, argv);
			
 
				+
			
 
				+  Carbon::SetWorkingDirForBazel();
			
 
				+
			
 
				+  llvm::SmallVector<llvm::StringRef> args(argv + 1, argv + argc);
			
 
				+  bool success = Carbon::Testing::Run(args);
			
 
				+  return success ? EXIT_SUCCESS : EXIT_FAILURE;
			
 
				+}
			
--- a/testing/base/source_gen_test.cpp
+++ b/testing/base/source_gen_test.cpp
@@ -0,0 +1,196 @@
 
				+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
			
 
				+// Exceptions. See /LICENSE for license information.
			
 
				+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
			
 
				+
			
 
				+#include "testing/base/source_gen.h"
			
 
				+
			
 
				+#include <gmock/gmock.h>
			
 
				+#include <gtest/gtest.h>
			
 
				+
			
 
				+#include "common/set.h"
			
 
				+#include "testing/base/gtest_main.h"
			
 
				+#include "toolchain/driver/driver.h"
			
 
				+
			
 
				+namespace Carbon::Testing {
			
 
				+namespace {
			
 
				+
			
 
				+using ::testing::AllOf;
			
 
				+using ::testing::ContainerEq;
			
 
				+using ::testing::Contains;
			
 
				+using ::testing::Each;
			
 
				+using ::testing::Eq;
			
 
				+using ::testing::Ge;
			
 
				+using ::testing::Gt;
			
 
				+using ::testing::Le;
			
 
				+using ::testing::MatchesRegex;
			
 
				+using ::testing::SizeIs;
			
 
				+
			
 
				+// Tiny helper to sum the sizes of a range of ranges. Uses a template to avoid
			
 
				+// hard coding any specific types for the two ranges.
			
 
				+template <typename T>
			
 
				+static auto SumSizes(const T& range) -> ssize_t {
			
 
				+  ssize_t sum = 0;
			
 
				+  for (const auto& inner_range : range) {
			
 
				+    sum += inner_range.size();
			
 
				+  }
			
 
				+  return sum;
			
 
				+}
			
 
				+
			
 
				+TEST(SourceGenTest, Identifiers) {
			
 
				+  SourceGen gen;
			
 
				+
			
 
				+  auto idents = gen.GetShuffledIdentifiers(1000);
			
 
				+  EXPECT_THAT(idents.size(), Eq(1000));
			
 
				+  for (llvm::StringRef ident : idents) {
			
 
				+    EXPECT_THAT(ident, MatchesRegex("[A-Za-z][A-Za-z0-9_]*"));
			
 
				+  }
			
 
				+
			
 
				+  // We should have at least one identifier of each length [1, 64]. The exact
			
 
				+  // distribution is an implementation detail designed to vaguely match the
			
 
				+  // expected distribution in source code.
			
 
				+  for (int size : llvm::seq_inclusive(1, 64)) {
			
 
				+    EXPECT_THAT(idents, Contains(SizeIs(size)));
			
 
				+  }
			
 
				+
			
 
				+  // Check that identifiers 4 characters or shorter are more common than longer
			
 
				+  // lengths. This is a very rough way of double checking that we got the
			
 
				+  // intended distribution.
			
 
				+  for (int short_size : llvm::seq_inclusive(1, 4)) {
			
 
				+    int short_count = llvm::count_if(idents, [&](auto ident) {
			
 
				+      return static_cast<int>(ident.size()) == short_size;
			
 
				+    });
			
 
				+    for (int long_size : llvm::seq_inclusive(5, 64)) {
			
 
				+      EXPECT_THAT(short_count, Gt(llvm::count_if(idents, [&](auto ident) {
			
 
				+                    return static_cast<int>(ident.size()) == long_size;
			
 
				+                  })));
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  // Check that repeated calls are different in interesting ways, but have the
			
 
				+  // exact same total bytes.
			
 
				+  ssize_t idents_size_sum = SumSizes(idents);
			
 
				+  for ([[maybe_unused]] int _ : llvm::seq(10)) {
			
 
				+    auto idents2 = gen.GetShuffledIdentifiers(1000);
			
 
				+    EXPECT_THAT(idents2, SizeIs(1000));
			
 
				+    // Should be (at least) a different shuffle of identifiers.
			
 
				+    EXPECT_THAT(idents2, Not(ContainerEq(idents)));
			
 
				+    // But the sum of lengths should be identical.
			
 
				+    EXPECT_THAT(SumSizes(idents2), Eq(idents_size_sum));
			
 
				+  }
			
 
				+
			
 
				+  // Check length constraints have the desired effect.
			
 
				+  idents =
			
 
				+      gen.GetShuffledIdentifiers(1000, /*min_length=*/10, /*max_length=*/20);
			
 
				+  EXPECT_THAT(idents, Each(SizeIs(AllOf(Ge(10), Le(20)))));
			
 
				+}
			
 
				+
			
 
				+TEST(SourceGenTest, UniformIdentifiers) {
			
 
				+  SourceGen gen;
			
 
				+  // Check that uniform identifier length results in exact coverage of each
			
 
				+  // possible length for an easy case, both without and with a remainder.
			
 
				+  auto idents =
			
 
				+      gen.GetShuffledIdentifiers(100, /*min_length=*/10, /*max_length=*/19,
			
 
				+                                 /*uniform=*/true);
			
 
				+  EXPECT_THAT(idents, Contains(SizeIs(10)).Times(10));
			
 
				+  EXPECT_THAT(idents, Contains(SizeIs(11)).Times(10));
			
 
				+  EXPECT_THAT(idents, Contains(SizeIs(12)).Times(10));
			
 
				+  EXPECT_THAT(idents, Contains(SizeIs(13)).Times(10));
			
 
				+  EXPECT_THAT(idents, Contains(SizeIs(14)).Times(10));
			
 
				+  EXPECT_THAT(idents, Contains(SizeIs(15)).Times(10));
			
 
				+  EXPECT_THAT(idents, Contains(SizeIs(16)).Times(10));
			
 
				+  EXPECT_THAT(idents, Contains(SizeIs(17)).Times(10));
			
 
				+  EXPECT_THAT(idents, Contains(SizeIs(18)).Times(10));
			
 
				+  EXPECT_THAT(idents, Contains(SizeIs(19)).Times(10));
			
 
				+
			
 
				+  idents = gen.GetShuffledIdentifiers(97, /*min_length=*/10, /*max_length=*/19,
			
 
				+                                      /*uniform=*/true);
			
 
				+  EXPECT_THAT(idents, Contains(SizeIs(10)).Times(10));
			
 
				+  EXPECT_THAT(idents, Contains(SizeIs(11)).Times(10));
			
 
				+  EXPECT_THAT(idents, Contains(SizeIs(12)).Times(10));
			
 
				+  EXPECT_THAT(idents, Contains(SizeIs(13)).Times(10));
			
 
				+  EXPECT_THAT(idents, Contains(SizeIs(14)).Times(10));
			
 
				+  EXPECT_THAT(idents, Contains(SizeIs(15)).Times(10));
			
 
				+  EXPECT_THAT(idents, Contains(SizeIs(16)).Times(10));
			
 
				+  EXPECT_THAT(idents, Contains(SizeIs(17)).Times(9));
			
 
				+  EXPECT_THAT(idents, Contains(SizeIs(18)).Times(9));
			
 
				+  EXPECT_THAT(idents, Contains(SizeIs(19)).Times(9));
			
 
				+}
			
 
				+
			
 
				+// Largely covered by `Identifiers` and `UniformIdentifiers`, but need to check
			
 
				+// for uniqueness specifically.
			
 
				+TEST(SourceGenTest, UniqueIdentifiers) {
			
 
				+  SourceGen gen;
			
 
				+
			
 
				+  auto unique = gen.GetShuffledUniqueIdentifiers(1000);
			
 
				+  EXPECT_THAT(unique.size(), Eq(1000));
			
 
				+  Set<llvm::StringRef> set;
			
 
				+  for (llvm::StringRef ident : unique) {
			
 
				+    EXPECT_THAT(ident, MatchesRegex("[A-Za-z][A-Za-z0-9_]*"));
			
 
				+    EXPECT_TRUE(set.Insert(ident).is_inserted())
			
 
				+        << "Colliding identifier: " << ident;
			
 
				+  }
			
 
				+
			
 
				+  // Check single length specifically where uniqueness is the most challenging.
			
 
				+  set.Clear();
			
 
				+  unique = gen.GetShuffledUniqueIdentifiers(1000, /*min_length=*/4,
			
 
				+                                            /*max_length=*/4);
			
 
				+  for (llvm::StringRef ident : unique) {
			
 
				+    EXPECT_TRUE(set.Insert(ident).is_inserted())
			
 
				+        << "Colliding identifier: " << ident;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+// Check that the source code doesn't have compiler errors.
			
 
				+auto TestCompile(llvm::StringRef source) -> bool {
			
 
				+  llvm::vfs::InMemoryFileSystem fs;
			
 
				+  InstallPaths installation(
			
 
				+      InstallPaths::MakeForBazelRunfiles(Testing::GetTestExePath()));
			
 
				+  Driver driver(fs, &installation, llvm::outs(), llvm::errs());
			
 
				+
			
 
				+  // Load the prelude into our VFS.
			
 
				+  //
			
 
				+  // TODO: Factor this and analogous code in file_test into a Driver helper.
			
 
				+  auto prelude =
			
 
				+      Driver::FindPreludeFiles(installation.core_package(), llvm::errs());
			
 
				+  CARBON_CHECK(!prelude.empty());
			
 
				+  for (const auto& path : prelude) {
			
 
				+    llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> file =
			
 
				+        llvm::MemoryBuffer::getFile(path);
			
 
				+    CARBON_CHECK(file) << file.getError().message();
			
 
				+    CARBON_CHECK(fs.addFile(path, /*ModificationTime=*/0, std::move(*file)))
			
 
				+        << "Duplicate file: " << path;
			
 
				+  }
			
 
				+
			
 
				+  fs.addFile("test.carbon", /*ModificationTime=*/0,
			
 
				+             llvm::MemoryBuffer::getMemBuffer(source));
			
 
				+  return driver.RunCommand({"compile", "--phase=check", "test.carbon"}).success;
			
 
				+}
			
 
				+
			
 
				+TEST(SourceGenTest, GenAPIFileDenseDeclsTest) {
			
 
				+  SourceGen gen;
			
 
				+
			
 
				+  std::string source =
			
 
				+      gen.GenAPIFileDenseDecls(1000, SourceGen::DenseDeclParams{});
			
 
				+  // Should be within 1% of the requested line count.
			
 
				+  EXPECT_THAT(source, Contains('\n').Times(AllOf(Ge(950), Le(1050))));
			
 
				+
			
 
				+  // Make sure we generated valid Carbon code.
			
 
				+  EXPECT_TRUE(TestCompile(source));
			
 
				+}
			
 
				+
			
 
				+TEST(SourceGenTest, GenAPIFileDenseDeclsCppTest) {
			
 
				+  SourceGen gen(SourceGen::Language::Cpp);
			
 
				+
			
 
				+  // Generate a 1000-line file which is enough to have a reasonably accurate
			
 
				+  // line count estimate and have a few classes.
			
 
				+  std::string source =
			
 
				+      gen.GenAPIFileDenseDecls(1000, SourceGen::DenseDeclParams{});
			
 
				+  // Should be within 10% of the requested line count.
			
 
				+  EXPECT_THAT(source, Contains('\n').Times(AllOf(Ge(900), Le(1100))));
			
 
				+
			
 
				+  // TODO: When the driver supports compiling C++ code as easily as Carbon, we
			
 
				+  // should test that the generated C++ code is valid.
			
 
				+}
			
 
				+
			
 
				+}  // namespace
			
 
				+}  // namespace Carbon::Testing
			
--- a/toolchain/driver/BUILD
+++ b/toolchain/driver/BUILD
@@ -52,6 +52,24 @@ cc_test(
 
				     ],
			
 
				 )
			
 
				 
			
 
				+cc_binary(
			
 
				+    name = "compile_benchmark",
			
 
				+    testonly = 1,
			
 
				+    srcs = ["compile_benchmark.cpp"],
			
 
				+    deps = [
			
 
				+        ":driver",
			
 
				+        "//common:benchmark_main",
			
 
				+        "//testing/base:source_gen_lib",
			
 
				+        "@llvm-project//llvm:Support",
			
 
				+    ],
			
 
				+)
			
 
				+
			
 
				+sh_test(
			
 
				+    name = "compile_benchmark_test",
			
 
				+    srcs = ["compile_benchmark_test.sh"],
			
 
				+    data = [":compile_benchmark"],
			
 
				+)
			
 
				+
			
 
				 cc_library(
			
 
				     name = "driver",
			
 
				     srcs = ["driver.cpp"],
			
--- a/toolchain/driver/compile_benchmark.cpp
+++ b/toolchain/driver/compile_benchmark.cpp
@@ -0,0 +1,166 @@
 
				+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
			
 
				+// Exceptions. See /LICENSE for license information.
			
 
				+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
			
 
				+
			
 
				+#include <benchmark/benchmark.h>
			
 
				+
			
 
				+#include <string>
			
 
				+
			
 
				+#include "common/benchmark_main.h"
			
 
				+#include "testing/base/source_gen.h"
			
 
				+#include "toolchain/driver/driver.h"
			
 
				+
			
 
				+namespace Carbon::Testing {
			
 
				+namespace {
			
 
				+
			
 
				+// Helper used to benchmark compilation across different phases.
			
 
				+//
			
 
				+// Handles setting up the compiler's driver, locating the prelude, and managing
			
 
				+// a VFS in which the compilations occur.
			
 
				+class CompileBenchmark {
			
 
				+ public:
			
 
				+  CompileBenchmark()
			
 
				+      : installation_(
			
 
				+            InstallPaths::MakeForBazelRunfiles(GetBenchmarkExePath())),
			
 
				+        driver_(fs_, &installation_, llvm::outs(), llvm::errs()) {
			
 
				+    // Load the prelude into our VFS.
			
 
				+    //
			
 
				+    // TODO: Factor this and analogous code in file_test into a Driver helper.
			
 
				+    auto prelude =
			
 
				+        Driver::FindPreludeFiles(installation_.core_package(), llvm::errs());
			
 
				+    CARBON_CHECK(!prelude.empty());
			
 
				+    for (const auto& path : prelude) {
			
 
				+      llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> file =
			
 
				+          llvm::MemoryBuffer::getFile(path);
			
 
				+      CARBON_CHECK(file) << file.getError().message();
			
 
				+      CARBON_CHECK(fs_.addFile(path, /*ModificationTime=*/0, std::move(*file)))
			
 
				+          << "Duplicate file: " << path;
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  // Setup a set of source files in the VFS for the driver. Each string input is
			
 
				+  // materialized into a virtual file and a list of the virtual filenames is
			
 
				+  // returned.
			
 
				+  auto SetUpFiles(llvm::ArrayRef<std::string> sources)
			
 
				+      -> llvm::OwningArrayRef<std::string> {
			
 
				+    llvm::OwningArrayRef<std::string> file_names(sources.size());
			
 
				+    for (ssize_t i : llvm::seq<ssize_t>(sources.size())) {
			
 
				+      file_names[i] = llvm::formatv("file_{0}.carbon", i).str();
			
 
				+      fs_.addFile(file_names[i], /*ModificationTime=*/0,
			
 
				+                  llvm::MemoryBuffer::getMemBuffer(sources[i]));
			
 
				+    }
			
 
				+    return file_names;
			
 
				+  }
			
 
				+
			
 
				+  auto driver() -> Driver& { return driver_; }
			
 
				+  auto gen() -> SourceGen& { return gen_; }
			
 
				+
			
 
				+ private:
			
 
				+  llvm::vfs::InMemoryFileSystem fs_;
			
 
				+  const InstallPaths installation_;
			
 
				+  Driver driver_;
			
 
				+
			
 
				+  SourceGen gen_;
			
 
				+};
			
 
				+
			
 
				+// An enumerator used to select compilation phases to benchmark.
			
 
				+enum class Phase {
			
 
				+  Lex,
			
 
				+  Parse,
			
 
				+  Check,
			
 
				+};
			
 
				+
			
 
				+// Maps the enumerator for a compilation phase into a specific `compile` command
			
 
				+// line flag.
			
 
				+static auto PhaseFlag(Phase phase) -> llvm::StringRef {
			
 
				+  switch (phase) {
			
 
				+    case Phase::Lex:
			
 
				+      return "--phase=lex";
			
 
				+    case Phase::Parse:
			
 
				+      return "--phase=parse";
			
 
				+    case Phase::Check:
			
 
				+      return "--phase=check";
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+// Benchmark on multiple files of the same size but with different source code
			
 
				+// in order to avoid branch prediction perfectly learning a particular file's
			
 
				+// structure and shape, and to get closer to a cache-cold benchmark number which
			
 
				+// is what we generally expect to care about in practice. We enforce an upper
			
 
				+// bound to avoid excessive benchmark time and a lower bound to avoid anchoring
			
 
				+// on a single source file that may have unrepresentative content.
			
 
				+//
			
 
				+// For simplicity, we compute a number of files from the target line count as a
			
 
				+// heuristic.
			
 
				+static auto ComputeFileCount(int target_lines) -> int {
			
 
				+#ifndef NDEBUG
			
 
				+  // Use a smaller number of files in debug builds where compiles are slower.
			
 
				+  return std::max(1, std::min(8, (1024 * 1024) / target_lines));
			
 
				+#else
			
 
				+  return std::max(8, std::min(1024, (1024 * 1024) / target_lines));
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+template <Phase P>
			
 
				+static auto BM_CompileAPIFileDenseDecls(benchmark::State& state) -> void {
			
 
				+  CompileBenchmark bench;
			
 
				+  int target_lines = state.range(0);
			
 
				+  int num_files = ComputeFileCount(target_lines);
			
 
				+  llvm::OwningArrayRef<std::string> sources(num_files);
			
 
				+
			
 
				+  // Create a collection of random source files. Average the actual number of
			
 
				+  // lines resulting so we can use that to compute the compilation speed as a
			
 
				+  // line-rate counter.
			
 
				+  double avg_lines = 0.0;
			
 
				+  for (std::string& source : sources) {
			
 
				+    source = bench.gen().GenAPIFileDenseDecls(target_lines,
			
 
				+                                              SourceGen::DenseDeclParams{});
			
 
				+    avg_lines += llvm::count(source, '\n');
			
 
				+  }
			
 
				+  avg_lines /= sources.size();
			
 
				+
			
 
				+  // Setup the sources as files for compilation.
			
 
				+  llvm::OwningArrayRef<std::string> file_names = bench.SetUpFiles(sources);
			
 
				+  CARBON_CHECK(static_cast<int>(file_names.size()) == num_files);
			
 
				+
			
 
				+  // We benchmark in batches of files to avoid benchmarking any peculiarities of
			
 
				+  // a single file.
			
 
				+  while (state.KeepRunningBatch(num_files)) {
			
 
				+    for (ssize_t i = 0; i < num_files;) {
			
 
				+      // We block optimizing `i` as that has proven both more effective at
			
 
				+      // blocking the loop from being optimized away and avoiding disruption of
			
 
				+      // the generated code that we're benchmarking.
			
 
				+      benchmark::DoNotOptimize(i);
			
 
				+
			
 
				+      bool success = bench.driver()
			
 
				+                         .RunCommand({"compile", PhaseFlag(P), file_names[i]})
			
 
				+                         .success;
			
 
				+      CARBON_DCHECK(success);
			
 
				+
			
 
				+      // We use the compilation success to step through the file names,
			
 
				+      // establishing a dependency between each lookup. This doesn't fully allow
			
 
				+      // us to measure latency rather than throughput, but minimizes any skew in
			
 
				+      // measurements from speculating the start of the next compilation.
			
 
				+      i += static_cast<ssize_t>(success);
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  // Compute the line-rate of these compilations.
			
 
				+  state.counters["Lines"] = benchmark::Counter(
			
 
				+      avg_lines, benchmark::Counter::kIsIterationInvariantRate);
			
 
				+}
			
 
				+
			
 
				+// Benchmark from 256-line test cases through 256k line test cases, and for each
			
 
				+// phase of compilation.
			
 
				+BENCHMARK(BM_CompileAPIFileDenseDecls<Phase::Lex>)
			
 
				+    ->RangeMultiplier(4)
			
 
				+    ->Range(256, static_cast<int64_t>(256 * 1024));
			
 
				+BENCHMARK(BM_CompileAPIFileDenseDecls<Phase::Parse>)
			
 
				+    ->RangeMultiplier(4)
			
 
				+    ->Range(256, static_cast<int64_t>(256 * 1024));
			
 
				+BENCHMARK(BM_CompileAPIFileDenseDecls<Phase::Check>)
			
 
				+    ->RangeMultiplier(4)
			
 
				+    ->Range(256, static_cast<int64_t>(256 * 1024));
			
 
				+
			
 
				+}  // namespace
			
 
				+}  // namespace Carbon::Testing
			
--- a/toolchain/driver/compile_benchmark_test.sh
+++ b/toolchain/driver/compile_benchmark_test.sh
@@ -0,0 +1,13 @@
 
				+#!/usr/bin/env bash
			
 
				+#
			
 
				+# Part of the Carbon Language project, under the Apache License v2.0 with LLVM
			
 
				+# Exceptions. See /LICENSE for license information.
			
 
				+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
			
 
				+
			
 
				+BENCHMARK="$TEST_SRCDIR/$TEST_WORKSPACE/toolchain/driver/compile_benchmark"
			
 
				+
			
 
				+# Run the benchmark with the fastest size and a single iteration to make sure it
			
 
				+# doesn't hit errors.
			
 
				+exec "$BENCHMARK" \
			
 
				+  --benchmark_min_time=1x \
			
 
				+  --benchmark_filter='/256$'
			
--- a/toolchain/lex/BUILD
+++ b/toolchain/lex/BUILD
@@ -289,6 +289,7 @@ cc_binary(
 
				         ":tokenized_buffer",
			
 
				         "//common:benchmark_main",
			
 
				         "//common:check",
			
 
				+        "//testing/base:source_gen_lib",
			
 
				         "//toolchain/base:value_store",
			
 
				         "//toolchain/diagnostics:diagnostic_emitter",
			
 
				         "//toolchain/diagnostics:null_diagnostics",
			
--- a/toolchain/lex/tokenized_buffer_benchmark.cpp
+++ b/toolchain/lex/tokenized_buffer_benchmark.cpp
@@ -11,6 +11,7 @@
 
				 #include "common/check.h"
			
 
				 #include "llvm/ADT/Sequence.h"
			
 
				 #include "llvm/ADT/StringExtras.h"
			
 
				+#include "testing/base/source_gen.h"
			
 
				 #include "toolchain/base/value_store.h"
			
 
				 #include "toolchain/diagnostics/diagnostic_emitter.h"
			
 
				 #include "toolchain/diagnostics/null_diagnostics.h"
			
@@ -26,180 +27,14 @@ namespace {
 
				 // and 1% itself needs to not be too tiny. This makes 100,000 a great balance.
			
 
				 constexpr int NumTokens = 100'000;
			
 
				 
			
 
				-auto IdentifierStartChars() -> llvm::ArrayRef<char> {
			
 
				-  static llvm::SmallVector<char> chars = [] {
			
 
				-    llvm::SmallVector<char> chars;
			
 
				-    chars.push_back('_');
			
 
				-    for (char c : llvm::seq_inclusive('A', 'Z')) {
			
 
				-      chars.push_back(c);
			
 
				-    }
			
 
				-    for (char c : llvm::seq_inclusive('a', 'z')) {
			
 
				-      chars.push_back(c);
			
 
				-    }
			
 
				-    return chars;
			
 
				-  }();
			
 
				-  return chars;
			
 
				-}
			
 
				-
			
 
				-auto IdentifierChars() -> llvm::ArrayRef<char> {
			
 
				-  static llvm::SmallVector<char> chars = [] {
			
 
				-    llvm::ArrayRef<char> start_chars = IdentifierStartChars();
			
 
				-    llvm::SmallVector<char> chars(start_chars.begin(), start_chars.end());
			
 
				-    for (char c : llvm::seq_inclusive('0', '9')) {
			
 
				-      chars.push_back(c);
			
 
				-    }
			
 
				-    return chars;
			
 
				-  }();
			
 
				-  return chars;
			
 
				-}
			
 
				-
			
 
				-// Generates a random identifier string of the specified length using the
			
 
				-// provided RNG BitGen.
			
 
				-auto GenerateRandomIdentifier(absl::BitGen& gen, int length) -> std::string {
			
 
				-  llvm::ArrayRef<char> start_chars = IdentifierStartChars();
			
 
				-  llvm::ArrayRef<char> chars = IdentifierChars();
			
 
				-
			
 
				-  std::string id_result;
			
 
				-  llvm::raw_string_ostream os(id_result);
			
 
				-  llvm::StringRef id;
			
 
				-  do {
			
 
				-    // Erase any prior attempts to find an identifier.
			
 
				-    id_result.clear();
			
 
				-    os << start_chars[absl::Uniform<int>(gen, 0, start_chars.size())];
			
 
				-    for (int j : llvm::seq(0, length)) {
			
 
				-      static_cast<void>(j);
			
 
				-      os << chars[absl::Uniform<int>(gen, 0, chars.size())];
			
 
				-    }
			
 
				-    // Check if we ended up forming an integer type literal or a keyword, and
			
 
				-    // try again.
			
 
				-    id = llvm::StringRef(id_result);
			
 
				-  } while (
			
 
				-      llvm::any_of(TokenKind::KeywordTokens,
			
 
				-                   [id](auto token) { return id == token.fixed_spelling(); }) ||
			
 
				-      ((id.consume_front("i") || id.consume_front("u") ||
			
 
				-        id.consume_front("f")) &&
			
 
				-       llvm::all_of(id, [](const char c) { return llvm::isDigit(c); })));
			
 
				-  return id_result;
			
 
				-}
			
 
				-
			
 
				-// Get a static pool of random identifiers with the desired distribution.
			
 
				-template <int MinLength = 1, int MaxLength = 64, bool Uniform = false>
			
 
				-auto GetRandomIdentifiers() -> const std::array<std::string, NumTokens>& {
			
 
				-  static_assert(MinLength <= MaxLength);
			
 
				-  static_assert(
			
 
				-      Uniform || MaxLength <= 64,
			
 
				-      "Cannot produce a meaningful non-uniform distribution of lengths longer "
			
 
				-      "than 64 as those are exceedingly rare in our observed data sets.");
			
 
				-
			
 
				-  static const std::array<std::string, NumTokens> id_storage = [] {
			
 
				-    std::array<int, 64> id_length_counts;
			
 
				-    // For non-uniform distribution, we simulate a distribution roughly based on
			
 
				-    // the observed histogram of identifier lengths, but smoothed a bit and
			
 
				-    // reduced to small counts so that we cycle through all the lengths
			
 
				-    // reasonably quickly. We want sampling of even 10% of NumTokens from this
			
 
				-    // in a round-robin form to not be skewed overly much. This still inherently
			
 
				-    // compresses the long tail as we'd rather have coverage even though it
			
 
				-    // distorts the distribution a bit.
			
 
				-    //
			
 
				-    // The distribution here comes from a script that analyzes source code run
			
 
				-    // over a few directories of LLVM. The script renders a visual ascii-art
			
 
				-    // histogram along with the data for each bucket, and that output is
			
 
				-    // included in comments above each bucket size below to help visualize the
			
 
				-    // rough shape we're aiming for.
			
 
				-    //
			
 
				-    // 1 characters   [3976]  ███████████████████████████████▊
			
 
				-    id_length_counts[0] = 40;
			
 
				-    // 2 characters   [3724]  █████████████████████████████▊
			
 
				-    id_length_counts[1] = 40;
			
 
				-    // 3 characters   [4173]  █████████████████████████████████▍
			
 
				-    id_length_counts[2] = 40;
			
 
				-    // 4 characters   [5000]  ████████████████████████████████████████
			
 
				-    id_length_counts[3] = 50;
			
 
				-    // 5 characters   [1568]  ████████████▌
			
 
				-    id_length_counts[4] = 20;
			
 
				-    // 6 characters   [2226]  █████████████████▊
			
 
				-    id_length_counts[5] = 20;
			
 
				-    // 7 characters   [2380]  ███████████████████
			
 
				-    id_length_counts[6] = 20;
			
 
				-    // 8 characters   [1786]  ██████████████▎
			
 
				-    id_length_counts[7] = 18;
			
 
				-    // 9 characters   [1397]  ███████████▏
			
 
				-    id_length_counts[8] = 12;
			
 
				-    // 10 characters  [ 739]  █████▉
			
 
				-    id_length_counts[9] = 12;
			
 
				-    // 11 characters  [ 779]  ██████▎
			
 
				-    id_length_counts[10] = 12;
			
 
				-    // 12 characters  [1344]  ██████████▊
			
 
				-    id_length_counts[11] = 12;
			
 
				-    // 13 characters  [ 498]  ████
			
 
				-    id_length_counts[12] = 5;
			
 
				-    // 14 characters  [ 284]  ██▎
			
 
				-    id_length_counts[13] = 3;
			
 
				-    // 15 characters  [ 172]  █▍
			
 
				-    // 16 characters  [ 278]  ██▎
			
 
				-    // 17 characters  [ 191]  █▌
			
 
				-    // 18 characters  [ 207]  █▋
			
 
				-    for (int i : llvm::seq(14, 18)) {
			
 
				-      id_length_counts[i] = 2;
			
 
				-    }
			
 
				-    // 19 - 63 characters are all <100 but non-zero, and we map them to 1 for
			
 
				-    // coverage despite slightly over weighting the tail.
			
 
				-    for (int i : llvm::seq(18, 64)) {
			
 
				-      id_length_counts[i] = 1;
			
 
				-    }
			
 
				-
			
 
				-    // Used to track the different count buckets when in a non-uniform
			
 
				-    // distribution.
			
 
				-    int length_bucket_index = 0;
			
 
				-    int length_count = 0;
			
 
				-
			
 
				-    std::array<std::string, NumTokens> ids;
			
 
				-    absl::BitGen gen;
			
 
				-    for (auto [i, id] : llvm::enumerate(ids)) {
			
 
				-      if (Uniform) {
			
 
				-        // Rather than using randomness, for a uniform distribution rotate
			
 
				-        // lengths in round-robin to get a deterministic and exact size on every
			
 
				-        // run. We will then shuffle them at the end to produce a random
			
 
				-        // ordering.
			
 
				-        int length = MinLength + i % (1 + MaxLength - MinLength);
			
 
				-        id = GenerateRandomIdentifier(gen, length);
			
 
				-        continue;
			
 
				-      }
			
 
				-
			
 
				-      // For non-uniform distribution, walk through each each length bucket
			
 
				-      // until our count matches the desired distribution, and then move to the
			
 
				-      // next.
			
 
				-      id = GenerateRandomIdentifier(gen, length_bucket_index + 1);
			
 
				-
			
 
				-      if (length_count < id_length_counts[length_bucket_index]) {
			
 
				-        ++length_count;
			
 
				-      } else {
			
 
				-        length_bucket_index =
			
 
				-            (length_bucket_index + 1) % id_length_counts.size();
			
 
				-        length_count = 0;
			
 
				-      }
			
 
				-    }
			
 
				-
			
 
				-    return ids;
			
 
				-  }();
			
 
				-  return id_storage;
			
 
				-}
			
 
				-
			
 
				 // Compute a random sequence of just identifiers.
			
 
				-template <int MinLength = 1, int MaxLength = 64, bool Uniform = false>
			
 
				-auto RandomIdentifierSeq(llvm::StringRef separator = " ") -> std::string {
			
 
				-  // Get a static pool of identifiers with the desired distribution.
			
 
				-  const std::array<std::string, NumTokens>& ids =
			
 
				-      GetRandomIdentifiers<MinLength, MaxLength, Uniform>();
			
 
				-
			
 
				-  // Shuffle tokens so we get exactly one of each identifier but in a random
			
 
				-  // order.
			
 
				-  std::array<llvm::StringRef, NumTokens> tokens;
			
 
				-  for (int i : llvm::seq(NumTokens)) {
			
 
				-    tokens[i] = ids[i];
			
 
				-  }
			
 
				-  std::shuffle(tokens.begin(), tokens.end(), absl::BitGen());
			
 
				-  return llvm::join(tokens, separator);
			
 
				+static auto RandomIdentifierSeq(int min_length, int max_length, bool uniform,
			
 
				+                                llvm::StringRef separator = " ")
			
 
				+    -> std::string {
			
 
				+  auto& gen = Testing::SourceGen::Global();
			
 
				+  llvm::SmallVector<llvm::StringRef> ids =
			
 
				+      gen.GetShuffledIdentifiers(NumTokens, min_length, max_length, uniform);
			
 
				+  return llvm::join(ids, separator);
			
 
				 }
			
 
				 
			
 
				 auto GetSymbolTokenTable() -> llvm::ArrayRef<TokenKind> {
			
@@ -299,7 +134,6 @@ auto RandomSource(RandomSourceOptions options) -> std::string {
 
				   // Get static pools of symbols, keywords, and identifiers.
			
 
				   llvm::ArrayRef<TokenKind> symbols = GetSymbolTokenTable();
			
 
				   llvm::ArrayRef<TokenKind> keywords = TokenKind::KeywordTokens;
			
 
				-  const std::array<std::string, NumTokens>& ids = GetRandomIdentifiers();
			
 
				 
			
 
				   // Build a list of StringRefs from the different types with the desired
			
 
				   // distribution, then shuffle that list.
			
@@ -312,6 +146,8 @@ auto RandomSource(RandomSourceOptions options) -> std::string {
 
				       << "We require at least 500 identifiers as we need to collect a "
			
 
				          "reasonable number of samples to end up with a reasonable "
			
 
				          "distribution of lengths.";
			
 
				+  llvm::SmallVector<llvm::StringRef> ids =
			
 
				+      Testing::SourceGen::Global().GetIdentifiers(num_identifiers);
			
 
				 
			
 
				   for (int i : llvm::seq(num_symbols)) {
			
 
				     tokens[i] = symbols[i % symbols.size()].fixed_spelling();
			
@@ -454,7 +290,8 @@ BENCHMARK(BM_ValidKeywordsAsRawIdentifiers);
 
				 // This benchmark does a 50-50 split of r-prefixed and r#-prefixed identifiers
			
 
				 // to directly compare raw and non-raw performance.
			
 
				 void BM_RawIdentifierFocus(benchmark::State& state) {
			
 
				-  const std::array<std::string, NumTokens>& ids = GetRandomIdentifiers();
			
 
				+  llvm::SmallVector<llvm::StringRef> ids =
			
 
				+      Testing::SourceGen::Global().GetIdentifiers(NumTokens / 2);
			
 
				 
			
 
				   llvm::SmallVector<std::string> modified_ids;
			
 
				   // As we resize, start with the in-use prefix. Note that `r#` uses the first
			
@@ -490,7 +327,7 @@ BENCHMARK(BM_RawIdentifierFocus);
 
				 
			
 
				 template <int MinLength, int MaxLength, bool Uniform>
			
 
				 void BM_ValidIdentifiers(benchmark::State& state) {
			
 
				-  std::string source = RandomIdentifierSeq<MinLength, MaxLength, Uniform>();
			
 
				+  std::string source = RandomIdentifierSeq(MinLength, MaxLength, Uniform);
			
 
				 
			
 
				   LexerBenchHelper helper(source);
			
 
				   for (auto _ : state) {
			
@@ -525,7 +362,7 @@ BENCHMARK(BM_ValidIdentifiers<80, 80, /*Uniform=*/true>);
 
				 void BM_HorizontalWhitespace(benchmark::State& state) {
			
 
				   int num_spaces = state.range(0);
			
 
				   std::string separator(num_spaces, ' ');
			
 
				-  std::string source = RandomIdentifierSeq<3, 5, /*Uniform=*/true>(separator);
			
 
				+  std::string source = RandomIdentifierSeq(3, 5, /*uniform=*/true, separator);
			
 
				 
			
 
				   LexerBenchHelper helper(source);
			
 
				   for (auto _ : state) {
			
@@ -579,7 +416,8 @@ void BM_GroupingSymbols(benchmark::State& state) {
 
				   // It should still let us look for specific pain points. We do include some
			
 
				   // whitespace and keywords to make sure *some* other parts of the benchmark
			
 
				   // are also active and have some reasonable icache pressure.
			
 
				-  const std::array<std::string, NumTokens>& ids = GetRandomIdentifiers();
			
 
				+  llvm::SmallVector<llvm::StringRef> ids =
			
 
				+      Testing::SourceGen::Global().GetShuffledIdentifiers(NumTokens);
			
 
				   std::string source;
			
 
				   llvm::raw_string_ostream os(source);
			
 
				   int num_tokens_per_nest =
			
@@ -658,7 +496,7 @@ BENCHMARK(BM_GroupingSymbols)
 
				 void BM_BlankLines(benchmark::State& state) {
			
 
				   int num_blank_lines = state.range(0);
			
 
				   std::string separator(num_blank_lines, '\n');
			
 
				-  std::string source = RandomIdentifierSeq<3, 5, /*Uniform=*/true>(separator);
			
 
				+  std::string source = RandomIdentifierSeq(3, 5, /*uniform=*/true, separator);
			
 
				 
			
 
				   LexerBenchHelper helper(source);
			
 
				   for (auto _ : state) {
			
@@ -693,7 +531,7 @@ void BM_CommentLines(benchmark::State& state) {
 
				     os << std::string(comment_indent, ' ') << "//"
			
 
				        << std::string(comment_length, ' ') << "\n";
			
 
				   }
			
 
				-  std::string source = RandomIdentifierSeq<3, 5, /*Uniform=*/true>(separator);
			
 
				+  std::string source = RandomIdentifierSeq(3, 5, /*uniform=*/true, separator);
			
 
				 
			
 
				   LexerBenchHelper helper(source);
			
 
				   for (auto _ : state) {