hai 1 ano · 21a81bc59e
--- a/.bazelrc
+++ b/.bazelrc
@@ -9,6 +9,10 @@ build:clang-tidy --aspects @bazel_clang_tidy//clang_tidy:clang_tidy.bzl%clang_ti
 
				 build:clang-tidy --output_groups=report
			
 
				 build:clang-tidy --@bazel_clang_tidy//:clang_tidy_config=//:clang_tidy_config
			
 
				 
			
 
				+# This warning seems to incorrectly fire in this build configuration, despite
			
 
				+# not firing in our normal builds.
			
 
				+build:clang-tidy --copt=-Wno-unknown-pragmas
			
 
				+
			
 
				 # Default to using a disk cache to minimize re-building LLVM and Clang which we
			
 
				 # try to avoid updating too frequently to minimize rebuild cost. The location
			
 
				 # here can be overridden in the user configuration where needed.
			
--- a/.codespell_ignore
+++ b/.codespell_ignore
@@ -11,6 +11,7 @@ createor
 
				 crossreference
			
 
				 falsy
			
 
				 forin
			
 
				+groupt
			
 
				 inout
			
 
				 parameteras
			
 
				 pullrequest
			
--- a/common/BUILD
+++ b/common/BUILD
@@ -182,6 +182,14 @@ cc_binary(
 
				     ],
			
 
				 )
			
 
				 
			
 
				+cc_library(
			
 
				+    name = "hashtable_key_context",
			
 
				+    hdrs = ["hashtable_key_context.h"],
			
 
				+    deps = [
			
 
				+        ":hashing",
			
 
				+    ],
			
 
				+)
			
 
				+
			
 
				 cc_library(
			
 
				     name = "indirect_value",
			
 
				     hdrs = ["indirect_value.h"],
			
@@ -224,6 +232,53 @@ cc_library(
 
				     alwayslink = 1,
			
 
				 )
			
 
				 
			
 
				+cc_library(
			
 
				+    name = "map",
			
 
				+    hdrs = ["map.h"],
			
 
				+    deps = [
			
 
				+        ":check",
			
 
				+        ":hashtable_key_context",
			
 
				+        ":raw_hashtable",
			
 
				+        "@llvm-project//llvm:Support",
			
 
				+    ],
			
 
				+)
			
 
				+
			
 
				+cc_test(
			
 
				+    name = "map_test",
			
 
				+    srcs = ["map_test.cpp"],
			
 
				+    deps = [
			
 
				+        ":map",
			
 
				+        ":raw_hashtable_test_helpers",
			
 
				+        "//testing/base:gtest_main",
			
 
				+        "//testing/base:test_raw_ostream",
			
 
				+        "@googletest//:gtest",
			
 
				+    ],
			
 
				+)
			
 
				+
			
 
				+cc_binary(
			
 
				+    name = "map_benchmark",
			
 
				+    testonly = 1,
			
 
				+    srcs = ["map_benchmark.cpp"],
			
 
				+    deps = [
			
 
				+        ":map",
			
 
				+        ":raw_hashtable_benchmark_helpers",
			
 
				+        "@abseil-cpp//absl/container:flat_hash_map",
			
 
				+        "@abseil-cpp//absl/random",
			
 
				+        "@google_benchmark//:benchmark_main",
			
 
				+        "@llvm-project//llvm:Support",
			
 
				+    ],
			
 
				+)
			
 
				+
			
 
				+sh_test(
			
 
				+    name = "map_benchmark_test",
			
 
				+    # The benchmark allocates a large amount of memory.
			
 
				+    size = "enormous",
			
 
				+    # We configure the test to run quickly.
			
 
				+    timeout = "short",
			
 
				+    srcs = ["map_benchmark_test.sh"],
			
 
				+    data = [":map_benchmark"],
			
 
				+)
			
 
				+
			
 
				 cc_library(
			
 
				     name = "ostream",
			
 
				     hdrs = ["ostream.h"],
			
@@ -232,6 +287,126 @@ cc_library(
 
				     ],
			
 
				 )
			
 
				 
			
 
				+cc_library(
			
 
				+    name = "raw_hashtable",
			
 
				+    srcs = ["raw_hashtable.cpp"],
			
 
				+    hdrs = ["raw_hashtable.h"],
			
 
				+    deps = [
			
 
				+        ":check",
			
 
				+        ":hashing",
			
 
				+        ":hashtable_key_context",
			
 
				+        ":raw_hashtable_metadata_group",
			
 
				+        "@llvm-project//llvm:Support",
			
 
				+    ],
			
 
				+)
			
 
				+
			
 
				+cc_library(
			
 
				+    name = "raw_hashtable_metadata_group",
			
 
				+    srcs = ["raw_hashtable_metadata_group.cpp"],
			
 
				+    hdrs = ["raw_hashtable_metadata_group.h"],
			
 
				+    deps = [
			
 
				+        ":check",
			
 
				+        "@llvm-project//llvm:Support",
			
 
				+    ],
			
 
				+)
			
 
				+
			
 
				+cc_binary(
			
 
				+    name = "raw_hashtable_metadata_group_benchmark",
			
 
				+    testonly = 1,
			
 
				+    srcs = ["raw_hashtable_metadata_group_benchmark.cpp"],
			
 
				+    deps = [
			
 
				+        ":raw_hashtable_metadata_group",
			
 
				+        "@abseil-cpp//absl/random",
			
 
				+        "@google_benchmark//:benchmark_main",
			
 
				+        "@llvm-project//llvm:Support",
			
 
				+    ],
			
 
				+)
			
 
				+
			
 
				+sh_test(
			
 
				+    name = "raw_hashtable_metadata_group_benchmark_test",
			
 
				+    srcs = ["raw_hashtable_metadata_group_benchmark_test.sh"],
			
 
				+    data = [":raw_hashtable_metadata_group_benchmark"],
			
 
				+)
			
 
				+
			
 
				+cc_library(
			
 
				+    name = "raw_hashtable_benchmark_helpers",
			
 
				+    testonly = 1,
			
 
				+    srcs = ["raw_hashtable_benchmark_helpers.cpp"],
			
 
				+    hdrs = ["raw_hashtable_benchmark_helpers.h"],
			
 
				+    copts = [
			
 
				+        "-O2",  # Always optimize to make testing benchmarks faster.
			
 
				+    ],
			
 
				+    deps = [
			
 
				+        ":check",
			
 
				+        ":hashing",
			
 
				+        ":raw_hashtable",
			
 
				+        ":set",
			
 
				+        "@abseil-cpp//absl/base:no_destructor",
			
 
				+        "@abseil-cpp//absl/hash",
			
 
				+        "@abseil-cpp//absl/random",
			
 
				+        "@google_benchmark//:benchmark",
			
 
				+        "@llvm-project//llvm:Support",
			
 
				+    ],
			
 
				+)
			
 
				+
			
 
				+cc_library(
			
 
				+    name = "raw_hashtable_test_helpers",
			
 
				+    testonly = 1,
			
 
				+    hdrs = ["raw_hashtable_test_helpers.h"],
			
 
				+    deps = [
			
 
				+        ":check",
			
 
				+        ":hashing",
			
 
				+        ":hashtable_key_context",
			
 
				+        ":ostream",
			
 
				+    ],
			
 
				+)
			
 
				+
			
 
				+cc_library(
			
 
				+    name = "set",
			
 
				+    hdrs = ["set.h"],
			
 
				+    deps = [
			
 
				+        ":check",
			
 
				+        ":hashtable_key_context",
			
 
				+        ":raw_hashtable",
			
 
				+        "@llvm-project//llvm:Support",
			
 
				+    ],
			
 
				+)
			
 
				+
			
 
				+cc_test(
			
 
				+    name = "set_test",
			
 
				+    srcs = ["set_test.cpp"],
			
 
				+    deps = [
			
 
				+        ":raw_hashtable_test_helpers",
			
 
				+        ":set",
			
 
				+        "//testing/base:gtest_main",
			
 
				+        "//testing/base:test_raw_ostream",
			
 
				+        "@googletest//:gtest",
			
 
				+    ],
			
 
				+)
			
 
				+
			
 
				+cc_binary(
			
 
				+    name = "set_benchmark",
			
 
				+    testonly = 1,
			
 
				+    srcs = ["set_benchmark.cpp"],
			
 
				+    deps = [
			
 
				+        ":raw_hashtable_benchmark_helpers",
			
 
				+        ":set",
			
 
				+        "@abseil-cpp//absl/container:flat_hash_set",
			
 
				+        "@google_benchmark//:benchmark_main",
			
 
				+        "@llvm-project//llvm:Support",
			
 
				+    ],
			
 
				+)
			
 
				+
			
 
				+sh_test(
			
 
				+    name = "set_benchmark_test",
			
 
				+    # The benchmark allocates a large amount of memory.
			
 
				+    size = "enormous",
			
 
				+    # We configure the test to run quickly.
			
 
				+    timeout = "short",
			
 
				+    srcs = ["set_benchmark_test.sh"],
			
 
				+    data = [":set_benchmark"],
			
 
				+)
			
 
				+
			
 
				 cc_library(
			
 
				     name = "string_helpers",
			
 
				     srcs = ["string_helpers.cpp"],
			
--- a/common/hashing.h
+++ b/common/hashing.h
@@ -573,9 +573,9 @@ constexpr auto HashCode::ExtractIndex() -> ssize_t { return value_; }
 
				 template <int N>
			
 
				 constexpr auto HashCode::ExtractIndexAndTag() -> std::pair<ssize_t, uint32_t> {
			
 
				   static_assert(N >= 1);
			
 
				-  static_assert(N <= 32);
			
 
				+  static_assert(N < 32);
			
 
				   return {static_cast<ssize_t>(value_ >> N),
			
 
				-          static_cast<uint32_t>(value_ & ((1U << (N + 1)) - 1))};
			
 
				+          static_cast<uint32_t>(value_ & ((1U << N) - 1))};
			
 
				 }
			
 
				 
			
 
				 // Building with `-DCARBON_MCA_MARKERS` will enable `llvm-mca` annotations in
			
--- a/common/hashing_test.cpp
+++ b/common/hashing_test.cpp
@@ -40,6 +40,12 @@ TEST(HashingTest, HashCodeAPI) {
 
				   EXPECT_THAT(a.ExtractIndex(), Ne(b.ExtractIndex()));
			
 
				   EXPECT_THAT(a.ExtractIndex(), Ne(empty.ExtractIndex()));
			
 
				 
			
 
				+  // The tag shouldn't have bits set outside the range requested.
			
 
				+  EXPECT_THAT(HashValue("a").ExtractIndexAndTag<1>().second & ~0b1, Eq(0));
			
 
				+  EXPECT_THAT(HashValue("a").ExtractIndexAndTag<2>().second & ~0b11, Eq(0));
			
 
				+  EXPECT_THAT(HashValue("a").ExtractIndexAndTag<3>().second & ~0b111, Eq(0));
			
 
				+  EXPECT_THAT(HashValue("a").ExtractIndexAndTag<4>().second & ~0b1111, Eq(0));
			
 
				+
			
 
				   // Note that the index produced with a tag may be different from the index
			
 
				   // alone!
			
 
				   EXPECT_THAT(HashValue("a").ExtractIndexAndTag<2>(),
			
--- a/common/hashtable_key_context.h
+++ b/common/hashtable_key_context.h
@@ -0,0 +1,85 @@
 
				+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
			
 
				+// Exceptions. See /LICENSE for license information.
			
 
				+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
			
 
				+
			
 
				+#ifndef CARBON_COMMON_HASHTABLE_KEY_CONTEXT_H_
			
 
				+#define CARBON_COMMON_HASHTABLE_KEY_CONTEXT_H_
			
 
				+
			
 
				+#include "common/hashing.h"
			
 
				+
			
 
				+namespace Carbon {
			
 
				+
			
 
				+// Customizable context for keys in hashtables.
			
 
				+//
			
 
				+// This type or customizations matching its API are used with the data
			
 
				+// structures in `map.h` and `set.h`. By providing a custom version of the
			
 
				+// `KeyContext` type parameter to those data structures, users can provide
			
 
				+// either stateless or stateful customization of the two core hashtable key
			
 
				+// operations: hashing and comparison.
			
 
				+//
			
 
				+// The default for hashing uses Carbon's `hashing.h`. Customizations must still
			
 
				+// return a `HashCode` as defined there, and it needs to have the same core
			
 
				+// properties of hashes produced by the `hashing.h` infrastructure.
			
 
				+//
			
 
				+// The default for comparison is `operator==`. The `KeyEq` method is always
			
 
				+// called with a key *stored in the hashtable* as the second or "RHS" parameter.
			
 
				+// This is to allow simplifying the set of overloads needed for heterogeneous
			
 
				+// contexts: only the first, LHS, parameter needs to support different lookup
			
 
				+// key types.
			
 
				+//
			
 
				+// Custom KeyContext types should have the the same API as the default type.
			
 
				+// They can choose to use templates to support heterogeneous key types or not as
			
 
				+// appropriate. The default context can also be used as a base class with only
			
 
				+// one or the other APIs customized.
			
 
				+//
			
 
				+// An important consideration is how the key context is constructed. When the
			
 
				+// key context can be default constructed, hashtable APIs trafficking in keys
			
 
				+// will have overloads that provide a default constructed key context. When the
			
 
				+// context is *not* default constructible, every API that accepts a key will
			
 
				+// also require a context argument to be called, and that argument will be used
			
 
				+// throughout that operation. The intent is to allow callers to provide stateful
			
 
				+// contexts to each API where it would be needed, while managing that state
			
 
				+// outside the hashtable. Often the needed state is trivially part of the
			
 
				+// caller's existing state and needn't be stored separately.
			
 
				+//
			
 
				+// Example for a stateful, customized key context for interned strings:
			
 
				+// ```cpp
			
 
				+// class InternedStringIndexKeyContext {
			
 
				+//  public:
			
 
				+//   InternedStringIndexKeyContext(
			
 
				+//       llvm::ArrayRef<llvm::StringRef> interned_strings)
			
 
				+//       : interned_strings_(interned_strings) {}
			
 
				+//
			
 
				+//   auto HashKey(llvm::StringRef s, uint64_t seed) const -> HashCode {
			
 
				+//     return HashValue(s);
			
 
				+//   }
			
 
				+//   auto HashKey(int index_key, uint64_t seed) const -> HashCode {
			
 
				+//     return HashKey(interned_strings_[index_key]);
			
 
				+//   }
			
 
				+//
			
 
				+//   auto KeyEq(llvm::StringRef lhs, int rhs_index) const -> bool {
			
 
				+//     return lhs == interned_strings_[rhs_index];
			
 
				+//   }
			
 
				+//   auto KeyEq(int lhs_index, int rhs_index) const -> bool {
			
 
				+//     return KeyEq(interned_strings_[lhs_index], rhs_index);
			
 
				+//   }
			
 
				+//
			
 
				+//  private:
			
 
				+//   llvm::ArrayRef<llvm::StringRef> interned_strings_;
			
 
				+// };
			
 
				+// ```
			
 
				+struct DefaultKeyContext {
			
 
				+  template <typename KeyT>
			
 
				+  auto HashKey(const KeyT& key, uint64_t seed) const -> HashCode {
			
 
				+    return HashValue(key, seed);
			
 
				+  }
			
 
				+
			
 
				+  template <typename LHSKeyT, typename RHSKeyT>
			
 
				+  auto KeyEq(const LHSKeyT& lhs_key, const RHSKeyT& rhs_key) const -> bool {
			
 
				+    return lhs_key == rhs_key;
			
 
				+  }
			
 
				+};
			
 
				+
			
 
				+}  // namespace Carbon
			
 
				+
			
 
				+#endif  // CARBON_COMMON_HASHTABLE_KEY_CONTEXT_H_
			
--- a/common/map.h
+++ b/common/map.h
@@ -0,0 +1,558 @@
 
				+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
			
 
				+// Exceptions. See /LICENSE for license information.
			
 
				+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
			
 
				+
			
 
				+#ifndef CARBON_COMMON_MAP_H_
			
 
				+#define CARBON_COMMON_MAP_H_
			
 
				+
			
 
				+#include <algorithm>
			
 
				+#include <concepts>
			
 
				+#include <utility>
			
 
				+
			
 
				+#include "common/check.h"
			
 
				+#include "common/hashtable_key_context.h"
			
 
				+#include "common/raw_hashtable.h"
			
 
				+#include "llvm/Support/Compiler.h"
			
 
				+
			
 
				+namespace Carbon {
			
 
				+
			
 
				+// Forward declarations to resolve cyclic references.
			
 
				+template <typename KeyT, typename ValueT, typename KeyContextT>
			
 
				+class MapView;
			
 
				+template <typename KeyT, typename ValueT, typename KeyContextT>
			
 
				+class MapBase;
			
 
				+template <typename KeyT, typename ValueT, ssize_t SmallSize,
			
 
				+          typename KeyContextT>
			
 
				+class Map;
			
 
				+
			
 
				+// A read-only view type for a map from key to value.
			
 
				+//
			
 
				+// This view is a cheap-to-copy type that should be passed by value, but
			
 
				+// provides view or read-only reference semantics to the underlying map data
			
 
				+// structure.
			
 
				+//
			
 
				+// This should always be preferred to a `const`-ref parameter for the `MapBase`
			
 
				+// or `Map` type as it provides more flexibility and a cleaner API.
			
 
				+//
			
 
				+// Note that while this type is a read-only view, that applies to the underlying
			
 
				+// *map* data structure, not the individual entries stored within it. Those can
			
 
				+// be mutated freely as long as both the hashes and equality of the keys are
			
 
				+// preserved. If we applied a deep-`const` design here, it would prevent using
			
 
				+// this type in many useful situations where the elements are mutated but the
			
 
				+// associative container is not. A view of immutable data can always be obtained
			
 
				+// by using `MapView<const T, const V>`, and we enable conversions to more-const
			
 
				+// views. This mirrors the semantics of views like `std::span`.
			
 
				+//
			
 
				+// A specific `KeyContextT` type can optionally be provided to configure how
			
 
				+// keys will be hashed and compared. The default is `DefaultKeyContext` which is
			
 
				+// stateless and will hash using `Carbon::HashValue` and compare using
			
 
				+// `operator==`. Every method accepting a lookup key or operating on the keys in
			
 
				+// the table will also accept an instance of this type. For stateless context
			
 
				+// types, including the default, an instance will be default constructed if not
			
 
				+// provided to these methods. However, stateful contexts should be constructed
			
 
				+// and passed in explicitly. The context type should be small and reasonable to
			
 
				+// pass by value, often a wrapper or pointer to the relevant context needed for
			
 
				+// hashing and comparing keys. For more details about the key context, see
			
 
				+// `hashtable_key_context.h`.
			
 
				+template <typename InputKeyT, typename InputValueT,
			
 
				+          typename InputKeyContextT = DefaultKeyContext>
			
 
				+class MapView
			
 
				+    : RawHashtable::ViewImpl<InputKeyT, InputValueT, InputKeyContextT> {
			
 
				+  using ImplT =
			
 
				+      RawHashtable::ViewImpl<InputKeyT, InputValueT, InputKeyContextT>;
			
 
				+  using EntryT = typename ImplT::EntryT;
			
 
				+
			
 
				+ public:
			
 
				+  using KeyT = typename ImplT::KeyT;
			
 
				+  using ValueT = typename ImplT::ValueT;
			
 
				+  using KeyContextT = typename ImplT::KeyContextT;
			
 
				+
			
 
				+  // This type represents the result of lookup operations. It encodes whether
			
 
				+  // the lookup was a success as well as accessors for the key and value.
			
 
				+  class LookupKVResult {
			
 
				+   public:
			
 
				+    LookupKVResult() = default;
			
 
				+    explicit LookupKVResult(EntryT* entry) : entry_(entry) {}
			
 
				+
			
 
				+    explicit operator bool() const { return entry_ != nullptr; }
			
 
				+
			
 
				+    auto key() const -> KeyT& { return entry_->key(); }
			
 
				+    auto value() const -> ValueT& { return entry_->value(); }
			
 
				+
			
 
				+   private:
			
 
				+    EntryT* entry_ = nullptr;
			
 
				+  };
			
 
				+
			
 
				+  // Enable implicit conversions that add `const`-ness to either key or value
			
 
				+  // type. This is always safe to do with a view. We use a template to avoid
			
 
				+  // needing all 3 versions.
			
 
				+  template <typename OtherKeyT, typename OtherValueT>
			
 
				+  // NOLINTNEXTLINE(google-explicit-constructor)
			
 
				+  MapView(MapView<OtherKeyT, OtherValueT, KeyContextT> other_view)
			
 
				+    requires(std::same_as<KeyT, OtherKeyT> ||
			
 
				+             std::same_as<KeyT, const OtherKeyT>) &&
			
 
				+            (std::same_as<ValueT, OtherValueT> ||
			
 
				+             std::same_as<ValueT, const OtherValueT>)
			
 
				+      : ImplT(other_view) {}
			
 
				+
			
 
				+  // Tests whether a key is present in the map.
			
 
				+  template <typename LookupKeyT>
			
 
				+  auto Contains(LookupKeyT lookup_key,
			
 
				+                KeyContextT key_context = KeyContextT()) const -> bool;
			
 
				+
			
 
				+  // Lookup a key in the map.
			
 
				+  template <typename LookupKeyT>
			
 
				+  auto Lookup(LookupKeyT lookup_key,
			
 
				+              KeyContextT key_context = KeyContextT()) const -> LookupKVResult;
			
 
				+
			
 
				+  // Lookup a key in the map and try to return a pointer to its value. Returns
			
 
				+  // null on a missing key.
			
 
				+  template <typename LookupKeyT>
			
 
				+  auto operator[](LookupKeyT lookup_key) const
			
 
				+      -> ValueT* requires(std::default_initializable<KeyContextT>);
			
 
				+
			
 
				+  // Run the provided callback for every key and value in the map.
			
 
				+  template <typename CallbackT>
			
 
				+  void ForEach(CallbackT callback)
			
 
				+    requires(std::invocable<CallbackT, KeyT&, ValueT&>);
			
 
				+
			
 
				+  // This routine is relatively inefficient and only intended for use in
			
 
				+  // benchmarking or logging of performance anomalies. The specific count
			
 
				+  // returned has no specific guarantees beyond being informative in benchmarks.
			
 
				+  // It counts how many of the keys in the hashtable have required probing
			
 
				+  // beyond their initial group of slots.
			
 
				+  //
			
 
				+  // TODO: Replace with a more general metrics routine that covers other
			
 
				+  // important aspects such as load factor, and average probe *distance*.
			
 
				+  auto CountProbedKeys(KeyContextT key_context = KeyContextT()) -> ssize_t {
			
 
				+    return ImplT::CountProbedKeys(key_context);
			
 
				+  }
			
 
				+
			
 
				+ private:
			
 
				+  template <typename MapKeyT, typename MapValueT, ssize_t MinSmallSize,
			
 
				+            typename KeyContextT>
			
 
				+  friend class Map;
			
 
				+  friend class MapBase<KeyT, ValueT, KeyContextT>;
			
 
				+  friend class MapView<const KeyT, ValueT, KeyContextT>;
			
 
				+  friend class MapView<KeyT, const ValueT, KeyContextT>;
			
 
				+  friend class MapView<const KeyT, const ValueT, KeyContextT>;
			
 
				+
			
 
				+  MapView() = default;
			
 
				+  // NOLINTNEXTLINE(google-explicit-constructor): Implicit by design.
			
 
				+  MapView(ImplT base) : ImplT(base) {}
			
 
				+  MapView(ssize_t size, RawHashtable::Storage* storage)
			
 
				+      : ImplT(size, storage) {}
			
 
				+};
			
 
				+
			
 
				+// A base class for a `Map` type that remains mutable while type-erasing the
			
 
				+// `SmallSize` (SSO) template parameter.
			
 
				+//
			
 
				+// A pointer or reference to this type is the preferred way to pass a mutable
			
 
				+// handle to a `Map` type across API boundaries as it avoids encoding specific
			
 
				+// SSO sizing information while providing a near-complete mutable API.
			
 
				+template <typename InputKeyT, typename InputValueT,
			
 
				+          typename InputKeyContextT = DefaultKeyContext>
			
 
				+class MapBase : protected RawHashtable::BaseImpl<InputKeyT, InputValueT,
			
 
				+                                                 InputKeyContextT> {
			
 
				+ protected:
			
 
				+  using ImplT =
			
 
				+      RawHashtable::BaseImpl<InputKeyT, InputValueT, InputKeyContextT>;
			
 
				+  using EntryT = typename ImplT::EntryT;
			
 
				+
			
 
				+ public:
			
 
				+  using KeyT = typename ImplT::KeyT;
			
 
				+  using ValueT = typename ImplT::ValueT;
			
 
				+  using KeyContextT = typename ImplT::KeyContextT;
			
 
				+  using ViewT = MapView<KeyT, ValueT, KeyContextT>;
			
 
				+  using LookupKVResult = typename ViewT::LookupKVResult;
			
 
				+
			
 
				+  // The result type for insertion operations both indicates whether an insert
			
 
				+  // was needed (as opposed to finding an existing element), and provides access
			
 
				+  // to the element's key and value.
			
 
				+  class InsertKVResult {
			
 
				+   public:
			
 
				+    InsertKVResult() = default;
			
 
				+    explicit InsertKVResult(bool inserted, EntryT& entry)
			
 
				+        : entry_(&entry), inserted_(inserted) {}
			
 
				+
			
 
				+    auto is_inserted() const -> bool { return inserted_; }
			
 
				+
			
 
				+    auto key() const -> KeyT& { return entry_->key(); }
			
 
				+    auto value() const -> ValueT& { return entry_->value(); }
			
 
				+
			
 
				+   private:
			
 
				+    EntryT* entry_;
			
 
				+    bool inserted_;
			
 
				+  };
			
 
				+
			
 
				+  // Implicitly convertible to the relevant view type.
			
 
				+  //
			
 
				+  // NOLINTNEXTLINE(google-explicit-constructor): Designed to implicitly decay.
			
 
				+  operator ViewT() const { return this->view_impl(); }
			
 
				+
			
 
				+  // We can't chain the above conversion with the conversions on `ViewT` to add
			
 
				+  // const, so explicitly support adding const to produce a view here.
			
 
				+  template <typename OtherKeyT, typename OtherValueT>
			
 
				+  // NOLINTNEXTLINE(google-explicit-constructor)
			
 
				+  operator MapView<OtherKeyT, OtherValueT, KeyContextT>() const
			
 
				+    requires(std::same_as<KeyT, OtherKeyT> ||
			
 
				+             std::same_as<const KeyT, OtherKeyT>) &&
			
 
				+            (std::same_as<ValueT, OtherValueT> ||
			
 
				+             std::same_as<const ValueT, OtherValueT>)
			
 
				+  {
			
 
				+    return ViewT(*this);
			
 
				+  }
			
 
				+
			
 
				+  // Convenience forwarder to the view type.
			
 
				+  template <typename LookupKeyT>
			
 
				+  auto Contains(LookupKeyT lookup_key,
			
 
				+                KeyContextT key_context = KeyContextT()) const -> bool {
			
 
				+    return ViewT(*this).Contains(lookup_key, key_context);
			
 
				+  }
			
 
				+
			
 
				+  // Convenience forwarder to the view type.
			
 
				+  template <typename LookupKeyT>
			
 
				+  auto Lookup(LookupKeyT lookup_key,
			
 
				+              KeyContextT key_context = KeyContextT()) const -> LookupKVResult {
			
 
				+    return ViewT(*this).Lookup(lookup_key, key_context);
			
 
				+  }
			
 
				+
			
 
				+  // Convenience forwarder to the view type.
			
 
				+  template <typename LookupKeyT>
			
 
				+  auto operator[](LookupKeyT lookup_key) const
			
 
				+      -> ValueT* requires(std::default_initializable<KeyContextT>) {
			
 
				+        return ViewT(*this)[lookup_key];
			
 
				+      }
			
 
				+
			
 
				+  // Convenience forwarder to the view type.
			
 
				+  template <typename CallbackT>
			
 
				+  void ForEach(CallbackT callback)
			
 
				+    requires(std::invocable<CallbackT, KeyT&, ValueT&>)
			
 
				+  {
			
 
				+    return ViewT(*this).ForEach(callback);
			
 
				+  }
			
 
				+
			
 
				+  // Convenience forwarder to the view type.
			
 
				+  auto CountProbedKeys(KeyContextT key_context = KeyContextT()) const
			
 
				+      -> ssize_t {
			
 
				+    return ViewT(*this).CountProbedKeys(key_context);
			
 
				+  }
			
 
				+
			
 
				+  // Insert a key and value into the map. If the key is already present, the new
			
 
				+  // value is discarded and the existing value preserved.
			
 
				+  template <typename LookupKeyT>
			
 
				+  auto Insert(LookupKeyT lookup_key, ValueT new_v,
			
 
				+              KeyContextT key_context = KeyContextT()) -> InsertKVResult;
			
 
				+
			
 
				+  // Insert a key into the map and call the provided callback if necessary to
			
 
				+  // produce a new value when no existing value is found.
			
 
				+  //
			
 
				+  // Example: `m.Insert(key, [] { return default_value; });`
			
 
				+  //
			
 
				+  // TODO: The `;` formatting below appears to be bugs in clang-format with
			
 
				+  // concepts that should be filed upstream.
			
 
				+  template <typename LookupKeyT, typename ValueCallbackT>
			
 
				+  auto Insert(LookupKeyT lookup_key, ValueCallbackT value_cb,
			
 
				+              KeyContextT key_context = KeyContextT()) -> InsertKVResult
			
 
				+    requires(
			
 
				+        !std::same_as<ValueT, ValueCallbackT> &&
			
 
				+        std::convertible_to<decltype(std::declval<ValueCallbackT>()()), ValueT>)
			
 
				+  ;
			
 
				+
			
 
				+  // Lookup a key in the map and if missing insert it and call the provided
			
 
				+  // callback to in-place construct both the key and value. The lookup key is
			
 
				+  // passed through to the callback so it needn't be captured and can be kept in
			
 
				+  // a register argument throughout.
			
 
				+  //
			
 
				+  // Example:
			
 
				+  // ```cpp
			
 
				+  //   m.Insert("widget", [](MyStringViewType lookup_key, void* key_storage,
			
 
				+  //                         void* value_storage) {
			
 
				+  //     new (key_storage) MyStringType(lookup_key);
			
 
				+  //     new (value_storage) MyValueType(....);
			
 
				+  //   });
			
 
				+  // ```
			
 
				+  template <typename LookupKeyT, typename InsertCallbackT>
			
 
				+  auto Insert(LookupKeyT lookup_key, InsertCallbackT insert_cb,
			
 
				+              KeyContextT key_context = KeyContextT()) -> InsertKVResult
			
 
				+    requires(!std::same_as<ValueT, InsertCallbackT> &&
			
 
				+             std::invocable<InsertCallbackT, LookupKeyT, void*, void*>);
			
 
				+
			
 
				+  // Replace a key's value in a map if already present or insert it if not
			
 
				+  // already present. The new value is always used.
			
 
				+  template <typename LookupKeyT>
			
 
				+  auto Update(LookupKeyT lookup_key, ValueT new_v,
			
 
				+              KeyContextT key_context = KeyContextT()) -> InsertKVResult;
			
 
				+
			
 
				+  // Lookup or insert a key into the map, and set it's value to the result of
			
 
				+  // the `value_cb` callback. The callback is always run and its result is
			
 
				+  // always used, whether the key was already in the map or not. Any existing
			
 
				+  // value is replaced with the result.
			
 
				+  //
			
 
				+  // Example: `m.Update(key, [] { return new_value; });`
			
 
				+  template <typename LookupKeyT, typename ValueCallbackT>
			
 
				+  auto Update(LookupKeyT lookup_key, ValueCallbackT value_cb,
			
 
				+              KeyContextT key_context = KeyContextT()) -> InsertKVResult
			
 
				+    requires(
			
 
				+        !std::same_as<ValueT, ValueCallbackT> &&
			
 
				+        std::convertible_to<decltype(std::declval<ValueCallbackT>()()), ValueT>)
			
 
				+  ;
			
 
				+
			
 
				+  // Lookup or insert a key into the map. If not already present and the key is
			
 
				+  // inserted, the `insert_cb` is used to construct the new key and value in
			
 
				+  // place. When inserting, the lookup key is passed through to the callback so
			
 
				+  // it needn't be captured and can be kept in a register argument throughout.
			
 
				+  // If the key was already present, the `update_cb` is called to update the
			
 
				+  // existing key and value as desired.
			
 
				+  //
			
 
				+  // Example of counting occurrences:
			
 
				+  // ```cpp
			
 
				+  //   m.Update(item, /*insert_cb=*/[](MyStringViewType lookup_key,
			
 
				+  //                                   void* key_storage, void* value_storage) {
			
 
				+  //                    new (key_storage) MyItem(lookup_key);
			
 
				+  //                    new (value_storage) Count(1);
			
 
				+  //                  },
			
 
				+  //                  /*update_cb=*/[](MyItem& /*key*/, Count& count) {
			
 
				+  //                    ++count;
			
 
				+  //                  });
			
 
				+  // ```
			
 
				+  template <typename LookupKeyT, typename InsertCallbackT,
			
 
				+            typename UpdateCallbackT>
			
 
				+  auto Update(LookupKeyT lookup_key, InsertCallbackT insert_cb,
			
 
				+              UpdateCallbackT update_cb,
			
 
				+              KeyContextT key_context = KeyContextT()) -> InsertKVResult
			
 
				+    requires(!std::same_as<ValueT, InsertCallbackT> &&
			
 
				+             std::invocable<InsertCallbackT, LookupKeyT, void*, void*> &&
			
 
				+             std::invocable<UpdateCallbackT, KeyT&, ValueT&>);
			
 
				+
			
 
				+  // Erase a key from the map.
			
 
				+  template <typename LookupKeyT>
			
 
				+  auto Erase(LookupKeyT lookup_key, KeyContextT key_context = KeyContextT())
			
 
				+      -> bool;
			
 
				+
			
 
				+  // Clear all key/value pairs from the map but leave the underlying hashtable
			
 
				+  // allocated and in place.
			
 
				+  void Clear();
			
 
				+
			
 
				+ protected:
			
 
				+  using ImplT::ImplT;
			
 
				+};
			
 
				+
			
 
				+// A data structure mapping from key to value.
			
 
				+//
			
 
				+// This map also supports small size optimization (or "SSO"). The provided
			
 
				+// `SmallSize` type parameter indicates the size of an embedded buffer for
			
 
				+// storing maps small enough to fit. The default is zero, which always allocates
			
 
				+// a heap buffer on construction. When non-zero, must be a multiple of the
			
 
				+// `MaxGroupSize` which is currently 16. The library will check that the size is
			
 
				+// valid and provide an error at compile time if not. We don't automatically
			
 
				+// select the next multiple or otherwise fit the size to the constraints to make
			
 
				+// it clear in the code how much memory is used by the SSO buffer.
			
 
				+//
			
 
				+// This data structure optimizes heavily for small key types that are cheap to
			
 
				+// move and even copy. Using types with large keys or expensive to copy keys may
			
 
				+// create surprising performance bottlenecks. A `std::string` key should be fine
			
 
				+// with generally small strings, but if some or many strings are large heap
			
 
				+// allocations the performance of hashtable routines may be unacceptably bad and
			
 
				+// another data structure or key design is likely preferable.
			
 
				+//
			
 
				+// Note that this type should typically not appear on API boundaries; either
			
 
				+// `MapBase` or `MapView` should be used instead.
			
 
				+template <typename InputKeyT, typename InputValueT, ssize_t SmallSize = 0,
			
 
				+          typename InputKeyContextT = DefaultKeyContext>
			
 
				+class Map : public RawHashtable::TableImpl<
			
 
				+                MapBase<InputKeyT, InputValueT, InputKeyContextT>, SmallSize> {
			
 
				+  using BaseT = MapBase<InputKeyT, InputValueT, InputKeyContextT>;
			
 
				+  using ImplT = RawHashtable::TableImpl<BaseT, SmallSize>;
			
 
				+
			
 
				+ public:
			
 
				+  using KeyT = typename BaseT::KeyT;
			
 
				+  using ValueT = typename BaseT::ValueT;
			
 
				+
			
 
				+  Map() = default;
			
 
				+  Map(const Map& arg) = default;
			
 
				+  Map(Map&& arg) noexcept = default;
			
 
				+
			
 
				+  // Reset the entire state of the hashtable to as it was when constructed,
			
 
				+  // throwing away any intervening allocations.
			
 
				+  void Reset();
			
 
				+};
			
 
				+
			
 
				+template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
			
 
				+template <typename LookupKeyT>
			
 
				+auto MapView<InputKeyT, InputValueT, InputKeyContextT>::Contains(
			
 
				+    LookupKeyT lookup_key, KeyContextT key_context) const -> bool {
			
 
				+  return this->LookupEntry(lookup_key, key_context) != nullptr;
			
 
				+}
			
 
				+
			
 
				+template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
			
 
				+template <typename LookupKeyT>
			
 
				+auto MapView<InputKeyT, InputValueT, InputKeyContextT>::Lookup(
			
 
				+    LookupKeyT lookup_key, KeyContextT key_context) const -> LookupKVResult {
			
 
				+  return LookupKVResult(this->LookupEntry(lookup_key, key_context));
			
 
				+}
			
 
				+
			
 
				+template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
			
 
				+template <typename LookupKeyT>
			
 
				+auto MapView<InputKeyT, InputValueT, InputKeyContextT>::operator[](
			
 
				+    LookupKeyT lookup_key) const
			
 
				+    -> ValueT* requires(std::default_initializable<KeyContextT>) {
			
 
				+      auto result = Lookup(lookup_key, KeyContextT());
			
 
				+      return result ? &result.value() : nullptr;
			
 
				+    }
			
 
				+
			
 
				+template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
			
 
				+template <typename CallbackT>
			
 
				+void MapView<InputKeyT, InputValueT, InputKeyContextT>::ForEach(
			
 
				+    CallbackT callback)
			
 
				+  requires(std::invocable<CallbackT, KeyT&, ValueT&>)
			
 
				+{
			
 
				+  this->ForEachEntry(
			
 
				+      [callback](EntryT& entry) { callback(entry.key(), entry.value()); },
			
 
				+      [](auto...) {});
			
 
				+}
			
 
				+
			
 
				+template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
			
 
				+template <typename LookupKeyT>
			
 
				+[[clang::always_inline]] auto
			
 
				+MapBase<InputKeyT, InputValueT, InputKeyContextT>::Insert(
			
 
				+    LookupKeyT lookup_key, ValueT new_v, KeyContextT key_context)
			
 
				+    -> InsertKVResult {
			
 
				+  return Insert(
			
 
				+      lookup_key,
			
 
				+      [&new_v](LookupKeyT lookup_key, void* key_storage, void* value_storage) {
			
 
				+        new (key_storage) KeyT(lookup_key);
			
 
				+        new (value_storage) ValueT(std::move(new_v));
			
 
				+      },
			
 
				+      key_context);
			
 
				+}
			
 
				+
			
 
				+template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
			
 
				+template <typename LookupKeyT, typename ValueCallbackT>
			
 
				+[[clang::always_inline]] auto
			
 
				+MapBase<InputKeyT, InputValueT, InputKeyContextT>::Insert(
			
 
				+    LookupKeyT lookup_key, ValueCallbackT value_cb, KeyContextT key_context)
			
 
				+    -> InsertKVResult
			
 
				+  requires(
			
 
				+      !std::same_as<ValueT, ValueCallbackT> &&
			
 
				+      std::convertible_to<decltype(std::declval<ValueCallbackT>()()), ValueT>)
			
 
				+{
			
 
				+  return Insert(
			
 
				+      lookup_key,
			
 
				+      [&value_cb](LookupKeyT lookup_key, void* key_storage,
			
 
				+                  void* value_storage) {
			
 
				+        new (key_storage) KeyT(lookup_key);
			
 
				+        new (value_storage) ValueT(value_cb());
			
 
				+      },
			
 
				+      key_context);
			
 
				+}
			
 
				+
			
 
				+template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
			
 
				+template <typename LookupKeyT, typename InsertCallbackT>
			
 
				+[[clang::always_inline]] auto
			
 
				+MapBase<InputKeyT, InputValueT, InputKeyContextT>::Insert(
			
 
				+    LookupKeyT lookup_key, InsertCallbackT insert_cb, KeyContextT key_context)
			
 
				+    -> InsertKVResult
			
 
				+  requires(!std::same_as<ValueT, InsertCallbackT> &&
			
 
				+           std::invocable<InsertCallbackT, LookupKeyT, void*, void*>)
			
 
				+{
			
 
				+  auto [entry, inserted] = this->InsertImpl(lookup_key, key_context);
			
 
				+  CARBON_DCHECK(entry) << "Should always result in a valid index.";
			
 
				+
			
 
				+  if (LLVM_LIKELY(!inserted)) {
			
 
				+    return InsertKVResult(false, *entry);
			
 
				+  }
			
 
				+
			
 
				+  insert_cb(lookup_key, static_cast<void*>(&entry->key_storage),
			
 
				+            static_cast<void*>(&entry->value_storage));
			
 
				+  return InsertKVResult(true, *entry);
			
 
				+}
			
 
				+
			
 
				+template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
			
 
				+template <typename LookupKeyT>
			
 
				+[[clang::always_inline]] auto
			
 
				+MapBase<InputKeyT, InputValueT, InputKeyContextT>::Update(
			
 
				+    LookupKeyT lookup_key, ValueT new_v, KeyContextT key_context)
			
 
				+    -> InsertKVResult {
			
 
				+  return Update(
			
 
				+      lookup_key,
			
 
				+      [&new_v](LookupKeyT lookup_key, void* key_storage, void* value_storage) {
			
 
				+        new (key_storage) KeyT(lookup_key);
			
 
				+        new (value_storage) ValueT(std::move(new_v));
			
 
				+      },
			
 
				+      [&new_v](KeyT& /*key*/, ValueT& value) {
			
 
				+        value.~ValueT();
			
 
				+        new (&value) ValueT(std::move(new_v));
			
 
				+      },
			
 
				+      key_context);
			
 
				+}
			
 
				+
			
 
				+template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
			
 
				+template <typename LookupKeyT, typename ValueCallbackT>
			
 
				+[[clang::always_inline]] auto
			
 
				+MapBase<InputKeyT, InputValueT, InputKeyContextT>::Update(
			
 
				+    LookupKeyT lookup_key, ValueCallbackT value_cb, KeyContextT key_context)
			
 
				+    -> InsertKVResult
			
 
				+  requires(
			
 
				+      !std::same_as<ValueT, ValueCallbackT> &&
			
 
				+      std::convertible_to<decltype(std::declval<ValueCallbackT>()()), ValueT>)
			
 
				+{
			
 
				+  return Update(
			
 
				+      lookup_key,
			
 
				+      [&value_cb](LookupKeyT lookup_key, void* key_storage,
			
 
				+                  void* value_storage) {
			
 
				+        new (key_storage) KeyT(lookup_key);
			
 
				+        new (value_storage) ValueT(value_cb());
			
 
				+      },
			
 
				+      [&value_cb](KeyT& /*key*/, ValueT& value) {
			
 
				+        value.~ValueT();
			
 
				+        new (&value) ValueT(value_cb());
			
 
				+      },
			
 
				+      key_context);
			
 
				+}
			
 
				+
			
 
				+template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
			
 
				+template <typename LookupKeyT, typename InsertCallbackT,
			
 
				+          typename UpdateCallbackT>
			
 
				+[[clang::always_inline]] auto
			
 
				+MapBase<InputKeyT, InputValueT, InputKeyContextT>::Update(
			
 
				+    LookupKeyT lookup_key, InsertCallbackT insert_cb, UpdateCallbackT update_cb,
			
 
				+    KeyContextT key_context) -> InsertKVResult
			
 
				+  requires(!std::same_as<ValueT, InsertCallbackT> &&
			
 
				+           std::invocable<InsertCallbackT, LookupKeyT, void*, void*> &&
			
 
				+           std::invocable<UpdateCallbackT, KeyT&, ValueT&>)
			
 
				+{
			
 
				+  auto [entry, inserted] = this->InsertImpl(lookup_key, key_context);
			
 
				+  CARBON_DCHECK(entry) << "Should always result in a valid index.";
			
 
				+
			
 
				+  if (LLVM_LIKELY(!inserted)) {
			
 
				+    update_cb(entry->key(), entry->value());
			
 
				+    return InsertKVResult(false, *entry);
			
 
				+  }
			
 
				+
			
 
				+  insert_cb(lookup_key, static_cast<void*>(&entry->key_storage),
			
 
				+            static_cast<void*>(&entry->value_storage));
			
 
				+  return InsertKVResult(true, *entry);
			
 
				+}
			
 
				+
			
 
				+template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
			
 
				+template <typename LookupKeyT>
			
 
				+auto MapBase<InputKeyT, InputValueT, InputKeyContextT>::Erase(
			
 
				+    LookupKeyT lookup_key, KeyContextT key_context) -> bool {
			
 
				+  return this->EraseImpl(lookup_key, key_context);
			
 
				+}
			
 
				+
			
 
				+template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
			
 
				+void MapBase<InputKeyT, InputValueT, InputKeyContextT>::Clear() {
			
 
				+  this->ClearImpl();
			
 
				+}
			
 
				+
			
 
				+template <typename InputKeyT, typename InputValueT, ssize_t SmallSize,
			
 
				+          typename InputKeyContextT>
			
 
				+void Map<InputKeyT, InputValueT, SmallSize, InputKeyContextT>::Reset() {
			
 
				+  this->ResetImpl();
			
 
				+}
			
 
				+
			
 
				+}  // namespace Carbon
			
 
				+
			
 
				+#endif  // CARBON_COMMON_MAP_H_
			
--- a/common/map_benchmark.cpp
+++ b/common/map_benchmark.cpp
@@ -0,0 +1,480 @@
 
				+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
			
 
				+// Exceptions. See /LICENSE for license information.
			
 
				+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
			
 
				+
			
 
				+#include <benchmark/benchmark.h>
			
 
				+
			
 
				+#include <type_traits>
			
 
				+
			
 
				+#include "absl/container/flat_hash_map.h"
			
 
				+#include "common/map.h"
			
 
				+#include "common/raw_hashtable_benchmark_helpers.h"
			
 
				+#include "llvm/ADT/DenseMap.h"
			
 
				+
			
 
				+namespace Carbon {
			
 
				+namespace {
			
 
				+
			
 
				+using RawHashtable::CarbonHashDI;
			
 
				+using RawHashtable::GetKeysAndHitKeys;
			
 
				+using RawHashtable::GetKeysAndMissKeys;
			
 
				+using RawHashtable::HitArgs;
			
 
				+using RawHashtable::SizeArgs;
			
 
				+using RawHashtable::ValueToBool;
			
 
				+
			
 
				+// Helpers to synthesize some value of one of the three types we use as value
			
 
				+// types.
			
 
				+template <typename T>
			
 
				+auto MakeValue() -> T {
			
 
				+  if constexpr (std::is_same_v<T, llvm::StringRef>) {
			
 
				+    return "abc";
			
 
				+  } else if constexpr (std::is_pointer_v<T>) {
			
 
				+    static std::remove_pointer_t<T> x;
			
 
				+    return &x;
			
 
				+  } else {
			
 
				+    return 42;
			
 
				+  }
			
 
				+}
			
 
				+template <typename T>
			
 
				+auto MakeValue2() -> T {
			
 
				+  if constexpr (std::is_same_v<T, llvm::StringRef>) {
			
 
				+    return "qux";
			
 
				+  } else if constexpr (std::is_pointer_v<T>) {
			
 
				+    static std::remove_pointer_t<T> y;
			
 
				+    return &y;
			
 
				+  } else {
			
 
				+    return 7;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+template <typename MapT>
			
 
				+struct IsCarbonMapImpl : std::false_type {};
			
 
				+template <typename KT, typename VT, int MinSmallSize>
			
 
				+struct IsCarbonMapImpl<Map<KT, VT, MinSmallSize>> : std::true_type {};
			
 
				+
			
 
				+template <typename MapT>
			
 
				+static constexpr bool IsCarbonMap = IsCarbonMapImpl<MapT>::value;
			
 
				+
			
 
				+// A wrapper around various map types that we specialize to implement a common
			
 
				+// API used in the benchmarks for various different map data structures that
			
 
				+// support different APIs. The primary template assumes a roughly
			
 
				+// `std::unordered_map` API design, and types with a different API design are
			
 
				+// supported through specializations.
			
 
				+template <typename MapT>
			
 
				+struct MapWrapperImpl {
			
 
				+  using KeyT = typename MapT::key_type;
			
 
				+  using ValueT = typename MapT::mapped_type;
			
 
				+
			
 
				+  MapT m;
			
 
				+
			
 
				+  auto BenchContains(KeyT k) -> bool { return m.find(k) != m.end(); }
			
 
				+
			
 
				+  auto BenchLookup(KeyT k) -> bool {
			
 
				+    auto it = m.find(k);
			
 
				+    if (it == m.end()) {
			
 
				+      return false;
			
 
				+    }
			
 
				+    return ValueToBool(it->second);
			
 
				+  }
			
 
				+
			
 
				+  auto BenchInsert(KeyT k, ValueT v) -> bool {
			
 
				+    auto result = m.insert({k, v});
			
 
				+    return result.second;
			
 
				+  }
			
 
				+
			
 
				+  auto BenchUpdate(KeyT k, ValueT v) -> bool {
			
 
				+    auto result = m.insert({k, v});
			
 
				+    result.first->second = v;
			
 
				+    return result.second;
			
 
				+  }
			
 
				+
			
 
				+  auto BenchErase(KeyT k) -> bool { return m.erase(k) != 0; }
			
 
				+};
			
 
				+
			
 
				+// Explicit (partial) specialization for the Carbon map type that uses its
			
 
				+// different API design.
			
 
				+template <typename KT, typename VT, int MinSmallSize>
			
 
				+struct MapWrapperImpl<Map<KT, VT, MinSmallSize>> {
			
 
				+  using MapT = Map<KT, VT, MinSmallSize>;
			
 
				+  using KeyT = KT;
			
 
				+  using ValueT = VT;
			
 
				+
			
 
				+  MapT m;
			
 
				+
			
 
				+  auto BenchContains(KeyT k) -> bool { return m.Contains(k); }
			
 
				+
			
 
				+  auto BenchLookup(KeyT k) -> bool {
			
 
				+    auto result = m.Lookup(k);
			
 
				+    if (!result) {
			
 
				+      return false;
			
 
				+    }
			
 
				+    return ValueToBool(result.value());
			
 
				+  }
			
 
				+
			
 
				+  auto BenchInsert(KeyT k, ValueT v) -> bool {
			
 
				+    auto result = m.Insert(k, v);
			
 
				+    return result.is_inserted();
			
 
				+  }
			
 
				+
			
 
				+  auto BenchUpdate(KeyT k, ValueT v) -> bool {
			
 
				+    auto result = m.Update(k, v);
			
 
				+    return result.is_inserted();
			
 
				+  }
			
 
				+
			
 
				+  auto BenchErase(KeyT k) -> bool { return m.Erase(k); }
			
 
				+};
			
 
				+
			
 
				+// Provide a way to override the Carbon Map specific benchmark runs with another
			
 
				+// hashtable implementation. When building, you can use one of these enum names
			
 
				+// in a macro define such as `-DCARBON_MAP_BENCH_OVERRIDE=Name` in order to
			
 
				+// trigger a specific override for the `Map` type benchmarks. This is used to
			
 
				+// get before/after runs that compare the performance of Carbon's Map versus
			
 
				+// other implementations.
			
 
				+enum class MapOverride : uint8_t {
			
 
				+  None,
			
 
				+  Abseil,
			
 
				+  LLVM,
			
 
				+  LLVMAndCarbonHash,
			
 
				+};
			
 
				+#ifndef CARBON_MAP_BENCH_OVERRIDE
			
 
				+#define CARBON_MAP_BENCH_OVERRIDE None
			
 
				+#endif
			
 
				+
			
 
				+template <typename MapT, MapOverride Override>
			
 
				+struct MapWrapperOverride : MapWrapperImpl<MapT> {};
			
 
				+
			
 
				+template <typename KeyT, typename ValueT, int MinSmallSize>
			
 
				+struct MapWrapperOverride<Map<KeyT, ValueT, MinSmallSize>, MapOverride::Abseil>
			
 
				+    : MapWrapperImpl<absl::flat_hash_map<KeyT, ValueT>> {};
			
 
				+
			
 
				+template <typename KeyT, typename ValueT, int MinSmallSize>
			
 
				+struct MapWrapperOverride<Map<KeyT, ValueT, MinSmallSize>, MapOverride::LLVM>
			
 
				+    : MapWrapperImpl<llvm::DenseMap<KeyT, ValueT>> {};
			
 
				+
			
 
				+template <typename KeyT, typename ValueT, int MinSmallSize>
			
 
				+struct MapWrapperOverride<Map<KeyT, ValueT, MinSmallSize>,
			
 
				+                          MapOverride::LLVMAndCarbonHash>
			
 
				+    : MapWrapperImpl<llvm::DenseMap<KeyT, ValueT, CarbonHashDI<KeyT>>> {};
			
 
				+
			
 
				+template <typename MapT>
			
 
				+using MapWrapper =
			
 
				+    MapWrapperOverride<MapT, MapOverride::CARBON_MAP_BENCH_OVERRIDE>;
			
 
				+
			
 
				+// NOLINTBEGIN(bugprone-macro-parentheses): Parentheses are incorrect here.
			
 
				+#define MAP_BENCHMARK_ONE_OP_SIZE(NAME, APPLY, KT, VT)        \
			
 
				+  BENCHMARK(NAME<Map<KT, VT>>)->Apply(APPLY);                 \
			
 
				+  BENCHMARK(NAME<absl::flat_hash_map<KT, VT>>)->Apply(APPLY); \
			
 
				+  BENCHMARK(NAME<llvm::DenseMap<KT, VT>>)->Apply(APPLY);      \
			
 
				+  BENCHMARK(NAME<llvm::DenseMap<KT, VT, CarbonHashDI<KT>>>)->Apply(APPLY)
			
 
				+// NOLINTEND(bugprone-macro-parentheses)
			
 
				+
			
 
				+#define MAP_BENCHMARK_ONE_OP(NAME, APPLY)                       \
			
 
				+  MAP_BENCHMARK_ONE_OP_SIZE(NAME, APPLY, int, int);             \
			
 
				+  MAP_BENCHMARK_ONE_OP_SIZE(NAME, APPLY, int*, int*);           \
			
 
				+  MAP_BENCHMARK_ONE_OP_SIZE(NAME, APPLY, int, llvm::StringRef); \
			
 
				+  MAP_BENCHMARK_ONE_OP_SIZE(NAME, APPLY, llvm::StringRef, int)
			
 
				+
			
 
				+// Benchmark the minimal latency of checking if a key is contained within a map,
			
 
				+// when it *is* definitely in that map. Because this is only really measuring
			
 
				+// the *minimal* latency, it is more similar to a throughput benchmark.
			
 
				+//
			
 
				+// While this is structured to observe the latency of testing for presence of a
			
 
				+// key, it is important to understand the reality of what this measures. Because
			
 
				+// the boolean result testing for whether a key is in a map is fundamentally
			
 
				+// provided not by accessing some data, but by branching on data to a control
			
 
				+// flow path which sets the boolean to `true` or `false`, the result can be
			
 
				+// speculatively provided based on predicting the conditional branch without
			
 
				+// waiting for the results of the comparison to become available. And because
			
 
				+// this is a small operation and we arrange for all the candidate keys to be
			
 
				+// present, that branch *should* be predicted extremely well. The result is that
			
 
				+// this measures the un-speculated latency of testing for presence which should
			
 
				+// be small or zero. Which is why this is ultimately more similar to a
			
 
				+// throughput benchmark.
			
 
				+//
			
 
				+// Because of these measurement oddities, the specific measurements here may not
			
 
				+// be very interesting for predicting real-world performance in any way, but
			
 
				+// they are useful for comparing how 'cheap' the operation is across changes to
			
 
				+// the data structure or between similar data structures with similar
			
 
				+// properties.
			
 
				+template <typename MapT>
			
 
				+static void BM_MapContainsHit(benchmark::State& state) {
			
 
				+  using MapWrapperT = MapWrapper<MapT>;
			
 
				+  using KT = typename MapWrapperT::KeyT;
			
 
				+  using VT = typename MapWrapperT::ValueT;
			
 
				+  MapWrapperT m;
			
 
				+  auto [keys, lookup_keys] =
			
 
				+      GetKeysAndHitKeys<KT>(state.range(0), state.range(1));
			
 
				+  for (auto k : keys) {
			
 
				+    m.BenchInsert(k, MakeValue<VT>());
			
 
				+  }
			
 
				+  ssize_t lookup_keys_size = lookup_keys.size();
			
 
				+
			
 
				+  while (state.KeepRunningBatch(lookup_keys_size)) {
			
 
				+    for (ssize_t i = 0; i < lookup_keys_size;) {
			
 
				+      // We block optimizing `i` as that has proven both more effective at
			
 
				+      // blocking the loop from being optimized away and avoiding disruption of
			
 
				+      // the generated code that we're benchmarking.
			
 
				+      benchmark::DoNotOptimize(i);
			
 
				+
			
 
				+      bool result = m.BenchContains(lookup_keys[i]);
			
 
				+      CARBON_DCHECK(result);
			
 
				+      // We use the lookup success to step through keys, establishing a
			
 
				+      // dependency between each lookup. This doesn't fully allow us to measure
			
 
				+      // latency rather than throughput, as noted above.
			
 
				+      i += static_cast<ssize_t>(result);
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+MAP_BENCHMARK_ONE_OP(BM_MapContainsHit, HitArgs);
			
 
				+
			
 
				+// Similar to `BM_MapContainsHit`, while this is structured as a latency
			
 
				+// benchmark, the critical path is expected to be well predicted and so it
			
 
				+// should turn into something closer to a throughput benchmark.
			
 
				+template <typename MapT>
			
 
				+static void BM_MapContainsMiss(benchmark::State& state) {
			
 
				+  using MapWrapperT = MapWrapper<MapT>;
			
 
				+  using KT = typename MapWrapperT::KeyT;
			
 
				+  using VT = typename MapWrapperT::ValueT;
			
 
				+  MapWrapperT m;
			
 
				+  auto [keys, lookup_keys] = GetKeysAndMissKeys<KT>(state.range(0));
			
 
				+  for (auto k : keys) {
			
 
				+    m.BenchInsert(k, MakeValue<VT>());
			
 
				+  }
			
 
				+  ssize_t lookup_keys_size = lookup_keys.size();
			
 
				+
			
 
				+  while (state.KeepRunningBatch(lookup_keys_size)) {
			
 
				+    for (ssize_t i = 0; i < lookup_keys_size;) {
			
 
				+      benchmark::DoNotOptimize(i);
			
 
				+
			
 
				+      bool result = m.BenchContains(lookup_keys[i]);
			
 
				+      CARBON_DCHECK(!result);
			
 
				+      i += static_cast<ssize_t>(!result);
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+MAP_BENCHMARK_ONE_OP(BM_MapContainsMiss, SizeArgs);
			
 
				+
			
 
				+// This is a genuine latency benchmark. We lookup a key in the hashtable and use
			
 
				+// the value associated with that key in the critical path of loading the next
			
 
				+// iteration's key. We still ensure the keys are always present, and so we
			
 
				+// generally expect the data structure branches to be well predicted. But we
			
 
				+// vary the keys aggressively to avoid any prediction artifacts from repeatedly
			
 
				+// examining the same key.
			
 
				+//
			
 
				+// This latency can be very helpful for understanding a range of data structure
			
 
				+// behaviors:
			
 
				+// - Many users of hashtables are directly dependent on the latency of this
			
 
				+//   operation, and this micro-benchmark will reflect the expected latency for
			
 
				+//   them.
			
 
				+// - Showing how latency varies across different sizes of table and different
			
 
				+//   fractions of the table being accessed (and thus needing space in the
			
 
				+//   cache).
			
 
				+//
			
 
				+// However, it remains an ultimately synthetic and unrepresentative benchmark.
			
 
				+// It should primarily be used to understand the relative cost of these
			
 
				+// operations between versions of the data structure or between related data
			
 
				+// structures.
			
 
				+//
			
 
				+// We vary both the number of entries in the table and the number of distinct
			
 
				+// keys used when doing lookups. As the table becomes large, the latter dictates
			
 
				+// the fraction of the table that will be accessed and thus the working set size
			
 
				+// of the benchmark. Querying the same small number of keys in even a large
			
 
				+// table doesn't actually encounter any cache pressure, so only a few of these
			
 
				+// benchmarks will show any effects of the caching subsystem.
			
 
				+template <typename MapT>
			
 
				+static void BM_MapLookupHit(benchmark::State& state) {
			
 
				+  using MapWrapperT = MapWrapper<MapT>;
			
 
				+  using KT = typename MapWrapperT::KeyT;
			
 
				+  using VT = typename MapWrapperT::ValueT;
			
 
				+  MapWrapperT m;
			
 
				+  auto [keys, lookup_keys] =
			
 
				+      GetKeysAndHitKeys<KT>(state.range(0), state.range(1));
			
 
				+  for (auto k : keys) {
			
 
				+    m.BenchInsert(k, MakeValue<VT>());
			
 
				+  }
			
 
				+  ssize_t lookup_keys_size = lookup_keys.size();
			
 
				+
			
 
				+  while (state.KeepRunningBatch(lookup_keys_size)) {
			
 
				+    for (ssize_t i = 0; i < lookup_keys_size;) {
			
 
				+      benchmark::DoNotOptimize(i);
			
 
				+
			
 
				+      bool result = m.BenchLookup(lookup_keys[i]);
			
 
				+      CARBON_DCHECK(result);
			
 
				+      i += static_cast<ssize_t>(result);
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+MAP_BENCHMARK_ONE_OP(BM_MapLookupHit, HitArgs);
			
 
				+
			
 
				+// This is an update throughput benchmark in practice. While whether the key was
			
 
				+// a hit is kept in the critical path, we only use keys that are hits and so
			
 
				+// expect that to be fully predicted and speculated.
			
 
				+//
			
 
				+// However, we expect this fairly closely matches how user code interacts with
			
 
				+// an update-style API. It will have some conditional testing (even if just an
			
 
				+// assert) on whether the key was a hit and otherwise continue executing. As a
			
 
				+// consequence the actual update is expected to not be in a meaningful critical
			
 
				+// path.
			
 
				+//
			
 
				+// This still provides a basic way to measure the cost of this operation,
			
 
				+// especially when comparing between implementations or across different hash
			
 
				+// tables.
			
 
				+template <typename MapT>
			
 
				+static void BM_MapUpdateHit(benchmark::State& state) {
			
 
				+  using MapWrapperT = MapWrapper<MapT>;
			
 
				+  using KT = typename MapWrapperT::KeyT;
			
 
				+  using VT = typename MapWrapperT::ValueT;
			
 
				+  MapWrapperT m;
			
 
				+  auto [keys, lookup_keys] =
			
 
				+      GetKeysAndHitKeys<KT>(state.range(0), state.range(1));
			
 
				+  for (auto k : keys) {
			
 
				+    m.BenchInsert(k, MakeValue<VT>());
			
 
				+  }
			
 
				+  ssize_t lookup_keys_size = lookup_keys.size();
			
 
				+
			
 
				+  while (state.KeepRunningBatch(lookup_keys_size)) {
			
 
				+    for (ssize_t i = 0; i < lookup_keys_size; ++i) {
			
 
				+      benchmark::DoNotOptimize(i);
			
 
				+
			
 
				+      bool inserted = m.BenchUpdate(lookup_keys[i], MakeValue2<VT>());
			
 
				+      CARBON_DCHECK(!inserted);
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+MAP_BENCHMARK_ONE_OP(BM_MapUpdateHit, HitArgs);
			
 
				+
			
 
				+// First erase and then insert the key. The code path will always be the same
			
 
				+// here and so we expect this to largely be a throughput benchmark because of
			
 
				+// branch prediction and speculative execution.
			
 
				+//
			
 
				+// We don't expect erase followed by insertion to be a common user code
			
 
				+// sequence, but we don't have a good way of benchmarking either erase or insert
			
 
				+// in isolation -- each would change the size of the table and thus the next
			
 
				+// iteration's benchmark. And if we try to correct the table size outside of the
			
 
				+// timed region, we end up trying to exclude too fine grained of a region from
			
 
				+// timers to get good measurement data.
			
 
				+//
			
 
				+// Our solution is to benchmark both erase and insertion back to back. We can
			
 
				+// then get a good profile of the code sequence of each, and at least measure
			
 
				+// the sum cost of these reliably. Careful profiling can help attribute that
			
 
				+// cost between erase and insert in order to understand which of the two
			
 
				+// operations is contributing most to any performance artifacts observed.
			
 
				+template <typename MapT>
			
 
				+static void BM_MapEraseUpdateHit(benchmark::State& state) {
			
 
				+  using MapWrapperT = MapWrapper<MapT>;
			
 
				+  using KT = typename MapWrapperT::KeyT;
			
 
				+  using VT = typename MapWrapperT::ValueT;
			
 
				+  MapWrapperT m;
			
 
				+  auto [keys, lookup_keys] =
			
 
				+      GetKeysAndHitKeys<KT>(state.range(0), state.range(1));
			
 
				+  for (auto k : keys) {
			
 
				+    m.BenchInsert(k, MakeValue<VT>());
			
 
				+  }
			
 
				+  ssize_t lookup_keys_size = lookup_keys.size();
			
 
				+
			
 
				+  while (state.KeepRunningBatch(lookup_keys_size)) {
			
 
				+    for (ssize_t i = 0; i < lookup_keys_size; ++i) {
			
 
				+      benchmark::DoNotOptimize(i);
			
 
				+
			
 
				+      m.BenchErase(lookup_keys[i]);
			
 
				+      benchmark::ClobberMemory();
			
 
				+
			
 
				+      bool inserted = m.BenchUpdate(lookup_keys[i], MakeValue2<VT>());
			
 
				+      CARBON_DCHECK(inserted);
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+MAP_BENCHMARK_ONE_OP(BM_MapEraseUpdateHit, HitArgs);
			
 
				+
			
 
				+// NOLINTBEGIN(bugprone-macro-parentheses): Parentheses are incorrect here.
			
 
				+#define MAP_BENCHMARK_OP_SEQ_SIZE(NAME, KT, VT)                  \
			
 
				+  BENCHMARK(NAME<Map<KT, VT>>)->Apply(SizeArgs);                 \
			
 
				+  BENCHMARK(NAME<absl::flat_hash_map<KT, VT>>)->Apply(SizeArgs); \
			
 
				+  BENCHMARK(NAME<llvm::DenseMap<KT, VT>>)->Apply(APPLY);         \
			
 
				+  BENCHMARK(NAME<llvm::DenseMap<KT, VT, CarbonHashDI<KT>>>)->Apply(SizeArgs)
			
 
				+// NOLINTEND(bugprone-macro-parentheses)
			
 
				+
			
 
				+#define MAP_BENCHMARK_OP_SEQ(NAME)                       \
			
 
				+  MAP_BENCHMARK_OP_SEQ_SIZE(NAME, int, int);             \
			
 
				+  MAP_BENCHMARK_OP_SEQ_SIZE(NAME, int*, int*);           \
			
 
				+  MAP_BENCHMARK_OP_SEQ_SIZE(NAME, int, llvm::StringRef); \
			
 
				+  MAP_BENCHMARK_OP_SEQ_SIZE(NAME, llvm::StringRef, int)
			
 
				+
			
 
				+// This is an interesting, somewhat specialized benchmark that measures the cost
			
 
				+// of inserting a sequence of key/value pairs into a table with no collisions up
			
 
				+// to some size and then inserting a colliding key and throwing away the table.
			
 
				+//
			
 
				+// This can give an idea of the cost of building up a map of a particular size,
			
 
				+// but without actually using it. Or of algorithms like cycle-detection which
			
 
				+// for some reason need an associative container.
			
 
				+//
			
 
				+// It also covers both the insert-into-an-empty-slot code path that isn't
			
 
				+// covered elsewhere, and the code path for growing a table to a larger size.
			
 
				+//
			
 
				+// Because this benchmark operates on whole maps, we also compute the number of
			
 
				+// probed keys for Carbon's set as that is both a general reflection of the
			
 
				+// efficacy of the underlying hash function, and a direct factor that drives the
			
 
				+// cost of these operations.
			
 
				+template <typename MapT>
			
 
				+static void BM_MapInsertSeq(benchmark::State& state) {
			
 
				+  using MapWrapperT = MapWrapper<MapT>;
			
 
				+  using KT = typename MapWrapperT::KeyT;
			
 
				+  using VT = typename MapWrapperT::ValueT;
			
 
				+  constexpr ssize_t LookupKeysSize = 1 << 8;
			
 
				+  auto [keys, lookup_keys] =
			
 
				+      GetKeysAndHitKeys<KT>(state.range(0), LookupKeysSize);
			
 
				+
			
 
				+  // Note that we don't force batches that use all the lookup keys because
			
 
				+  // there's no difference in cache usage by covering all the different lookup
			
 
				+  // keys.
			
 
				+  ssize_t i = 0;
			
 
				+  for (auto _ : state) {
			
 
				+    benchmark::DoNotOptimize(i);
			
 
				+
			
 
				+    MapWrapperT m;
			
 
				+    for (auto k : keys) {
			
 
				+      bool inserted = m.BenchInsert(k, MakeValue<VT>());
			
 
				+      CARBON_DCHECK(inserted) << "Must be a successful insert!";
			
 
				+    }
			
 
				+
			
 
				+    // Now insert a final random repeated key.
			
 
				+    bool inserted = m.BenchInsert(lookup_keys[i], MakeValue2<VT>());
			
 
				+    CARBON_DCHECK(!inserted) << "Must already be in the map!";
			
 
				+
			
 
				+    // Rotate through the shuffled keys.
			
 
				+    i = (i + static_cast<ssize_t>(!inserted)) & (LookupKeysSize - 1);
			
 
				+  }
			
 
				+
			
 
				+  // It can be easier in some cases to think of this as a key-throughput rate of
			
 
				+  // insertion rather than the latency of inserting N keys, so construct the
			
 
				+  // rate counter as well.
			
 
				+  state.counters["KeyRate"] = benchmark::Counter(
			
 
				+      keys.size(), benchmark::Counter::kIsIterationInvariantRate);
			
 
				+
			
 
				+  // Report some extra statistics about the Carbon type.
			
 
				+  if constexpr (IsCarbonMap<MapT>) {
			
 
				+    // Re-build a map outside of the timing loop to look at the statistics
			
 
				+    // rather than the timing.
			
 
				+    MapT m;
			
 
				+    for (auto k : keys) {
			
 
				+      bool inserted = m.Insert(k, MakeValue<VT>()).is_inserted();
			
 
				+      CARBON_DCHECK(inserted) << "Must be a successful insert!";
			
 
				+    }
			
 
				+
			
 
				+    // While this count is "iteration invariant" (it should be exactly the same
			
 
				+    // for every iteration as the set of keys is the same), we don't use that
			
 
				+    // because it will scale this by the number of iterations. We want to
			
 
				+    // display the probe count of this benchmark *parameter*, not the probe
			
 
				+    // count that resulted from the number of iterations. That means we use the
			
 
				+    // normal counter API without flags.
			
 
				+    state.counters["Probed"] = m.CountProbedKeys();
			
 
				+
			
 
				+    // Uncomment this call to print out statistics about the index-collisions
			
 
				+    // among these keys for debugging:
			
 
				+    //
			
 
				+    // RawHashtable::DumpHashStatistics(keys);
			
 
				+  }
			
 
				+}
			
 
				+MAP_BENCHMARK_ONE_OP(BM_MapInsertSeq, SizeArgs);
			
 
				+
			
 
				+}  // namespace
			
 
				+}  // namespace Carbon
			
--- a/common/map_benchmark_test.sh
+++ b/common/map_benchmark_test.sh
@@ -0,0 +1,12 @@
 
				+#!/usr/bin/env bash
			
 
				+#
			
 
				+# Part of the Carbon Language project, under the Apache License v2.0 with LLVM
			
 
				+# Exceptions. See /LICENSE for license information.
			
 
				+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
			
 
				+
			
 
				+BENCHMARK="$TEST_SRCDIR/$TEST_WORKSPACE/common/map_benchmark"
			
 
				+
			
 
				+exec "$BENCHMARK" \
			
 
				+  --benchmark_counters_tabular=true \
			
 
				+  --benchmark_min_time=1x \
			
 
				+  --benchmark_filter='^[^/]*/[1-9][0-9]{0,3}(/[0-9]+)?$'
			
--- a/common/map_test.cpp
+++ b/common/map_test.cpp
@@ -0,0 +1,627 @@
 
				+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
			
 
				+// Exceptions. See /LICENSE for license information.
			
 
				+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
			
 
				+
			
 
				+#include "common/map.h"
			
 
				+
			
 
				+#include <gmock/gmock.h>
			
 
				+#include <gtest/gtest.h>
			
 
				+
			
 
				+#include <initializer_list>
			
 
				+#include <type_traits>
			
 
				+#include <utility>
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "common/raw_hashtable_test_helpers.h"
			
 
				+
			
 
				+namespace Carbon::Testing {
			
 
				+namespace {
			
 
				+
			
 
				+using RawHashtable::FixedHashKeyContext;
			
 
				+using RawHashtable::IndexKeyContext;
			
 
				+using RawHashtable::TestData;
			
 
				+using RawHashtable::TestKeyContext;
			
 
				+using ::testing::Pair;
			
 
				+using ::testing::UnorderedElementsAreArray;
			
 
				+
			
 
				+template <typename MapT, typename MatcherRangeT>
			
 
				+void ExpectMapElementsAre(MapT&& m, MatcherRangeT element_matchers) {
			
 
				+  // Now collect the elements into a container.
			
 
				+  using KeyT = typename std::remove_reference<MapT>::type::KeyT;
			
 
				+  using ValueT = typename std::remove_reference<MapT>::type::ValueT;
			
 
				+  std::vector<std::pair<KeyT, ValueT>> map_entries;
			
 
				+  m.ForEach([&map_entries](KeyT& k, ValueT& v) {
			
 
				+    map_entries.push_back({k, v});
			
 
				+  });
			
 
				+
			
 
				+  // Use the GoogleMock unordered container matcher to validate and show errors
			
 
				+  // on wrong elements.
			
 
				+  EXPECT_THAT(map_entries, UnorderedElementsAreArray(element_matchers));
			
 
				+}
			
 
				+
			
 
				+// Allow directly using an initializer list.
			
 
				+template <typename MapT, typename MatcherT>
			
 
				+void ExpectMapElementsAre(MapT&& m,
			
 
				+                          std::initializer_list<MatcherT> element_matchers) {
			
 
				+  std::vector<MatcherT> element_matchers_storage = element_matchers;
			
 
				+  ExpectMapElementsAre(m, element_matchers_storage);
			
 
				+}
			
 
				+
			
 
				+template <typename ValueCB, typename RangeT, typename... RangeTs>
			
 
				+auto MakeKeyValues(ValueCB value_cb, RangeT&& range, RangeTs&&... ranges) {
			
 
				+  using KeyT = typename RangeT::value_type;
			
 
				+  using ValueT = decltype(value_cb(std::declval<KeyT>()));
			
 
				+  std::vector<std::pair<KeyT, ValueT>> elements;
			
 
				+  auto add_range = [&](RangeT&& r) {
			
 
				+    for (const auto&& e : r) {
			
 
				+      elements.push_back({e, value_cb(e)});
			
 
				+    }
			
 
				+  };
			
 
				+  add_range(std::forward<RangeT>(range));
			
 
				+  (add_range(std::forward<RangeT>(ranges)), ...);
			
 
				+
			
 
				+  return elements;
			
 
				+}
			
 
				+
			
 
				+template <typename MapT>
			
 
				+class MapTest : public ::testing::Test {};
			
 
				+
			
 
				+using Types = ::testing::Types<
			
 
				+    Map<int, int>, Map<int, int, 16>, Map<int, int, 64>,
			
 
				+    Map<int, int, 0, TestKeyContext>, Map<int, int, 16, TestKeyContext>,
			
 
				+    Map<int, int, 64, TestKeyContext>, Map<TestData, TestData>,
			
 
				+    Map<TestData, TestData, 16>, Map<TestData, TestData, 0, TestKeyContext>,
			
 
				+    Map<TestData, TestData, 16, TestKeyContext>>;
			
 
				+TYPED_TEST_SUITE(MapTest, Types);
			
 
				+
			
 
				+TYPED_TEST(MapTest, Basic) {
			
 
				+  TypeParam m;
			
 
				+
			
 
				+  EXPECT_FALSE(m.Contains(42));
			
 
				+  EXPECT_EQ(nullptr, m[42]);
			
 
				+  EXPECT_TRUE(m.Insert(1, 100).is_inserted());
			
 
				+  ASSERT_TRUE(m.Contains(1));
			
 
				+  auto result = m.Lookup(1);
			
 
				+  EXPECT_TRUE(result);
			
 
				+  EXPECT_EQ(1, result.key());
			
 
				+  EXPECT_EQ(100, result.value());
			
 
				+  EXPECT_EQ(100, *m[1]);
			
 
				+  // Reinsertion doesn't change the value.
			
 
				+  auto i_result = m.Insert(1, 101);
			
 
				+  EXPECT_FALSE(i_result.is_inserted());
			
 
				+  EXPECT_EQ(100, i_result.value());
			
 
				+  EXPECT_EQ(100, *m[1]);
			
 
				+  // Update does change the value.
			
 
				+  i_result = m.Update(1, 101);
			
 
				+  EXPECT_FALSE(i_result.is_inserted());
			
 
				+  EXPECT_EQ(101, i_result.value());
			
 
				+  EXPECT_EQ(101, *m[1]);
			
 
				+
			
 
				+  // Verify all the elements.
			
 
				+  ExpectMapElementsAre(m, {Pair(1, 101)});
			
 
				+
			
 
				+  // Fill up a bunch to ensure we trigger growth a few times.
			
 
				+  for (int i : llvm::seq(2, 512)) {
			
 
				+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
			
 
				+    EXPECT_TRUE(m.Insert(i, i * 100).is_inserted());
			
 
				+
			
 
				+    // Immediately do a basic check of all elements to pin down when an
			
 
				+    // insertion corrupts the rest of the table.
			
 
				+    ExpectMapElementsAre(
			
 
				+        m,
			
 
				+        MakeKeyValues([](int k) { return k * 100 + static_cast<int>(k == 1); },
			
 
				+                      llvm::seq_inclusive(1, i)));
			
 
				+  }
			
 
				+  for (int i : llvm::seq(1, 512)) {
			
 
				+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
			
 
				+    EXPECT_FALSE(m.Insert(i, i * 100 + 1).is_inserted());
			
 
				+    EXPECT_EQ(i * 100 + static_cast<int>(i == 1), *m[i]);
			
 
				+    EXPECT_FALSE(m.Update(i, i * 100 + 1).is_inserted());
			
 
				+    EXPECT_EQ(i * 100 + 1, *m[i]);
			
 
				+  }
			
 
				+  EXPECT_FALSE(m.Contains(513));
			
 
				+
			
 
				+  // Verify all the elements.
			
 
				+  ExpectMapElementsAre(
			
 
				+      m, MakeKeyValues([](int k) { return k * 100 + 1; }, llvm::seq(1, 512)));
			
 
				+}
			
 
				+
			
 
				+TYPED_TEST(MapTest, FactoryAPI) {
			
 
				+  TypeParam m;
			
 
				+  EXPECT_TRUE(m.Insert(1, [] { return 100; }).is_inserted());
			
 
				+  ASSERT_TRUE(m.Contains(1));
			
 
				+  EXPECT_EQ(100, *m[1]);
			
 
				+  // Reinsertion doesn't invoke the callback.
			
 
				+  EXPECT_FALSE(m.Insert(1, []() -> int {
			
 
				+                  llvm_unreachable("Should never be called!");
			
 
				+                }).is_inserted());
			
 
				+  // Update does invoke the callback.
			
 
				+  auto i_result = m.Update(1, [] { return 101; });
			
 
				+  EXPECT_FALSE(i_result.is_inserted());
			
 
				+  EXPECT_EQ(101, i_result.value());
			
 
				+  EXPECT_EQ(101, *m[1]);
			
 
				+}
			
 
				+
			
 
				+TYPED_TEST(MapTest, Copy) {
			
 
				+  using MapT = TypeParam;
			
 
				+
			
 
				+  MapT m;
			
 
				+  // Make sure we exceed the small size for some of the map types, but not all
			
 
				+  // of them, so we cover all the combinations of copying between small and
			
 
				+  // large.
			
 
				+  for (int i : llvm::seq(1, 24)) {
			
 
				+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
			
 
				+    ASSERT_TRUE(m.Insert(i, i * 100).is_inserted());
			
 
				+  }
			
 
				+
			
 
				+  MapT other_m1 = m;
			
 
				+  ExpectMapElementsAre(
			
 
				+      other_m1, MakeKeyValues([](int k) { return k * 100; }, llvm::seq(1, 24)));
			
 
				+
			
 
				+  // Add some more elements to the original.
			
 
				+  for (int i : llvm::seq(24, 32)) {
			
 
				+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
			
 
				+    ASSERT_TRUE(m.Insert(i, i * 100).is_inserted());
			
 
				+  }
			
 
				+
			
 
				+  // The first copy doesn't change.
			
 
				+  ExpectMapElementsAre(
			
 
				+      other_m1, MakeKeyValues([](int k) { return k * 100; }, llvm::seq(1, 24)));
			
 
				+
			
 
				+  // A new copy does.
			
 
				+  MapT other_m2 = m;
			
 
				+  ExpectMapElementsAre(
			
 
				+      other_m2, MakeKeyValues([](int k) { return k * 100; }, llvm::seq(1, 32)));
			
 
				+}
			
 
				+
			
 
				+TYPED_TEST(MapTest, Move) {
			
 
				+  using MapT = TypeParam;
			
 
				+
			
 
				+  MapT m;
			
 
				+  // Make sure we exceed the small size for some of the map types, but not all
			
 
				+  // of them, so we cover all the combinations of moving between small and
			
 
				+  // large.
			
 
				+  for (int i : llvm::seq(1, 24)) {
			
 
				+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
			
 
				+    ASSERT_TRUE(m.Insert(i, i * 100).is_inserted());
			
 
				+  }
			
 
				+
			
 
				+  MapT other_m1 = std::move(m);
			
 
				+  ExpectMapElementsAre(
			
 
				+      other_m1, MakeKeyValues([](int k) { return k * 100; }, llvm::seq(1, 24)));
			
 
				+
			
 
				+  // Add some more elements.
			
 
				+  for (int i : llvm::seq(24, 32)) {
			
 
				+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
			
 
				+    ASSERT_TRUE(other_m1.Insert(i, i * 100).is_inserted());
			
 
				+  }
			
 
				+  ExpectMapElementsAre(
			
 
				+      other_m1, MakeKeyValues([](int k) { return k * 100; }, llvm::seq(1, 32)));
			
 
				+}
			
 
				+
			
 
				+TYPED_TEST(MapTest, Conversions) {
			
 
				+  using MapT = TypeParam;
			
 
				+  using KeyT = MapT::KeyT;
			
 
				+  using ValueT = MapT::ValueT;
			
 
				+  using KeyContextT = MapT::KeyContextT;
			
 
				+
			
 
				+  MapT m;
			
 
				+
			
 
				+  ASSERT_TRUE(m.Insert(1, 101).is_inserted());
			
 
				+  ASSERT_TRUE(m.Insert(2, 102).is_inserted());
			
 
				+  ASSERT_TRUE(m.Insert(3, 103).is_inserted());
			
 
				+  ASSERT_TRUE(m.Insert(4, 104).is_inserted());
			
 
				+
			
 
				+  MapView<KeyT, ValueT, KeyContextT> mv = m;
			
 
				+  MapView<const KeyT, ValueT, KeyContextT> cmv = m;
			
 
				+  MapView<KeyT, const ValueT, KeyContextT> cmv2 = m;
			
 
				+  MapView<const KeyT, const ValueT, KeyContextT> cmv3 = m;
			
 
				+  EXPECT_TRUE(mv.Contains(1));
			
 
				+  EXPECT_EQ(101, *mv[1]);
			
 
				+  EXPECT_TRUE(cmv.Contains(2));
			
 
				+  EXPECT_EQ(102, *cmv[2]);
			
 
				+  EXPECT_TRUE(cmv2.Contains(3));
			
 
				+  EXPECT_EQ(103, *cmv2[3]);
			
 
				+  EXPECT_TRUE(cmv3.Contains(4));
			
 
				+  EXPECT_EQ(104, *cmv3[4]);
			
 
				+}
			
 
				+
			
 
				+// This test is largely exercising the underlying `RawHashtable` implementation
			
 
				+// with complex growth, erasure, and re-growth.
			
 
				+TYPED_TEST(MapTest, ComplexOpSequence) {
			
 
				+  // Use a small size as well to cover more growth scenarios.
			
 
				+  TypeParam m;
			
 
				+
			
 
				+  EXPECT_FALSE(m.Contains(42));
			
 
				+  EXPECT_EQ(nullptr, m[42]);
			
 
				+  EXPECT_TRUE(m.Insert(1, 100).is_inserted());
			
 
				+  ASSERT_TRUE(m.Contains(1));
			
 
				+  auto result = m.Lookup(1);
			
 
				+  EXPECT_TRUE(result);
			
 
				+  EXPECT_EQ(1, result.key());
			
 
				+  EXPECT_EQ(100, result.value());
			
 
				+  EXPECT_EQ(100, *m[1]);
			
 
				+  // Reinsertion doesn't change the value.
			
 
				+  auto i_result = m.Insert(1, 101);
			
 
				+  EXPECT_FALSE(i_result.is_inserted());
			
 
				+  EXPECT_EQ(100, i_result.value());
			
 
				+  EXPECT_EQ(100, *m[1]);
			
 
				+  // Update does change the value.
			
 
				+  i_result = m.Update(1, 101);
			
 
				+  EXPECT_FALSE(i_result.is_inserted());
			
 
				+  EXPECT_EQ(101, i_result.value());
			
 
				+  EXPECT_EQ(101, *m[1]);
			
 
				+
			
 
				+  // Verify all the elements.
			
 
				+  ExpectMapElementsAre(m, {Pair(1, 101)});
			
 
				+
			
 
				+  // Fill up the small buffer but don't overflow it.
			
 
				+  for (int i : llvm::seq(2, 5)) {
			
 
				+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
			
 
				+    EXPECT_TRUE(m.Insert(i, i * 100).is_inserted());
			
 
				+  }
			
 
				+  for (int i : llvm::seq(1, 5)) {
			
 
				+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
			
 
				+    ASSERT_TRUE(m.Contains(i));
			
 
				+    EXPECT_EQ(i * 100 + static_cast<int>(i == 1), *m[i]);
			
 
				+    EXPECT_FALSE(m.Insert(i, i * 100 + 1).is_inserted());
			
 
				+    EXPECT_EQ(i * 100 + static_cast<int>(i == 1), *m[i]);
			
 
				+    EXPECT_FALSE(m.Update(i, i * 100 + 1).is_inserted());
			
 
				+    EXPECT_EQ(i * 100 + 1, *m[i]);
			
 
				+  }
			
 
				+  EXPECT_FALSE(m.Contains(5));
			
 
				+
			
 
				+  // Verify all the elements.
			
 
				+  ExpectMapElementsAre(
			
 
				+      m, {Pair(1, 101), Pair(2, 201), Pair(3, 301), Pair(4, 401)});
			
 
				+
			
 
				+  // Erase some entries from the small buffer.
			
 
				+  EXPECT_FALSE(m.Erase(42));
			
 
				+  EXPECT_TRUE(m.Erase(2));
			
 
				+  EXPECT_EQ(101, *m[1]);
			
 
				+  EXPECT_EQ(nullptr, m[2]);
			
 
				+  EXPECT_EQ(301, *m[3]);
			
 
				+  EXPECT_EQ(401, *m[4]);
			
 
				+  EXPECT_TRUE(m.Erase(1));
			
 
				+  EXPECT_EQ(nullptr, m[1]);
			
 
				+  EXPECT_EQ(nullptr, m[2]);
			
 
				+  EXPECT_EQ(301, *m[3]);
			
 
				+  EXPECT_EQ(401, *m[4]);
			
 
				+  EXPECT_TRUE(m.Erase(4));
			
 
				+  EXPECT_EQ(nullptr, m[1]);
			
 
				+  EXPECT_EQ(nullptr, m[2]);
			
 
				+  EXPECT_EQ(301, *m[3]);
			
 
				+  EXPECT_EQ(nullptr, m[4]);
			
 
				+  // Fill them back in, but with a different order and going back to the
			
 
				+  // original value.
			
 
				+  EXPECT_TRUE(m.Insert(1, 100).is_inserted());
			
 
				+  EXPECT_TRUE(m.Insert(2, 200).is_inserted());
			
 
				+  EXPECT_TRUE(m.Insert(4, 400).is_inserted());
			
 
				+  EXPECT_EQ(100, *m[1]);
			
 
				+  EXPECT_EQ(200, *m[2]);
			
 
				+  EXPECT_EQ(301, *m[3]);
			
 
				+  EXPECT_EQ(400, *m[4]);
			
 
				+  // Then update their values to match.
			
 
				+  EXPECT_FALSE(m.Update(1, 101).is_inserted());
			
 
				+  EXPECT_FALSE(m.Update(2, 201).is_inserted());
			
 
				+  EXPECT_FALSE(m.Update(4, 401).is_inserted());
			
 
				+
			
 
				+  // Now fill up the first metadata group.
			
 
				+  for (int i : llvm::seq(5, 14)) {
			
 
				+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
			
 
				+    EXPECT_TRUE(m.Insert(i, i * 100).is_inserted());
			
 
				+  }
			
 
				+  for (int i : llvm::seq(1, 14)) {
			
 
				+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
			
 
				+    ASSERT_TRUE(m.Contains(i));
			
 
				+    EXPECT_EQ(i * 100 + static_cast<int>(i < 5), *m[i]);
			
 
				+    EXPECT_FALSE(m.Insert(i, i * 100 + 2).is_inserted());
			
 
				+    EXPECT_EQ(i * 100 + static_cast<int>(i < 5), *m[i]);
			
 
				+    EXPECT_FALSE(m.Update(i, i * 100 + 2).is_inserted());
			
 
				+    EXPECT_EQ(i * 100 + 2, *m[i]);
			
 
				+  }
			
 
				+  EXPECT_FALSE(m.Contains(42));
			
 
				+
			
 
				+  // Verify all the elements by walking the entire map.
			
 
				+  ExpectMapElementsAre(
			
 
				+      m, {Pair(1, 102), Pair(2, 202), Pair(3, 302), Pair(4, 402), Pair(5, 502),
			
 
				+          Pair(6, 602), Pair(7, 702), Pair(8, 802), Pair(9, 902),
			
 
				+          Pair(10, 1002), Pair(11, 1102), Pair(12, 1202), Pair(13, 1302)});
			
 
				+
			
 
				+  // Now fill up several more groups.
			
 
				+  for (int i : llvm::seq(14, 100)) {
			
 
				+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
			
 
				+    EXPECT_TRUE(m.Insert(i, i * 100).is_inserted());
			
 
				+  }
			
 
				+  for (int i : llvm::seq(1, 100)) {
			
 
				+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
			
 
				+    ASSERT_TRUE(m.Contains(i));
			
 
				+    EXPECT_EQ(i * 100 + 2 * static_cast<int>(i < 14), *m[i]);
			
 
				+    EXPECT_FALSE(m.Insert(i, i * 100 + 1).is_inserted());
			
 
				+    EXPECT_EQ(i * 100 + 2 * static_cast<int>(i < 14), *m[i]);
			
 
				+    EXPECT_FALSE(m.Update(i, i * 100 + 3).is_inserted());
			
 
				+    EXPECT_EQ(i * 100 + 3, *m[i]);
			
 
				+  }
			
 
				+  EXPECT_FALSE(m.Contains(420));
			
 
				+
			
 
				+  // Check walking the entire container.
			
 
				+  ExpectMapElementsAre(
			
 
				+      m, MakeKeyValues([](int k) { return k * 100 + 3; }, llvm::seq(1, 100)));
			
 
				+
			
 
				+  // Clear back to empty.
			
 
				+  m.Clear();
			
 
				+  EXPECT_FALSE(m.Contains(42));
			
 
				+  EXPECT_EQ(nullptr, m[42]);
			
 
				+
			
 
				+  // Refill but with both overlapping and different values.
			
 
				+  for (int i : llvm::seq(50, 150)) {
			
 
				+    EXPECT_TRUE(m.Insert(i, i * 100).is_inserted());
			
 
				+  }
			
 
				+  for (int i : llvm::seq(50, 150)) {
			
 
				+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
			
 
				+    ASSERT_TRUE(m.Contains(i));
			
 
				+    EXPECT_EQ(i * 100, *m[i]);
			
 
				+    EXPECT_FALSE(m.Insert(i, i * 100 + 1).is_inserted());
			
 
				+    EXPECT_EQ(i * 100, *m[i]);
			
 
				+    EXPECT_FALSE(m.Update(i, i * 100 + 1).is_inserted());
			
 
				+    EXPECT_EQ(i * 100 + 1, *m[i]);
			
 
				+  }
			
 
				+  EXPECT_FALSE(m.Contains(42));
			
 
				+  EXPECT_FALSE(m.Contains(420));
			
 
				+
			
 
				+  ExpectMapElementsAre(
			
 
				+      m, MakeKeyValues([](int k) { return k * 100 + 1; }, llvm::seq(50, 150)));
			
 
				+
			
 
				+  EXPECT_FALSE(m.Erase(42));
			
 
				+  EXPECT_TRUE(m.Contains(73));
			
 
				+  EXPECT_TRUE(m.Erase(73));
			
 
				+  EXPECT_FALSE(m.Contains(73));
			
 
				+  for (int i : llvm::seq(102, 136)) {
			
 
				+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
			
 
				+    EXPECT_TRUE(m.Contains(i));
			
 
				+    EXPECT_TRUE(m.Erase(i));
			
 
				+    EXPECT_FALSE(m.Contains(i));
			
 
				+  }
			
 
				+  for (int i : llvm::seq(50, 150)) {
			
 
				+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
			
 
				+    if (i == 73 || (i >= 102 && i < 136)) {
			
 
				+      continue;
			
 
				+    }
			
 
				+    ASSERT_TRUE(m.Contains(i));
			
 
				+    EXPECT_EQ(i * 100 + 1, *m[i]);
			
 
				+    EXPECT_FALSE(m.Insert(i, i * 100 + 2).is_inserted());
			
 
				+    EXPECT_EQ(i * 100 + 1, *m[i]);
			
 
				+    EXPECT_FALSE(m.Update(i, i * 100 + 2).is_inserted());
			
 
				+    EXPECT_EQ(i * 100 + 2, *m[i]);
			
 
				+  }
			
 
				+  EXPECT_TRUE(m.Insert(73, 73 * 100 + 3).is_inserted());
			
 
				+  EXPECT_EQ(73 * 100 + 3, *m[73]);
			
 
				+
			
 
				+  ExpectMapElementsAre(
			
 
				+      m, MakeKeyValues([](int k) { return k * 100 + 2 + (k == 73); },
			
 
				+                       llvm::seq(50, 102), llvm::seq(136, 150)));
			
 
				+
			
 
				+  // Reset back to empty and small.
			
 
				+  m.Reset();
			
 
				+  EXPECT_FALSE(m.Contains(42));
			
 
				+  EXPECT_EQ(nullptr, m[42]);
			
 
				+
			
 
				+  // Refill but with both overlapping and different values, now triggering
			
 
				+  // growth too. Also, use update instead of insert.
			
 
				+  for (int i : llvm::seq(75, 175)) {
			
 
				+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
			
 
				+    EXPECT_TRUE(m.Update(i, i * 100).is_inserted());
			
 
				+  }
			
 
				+  for (int i : llvm::seq(75, 175)) {
			
 
				+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
			
 
				+    ASSERT_TRUE(m.Contains(i));
			
 
				+    EXPECT_EQ(i * 100, *m[i]);
			
 
				+    EXPECT_FALSE(m.Insert(i, i * 100 + 1).is_inserted());
			
 
				+    EXPECT_EQ(i * 100, *m[i]);
			
 
				+    EXPECT_FALSE(m.Update(i, i * 100 + 1).is_inserted());
			
 
				+    EXPECT_EQ(i * 100 + 1, *m[i]);
			
 
				+  }
			
 
				+  EXPECT_FALSE(m.Contains(42));
			
 
				+  EXPECT_FALSE(m.Contains(420));
			
 
				+
			
 
				+  ExpectMapElementsAre(
			
 
				+      m, MakeKeyValues([](int k) { return k * 100 + 1; }, llvm::seq(75, 175)));
			
 
				+
			
 
				+  EXPECT_FALSE(m.Erase(42));
			
 
				+  EXPECT_TRUE(m.Contains(93));
			
 
				+  EXPECT_TRUE(m.Erase(93));
			
 
				+  EXPECT_FALSE(m.Contains(93));
			
 
				+  for (int i : llvm::seq(102, 136)) {
			
 
				+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
			
 
				+    EXPECT_TRUE(m.Contains(i));
			
 
				+    EXPECT_TRUE(m.Erase(i));
			
 
				+    EXPECT_FALSE(m.Contains(i));
			
 
				+  }
			
 
				+  for (int i : llvm::seq(75, 175)) {
			
 
				+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
			
 
				+    if (i == 93 || (i >= 102 && i < 136)) {
			
 
				+      continue;
			
 
				+    }
			
 
				+    ASSERT_TRUE(m.Contains(i));
			
 
				+    EXPECT_EQ(i * 100 + 1, *m[i]);
			
 
				+    EXPECT_FALSE(m.Insert(i, i * 100 + 2).is_inserted());
			
 
				+    EXPECT_EQ(i * 100 + 1, *m[i]);
			
 
				+    EXPECT_FALSE(m.Update(i, i * 100 + 2).is_inserted());
			
 
				+    EXPECT_EQ(i * 100 + 2, *m[i]);
			
 
				+  }
			
 
				+  EXPECT_TRUE(m.Insert(93, 93 * 100 + 3).is_inserted());
			
 
				+  EXPECT_EQ(93 * 100 + 3, *m[93]);
			
 
				+
			
 
				+  ExpectMapElementsAre(
			
 
				+      m, MakeKeyValues([](int k) { return k * 100 + 2 + (k == 93); },
			
 
				+                       llvm::seq(75, 102), llvm::seq(136, 175)));
			
 
				+}
			
 
				+
			
 
				+template <typename MapT>
			
 
				+class MapCollisionTest : public ::testing::Test {};
			
 
				+
			
 
				+using CollisionTypes = ::testing::Types<
			
 
				+    Map<int, int, 16,
			
 
				+        FixedHashKeyContext<7, /*FixIndexBits*/ true, /*FixTagBits*/ false, 0>>,
			
 
				+    Map<int, int, 16,
			
 
				+        FixedHashKeyContext<7, /*FixIndexBits*/ false, /*FixTagBits*/ true, 0>>,
			
 
				+    Map<int, int, 16,
			
 
				+        FixedHashKeyContext<7, /*FixIndexBits*/ true, /*FixTagBits*/ true, 0>>,
			
 
				+    Map<int, int, 16,
			
 
				+        FixedHashKeyContext<7, /*FixIndexBits*/ true, /*FixTagBits*/ true,
			
 
				+                            ~static_cast<uint64_t>(0)>>>;
			
 
				+TYPED_TEST_SUITE(MapCollisionTest, CollisionTypes);
			
 
				+
			
 
				+TYPED_TEST(MapCollisionTest, Basic) {
			
 
				+  TypeParam m;
			
 
				+
			
 
				+  // Fill the map through a couple of growth steps, verifying at each step. Note
			
 
				+  // that because this is a collision test, we synthesize actively harmful
			
 
				+  // hashes in terms of collisions and so this test is essentially quadratic. We
			
 
				+  // need to keep it relatively small.
			
 
				+  for (int i : llvm::seq(1, 256)) {
			
 
				+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
			
 
				+    EXPECT_TRUE(m.Insert(i, i * 100).is_inserted());
			
 
				+
			
 
				+    // Immediately do a basic check of all elements to pin down when an
			
 
				+    // insertion corrupts the rest of the table.
			
 
				+    ExpectMapElementsAre(m, MakeKeyValues([](int k) { return k * 100; },
			
 
				+                                          llvm::seq_inclusive(1, i)));
			
 
				+  }
			
 
				+  EXPECT_FALSE(m.Contains(257));
			
 
				+
			
 
				+  // Erase and re-fill from the back.
			
 
				+  for (int i : llvm::seq(192, 256)) {
			
 
				+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
			
 
				+    EXPECT_TRUE(m.Erase(i));
			
 
				+  }
			
 
				+  ExpectMapElementsAre(
			
 
				+      m, MakeKeyValues([](int k) { return k * 100; }, llvm::seq(1, 192)));
			
 
				+  for (int i : llvm::seq(192, 256)) {
			
 
				+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
			
 
				+    EXPECT_TRUE(m.Insert(i, i * 100 + 1).is_inserted());
			
 
				+  }
			
 
				+  ExpectMapElementsAre(m,
			
 
				+                       MakeKeyValues([](int k) { return k * 100 + (k >= 192); },
			
 
				+                                     llvm::seq(1, 256)));
			
 
				+
			
 
				+  // Erase and re-fill from the front.
			
 
				+  for (int i : llvm::seq(1, 64)) {
			
 
				+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
			
 
				+    EXPECT_TRUE(m.Erase(i));
			
 
				+  }
			
 
				+  ExpectMapElementsAre(m,
			
 
				+                       MakeKeyValues([](int k) { return k * 100 + (k >= 192); },
			
 
				+                                     llvm::seq(64, 256)));
			
 
				+  for (int i : llvm::seq(1, 64)) {
			
 
				+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
			
 
				+    EXPECT_TRUE(m.Insert(i, i * 100 + 1).is_inserted());
			
 
				+  }
			
 
				+  ExpectMapElementsAre(
			
 
				+      m, MakeKeyValues([](int k) { return k * 100 + (k < 64) + (k >= 192); },
			
 
				+                       llvm::seq(1, 256)));
			
 
				+
			
 
				+  // Erase and re-fill from the middle.
			
 
				+  for (int i : llvm::seq(64, 192)) {
			
 
				+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
			
 
				+    EXPECT_TRUE(m.Erase(i));
			
 
				+  }
			
 
				+  ExpectMapElementsAre(m, MakeKeyValues([](int k) { return k * 100 + 1; },
			
 
				+                                        llvm::seq(1, 64), llvm::seq(192, 256)));
			
 
				+  for (int i : llvm::seq(64, 192)) {
			
 
				+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
			
 
				+    EXPECT_TRUE(m.Insert(i, i * 100 + 1).is_inserted());
			
 
				+  }
			
 
				+  ExpectMapElementsAre(
			
 
				+      m, MakeKeyValues([](int k) { return k * 100 + 1; }, llvm::seq(1, 256)));
			
 
				+
			
 
				+  // Erase and re-fill from both the back and front.
			
 
				+  for (auto s : {llvm::seq(192, 256), llvm::seq(1, 64)}) {
			
 
				+    for (int i : s) {
			
 
				+      SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
			
 
				+      EXPECT_TRUE(m.Erase(i));
			
 
				+    }
			
 
				+  }
			
 
				+  ExpectMapElementsAre(
			
 
				+      m, MakeKeyValues([](int k) { return k * 100 + 1; }, llvm::seq(64, 192)));
			
 
				+  for (auto s : {llvm::seq(192, 256), llvm::seq(1, 64)}) {
			
 
				+    for (int i : s) {
			
 
				+      SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
			
 
				+      EXPECT_TRUE(m.Insert(i, i * 100 + 2).is_inserted());
			
 
				+    }
			
 
				+  }
			
 
				+  ExpectMapElementsAre(
			
 
				+      m,
			
 
				+      MakeKeyValues([](int k) { return k * 100 + 1 + (k < 64) + (k >= 192); },
			
 
				+                    llvm::seq(1, 256)));
			
 
				+
			
 
				+  // And update the middle elements in place.
			
 
				+  for (int i : llvm::seq(64, 192)) {
			
 
				+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
			
 
				+    EXPECT_FALSE(m.Update(i, i * 100 + 2).is_inserted());
			
 
				+  }
			
 
				+  ExpectMapElementsAre(
			
 
				+      m, MakeKeyValues([](int k) { return k * 100 + 2; }, llvm::seq(1, 256)));
			
 
				+}
			
 
				+
			
 
				+TEST(MapContextTest, Basic) {
			
 
				+  llvm::SmallVector<TestData> keys;
			
 
				+  for (int i : llvm::seq(0, 513)) {
			
 
				+    keys.push_back(i * 100000);
			
 
				+  }
			
 
				+  IndexKeyContext<TestData> key_context(keys);
			
 
				+  Map<ssize_t, int, 0, IndexKeyContext<TestData>> m;
			
 
				+
			
 
				+  EXPECT_FALSE(m.Contains(42, key_context));
			
 
				+  EXPECT_TRUE(m.Insert(1, 100, key_context).is_inserted());
			
 
				+  ASSERT_TRUE(m.Contains(1, key_context));
			
 
				+  auto result = m.Lookup(TestData(100000), key_context);
			
 
				+  EXPECT_TRUE(result);
			
 
				+  EXPECT_EQ(1, result.key());
			
 
				+  EXPECT_EQ(100, result.value());
			
 
				+  // Reinsertion doesn't change the value. Also, double check a temporary
			
 
				+  // context.
			
 
				+  auto i_result = m.Insert(1, 101, IndexKeyContext<TestData>(keys));
			
 
				+  EXPECT_FALSE(i_result.is_inserted());
			
 
				+  EXPECT_EQ(100, i_result.value());
			
 
				+  // Update does change the value.
			
 
				+  i_result = m.Update(1, 101, key_context);
			
 
				+  EXPECT_FALSE(i_result.is_inserted());
			
 
				+  EXPECT_EQ(101, i_result.value());
			
 
				+
			
 
				+  // Verify all the elements.
			
 
				+  ExpectMapElementsAre(m, {Pair(1, 101)});
			
 
				+
			
 
				+  // Fill up a bunch to ensure we trigger growth a few times.
			
 
				+  for (int i : llvm::seq(2, 512)) {
			
 
				+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
			
 
				+    EXPECT_TRUE(m.Insert(i, i * 100, key_context).is_inserted());
			
 
				+
			
 
				+    // Immediately do a basic check of all elements to pin down when an
			
 
				+    // insertion corrupts the rest of the table.
			
 
				+    for (int j : llvm::seq(1, i)) {
			
 
				+      SCOPED_TRACE(llvm::formatv("Assert key: {0}", j).str());
			
 
				+      ASSERT_EQ(j * 100 + static_cast<int>(j == 1),
			
 
				+                m.Lookup(j, key_context).value());
			
 
				+      ASSERT_EQ(j * 100 + static_cast<int>(j == 1),
			
 
				+                m.Lookup(TestData(j * 100000), key_context).value());
			
 
				+    }
			
 
				+  }
			
 
				+  for (int i : llvm::seq(1, 512)) {
			
 
				+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
			
 
				+    EXPECT_FALSE(m.Insert(i, i * 100 + 1, key_context).is_inserted());
			
 
				+    EXPECT_EQ(i * 100 + static_cast<int>(i == 1),
			
 
				+              m.Lookup(i, key_context).value());
			
 
				+    EXPECT_FALSE(m.Update(i, i * 100 + 1, key_context).is_inserted());
			
 
				+    EXPECT_EQ(i * 100 + 1, m.Lookup(i, key_context).value());
			
 
				+  }
			
 
				+  EXPECT_FALSE(m.Contains(0, key_context));
			
 
				+  EXPECT_FALSE(m.Contains(512, key_context));
			
 
				+
			
 
				+  // Verify all the elements.
			
 
				+  ExpectMapElementsAre(
			
 
				+      m, MakeKeyValues([](int k) { return k * 100 + 1; }, llvm::seq(1, 512)));
			
 
				+}
			
 
				+
			
 
				+}  // namespace
			
 
				+}  // namespace Carbon::Testing
			
--- a/common/ostream.h
+++ b/common/ostream.h
@@ -54,6 +54,7 @@ class Printable {
 
				 
			
 
				 // Returns the result of printing the value.
			
 
				 template <typename T>
			
 
				+  requires std::derived_from<T, Printable<T>>
			
 
				 inline auto PrintToString(const T& val) -> std::string {
			
 
				   std::string str;
			
 
				   llvm::raw_string_ostream stream(str);
			
--- a/common/raw_hashtable.cpp
+++ b/common/raw_hashtable.cpp
@@ -0,0 +1,11 @@
 
				+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
			
 
				+// Exceptions. See /LICENSE for license information.
			
 
				+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
			
 
				+
			
 
				+#include "common/raw_hashtable.h"
			
 
				+
			
 
				+namespace Carbon::RawHashtable {
			
 
				+
			
 
				+volatile std::byte global_addr_seed{1};
			
 
				+
			
 
				+}  // namespace Carbon::RawHashtable
			
--- a/common/raw_hashtable.h
+++ b/common/raw_hashtable.h
@@ -0,0 +1,1336 @@
 
				+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
			
 
				+// Exceptions. See /LICENSE for license information.
			
 
				+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
			
 
				+
			
 
				+#ifndef CARBON_COMMON_RAW_HASHTABLE_H_
			
 
				+#define CARBON_COMMON_RAW_HASHTABLE_H_
			
 
				+
			
 
				+#include <algorithm>
			
 
				+#include <concepts>
			
 
				+#include <cstddef>
			
 
				+#include <cstring>
			
 
				+#include <new>
			
 
				+#include <type_traits>
			
 
				+#include <utility>
			
 
				+
			
 
				+#include "common/check.h"
			
 
				+#include "common/hashing.h"
			
 
				+#include "common/hashtable_key_context.h"
			
 
				+#include "common/raw_hashtable_metadata_group.h"
			
 
				+#include "llvm/Support/Compiler.h"
			
 
				+#include "llvm/Support/MathExtras.h"
			
 
				+
			
 
				+// A namespace collecting a set of low-level utilities for building hashtable
			
 
				+// data structures. These should only be used as implementation details of
			
 
				+// higher-level data-structure APIs.
			
 
				+//
			
 
				+// The utilities here use the `hashtable_key_context.h` provided `KeyContext` to
			
 
				+// support the necessary hashtable operations on keys: hashing and comparison.
			
 
				+// This also serves as the customization point for hashtables built on this
			
 
				+// infrastructure for those operations. See that header file for details.
			
 
				+//
			
 
				+// These utilities support hashtables following a *specific* API design pattern,
			
 
				+// and using Small-Size Optimization, or "SSO", when desired. We expect there to
			
 
				+// be three layers to any hashtable design:
			
 
				+//
			
 
				+// - A *view* type: a read-only view of the hashtable contents. This type should
			
 
				+//   be a value type and is expected to be passed by-value in APIs. However, it
			
 
				+//   will have `const`-reference semantics, much like a `std::string_view`. Note
			
 
				+//   that the *entries* will continue to be mutable, it is only the *table* that
			
 
				+//   is read-only.
			
 
				+//
			
 
				+// - A *base* type: a base class type of the actual hashtable, which allows
			
 
				+//   almost all mutable operations but erases any specific SSO buffer size.
			
 
				+//   Because this is a base of the actual hash table, it is designed to be
			
 
				+//   passed as a non-`const` reference or pointer.
			
 
				+//
			
 
				+// - A *table* type: the actual hashtable which derives from the base type and
			
 
				+//   adds any desired SSO storage buffer. Beyond the physical storage, it also
			
 
				+//   allows resetting the table to its initial state & allocated size, as well
			
 
				+//   as copying and moving the table.
			
 
				+//
			
 
				+// For complete examples of the API design, see `set.h` for a hashtable-based
			
 
				+// set data structure, and `map.h` for a hashtable-based map data structure.
			
 
				+//
			
 
				+// The hashtable design implemented here has several key invariants and design
			
 
				+// elements that are essential to all three of the types above and the
			
 
				+// functionality they provide.
			
 
				+//
			
 
				+// - The underlying hashtable uses [open addressing], a power-of-two table size,
			
 
				+//   and quadratic probing rather than closed addressing and chaining.
			
 
				+//
			
 
				+//   [open addressing]: https://en.wikipedia.org/wiki/Open_addressing
			
 
				+//
			
 
				+// - Each _slot_ in the table corresponds to a key, a value, and one byte of
			
 
				+//   metadata. Each _entry_ is a key and value. The key and value for an entry
			
 
				+//   are stored together.
			
 
				+//
			
 
				+// - The allocated storage is organized into an array of metadata bytes followed
			
 
				+//   by an array of entry storage.
			
 
				+//
			
 
				+// - The metadata byte corresponding to each entry marks that entry is either
			
 
				+//   empty, deleted, or present. When present, a 7-bit tag is also stored using
			
 
				+//   another 7 bits from the hash of the entry key.
			
 
				+//
			
 
				+// - The storage for an entry is an internal type that should not be exposed to
			
 
				+//   users, and instead only the underlying keys and values.
			
 
				+//
			
 
				+// - The hash addressing and probing occurs over *groups* of slots rather than
			
 
				+//   individual entries. When inserting a new entry, it can be added to the
			
 
				+//   group it hashes to as long it is not full, and can even replace a slot with
			
 
				+//   a tombstone indicating a previously deleted entry. Only when the group is
			
 
				+//   full will it look at the next group in the probe sequence. As a result,
			
 
				+//   there may be entries in a group where a different group is the start of
			
 
				+//   that entry's probe sequence. Also, when performing a lookup, every group in
			
 
				+//   the probe sequence must be inspected for the lookup key until it is found
			
 
				+//   or the group has an empty slot.
			
 
				+//
			
 
				+// - Groups are scanned rapidly using the one-byte metadata for each entry in
			
 
				+//   the group and CPU instructions that allow comparing all of the metadata for
			
 
				+//   a group in parallel. For more details on the metadata group encoding and
			
 
				+//   scanning, see `raw_hashtable_metadata_group.h`.
			
 
				+//
			
 
				+// - `GroupSize` is a platform-specific relatively small power of two that fits
			
 
				+//   in some hardware register. However, `MaxGroupSize` is provided as a
			
 
				+//   portable max that is also a power of two. The table storage, whether
			
 
				+//   provided by an SSO buffer or allocated, is required to be a multiple of
			
 
				+//   `MaxGroupSize` to keep the requirement portable but sufficient for all
			
 
				+//   platforms.
			
 
				+//
			
 
				+// - There is *always* an allocated table of some multiple of `MaxGroupSize`.
			
 
				+//   This allows accesses to be branchless. When heap allocated, we pro-actively
			
 
				+//   allocate at least a minimum heap size table. When there is a small-size
			
 
				+//   optimization (SSO) buffer, that provides the initial allocation.
			
 
				+//
			
 
				+// - The table performs a minimal amount of bookkeeping that limits the APIs it
			
 
				+//   can support:
			
 
				+//    - `alloc_size` is the size of the table *allocated* (not *used*), and is
			
 
				+//       always a power of 2 at least as big as `MinAllocatedSize`.
			
 
				+//    - `storage` is a pointer to the storage for the `alloc_size` slots of the
			
 
				+//       table, and never null.
			
 
				+//    - `small_alloc_size` is the maximum `alloc_size` where the table is stored
			
 
				+//       in the object itself instead of separately on the heap. In this case,
			
 
				+//       `storage` points to `small_storage_`.
			
 
				+//    - `growth_budget` is the number of entries that may be added before the
			
 
				+//       table allocation is doubled. It is always
			
 
				+//       `GrowthThresholdForAllocSize(alloc_size)` minus the number of
			
 
				+//       non-empty (filled or deleted) slots. If it ever falls to 0, the table
			
 
				+//       is grown to keep it greater than 0.
			
 
				+//   There is also the "moved-from" state where the table may only be
			
 
				+//   reinitialized or destroyed where the `alloc_size` is 0 and `storage` is
			
 
				+//   null. Since it doesn't track the exact number of filled entries in a table,
			
 
				+//   it doesn't support a container-style `size` API.
			
 
				+//
			
 
				+// - There is no direct iterator support because of the complexity of embedding
			
 
				+//   the group-based metadata scanning into an iterator model. Instead, there is
			
 
				+//   just a for-each method that is passed a lambda to observe all entries. The
			
 
				+//   order of this observation is also not guaranteed.
			
 
				+namespace Carbon::RawHashtable {
			
 
				+
			
 
				+// If allocating storage, allocate a minimum of one cacheline of group metadata
			
 
				+// or a minimum of one group, whichever is larger.
			
 
				+constexpr ssize_t MinAllocatedSize = std::max<ssize_t>(64, MaxGroupSize);
			
 
				+
			
 
				+// An entry in the hashtable storage of a `KeyT` and `ValueT` object.
			
 
				+//
			
 
				+// Allows manual construction, destruction, and access to these values so we can
			
 
				+// create arrays af the entries prior to populating them with actual keys and
			
 
				+// values.
			
 
				+template <typename KeyT, typename ValueT>
			
 
				+struct StorageEntry {
			
 
				+  static constexpr bool IsTriviallyDestructible =
			
 
				+      std::is_trivially_destructible_v<KeyT> &&
			
 
				+      std::is_trivially_destructible_v<ValueT>;
			
 
				+
			
 
				+  static constexpr bool IsTriviallyRelocatable =
			
 
				+      IsTriviallyDestructible && std::is_trivially_move_constructible_v<KeyT> &&
			
 
				+      std::is_trivially_move_constructible_v<ValueT>;
			
 
				+
			
 
				+  auto key() const -> const KeyT& {
			
 
				+    // Ensure we don't need more alignment than available. Inside a method body
			
 
				+    // to apply to the complete type.
			
 
				+    static_assert(
			
 
				+        alignof(StorageEntry) <= MinAllocatedSize,
			
 
				+        "The minimum allocated size turns into the alignment of our array of "
			
 
				+        "storage entries as they follow the metadata byte array.");
			
 
				+
			
 
				+    return *std::launder(reinterpret_cast<const KeyT*>(&key_storage));
			
 
				+  }
			
 
				+  auto key() -> KeyT& {
			
 
				+    return const_cast<KeyT&>(const_cast<const StorageEntry*>(this)->key());
			
 
				+  }
			
 
				+
			
 
				+  auto value() const -> const ValueT& {
			
 
				+    return *std::launder(reinterpret_cast<const ValueT*>(&value_storage));
			
 
				+  }
			
 
				+  auto value() -> ValueT& {
			
 
				+    return const_cast<ValueT&>(const_cast<const StorageEntry*>(this)->value());
			
 
				+  }
			
 
				+
			
 
				+  // We handle destruction and move manually as we only want to expose distinct
			
 
				+  // `KeyT` and `ValueT` subobjects to user code that may need to do in-place
			
 
				+  // construction. As a consequence, this struct only provides the storage and
			
 
				+  // we have to manually manage the construction, move, and destruction of the
			
 
				+  // objects.
			
 
				+  auto Destroy() -> void {
			
 
				+    static_assert(!IsTriviallyDestructible,
			
 
				+                  "Should never instantiate when trivial!");
			
 
				+    key().~KeyT();
			
 
				+    value().~ValueT();
			
 
				+  }
			
 
				+
			
 
				+  auto CopyFrom(const StorageEntry& entry) -> void {
			
 
				+    if constexpr (IsTriviallyRelocatable) {
			
 
				+      memcpy(this, &entry, sizeof(StorageEntry));
			
 
				+    } else {
			
 
				+      new (&key_storage) KeyT(entry.key());
			
 
				+      new (&value_storage) ValueT(entry.value());
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  // Move from an expiring entry and destroy that entry's key and value.
			
 
				+  // Optimizes to directly use `memcpy` when correct.
			
 
				+  auto MoveFrom(StorageEntry&& entry) -> void {
			
 
				+    if constexpr (IsTriviallyRelocatable) {
			
 
				+      memcpy(this, &entry, sizeof(StorageEntry));
			
 
				+    } else {
			
 
				+      new (&key_storage) KeyT(std::move(entry.key()));
			
 
				+      entry.key().~KeyT();
			
 
				+      new (&value_storage) ValueT(std::move(entry.value()));
			
 
				+      entry.value().~ValueT();
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  alignas(KeyT) std::byte key_storage[sizeof(KeyT)];
			
 
				+  alignas(ValueT) std::byte value_storage[sizeof(ValueT)];
			
 
				+};
			
 
				+
			
 
				+// A specialization of the storage entry for sets without a distinct value type.
			
 
				+// Somewhat duplicative with the key-value version, but C++ specialization makes
			
 
				+// doing better difficult.
			
 
				+template <typename KeyT>
			
 
				+struct StorageEntry<KeyT, void> {
			
 
				+  static constexpr bool IsTriviallyDestructible =
			
 
				+      std::is_trivially_destructible_v<KeyT>;
			
 
				+
			
 
				+  static constexpr bool IsTriviallyRelocatable =
			
 
				+      IsTriviallyDestructible && std::is_trivially_move_constructible_v<KeyT>;
			
 
				+
			
 
				+  auto key() const -> const KeyT& {
			
 
				+    // Ensure we don't need more alignment than available.
			
 
				+    static_assert(
			
 
				+        alignof(StorageEntry) <= MinAllocatedSize,
			
 
				+        "The minimum allocated size turns into the alignment of our array of "
			
 
				+        "storage entries as they follow the metadata byte array.");
			
 
				+
			
 
				+    return *std::launder(reinterpret_cast<const KeyT*>(&key_storage));
			
 
				+  }
			
 
				+  auto key() -> KeyT& {
			
 
				+    return const_cast<KeyT&>(const_cast<const StorageEntry*>(this)->key());
			
 
				+  }
			
 
				+
			
 
				+  auto Destroy() -> void {
			
 
				+    static_assert(!IsTriviallyDestructible,
			
 
				+                  "Should never instantiate when trivial!");
			
 
				+    key().~KeyT();
			
 
				+  }
			
 
				+
			
 
				+  auto CopyFrom(const StorageEntry& entry) -> void {
			
 
				+    if constexpr (IsTriviallyRelocatable) {
			
 
				+      memcpy(this, &entry, sizeof(StorageEntry));
			
 
				+    } else {
			
 
				+      new (&key_storage) KeyT(entry.key());
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  auto MoveFrom(StorageEntry&& entry) -> void {
			
 
				+    if constexpr (IsTriviallyRelocatable) {
			
 
				+      memcpy(this, &entry, sizeof(StorageEntry));
			
 
				+    } else {
			
 
				+      new (&key_storage) KeyT(std::move(entry.key()));
			
 
				+      entry.key().~KeyT();
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  alignas(KeyT) std::byte key_storage[sizeof(KeyT)];
			
 
				+};
			
 
				+
			
 
				+// A placeholder empty type used to model pointers to the allocated buffer of
			
 
				+// storage.
			
 
				+//
			
 
				+// The allocated storage doesn't have a meaningful static layout -- it consists
			
 
				+// of an array of metadata groups followed by an array of storage entries.
			
 
				+// However, we want to be able to mark pointers to this and so use pointers to
			
 
				+// this placeholder type as that signifier.
			
 
				+//
			
 
				+// This is a complete, empty type so that it can be used as a base class of a
			
 
				+// specific concrete storage type for compile-time sized storage.
			
 
				+struct Storage {};
			
 
				+
			
 
				+// Forward declaration to support friending, see the definition below.
			
 
				+template <typename KeyT, typename ValueT = void,
			
 
				+          typename InputKeyContextT = DefaultKeyContext>
			
 
				+class BaseImpl;
			
 
				+
			
 
				+// Implementation helper for defining a read-only view type for a hashtable.
			
 
				+//
			
 
				+// A specific user-facing hashtable view type should derive privately from this
			
 
				+// type, and forward the implementation of its interface to functions in this
			
 
				+// type.
			
 
				+//
			
 
				+// The methods available to user-facing hashtable types are `protected`, and
			
 
				+// where they are expected to directly map to a public API, named with an
			
 
				+// `Impl`. The suffix naming ensures types don't `using` in these low-level APIs
			
 
				+// but declare their own and implement them by forwarding to these APIs. We
			
 
				+// don't want users to have to read these implementation details to understand
			
 
				+// their container's API, so none of these methods should be `using`-ed into the
			
 
				+// user facing types.
			
 
				+//
			
 
				+// Some of the types are just convenience aliases and aren't important to
			
 
				+// surface as part of the user-facing type API for readers and so those are
			
 
				+// reasonable to add via a `using`.
			
 
				+//
			
 
				+// Some methods are used by other parts of the raw hashtable implementation.
			
 
				+// Those are kept `private` and where necessary the other components of the raw
			
 
				+// hashtable implementation are friended to give access to them.
			
 
				+template <typename InputKeyT, typename InputValueT = void,
			
 
				+          typename InputKeyContextT = DefaultKeyContext>
			
 
				+class ViewImpl {
			
 
				+ protected:
			
 
				+  using KeyT = InputKeyT;
			
 
				+  using ValueT = InputValueT;
			
 
				+  using KeyContextT = InputKeyContextT;
			
 
				+  using EntryT = StorageEntry<KeyT, ValueT>;
			
 
				+
			
 
				+  friend class BaseImpl<KeyT, ValueT, KeyContextT>;
			
 
				+
			
 
				+  // Make more-`const` types friends to enable conversions that add `const`.
			
 
				+  friend class ViewImpl<const KeyT, ValueT, KeyContextT>;
			
 
				+  friend class ViewImpl<KeyT, const ValueT, KeyContextT>;
			
 
				+  friend class ViewImpl<const KeyT, const ValueT, KeyContextT>;
			
 
				+
			
 
				+  ViewImpl() = default;
			
 
				+
			
 
				+  // Support adding `const` to either key or value type of some other view.
			
 
				+  template <typename OtherKeyT, typename OtherValueT>
			
 
				+  // NOLINTNEXTLINE(google-explicit-constructor)
			
 
				+  ViewImpl(ViewImpl<OtherKeyT, OtherValueT, KeyContextT> other_view)
			
 
				+    requires(std::same_as<KeyT, OtherKeyT> ||
			
 
				+             std::same_as<KeyT, const OtherKeyT>) &&
			
 
				+                (std::same_as<ValueT, OtherValueT> ||
			
 
				+                 std::same_as<ValueT, const OtherValueT>)
			
 
				+      : alloc_size_(other_view.alloc_size_), storage_(other_view.storage_) {}
			
 
				+
			
 
				+  // Looks up an entry in the hashtable and returns its address or null if not
			
 
				+  // present.
			
 
				+  template <typename LookupKeyT>
			
 
				+  auto LookupEntry(LookupKeyT lookup_key, KeyContextT key_context) const
			
 
				+      -> EntryT*;
			
 
				+
			
 
				+  // Calls `entry_callback` for each entry in the hashtable. All the entries
			
 
				+  // within a specific group are visited first, and then `group_callback` is
			
 
				+  // called on the group itself. The `group_callback` is typically only used by
			
 
				+  // the internals of the hashtable.
			
 
				+  template <typename EntryCallbackT, typename GroupCallbackT>
			
 
				+  auto ForEachEntry(EntryCallbackT entry_callback,
			
 
				+                    GroupCallbackT group_callback) const -> void;
			
 
				+
			
 
				+  // Counts the number of keys in the hashtable that required probing beyond the
			
 
				+  // initial group.
			
 
				+  auto CountProbedKeys(KeyContextT key_context) const -> ssize_t;
			
 
				+
			
 
				+ private:
			
 
				+  ViewImpl(ssize_t alloc_size, Storage* storage)
			
 
				+      : alloc_size_(alloc_size), storage_(storage) {}
			
 
				+
			
 
				+  // Computes the offset from the metadata array to the entries array for a
			
 
				+  // given size. This is trivial, but we use this routine to enforce invariants
			
 
				+  // on the sizes.
			
 
				+  static constexpr auto EntriesOffset(ssize_t alloc_size) -> ssize_t {
			
 
				+    CARBON_DCHECK(llvm::isPowerOf2_64(alloc_size))
			
 
				+        << "Size must be a power of two for a hashed buffer!";
			
 
				+    // The size is always a power of two. We prevent any too-small sizes so it
			
 
				+    // being a power of two provides the needed alignment. As a result, the
			
 
				+    // offset is exactly the size. We validate this here to catch alignment bugs
			
 
				+    // early.
			
 
				+    CARBON_DCHECK(static_cast<uint64_t>(alloc_size) ==
			
 
				+                  llvm::alignTo<alignof(EntryT)>(alloc_size));
			
 
				+    return alloc_size;
			
 
				+  }
			
 
				+
			
 
				+  auto metadata() const -> uint8_t* {
			
 
				+    return reinterpret_cast<uint8_t*>(storage_);
			
 
				+  }
			
 
				+  auto entries() const -> EntryT* {
			
 
				+    return reinterpret_cast<EntryT*>(reinterpret_cast<std::byte*>(storage_) +
			
 
				+                                     EntriesOffset(alloc_size_));
			
 
				+  }
			
 
				+
			
 
				+  ssize_t alloc_size_;
			
 
				+  Storage* storage_;
			
 
				+};
			
 
				+
			
 
				+// Implementation helper for defining a read-write base type for a hashtable
			
 
				+// that type-erases any SSO buffer.
			
 
				+//
			
 
				+// A specific user-facing hashtable base type should derive using *`protected`*
			
 
				+// inheritance from this type, and forward the implementation of its interface
			
 
				+// to functions in this type.
			
 
				+//
			
 
				+// Other than the use of `protected` inheritance, the patterns for this type,
			
 
				+// and how to build user-facing hashtable base types from it, mirror those of
			
 
				+// `ViewImpl`. See its documentation for more details.
			
 
				+template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
			
 
				+class BaseImpl {
			
 
				+ protected:
			
 
				+  using KeyT = InputKeyT;
			
 
				+  using ValueT = InputValueT;
			
 
				+  using KeyContextT = InputKeyContextT;
			
 
				+  using ViewImplT = ViewImpl<KeyT, ValueT, KeyContextT>;
			
 
				+  using EntryT = typename ViewImplT::EntryT;
			
 
				+
			
 
				+  BaseImpl(int small_alloc_size, Storage* small_storage)
			
 
				+      : small_alloc_size_(small_alloc_size) {
			
 
				+    CARBON_CHECK(small_alloc_size >= 0);
			
 
				+    Construct(small_storage);
			
 
				+  }
			
 
				+  // Only used for copying and moving, and leaves storage uninitialized.
			
 
				+  BaseImpl(ssize_t alloc_size, int growth_budget, int small_alloc_size)
			
 
				+      : view_impl_(alloc_size, nullptr),
			
 
				+        growth_budget_(growth_budget),
			
 
				+        small_alloc_size_(small_alloc_size) {}
			
 
				+  ~BaseImpl();
			
 
				+
			
 
				+  // NOLINTNEXTLINE(google-explicit-constructor): Designed to implicitly decay.
			
 
				+  operator ViewImplT() const { return view_impl(); }
			
 
				+
			
 
				+  auto view_impl() const -> ViewImplT { return view_impl_; }
			
 
				+
			
 
				+  // Looks up the provided key in the hashtable. If found, returns a pointer to
			
 
				+  // that entry and `false`.
			
 
				+  //
			
 
				+  // If not found, will locate an empty entry for inserting into, set the
			
 
				+  // metadata for that entry, and return a pointer to the entry and `true`. When
			
 
				+  // necessary, this will grow the hashtable to cause there to be sufficient
			
 
				+  // empty entries.
			
 
				+  template <typename LookupKeyT>
			
 
				+  auto InsertImpl(LookupKeyT lookup_key, KeyContextT key_context)
			
 
				+      -> std::pair<EntryT*, bool>;
			
 
				+
			
 
				+  // Looks up the entry in the hashtable, and if found destroys the entry and
			
 
				+  // returns `true`. If not found, returns `false`.
			
 
				+  //
			
 
				+  // Does not release any memory, just leaves a tombstone behind so this entry
			
 
				+  // cannot be found and the slot can in theory be re-used.
			
 
				+  template <typename LookupKeyT>
			
 
				+  auto EraseImpl(LookupKeyT lookup_key, KeyContextT key_context) -> bool;
			
 
				+
			
 
				+  // Erases all entries in the hashtable but leaves the allocated storage.
			
 
				+  auto ClearImpl() -> void;
			
 
				+
			
 
				+ private:
			
 
				+  template <typename InputBaseT, ssize_t SmallSize>
			
 
				+  friend class TableImpl;
			
 
				+
			
 
				+  static constexpr ssize_t Alignment = std::max<ssize_t>(
			
 
				+      {alignof(MetadataGroup), alignof(StorageEntry<KeyT, ValueT>)});
			
 
				+
			
 
				+  // Implementation of inline small storage for the provided key type, value
			
 
				+  // type, and small size. Specialized for a zero small size to be an empty
			
 
				+  // struct.
			
 
				+  template <ssize_t SmallSize>
			
 
				+  struct SmallStorage : Storage {
			
 
				+    alignas(Alignment) uint8_t metadata[SmallSize];
			
 
				+    mutable StorageEntry<KeyT, ValueT> entries[SmallSize];
			
 
				+  };
			
 
				+  // Specialized storage with no inline buffer to avoid any extra alignment.
			
 
				+  template <>
			
 
				+  struct SmallStorage<0> {};
			
 
				+
			
 
				+  static constexpr auto AllocByteSize(ssize_t alloc_size) -> ssize_t {
			
 
				+    return ViewImplT::EntriesOffset(alloc_size) + sizeof(EntryT) * alloc_size;
			
 
				+  }
			
 
				+  static auto Allocate(ssize_t alloc_size) -> Storage*;
			
 
				+  static auto Deallocate(Storage* storage, ssize_t alloc_size) -> void;
			
 
				+
			
 
				+  auto growth_budget() const -> ssize_t { return growth_budget_; }
			
 
				+  auto alloc_size() const -> ssize_t { return view_impl_.alloc_size_; }
			
 
				+  auto alloc_size() -> ssize_t& { return view_impl_.alloc_size_; }
			
 
				+  auto storage() const -> Storage* { return view_impl_.storage_; }
			
 
				+  auto storage() -> Storage*& { return view_impl_.storage_; }
			
 
				+  auto metadata() const -> uint8_t* { return view_impl_.metadata(); }
			
 
				+  auto entries() const -> EntryT* { return view_impl_.entries(); }
			
 
				+  auto small_alloc_size() const -> ssize_t {
			
 
				+    return static_cast<unsigned>(small_alloc_size_);
			
 
				+  }
			
 
				+  auto is_small() const -> bool { return alloc_size() <= small_alloc_size(); }
			
 
				+
			
 
				+  auto Construct(Storage* small_storage) -> void;
			
 
				+  auto Destroy() -> void;
			
 
				+
			
 
				+  template <typename LookupKeyT>
			
 
				+  auto InsertIntoEmpty(LookupKeyT lookup_key, KeyContextT key_context)
			
 
				+      -> EntryT*;
			
 
				+
			
 
				+  static auto ComputeNextAllocSize(ssize_t old_alloc_size) -> ssize_t;
			
 
				+  static auto GrowthThresholdForAllocSize(ssize_t alloc_size) -> ssize_t;
			
 
				+
			
 
				+  template <typename LookupKeyT>
			
 
				+  auto GrowAndInsert(LookupKeyT lookup_key, KeyContextT key_context) -> EntryT*;
			
 
				+
			
 
				+  ViewImplT view_impl_;
			
 
				+  int growth_budget_;
			
 
				+  int small_alloc_size_;
			
 
				+};
			
 
				+
			
 
				+// Implementation helper for defining a hashtable type with an SSO buffer.
			
 
				+//
			
 
				+// A specific user-facing hashtable should derive privately from this
			
 
				+// type, and forward the implementation of its interface to functions in this
			
 
				+// type. It should provide the corresponding user-facing hashtable base type as
			
 
				+// the `InputBaseT` type parameter (rather than a key/value pair), and this type
			
 
				+// will in turn derive from that provided base type. This allows derived-to-base
			
 
				+// conversion from the user-facing hashtable type to the user-facing hashtable
			
 
				+// base type. And it does so keeping the inheritance linear. The resulting
			
 
				+// linear inheritance hierarchy for a `Map<K, T>` type will look like:
			
 
				+//
			
 
				+//   Map<K, T>
			
 
				+//    ↓
			
 
				+//   TableImpl<MapBase<K, T>>
			
 
				+//    ↓
			
 
				+//   MapBase<K, T>
			
 
				+//    ↓
			
 
				+//   BaseImpl<K, T>
			
 
				+//
			
 
				+// Other than this inheritance technique, the patterns for this type, and how to
			
 
				+// build user-facing hashtable types from it, mirror those of `ViewImpl`. See
			
 
				+// its documentation for more details.
			
 
				+template <typename InputBaseT, ssize_t SmallSize>
			
 
				+class TableImpl : public InputBaseT {
			
 
				+ protected:
			
 
				+  using BaseT = InputBaseT;
			
 
				+
			
 
				+  TableImpl() : BaseT(SmallSize, small_storage()) {}
			
 
				+  TableImpl(const TableImpl& arg);
			
 
				+  TableImpl(TableImpl&& arg) noexcept;
			
 
				+
			
 
				+  // Resets the hashtable to its initial state, clearing all entries and
			
 
				+  // releasing all memory. If the hashtable had an SSO buffer, that is restored
			
 
				+  // as the storage. Otherwise, a minimum sized table storage is allocated.
			
 
				+  auto ResetImpl() -> void;
			
 
				+
			
 
				+ private:
			
 
				+  using KeyT = BaseT::KeyT;
			
 
				+  using ValueT = BaseT::ValueT;
			
 
				+  using EntryT = BaseT::EntryT;
			
 
				+  using SmallStorage = BaseT::template SmallStorage<SmallSize>;
			
 
				+
			
 
				+  auto small_storage() const -> Storage*;
			
 
				+
			
 
				+  [[no_unique_address]] mutable SmallStorage small_storage_;
			
 
				+};
			
 
				+
			
 
				+////////////////////////////////////////////////////////////////////////////////
			
 
				+//
			
 
				+// Only implementation details below this point.
			
 
				+//
			
 
				+////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+// Computes a seed that provides a small amount of entropy from ASLR where
			
 
				+// available with minimal cost. The priority is speed, and this computes the
			
 
				+// entropy in a way that doesn't require loading from memory, merely accessing
			
 
				+// entropy already available without accessing memory.
			
 
				+inline auto ComputeSeed() -> uint64_t {
			
 
				+  // A global variable whose address is used as a seed. This allows ASLR to
			
 
				+  // introduce some variation in hashtable ordering when enabled via the code
			
 
				+  // model for globals.
			
 
				+  extern volatile std::byte global_addr_seed;
			
 
				+
			
 
				+  return reinterpret_cast<uint64_t>(&global_addr_seed);
			
 
				+}
			
 
				+
			
 
				+inline auto ComputeProbeMaskFromSize(ssize_t size) -> size_t {
			
 
				+  CARBON_DCHECK(llvm::isPowerOf2_64(size))
			
 
				+      << "Size must be a power of two for a hashed buffer!";
			
 
				+  // Since `size` is a power of two, we can make sure the probes are less
			
 
				+  // than `size` by making the mask `size - 1`. We also mask off the low
			
 
				+  // bits so the probes are a multiple of the size of the groups of entries.
			
 
				+  return (size - 1) & ~GroupMask;
			
 
				+}
			
 
				+
			
 
				+// This class handles building a sequence of probe indices from a given
			
 
				+// starting point, including both the quadratic growth and masking the index
			
 
				+// to stay within the bucket array size. The starting point doesn't need to be
			
 
				+// clamped to the size ahead of time (or even be positive), we will do it
			
 
				+// internally.
			
 
				+//
			
 
				+// For reference on quadratic probing:
			
 
				+// https://en.wikipedia.org/wiki/Quadratic_probing
			
 
				+//
			
 
				+// We compute the quadratic probe index incrementally, but we can also compute
			
 
				+// it mathematically and will check that the incremental result matches our
			
 
				+// mathematical expectation. We use the quadratic probing formula of:
			
 
				+//
			
 
				+//   p(start, step) = (start + (step + step^2) / 2) (mod size / GroupSize)
			
 
				+//
			
 
				+// However, we compute it incrementally and scale all the variables by the group
			
 
				+// size so it can be used as an index without an additional multiplication.
			
 
				+class ProbeSequence {
			
 
				+ public:
			
 
				+  ProbeSequence(ssize_t start, ssize_t size) {
			
 
				+    mask_ = ComputeProbeMaskFromSize(size);
			
 
				+    p_ = start & mask_;
			
 
				+#ifndef NDEBUG
			
 
				+    start_ = start & mask_;
			
 
				+    size_ = size;
			
 
				+#endif
			
 
				+  }
			
 
				+
			
 
				+  void Next() {
			
 
				+    step_ += GroupSize;
			
 
				+    p_ = (p_ + step_) & mask_;
			
 
				+#ifndef NDEBUG
			
 
				+    // Verify against the quadratic formula we expect to be following by scaling
			
 
				+    // everything down by `GroupSize`.
			
 
				+    CARBON_DCHECK(
			
 
				+        (p_ / GroupSize) ==
			
 
				+        ((start_ / GroupSize +
			
 
				+          (step_ / GroupSize + (step_ / GroupSize) * (step_ / GroupSize)) / 2) %
			
 
				+         (size_ / GroupSize)))
			
 
				+        << "Index in probe sequence does not match the expected formula.";
			
 
				+    CARBON_DCHECK(step_ < size_) << "We necessarily visit all groups, so we "
			
 
				+                                    "can't have more probe steps than groups.";
			
 
				+#endif
			
 
				+  }
			
 
				+
			
 
				+  auto index() const -> ssize_t { return p_; }
			
 
				+
			
 
				+ private:
			
 
				+  ssize_t step_ = 0;
			
 
				+  size_t mask_;
			
 
				+  ssize_t p_;
			
 
				+#ifndef NDEBUG
			
 
				+  ssize_t start_;
			
 
				+  ssize_t size_;
			
 
				+#endif
			
 
				+};
			
 
				+
			
 
				+// TODO: Evaluate keeping this outlined to see if macro benchmarks observe the
			
 
				+// same perf hit as micro benchmarks.
			
 
				+template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
			
 
				+template <typename LookupKeyT>
			
 
				+auto ViewImpl<InputKeyT, InputValueT, InputKeyContextT>::LookupEntry(
			
 
				+    LookupKeyT lookup_key, KeyContextT key_context) const -> EntryT* {
			
 
				+  // Prefetch with a "low" temporal locality as we're primarily expecting a
			
 
				+  // brief use of the storage and then to return to application code.
			
 
				+  __builtin_prefetch(storage_, /*read*/ 0, /*low-locality*/ 1);
			
 
				+
			
 
				+  ssize_t local_size = alloc_size_;
			
 
				+  CARBON_DCHECK(local_size > 0);
			
 
				+
			
 
				+  uint8_t* local_metadata = metadata();
			
 
				+  HashCode hash = key_context.HashKey(lookup_key, ComputeSeed());
			
 
				+  auto [hash_index, tag] = hash.ExtractIndexAndTag<7>();
			
 
				+
			
 
				+  EntryT* local_entries = entries();
			
 
				+
			
 
				+  // Walk through groups of entries using a quadratic probe starting from
			
 
				+  // `hash_index`.
			
 
				+  ProbeSequence s(hash_index, local_size);
			
 
				+  do {
			
 
				+    ssize_t group_index = s.index();
			
 
				+
			
 
				+    // For each group, match the tag against the metadata to extract the
			
 
				+    // potentially matching entries within the group.
			
 
				+    MetadataGroup g = MetadataGroup::Load(local_metadata, group_index);
			
 
				+    auto metadata_matched_range = g.Match(tag);
			
 
				+    if (LLVM_LIKELY(metadata_matched_range)) {
			
 
				+      // If any entries in this group potentially match based on their metadata,
			
 
				+      // walk each candidate and compare its key to see if we have definitively
			
 
				+      // found a match.
			
 
				+      EntryT* group_entries = &local_entries[group_index];
			
 
				+      auto byte_it = metadata_matched_range.begin();
			
 
				+      auto byte_end = metadata_matched_range.end();
			
 
				+      do {
			
 
				+        EntryT* entry = byte_it.index_ptr(group_entries);
			
 
				+        if (LLVM_LIKELY(key_context.KeyEq(lookup_key, entry->key()))) {
			
 
				+          __builtin_assume(entry != nullptr);
			
 
				+          return entry;
			
 
				+        }
			
 
				+        ++byte_it;
			
 
				+      } while (LLVM_UNLIKELY(byte_it != byte_end));
			
 
				+    }
			
 
				+
			
 
				+    // We failed to find a matching entry in this bucket, so check if there are
			
 
				+    // empty slots as that indicates we're done probing -- no later probed index
			
 
				+    // could have a match.
			
 
				+    auto empty_byte_matched_range = g.MatchEmpty();
			
 
				+    if (LLVM_LIKELY(empty_byte_matched_range)) {
			
 
				+      return nullptr;
			
 
				+    }
			
 
				+
			
 
				+    s.Next();
			
 
				+
			
 
				+    // We use a weird construct of an "unlikely" condition of `true`. The goal
			
 
				+    // is to get the compiler to not prioritize the back edge of the loop for
			
 
				+    // code layout, and in at least some tests this seems to be an effective
			
 
				+    // construct for achieving this.
			
 
				+  } while (LLVM_UNLIKELY(true));
			
 
				+}
			
 
				+
			
 
				+// Note that we force inlining here because we expect to be called with lambdas
			
 
				+// that will in turn be inlined to form the loop body. We don't want function
			
 
				+// boundaries within the loop for performance, and recognizing the degree of
			
 
				+// simplification from inlining these callbacks may be difficult to
			
 
				+// automatically recognize.
			
 
				+template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
			
 
				+template <typename EntryCallbackT, typename GroupCallbackT>
			
 
				+[[clang::always_inline]] auto
			
 
				+ViewImpl<InputKeyT, InputValueT, InputKeyContextT>::ForEachEntry(
			
 
				+    EntryCallbackT entry_callback, GroupCallbackT group_callback) const
			
 
				+    -> void {
			
 
				+  uint8_t* local_metadata = metadata();
			
 
				+  EntryT* local_entries = entries();
			
 
				+
			
 
				+  ssize_t local_size = alloc_size_;
			
 
				+  for (ssize_t group_index = 0; group_index < local_size;
			
 
				+       group_index += GroupSize) {
			
 
				+    auto g = MetadataGroup::Load(local_metadata, group_index);
			
 
				+    auto present_matched_range = g.MatchPresent();
			
 
				+    if (!present_matched_range) {
			
 
				+      continue;
			
 
				+    }
			
 
				+    for (ssize_t byte_index : present_matched_range) {
			
 
				+      entry_callback(local_entries[group_index + byte_index]);
			
 
				+    }
			
 
				+
			
 
				+    group_callback(&local_metadata[group_index]);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
			
 
				+auto ViewImpl<InputKeyT, InputValueT, InputKeyContextT>::CountProbedKeys(
			
 
				+    KeyContextT key_context) const -> ssize_t {
			
 
				+  uint8_t* local_metadata = metadata();
			
 
				+  EntryT* local_entries = entries();
			
 
				+  ssize_t local_size = alloc_size_;
			
 
				+  ssize_t count = 0;
			
 
				+  for (ssize_t group_index = 0; group_index < local_size;
			
 
				+       group_index += GroupSize) {
			
 
				+    auto g = MetadataGroup::Load(local_metadata, group_index);
			
 
				+    auto present_matched_range = g.MatchPresent();
			
 
				+    for (ssize_t byte_index : present_matched_range) {
			
 
				+      ssize_t index = group_index + byte_index;
			
 
				+      HashCode hash =
			
 
				+          key_context.HashKey(local_entries[index].key(), ComputeSeed());
			
 
				+      ssize_t hash_index = hash.ExtractIndexAndTag<7>().first &
			
 
				+                           ComputeProbeMaskFromSize(local_size);
			
 
				+      count += static_cast<ssize_t>(hash_index != group_index);
			
 
				+    }
			
 
				+  }
			
 
				+  return count;
			
 
				+}
			
 
				+
			
 
				+template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
			
 
				+BaseImpl<InputKeyT, InputValueT, InputKeyContextT>::~BaseImpl() {
			
 
				+  Destroy();
			
 
				+}
			
 
				+
			
 
				+// TODO: Evaluate whether it is worth forcing this out-of-line given the
			
 
				+// reasonable ABI boundary it forms and large volume of code necessary to
			
 
				+// implement it.
			
 
				+template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
			
 
				+template <typename LookupKeyT>
			
 
				+auto BaseImpl<InputKeyT, InputValueT, InputKeyContextT>::InsertImpl(
			
 
				+    LookupKeyT lookup_key, KeyContextT key_context)
			
 
				+    -> std::pair<EntryT*, bool> {
			
 
				+  CARBON_DCHECK(alloc_size() > 0);
			
 
				+
			
 
				+  uint8_t* local_metadata = metadata();
			
 
				+
			
 
				+  HashCode hash = key_context.HashKey(lookup_key, ComputeSeed());
			
 
				+  auto [hash_index, tag] = hash.ExtractIndexAndTag<7>();
			
 
				+
			
 
				+  // We re-purpose the empty control byte to signal no insert is needed to the
			
 
				+  // caller. This is guaranteed to not be a control byte we're inserting.
			
 
				+  // constexpr uint8_t NoInsertNeeded = Group::Empty;
			
 
				+
			
 
				+  ssize_t group_with_deleted_index;
			
 
				+  MetadataGroup::MatchIndex deleted_match = {};
			
 
				+
			
 
				+  EntryT* local_entries = entries();
			
 
				+
			
 
				+  auto return_insert_at_index = [&](ssize_t index) -> std::pair<EntryT*, bool> {
			
 
				+    // We'll need to insert at this index so set the control group byte to the
			
 
				+    // proper value.
			
 
				+    local_metadata[index] = tag | MetadataGroup::PresentMask;
			
 
				+    return {&local_entries[index], true};
			
 
				+  };
			
 
				+
			
 
				+  for (ProbeSequence s(hash_index, alloc_size());; s.Next()) {
			
 
				+    ssize_t group_index = s.index();
			
 
				+    auto g = MetadataGroup::Load(local_metadata, group_index);
			
 
				+
			
 
				+    auto control_byte_matched_range = g.Match(tag);
			
 
				+    if (control_byte_matched_range) {
			
 
				+      EntryT* group_entries = &local_entries[group_index];
			
 
				+      auto byte_it = control_byte_matched_range.begin();
			
 
				+      auto byte_end = control_byte_matched_range.end();
			
 
				+      do {
			
 
				+        EntryT* entry = byte_it.index_ptr(group_entries);
			
 
				+        if (LLVM_LIKELY(key_context.KeyEq(lookup_key, entry->key()))) {
			
 
				+          return {entry, false};
			
 
				+        }
			
 
				+        ++byte_it;
			
 
				+      } while (LLVM_UNLIKELY(byte_it != byte_end));
			
 
				+    }
			
 
				+
			
 
				+    // Track the first group with a deleted entry that we could insert over.
			
 
				+    if (!deleted_match) {
			
 
				+      deleted_match = g.MatchDeleted();
			
 
				+      group_with_deleted_index = group_index;
			
 
				+    }
			
 
				+
			
 
				+    // We failed to find a matching entry in this bucket, so check if there are
			
 
				+    // no empty slots. In that case, we'll continue probing.
			
 
				+    auto empty_match = g.MatchEmpty();
			
 
				+    if (!empty_match) {
			
 
				+      continue;
			
 
				+    }
			
 
				+    // Ok, we've finished probing without finding anything and need to insert
			
 
				+    // instead.
			
 
				+
			
 
				+    // If we found a deleted slot, we don't need the probe sequence to insert
			
 
				+    // so just bail. We want to ensure building up a table is fast so we
			
 
				+    // de-prioritize this a bit. In practice this doesn't have too much of an
			
 
				+    // effect.
			
 
				+    if (LLVM_UNLIKELY(deleted_match)) {
			
 
				+      return return_insert_at_index(group_with_deleted_index +
			
 
				+                                    deleted_match.index());
			
 
				+    }
			
 
				+
			
 
				+    // We're going to need to grow by inserting into an empty slot. Check that
			
 
				+    // we have the budget for that before we compute the exact index of the
			
 
				+    // empty slot. Without the growth budget we'll have to completely rehash and
			
 
				+    // so we can just bail here.
			
 
				+    if (LLVM_UNLIKELY(growth_budget_ == 0)) {
			
 
				+      return {GrowAndInsert(lookup_key, key_context), true};
			
 
				+    }
			
 
				+
			
 
				+    --growth_budget_;
			
 
				+    CARBON_DCHECK(growth_budget() >= 0)
			
 
				+        << "Growth budget shouldn't have gone negative!";
			
 
				+    return return_insert_at_index(group_index + empty_match.index());
			
 
				+  }
			
 
				+
			
 
				+  CARBON_FATAL() << "We should never finish probing without finding the entry "
			
 
				+                    "or an empty slot.";
			
 
				+}
			
 
				+
			
 
				+template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
			
 
				+template <typename LookupKeyT>
			
 
				+auto BaseImpl<InputKeyT, InputValueT, InputKeyContextT>::EraseImpl(
			
 
				+    LookupKeyT lookup_key, KeyContextT key_context) -> bool {
			
 
				+  EntryT* entry = view_impl_.LookupEntry(lookup_key, key_context);
			
 
				+  if (!entry) {
			
 
				+    return false;
			
 
				+  }
			
 
				+
			
 
				+  // If there are empty slots in this group then nothing will probe past this
			
 
				+  // group looking for an entry so we can simply set this slot to empty as
			
 
				+  // well. However, if every slot in this group is full, it might be part of
			
 
				+  // a long probe chain that we can't disrupt. In that case we mark the slot's
			
 
				+  // metadata as deleted to keep probes continuing past it.
			
 
				+  //
			
 
				+  // If we mark the slot as empty, we'll also need to increase the growth
			
 
				+  // budget.
			
 
				+  uint8_t* local_metadata = metadata();
			
 
				+  EntryT* local_entries = entries();
			
 
				+  ssize_t index = entry - local_entries;
			
 
				+  ssize_t group_index = index & ~GroupMask;
			
 
				+  auto g = MetadataGroup::Load(local_metadata, group_index);
			
 
				+  auto empty_matched_range = g.MatchEmpty();
			
 
				+  if (empty_matched_range) {
			
 
				+    local_metadata[index] = MetadataGroup::Empty;
			
 
				+    ++growth_budget_;
			
 
				+  } else {
			
 
				+    local_metadata[index] = MetadataGroup::Deleted;
			
 
				+  }
			
 
				+
			
 
				+  if constexpr (!EntryT::IsTriviallyDestructible) {
			
 
				+    entry->Destroy();
			
 
				+  }
			
 
				+
			
 
				+  return true;
			
 
				+}
			
 
				+
			
 
				+template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
			
 
				+auto BaseImpl<InputKeyT, InputValueT, InputKeyContextT>::ClearImpl() -> void {
			
 
				+  view_impl_.ForEachEntry(
			
 
				+      [](EntryT& entry) {
			
 
				+        if constexpr (!EntryT::IsTriviallyDestructible) {
			
 
				+          entry.Destroy();
			
 
				+        }
			
 
				+      },
			
 
				+      [](uint8_t* metadata_group) {
			
 
				+        // Clear the group.
			
 
				+        std::memset(metadata_group, 0, GroupSize);
			
 
				+      });
			
 
				+  growth_budget_ = GrowthThresholdForAllocSize(alloc_size());
			
 
				+}
			
 
				+
			
 
				+// Allocates the appropriate memory layout for a table of the given
			
 
				+// `alloc_size`, with space both for the metadata array and entries.
			
 
				+//
			
 
				+// The returned pointer *must* be deallocated by calling the below `Deallocate`
			
 
				+// function with the same `alloc_size` as used here.
			
 
				+template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
			
 
				+auto BaseImpl<InputKeyT, InputValueT, InputKeyContextT>::Allocate(
			
 
				+    ssize_t alloc_size) -> Storage* {
			
 
				+  return reinterpret_cast<Storage*>(__builtin_operator_new(
			
 
				+      AllocByteSize(alloc_size), static_cast<std::align_val_t>(Alignment),
			
 
				+      std::nothrow_t()));
			
 
				+}
			
 
				+
			
 
				+// Deallocates a table's storage that was allocated with the `Allocate`
			
 
				+// function.
			
 
				+template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
			
 
				+auto BaseImpl<InputKeyT, InputValueT, InputKeyContextT>::Deallocate(
			
 
				+    Storage* storage, ssize_t alloc_size) -> void {
			
 
				+  ssize_t allocated_size = AllocByteSize(alloc_size);
			
 
				+  // We don't need the size, but make sure it always compiles.
			
 
				+  static_cast<void>(allocated_size);
			
 
				+  __builtin_operator_delete(storage,
			
 
				+#if __cpp_sized_deallocation
			
 
				+                            allocated_size,
			
 
				+#endif
			
 
				+                            static_cast<std::align_val_t>(Alignment));
			
 
				+}
			
 
				+
			
 
				+// Construct a table using the provided small storage if `small_alloc_size_` is
			
 
				+// non-zero. If `small_alloc_size_` is zero, then `small_storage` won't be used
			
 
				+// and can be null. Regardless, after this the storage pointer is non-null and
			
 
				+// the size is non-zero so that we can directly begin inserting or querying the
			
 
				+// table.
			
 
				+template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
			
 
				+auto BaseImpl<InputKeyT, InputValueT, InputKeyContextT>::Construct(
			
 
				+    Storage* small_storage) -> void {
			
 
				+  if (small_alloc_size_ > 0) {
			
 
				+    alloc_size() = small_alloc_size_;
			
 
				+    storage() = small_storage;
			
 
				+  } else {
			
 
				+    // Directly allocate the initial buffer so that the hashtable is never in
			
 
				+    // an empty state.
			
 
				+    alloc_size() = MinAllocatedSize;
			
 
				+    storage() = Allocate(MinAllocatedSize);
			
 
				+  }
			
 
				+  std::memset(metadata(), 0, alloc_size());
			
 
				+  growth_budget_ = GrowthThresholdForAllocSize(alloc_size());
			
 
				+}
			
 
				+
			
 
				+// Destroy the current table, releasing any memory used.
			
 
				+template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
			
 
				+auto BaseImpl<InputKeyT, InputValueT, InputKeyContextT>::Destroy() -> void {
			
 
				+  // Check for a moved-from state and don't do anything. Only a moved-from table
			
 
				+  // has a zero size.
			
 
				+  if (alloc_size() == 0) {
			
 
				+    return;
			
 
				+  }
			
 
				+
			
 
				+  // Destroy all the entries.
			
 
				+  if constexpr (!EntryT::IsTriviallyDestructible) {
			
 
				+    view_impl_.ForEachEntry([](EntryT& entry) { entry.Destroy(); },
			
 
				+                            [](auto...) {});
			
 
				+  }
			
 
				+
			
 
				+  // If small, nothing to deallocate.
			
 
				+  if (is_small()) {
			
 
				+    return;
			
 
				+  }
			
 
				+
			
 
				+  // Just deallocate the storage without updating anything when destroying the
			
 
				+  // object.
			
 
				+  Deallocate(storage(), alloc_size());
			
 
				+}
			
 
				+
			
 
				+// Optimized routine to insert a key into a table when that key *definitely*
			
 
				+// isn't present in the table and the table *definitely* has a viable empty slot
			
 
				+// (and growth space) to insert into before any deleted slots. When both of
			
 
				+// these are true, typically just after growth, we can dramatically simplify the
			
 
				+// insert position search.
			
 
				+template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
			
 
				+template <typename LookupKeyT>
			
 
				+[[clang::noinline]] auto
			
 
				+BaseImpl<InputKeyT, InputValueT, InputKeyContextT>::InsertIntoEmpty(
			
 
				+    LookupKeyT lookup_key, KeyContextT key_context) -> EntryT* {
			
 
				+  HashCode hash = key_context.HashKey(lookup_key, ComputeSeed());
			
 
				+  auto [hash_index, tag] = hash.ExtractIndexAndTag<7>();
			
 
				+  uint8_t* local_metadata = metadata();
			
 
				+  EntryT* local_entries = entries();
			
 
				+
			
 
				+  for (ProbeSequence s(hash_index, alloc_size());; s.Next()) {
			
 
				+    ssize_t group_index = s.index();
			
 
				+    auto g = MetadataGroup::Load(local_metadata, group_index);
			
 
				+
			
 
				+    if (auto empty_match = g.MatchEmpty()) {
			
 
				+      ssize_t index = group_index + empty_match.index();
			
 
				+      local_metadata[index] = tag | MetadataGroup::PresentMask;
			
 
				+      return &local_entries[index];
			
 
				+    }
			
 
				+
			
 
				+    // Otherwise we continue probing.
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+// Apply our doubling growth strategy and (re-)check invariants around table
			
 
				+// size.
			
 
				+template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
			
 
				+auto BaseImpl<InputKeyT, InputValueT, InputKeyContextT>::ComputeNextAllocSize(
			
 
				+    ssize_t old_alloc_size) -> ssize_t {
			
 
				+  CARBON_DCHECK(llvm::isPowerOf2_64(old_alloc_size))
			
 
				+      << "Expected a power of two!";
			
 
				+  ssize_t new_alloc_size;
			
 
				+  bool overflow = __builtin_mul_overflow(old_alloc_size, 2, &new_alloc_size);
			
 
				+  CARBON_CHECK(!overflow) << "Computing the new size overflowed `ssize_t`!";
			
 
				+  return new_alloc_size;
			
 
				+}
			
 
				+
			
 
				+// Compute the growth threshold for a given size.
			
 
				+template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
			
 
				+auto BaseImpl<InputKeyT, InputValueT,
			
 
				+              InputKeyContextT>::GrowthThresholdForAllocSize(ssize_t alloc_size)
			
 
				+    -> ssize_t {
			
 
				+  // We use a 7/8ths load factor to trigger growth.
			
 
				+  return alloc_size - alloc_size / 8;
			
 
				+}
			
 
				+
			
 
				+// Grow the hashtable to create space and then insert into it. Returns the
			
 
				+// selected insertion entry. Never returns null. In addition to growing and
			
 
				+// selecting the insertion entry, this routine updates the metadata array so
			
 
				+// that this function can be directly called and the result returned from
			
 
				+// `InsertImpl`.
			
 
				+template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
			
 
				+template <typename LookupKeyT>
			
 
				+[[clang::noinline]] auto
			
 
				+BaseImpl<InputKeyT, InputValueT, InputKeyContextT>::GrowAndInsert(
			
 
				+    LookupKeyT lookup_key, KeyContextT key_context) -> EntryT* {
			
 
				+  // We collect the probed elements in a small vector for re-insertion. It is
			
 
				+  // tempting to reuse the already allocated storage, but doing so appears to
			
 
				+  // be a (very slight) performance regression. These are relatively rare and
			
 
				+  // storing them into the existing storage creates stores to the same regions
			
 
				+  // of memory we're reading. Moreover, it requires moving both the key and the
			
 
				+  // value twice, and doing the `memcpy` widening for relocatable types before
			
 
				+  // the group walk rather than after the group walk. In practice, between the
			
 
				+  // statistical rareness and using a large small size buffer here on the stack,
			
 
				+  // we can handle this most efficiently with temporary, additional storage.
			
 
				+  llvm::SmallVector<ssize_t, 128> probed_indices;
			
 
				+
			
 
				+  // We grow into a new `MapBase` so that both the new and old maps are
			
 
				+  // fully functional until all the entries are moved over. However, we directly
			
 
				+  // manipulate the internals to short circuit many aspects of the growth.
			
 
				+  ssize_t old_size = alloc_size();
			
 
				+  CARBON_DCHECK(old_size > 0);
			
 
				+  CARBON_DCHECK(growth_budget_ == 0);
			
 
				+
			
 
				+  bool old_small = is_small();
			
 
				+  Storage* old_storage = storage();
			
 
				+  uint8_t* old_metadata = metadata();
			
 
				+  EntryT* old_entries = entries();
			
 
				+
			
 
				+#ifndef NDEBUG
			
 
				+  // Count how many of the old table slots will end up being empty after we grow
			
 
				+  // the table. This is both the currently empty slots, but also the deleted
			
 
				+  // slots because we clear them to empty and re-insert everything that had any
			
 
				+  // probing.
			
 
				+  ssize_t debug_empty_count =
			
 
				+      llvm::count(llvm::ArrayRef(old_metadata, old_size), MetadataGroup::Empty);
			
 
				+  ssize_t debug_deleted_count = llvm::count(
			
 
				+      llvm::ArrayRef(old_metadata, old_size), MetadataGroup::Deleted);
			
 
				+  CARBON_DCHECK(debug_empty_count >=
			
 
				+                (old_size - GrowthThresholdForAllocSize(old_size)))
			
 
				+      << "debug_empty_count: " << debug_empty_count
			
 
				+      << ", debug_deleted_count: " << debug_deleted_count
			
 
				+      << ", size: " << old_size;
			
 
				+#endif
			
 
				+
			
 
				+  // Compute the new size and grow the storage in place (if possible).
			
 
				+  ssize_t new_size = ComputeNextAllocSize(old_size);
			
 
				+  alloc_size() = new_size;
			
 
				+  storage() = Allocate(new_size);
			
 
				+  growth_budget_ = GrowthThresholdForAllocSize(new_size);
			
 
				+
			
 
				+  // Now extract the new components of the table.
			
 
				+  uint8_t* new_metadata = metadata();
			
 
				+  EntryT* new_entries = entries();
			
 
				+
			
 
				+  // We always double the size when we grow. This allows an important
			
 
				+  // optimization -- we're adding exactly one more high bit to the hash-computed
			
 
				+  // index for each entry. This in turn means we can classify every entry in the
			
 
				+  // table into three cases:
			
 
				+  //
			
 
				+  // 1) The new high bit is zero, the entry is at the same index in the new
			
 
				+  //    table as the old.
			
 
				+  //
			
 
				+  // 2) The new high bit is one, the entry is at the old index plus the old
			
 
				+  //    size.
			
 
				+  //
			
 
				+  // 3) The entry's current index doesn't match the initial hash index because
			
 
				+  //    it required some amount of probing to find an empty slot.
			
 
				+  //
			
 
				+  // The design of the hash table tries to minimize how many entries fall into
			
 
				+  // case (3), so we expect the vast majority of entries to be in (1) or (2).
			
 
				+  // This lets us model growth notionally as duplicating the hash table,
			
 
				+  // clearing out the empty slots, and inserting any probed elements.
			
 
				+
			
 
				+  ssize_t count = 0;
			
 
				+  for (ssize_t group_index = 0; group_index < old_size;
			
 
				+       group_index += GroupSize) {
			
 
				+    auto low_g = MetadataGroup::Load(old_metadata, group_index);
			
 
				+    // Make sure to match present elements first to enable pipelining with
			
 
				+    // clearing.
			
 
				+    auto present_matched_range = low_g.MatchPresent();
			
 
				+    low_g.ClearDeleted();
			
 
				+    MetadataGroup high_g;
			
 
				+    if constexpr (MetadataGroup::FastByteClear) {
			
 
				+      // When we have a fast byte clear, we can update the metadata for the
			
 
				+      // growth in-register and store at the end.
			
 
				+      high_g = low_g;
			
 
				+    } else {
			
 
				+      // If we don't have a fast byte clear, we can store the metadata group
			
 
				+      // eagerly here and overwrite bytes with a byte store below instead of
			
 
				+      // clearing the byte in-register.
			
 
				+      low_g.Store(new_metadata, group_index);
			
 
				+      low_g.Store(new_metadata, group_index | old_size);
			
 
				+    }
			
 
				+    for (ssize_t byte_index : present_matched_range) {
			
 
				+      ++count;
			
 
				+      ssize_t old_index = group_index + byte_index;
			
 
				+      if constexpr (!MetadataGroup::FastByteClear) {
			
 
				+        CARBON_DCHECK(new_metadata[old_index] == old_metadata[old_index]);
			
 
				+        CARBON_DCHECK(new_metadata[old_index | old_size] ==
			
 
				+                      old_metadata[old_index]);
			
 
				+      }
			
 
				+      HashCode hash =
			
 
				+          key_context.HashKey(old_entries[old_index].key(), ComputeSeed());
			
 
				+      ssize_t old_hash_index = hash.ExtractIndexAndTag<7>().first &
			
 
				+                               ComputeProbeMaskFromSize(old_size);
			
 
				+      if (LLVM_UNLIKELY(old_hash_index != group_index)) {
			
 
				+        probed_indices.push_back(old_index);
			
 
				+        if constexpr (MetadataGroup::FastByteClear) {
			
 
				+          low_g.ClearByte(byte_index);
			
 
				+          high_g.ClearByte(byte_index);
			
 
				+        } else {
			
 
				+          new_metadata[old_index] = MetadataGroup::Empty;
			
 
				+          new_metadata[old_index | old_size] = MetadataGroup::Empty;
			
 
				+        }
			
 
				+        continue;
			
 
				+      }
			
 
				+      ssize_t new_index = hash.ExtractIndexAndTag<7>().first &
			
 
				+                          ComputeProbeMaskFromSize(new_size);
			
 
				+      CARBON_DCHECK(new_index == old_hash_index ||
			
 
				+                    new_index == (old_hash_index | old_size));
			
 
				+      // Toggle the newly added bit of the index to get to the other possible
			
 
				+      // target index.
			
 
				+      if constexpr (MetadataGroup::FastByteClear) {
			
 
				+        (new_index == old_hash_index ? high_g : low_g).ClearByte(byte_index);
			
 
				+        new_index += byte_index;
			
 
				+      } else {
			
 
				+        new_index += byte_index;
			
 
				+        new_metadata[new_index ^ old_size] = MetadataGroup::Empty;
			
 
				+      }
			
 
				+
			
 
				+      // If we need to explicitly move (and destroy) the key or value, do so
			
 
				+      // here where we already know its target.
			
 
				+      if constexpr (!EntryT::IsTriviallyRelocatable) {
			
 
				+        new_entries[new_index].MoveFrom(std::move(old_entries[old_index]));
			
 
				+      }
			
 
				+    }
			
 
				+    if constexpr (MetadataGroup::FastByteClear) {
			
 
				+      low_g.Store(new_metadata, group_index);
			
 
				+      high_g.Store(new_metadata, (group_index | old_size));
			
 
				+    }
			
 
				+  }
			
 
				+  CARBON_DCHECK((count - static_cast<ssize_t>(probed_indices.size())) ==
			
 
				+                (new_size - llvm::count(llvm::ArrayRef(new_metadata, new_size),
			
 
				+                                        MetadataGroup::Empty)));
			
 
				+#ifndef NDEBUG
			
 
				+  CARBON_DCHECK((debug_empty_count + debug_deleted_count) ==
			
 
				+                (old_size - count));
			
 
				+  CARBON_DCHECK(llvm::count(llvm::ArrayRef(new_metadata, new_size),
			
 
				+                            MetadataGroup::Empty) ==
			
 
				+                debug_empty_count + debug_deleted_count +
			
 
				+                    static_cast<ssize_t>(probed_indices.size()) + old_size);
			
 
				+#endif
			
 
				+
			
 
				+  // If the keys or values are trivially relocatable, we do a bulk memcpy of
			
 
				+  // them into place. This will copy them into both possible locations, which is
			
 
				+  // fine. One will be empty and clobbered if reused or ignored. The other will
			
 
				+  // be the one used. This might seem like it needs it to be valid for us to
			
 
				+  // create two copies, but it doesn't. This produces the exact same storage as
			
 
				+  // copying the storage into the wrong location first, and then again into the
			
 
				+  // correct location. Only one is live and only one is destroyed.
			
 
				+  if constexpr (EntryT::IsTriviallyRelocatable) {
			
 
				+    memcpy(new_entries, old_entries, old_size * sizeof(EntryT));
			
 
				+    memcpy(new_entries + old_size, old_entries, old_size * sizeof(EntryT));
			
 
				+  }
			
 
				+
			
 
				+  // We then need to do a normal insertion for anything that was probed before
			
 
				+  // growth, but we know we'll find an empty slot, so leverage that.
			
 
				+  for (ssize_t old_index : probed_indices) {
			
 
				+    EntryT* new_entry =
			
 
				+        InsertIntoEmpty(old_entries[old_index].key(), key_context);
			
 
				+    new_entry->MoveFrom(std::move(old_entries[old_index]));
			
 
				+  }
			
 
				+  CARBON_DCHECK(count ==
			
 
				+                (new_size - llvm::count(llvm::ArrayRef(new_metadata, new_size),
			
 
				+                                        MetadataGroup::Empty)));
			
 
				+  growth_budget_ -= count;
			
 
				+  CARBON_DCHECK(growth_budget_ ==
			
 
				+                (GrowthThresholdForAllocSize(new_size) -
			
 
				+                 (new_size - llvm::count(llvm::ArrayRef(new_metadata, new_size),
			
 
				+                                         MetadataGroup::Empty))));
			
 
				+  CARBON_DCHECK(growth_budget_ > 0 &&
			
 
				+                "Must still have a growth budget after rehash!");
			
 
				+
			
 
				+  if (!old_small) {
			
 
				+    // Old isn't a small buffer, so we need to deallocate it.
			
 
				+    Deallocate(old_storage, old_size);
			
 
				+  }
			
 
				+
			
 
				+  // And lastly insert the lookup_key into an index in the newly grown map and
			
 
				+  // return that index for use.
			
 
				+  --growth_budget_;
			
 
				+  return InsertIntoEmpty(lookup_key, key_context);
			
 
				+}
			
 
				+
			
 
				+template <typename InputBaseT, ssize_t SmallSize>
			
 
				+TableImpl<InputBaseT, SmallSize>::TableImpl(const TableImpl& arg)
			
 
				+    : BaseT(arg.alloc_size(), arg.growth_budget_, SmallSize) {
			
 
				+  CARBON_DCHECK(arg.small_alloc_size_ == SmallSize);
			
 
				+
			
 
				+  ssize_t local_size = arg.alloc_size();
			
 
				+
			
 
				+  if (SmallSize > 0 && arg.is_small()) {
			
 
				+    CARBON_DCHECK(local_size == SmallSize);
			
 
				+    this->storage() = small_storage();
			
 
				+  } else {
			
 
				+    this->storage() = BaseT::Allocate(local_size);
			
 
				+  }
			
 
				+
			
 
				+  // Preserve which slot every entry is in, including tombstones in the
			
 
				+  // metadata, in order to copy into the new table's storage without rehashing
			
 
				+  // all of the keys. This is especially important as we don't have an easy way
			
 
				+  // to access the key context needed for rehashing here.
			
 
				+  uint8_t* local_metadata = this->metadata();
			
 
				+  EntryT* local_entries = this->entries();
			
 
				+  const uint8_t* local_arg_metadata = arg.metadata();
			
 
				+  const EntryT* local_arg_entries = arg.entries();
			
 
				+  memcpy(local_metadata, local_arg_metadata, local_size);
			
 
				+
			
 
				+  for (ssize_t group_index = 0; group_index < local_size;
			
 
				+       group_index += GroupSize) {
			
 
				+    auto g = MetadataGroup::Load(local_arg_metadata, group_index);
			
 
				+    for (ssize_t byte_index : g.MatchPresent()) {
			
 
				+      local_entries[group_index + byte_index].CopyFrom(
			
 
				+          local_arg_entries[group_index + byte_index]);
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+// Puts the incoming table into a moved-from state that can be destroyed or
			
 
				+// re-initialized but must not be used otherwise.
			
 
				+template <typename InputBaseT, ssize_t SmallSize>
			
 
				+TableImpl<InputBaseT, SmallSize>::TableImpl(TableImpl&& arg) noexcept
			
 
				+    : BaseT(arg.alloc_size(), arg.growth_budget_, SmallSize) {
			
 
				+  CARBON_DCHECK(arg.small_alloc_size_ == SmallSize);
			
 
				+
			
 
				+  ssize_t local_size = arg.alloc_size();
			
 
				+
			
 
				+  if (SmallSize > 0 && arg.is_small()) {
			
 
				+    CARBON_DCHECK(local_size == SmallSize);
			
 
				+    this->storage() = small_storage();
			
 
				+
			
 
				+    // For small tables, we have to move the entries as we can't move the tables
			
 
				+    // themselves. We do this preserving their slots and even tombstones to
			
 
				+    // avoid rehashing.
			
 
				+    uint8_t* local_metadata = this->metadata();
			
 
				+    EntryT* local_entries = this->entries();
			
 
				+    uint8_t* local_arg_metadata = arg.metadata();
			
 
				+    EntryT* local_arg_entries = arg.entries();
			
 
				+    memcpy(local_metadata, local_arg_metadata, local_size);
			
 
				+    if (EntryT::IsTriviallyRelocatable) {
			
 
				+      memcpy(local_entries, local_arg_entries, SmallSize * sizeof(EntryT));
			
 
				+    } else {
			
 
				+      for (ssize_t group_index = 0; group_index < local_size;
			
 
				+           group_index += GroupSize) {
			
 
				+        auto g = MetadataGroup::Load(local_arg_metadata, group_index);
			
 
				+        for (ssize_t byte_index : g.MatchPresent()) {
			
 
				+          local_entries[group_index + byte_index].MoveFrom(
			
 
				+              std::move(local_arg_entries[group_index + byte_index]));
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  } else {
			
 
				+    // Just point to the allocated storage.
			
 
				+    this->storage() = arg.storage();
			
 
				+  }
			
 
				+
			
 
				+  // Finally, put the incoming table into a moved-from state.
			
 
				+  arg.alloc_size() = 0;
			
 
				+  // Replace the pointer with null to ease debugging.
			
 
				+  arg.storage() = nullptr;
			
 
				+}
			
 
				+
			
 
				+// Reset a table to its original state, including releasing any allocated
			
 
				+// memory.
			
 
				+template <typename InputBaseT, ssize_t SmallSize>
			
 
				+auto TableImpl<InputBaseT, SmallSize>::ResetImpl() -> void {
			
 
				+  this->Destroy();
			
 
				+
			
 
				+  // Re-initialize the whole thing.
			
 
				+  CARBON_DCHECK(this->small_alloc_size() == SmallSize);
			
 
				+  this->Construct(small_storage());
			
 
				+}
			
 
				+
			
 
				+template <typename InputBaseT, ssize_t SmallSize>
			
 
				+auto TableImpl<InputBaseT, SmallSize>::small_storage() const -> Storage* {
			
 
				+  if constexpr (SmallSize > 0) {
			
 
				+    // Do a bunch of validation of the small size to establish our invariants
			
 
				+    // when we know we have a non-zero small size.
			
 
				+    static_assert(llvm::isPowerOf2_64(SmallSize),
			
 
				+                  "SmallSize must be a power of two for a hashed buffer!");
			
 
				+    static_assert(
			
 
				+        SmallSize >= MaxGroupSize,
			
 
				+        "We require all small sizes to multiples of the largest group "
			
 
				+        "size supported to ensure it can be used portably.  ");
			
 
				+    static_assert(
			
 
				+        (SmallSize % MaxGroupSize) == 0,
			
 
				+        "Small size must be a multiple of the max group size supported "
			
 
				+        "so that we can allocate a whole number of groups.");
			
 
				+    // Implied by the max asserts above.
			
 
				+    static_assert(SmallSize >= GroupSize);
			
 
				+    static_assert((SmallSize % GroupSize) == 0);
			
 
				+
			
 
				+    static_assert(SmallSize >= alignof(StorageEntry<KeyT, ValueT>),
			
 
				+                  "Requested a small size that would require padding between "
			
 
				+                  "metadata bytes and correctly aligned key and value types. "
			
 
				+                  "Either a larger small size or a zero small size and heap "
			
 
				+                  "allocation are required for this key and value type.");
			
 
				+
			
 
				+    static_assert(offsetof(SmallStorage, entries) == SmallSize,
			
 
				+                  "Offset to entries in small size storage doesn't match "
			
 
				+                  "computed offset!");
			
 
				+
			
 
				+    return &small_storage_;
			
 
				+  } else {
			
 
				+    static_assert(
			
 
				+        sizeof(TableImpl) == sizeof(BaseT),
			
 
				+        "Empty small storage caused a size difference and wasted space!");
			
 
				+
			
 
				+    return nullptr;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+}  // namespace Carbon::RawHashtable
			
 
				+
			
 
				+#endif  // CARBON_COMMON_RAW_HASHTABLE_H_
			
--- a/common/raw_hashtable_benchmark_helpers.cpp
+++ b/common/raw_hashtable_benchmark_helpers.cpp
@@ -0,0 +1,383 @@
 
				+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
			
 
				+// Exceptions. See /LICENSE for license information.
			
 
				+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
			
 
				+
			
 
				+#include "common/raw_hashtable_benchmark_helpers.h"
			
 
				+
			
 
				+#include <cstddef>
			
 
				+#include <forward_list>
			
 
				+
			
 
				+namespace Carbon::RawHashtable {
			
 
				+
			
 
				+// A local shuffle implementation built on Abseil to improve performance in
			
 
				+// debug builds.
			
 
				+template <typename T>
			
 
				+static auto Shuffle(llvm::MutableArrayRef<T> data, absl::BitGen& gen) {
			
 
				+  for (ssize_t i : llvm::seq<ssize_t>(0, data.size() - 1)) {
			
 
				+    ssize_t j = absl::Uniform<ssize_t>(gen, 0, data.size() - i);
			
 
				+    if (j != 0) {
			
 
				+      std::swap(data[i], data[i + j]);
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+constexpr ssize_t NumChars = 64;
			
 
				+static_assert(llvm::isPowerOf2_64(NumChars));
			
 
				+
			
 
				+// For benchmarking, we use short strings in a fixed distribution with common
			
 
				+// characters. Real-world strings aren't uniform across ASCII or Unicode, etc.
			
 
				+// And for *micro*-benchmarking we want to focus on the map overhead with short,
			
 
				+// fast keys.
			
 
				+static auto MakeChars() -> llvm::OwningArrayRef<char> {
			
 
				+  llvm::OwningArrayRef<char> characters(NumChars);
			
 
				+
			
 
				+  // Start with `-` and `_`, and then add `a` - `z`, `A` - `Z`, and `0` - `9`.
			
 
				+  characters[0] = '-';
			
 
				+  characters[1] = '_';
			
 
				+  ssize_t i = 2;
			
 
				+  for (auto range :
			
 
				+       {llvm::seq_inclusive('a', 'z'), llvm::seq_inclusive('A', 'Z'),
			
 
				+        llvm::seq_inclusive('0', '9')}) {
			
 
				+    for (char c : range) {
			
 
				+      characters[i] = c;
			
 
				+      ++i;
			
 
				+    }
			
 
				+  }
			
 
				+  CARBON_CHECK(i == NumChars) << "Expected exactly " << NumChars
			
 
				+                              << " characters, got " << i << " instead!";
			
 
				+  return characters;
			
 
				+}
			
 
				+
			
 
				+constexpr ssize_t NumFourCharStrs = NumChars * NumChars * NumChars * NumChars;
			
 
				+static_assert(llvm::isPowerOf2_64(NumFourCharStrs));
			
 
				+
			
 
				+// Compute every 4-character string in a shuffled array. This is a little memory
			
 
				+// intense -- 64 MiB -- but ends up being much cheaper by letting us reliably
			
 
				+// select a unique 4-character sequence to avoid collisions.
			
 
				+static auto MakeFourCharStrs(llvm::ArrayRef<char> characters, absl::BitGen& gen)
			
 
				+    -> llvm::OwningArrayRef<std::array<char, 4>> {
			
 
				+  constexpr ssize_t NumCharsMask = NumChars - 1;
			
 
				+  constexpr ssize_t NumCharsShift = llvm::CTLog2<NumChars>();
			
 
				+  llvm::OwningArrayRef<std::array<char, 4>> four_char_strs(NumFourCharStrs);
			
 
				+  for (auto [i, str] : llvm::enumerate(four_char_strs)) {
			
 
				+    str[0] = characters[i & NumCharsMask];
			
 
				+    i >>= NumCharsShift;
			
 
				+    str[1] = characters[i & NumCharsMask];
			
 
				+    i >>= NumCharsShift;
			
 
				+    str[2] = characters[i & NumCharsMask];
			
 
				+    i >>= NumCharsShift;
			
 
				+    CARBON_CHECK((i & ~NumCharsMask) == 0);
			
 
				+    str[3] = characters[i];
			
 
				+  }
			
 
				+  Shuffle(four_char_strs, gen);
			
 
				+  return four_char_strs;
			
 
				+}
			
 
				+
			
 
				+constexpr ssize_t NumRandomChars = static_cast<ssize_t>(64) * 1024;
			
 
				+
			
 
				+// Create a pool of random characters to sample from rather than computing this
			
 
				+// for every string which is very slow in debug builds. We also pad this pool
			
 
				+// with the max length so we can pull the full length from the end to simplify
			
 
				+// the logic when wrapping around the pool.
			
 
				+static auto MakeRandomChars(llvm::ArrayRef<char> characters, int max_length,
			
 
				+                            absl::BitGen& gen) -> llvm::OwningArrayRef<char> {
			
 
				+  llvm::OwningArrayRef<char> random_chars(NumRandomChars + max_length);
			
 
				+  for (char& c : random_chars) {
			
 
				+    c = characters[absl::Uniform<ssize_t>(gen, 0, NumChars)];
			
 
				+  }
			
 
				+  return random_chars;
			
 
				+}
			
 
				+
			
 
				+// Make a small vector of pointers into a single allocation of raw strings. The
			
 
				+// allocated memory is expected to leak and must be transitively referenced by a
			
 
				+// global. Each string has `length` size (which must be >= 4), and there are
			
 
				+// `key_count` keys in the result. Each key is filled from the `random_chars`
			
 
				+// until the last 4 characters. The last four characters of each string will be
			
 
				+// taken sequentially from `four_char_strs` from some random start position to
			
 
				+// ensure no duplicate keys are produced.
			
 
				+static auto MakeRawStrKeys(ssize_t length, ssize_t key_count,
			
 
				+                           llvm::ArrayRef<std::array<char, 4>> four_char_strs,
			
 
				+                           llvm::ArrayRef<char> random_chars, absl::BitGen& gen)
			
 
				+    -> llvm::SmallVector<const char*> {
			
 
				+  llvm::SmallVector<const char*> raw_keys;
			
 
				+  CARBON_CHECK(length >= 4);
			
 
				+  ssize_t prefix_length = length - 4;
			
 
				+
			
 
				+  // Select a random start for indexing our four character strings.
			
 
				+  ssize_t four_char_index = absl::Uniform<ssize_t>(gen, 0, NumFourCharStrs);
			
 
				+
			
 
				+  // Select a random start for the prefix random characters.
			
 
				+  ssize_t random_chars_index = absl::Uniform<ssize_t>(gen, 0, NumRandomChars);
			
 
				+
			
 
				+  // Do a single memory allocation for all the keys of this length to
			
 
				+  // avoid an excessive number of small and fragmented allocations. This
			
 
				+  // memory is intentionally leaked as the keys are global and will
			
 
				+  // themselves will point into it.
			
 
				+  char* key_text = new char[key_count * length];
			
 
				+
			
 
				+  // Reserve all the key space since we know how many we'll need.
			
 
				+  raw_keys.reserve(key_count);
			
 
				+  for ([[gnu::unused]] ssize_t i : llvm::seq<ssize_t>(0, key_count)) {
			
 
				+    memcpy(key_text, random_chars.data() + random_chars_index, prefix_length);
			
 
				+    random_chars_index += prefix_length;
			
 
				+    random_chars_index &= NumRandomChars - 1;
			
 
				+    // Set the last four characters with this entry in the shuffled
			
 
				+    // sequence.
			
 
				+    memcpy(key_text + prefix_length, four_char_strs[four_char_index].data(), 4);
			
 
				+    // Step through the shuffled sequence. We start at a random position,
			
 
				+    // so we need to wrap around the end.
			
 
				+    ++four_char_index;
			
 
				+    four_char_index &= NumFourCharStrs - 1;
			
 
				+
			
 
				+    // And finally save the start pointer as one of our raw keys.
			
 
				+    raw_keys.push_back(key_text);
			
 
				+    key_text += length;
			
 
				+  }
			
 
				+  return raw_keys;
			
 
				+}
			
 
				+
			
 
				+// Build up a large collection of random and unique string keys. This is
			
 
				+// actually a relatively expensive operation due to needing to build all the
			
 
				+// random string text. As a consequence, the initializer of this global is
			
 
				+// somewhat performance tuned to ensure benchmarks don't take an excessive
			
 
				+// amount of time to run or use an excessive amount of memory.
			
 
				+static absl::NoDestructor<llvm::OwningArrayRef<llvm::StringRef>> raw_str_keys{
			
 
				+    [] {
			
 
				+      llvm::OwningArrayRef<llvm::StringRef> keys(MaxNumKeys);
			
 
				+      absl::BitGen gen;
			
 
				+
			
 
				+      std::array length_buckets = {
			
 
				+          4, 4, 4, 4, 5, 5, 5, 5, 7, 7, 10, 10, 15, 25, 40, 80,
			
 
				+      };
			
 
				+      static_assert((MaxNumKeys % length_buckets.size()) == 0);
			
 
				+      CARBON_CHECK(llvm::is_sorted(length_buckets));
			
 
				+
			
 
				+      // For each distinct length bucket, we build a vector of raw keys.
			
 
				+      std::forward_list<llvm::SmallVector<const char*>> raw_keys_storage;
			
 
				+      // And a parallel array to the length buckets with the raw keys of that
			
 
				+      // length.
			
 
				+      std::array<llvm::SmallVector<const char*>*, length_buckets.size()>
			
 
				+          raw_keys_buckets;
			
 
				+
			
 
				+      llvm::OwningArrayRef<char> characters = MakeChars();
			
 
				+      llvm::OwningArrayRef<std::array<char, 4>> four_char_strs =
			
 
				+          MakeFourCharStrs(characters, gen);
			
 
				+      llvm::OwningArrayRef<char> random_chars = MakeRandomChars(
			
 
				+          characters, /*max_length=*/length_buckets.back(), gen);
			
 
				+
			
 
				+      ssize_t prev_length = -1;
			
 
				+      for (auto [length_index, length] : llvm::enumerate(length_buckets)) {
			
 
				+        // We can detect repetitions in length as they are sorted.
			
 
				+        if (length == prev_length) {
			
 
				+          raw_keys_buckets[length_index] = raw_keys_buckets[length_index - 1];
			
 
				+          continue;
			
 
				+        }
			
 
				+        prev_length = length;
			
 
				+
			
 
				+        // We want to compute all the keys of this length that we'll need.
			
 
				+        ssize_t key_count = (MaxNumKeys / length_buckets.size()) *
			
 
				+                            llvm::count(length_buckets, length);
			
 
				+
			
 
				+        raw_keys_buckets[length_index] =
			
 
				+            &raw_keys_storage.emplace_front(MakeRawStrKeys(
			
 
				+                length, key_count, four_char_strs, random_chars, gen));
			
 
				+      }
			
 
				+
			
 
				+      // Now build the actual key array from our intermediate storage by
			
 
				+      // round-robin extracting from the length buckets.
			
 
				+      for (auto [index, key] : llvm::enumerate(keys)) {
			
 
				+        ssize_t bucket = index % length_buckets.size();
			
 
				+        ssize_t length = length_buckets[bucket];
			
 
				+        // We pop a raw key from the list of them associated with this bucket.
			
 
				+        const char* raw_key = raw_keys_buckets[bucket]->pop_back_val();
			
 
				+        // And build our key from that.
			
 
				+        key = llvm::StringRef(raw_key, length);
			
 
				+      }
			
 
				+      // Check that in fact we popped every raw key into our main keys.
			
 
				+      for (const auto& raw_keys : raw_keys_storage) {
			
 
				+        CARBON_CHECK(raw_keys.empty());
			
 
				+      }
			
 
				+      return keys;
			
 
				+    }()};
			
 
				+
			
 
				+static absl::NoDestructor<llvm::OwningArrayRef<int*>> raw_ptr_keys{[] {
			
 
				+  llvm::OwningArrayRef<int*> keys(MaxNumKeys);
			
 
				+  for (auto [index, key] : llvm::enumerate(keys)) {
			
 
				+    // We leak these pointers -- this is a static initializer executed once.
			
 
				+    key = new int(static_cast<int>(index));
			
 
				+  }
			
 
				+  return keys;
			
 
				+}()};
			
 
				+
			
 
				+static absl::NoDestructor<llvm::OwningArrayRef<int>> raw_int_keys{[] {
			
 
				+  llvm::OwningArrayRef<int> keys(MaxNumKeys);
			
 
				+  for (auto [index, key] : llvm::enumerate(keys)) {
			
 
				+    key = index + 1;
			
 
				+  }
			
 
				+  return keys;
			
 
				+}()};
			
 
				+
			
 
				+namespace {
			
 
				+
			
 
				+// Allow generically dispatching over the specific key types we build and
			
 
				+// support.
			
 
				+template <typename T>
			
 
				+auto GetRawKeys() -> llvm::ArrayRef<T> {
			
 
				+  if constexpr (std::is_same_v<T, llvm::StringRef>) {
			
 
				+    return *raw_str_keys;
			
 
				+  } else if constexpr (std::is_pointer_v<T>) {
			
 
				+    return *raw_ptr_keys;
			
 
				+  } else {
			
 
				+    return *raw_int_keys;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+template <typename T>
			
 
				+static absl::NoDestructor<
			
 
				+    std::map<std::pair<ssize_t, ssize_t>, llvm::OwningArrayRef<T>>>
			
 
				+    lookup_keys_storage;
			
 
				+
			
 
				+// Given a particular table keys size and lookup keys size, provide an array ref
			
 
				+// to a shuffled set of lookup keys.
			
 
				+//
			
 
				+// Because different table sizes pull from different sub-ranges of our raw keys,
			
 
				+// we need to compute a distinct set of random keys in the table to use for
			
 
				+// lookups depending on the table size. And we also want to have an even
			
 
				+// distribution of key *sizes* throughout the lookup keys, and so we can't
			
 
				+// compute a single lookup keys array of the maximum size. Instead we need to
			
 
				+// compute a distinct special set of lookup keys for each pair of table and
			
 
				+// lookup size, and then shuffle that specific set into a random sequence that
			
 
				+// is returned. This function memoizes this sequence for each pair of sizes.
			
 
				+template <typename T>
			
 
				+auto GetShuffledLookupKeys(ssize_t table_keys_size, ssize_t lookup_keys_size)
			
 
				+    -> llvm::ArrayRef<T> {
			
 
				+  // The raw keys aren't shuffled and round-robin through the sizes. We want to
			
 
				+  // keep the total size of lookup keys used exactly the same across runs. So
			
 
				+  // for a given size we always take the leading sequence from the raw keys for
			
 
				+  // that size, duplicating as needed to get the desired lookup sequence size,
			
 
				+  // and then shuffle the keys in that sequence to end up with a random sequence
			
 
				+  // of keys. We store each of these shuffled sequences in a map to avoid
			
 
				+  // repeatedly computing them.
			
 
				+  llvm::OwningArrayRef<T>& lookup_keys =
			
 
				+      (*lookup_keys_storage<T>)[{table_keys_size, lookup_keys_size}];
			
 
				+  if (lookup_keys.empty()) {
			
 
				+    lookup_keys = llvm::OwningArrayRef<T>(lookup_keys_size);
			
 
				+    auto raw_keys = GetRawKeys<T>();
			
 
				+    for (auto [index, key] : llvm::enumerate(lookup_keys)) {
			
 
				+      key = raw_keys[index % table_keys_size];
			
 
				+    }
			
 
				+    absl::BitGen gen;
			
 
				+    Shuffle(lookup_keys, gen);
			
 
				+  }
			
 
				+  CARBON_CHECK(static_cast<ssize_t>(lookup_keys.size()) == lookup_keys_size);
			
 
				+
			
 
				+  return lookup_keys;
			
 
				+}
			
 
				+
			
 
				+}  // namespace
			
 
				+
			
 
				+template <typename T>
			
 
				+auto GetKeysAndMissKeys(ssize_t table_keys_size)
			
 
				+    -> std::pair<llvm::ArrayRef<T>, llvm::ArrayRef<T>> {
			
 
				+  CARBON_CHECK(table_keys_size <= MaxNumKeys);
			
 
				+  // The raw keys aren't shuffled and round-robin through the sizes. Take the
			
 
				+  // tail of this sequence and shuffle it to form a random set of miss keys with
			
 
				+  // a consistent total size.
			
 
				+  static absl::NoDestructor<llvm::OwningArrayRef<T>> miss_keys{[] {
			
 
				+    llvm::OwningArrayRef<T> keys;
			
 
				+    keys = GetRawKeys<T>().take_back(NumOtherKeys);
			
 
				+    CARBON_CHECK(keys.size() == NumOtherKeys);
			
 
				+    absl::BitGen gen;
			
 
				+    Shuffle(keys, gen);
			
 
				+    return keys;
			
 
				+  }()};
			
 
				+
			
 
				+  return {GetRawKeys<T>().slice(0, table_keys_size), *miss_keys};
			
 
				+}
			
 
				+template auto GetKeysAndMissKeys<int>(ssize_t size)
			
 
				+    -> std::pair<llvm::ArrayRef<int>, llvm::ArrayRef<int>>;
			
 
				+template auto GetKeysAndMissKeys<int*>(ssize_t size)
			
 
				+    -> std::pair<llvm::ArrayRef<int*>, llvm::ArrayRef<int*>>;
			
 
				+template auto GetKeysAndMissKeys<llvm::StringRef>(ssize_t size)
			
 
				+    -> std::pair<llvm::ArrayRef<llvm::StringRef>,
			
 
				+                 llvm::ArrayRef<llvm::StringRef>>;
			
 
				+
			
 
				+template <typename T>
			
 
				+auto GetKeysAndHitKeys(ssize_t table_keys_size, ssize_t lookup_keys_size)
			
 
				+    -> std::pair<llvm::ArrayRef<T>, llvm::ArrayRef<T>> {
			
 
				+  CARBON_CHECK(table_keys_size <= MaxNumKeys);
			
 
				+  CARBON_CHECK(lookup_keys_size <= MaxNumKeys);
			
 
				+  return {GetRawKeys<T>().slice(0, table_keys_size),
			
 
				+          GetShuffledLookupKeys<T>(table_keys_size, lookup_keys_size)};
			
 
				+}
			
 
				+template auto GetKeysAndHitKeys<int>(ssize_t size, ssize_t lookup_keys_size)
			
 
				+    -> std::pair<llvm::ArrayRef<int>, llvm::ArrayRef<int>>;
			
 
				+template auto GetKeysAndHitKeys<int*>(ssize_t size, ssize_t lookup_keys_size)
			
 
				+    -> std::pair<llvm::ArrayRef<int*>, llvm::ArrayRef<int*>>;
			
 
				+template auto GetKeysAndHitKeys<llvm::StringRef>(ssize_t size,
			
 
				+                                                 ssize_t lookup_keys_size)
			
 
				+    -> std::pair<llvm::ArrayRef<llvm::StringRef>,
			
 
				+                 llvm::ArrayRef<llvm::StringRef>>;
			
 
				+
			
 
				+template <typename T>
			
 
				+auto DumpHashStatistics(llvm::ArrayRef<T> keys) -> void {
			
 
				+  if (keys.size() < GroupSize) {
			
 
				+    return;
			
 
				+  }
			
 
				+
			
 
				+  // The hash table load factor is 7/8ths, so we want to add 1/7th of our
			
 
				+  // current size, subtract one, and pick the next power of two to get the power
			
 
				+  // of two where 7/8ths is greater than or equal to the incoming key size.
			
 
				+  ssize_t expected_size =
			
 
				+      llvm::NextPowerOf2(keys.size() + (keys.size() / 7) - 1);
			
 
				+
			
 
				+  constexpr int GroupShift = llvm::CTLog2<GroupSize>();
			
 
				+
			
 
				+  size_t mask = ComputeProbeMaskFromSize(expected_size);
			
 
				+  uint64_t salt = ComputeSeed();
			
 
				+  auto get_hash_index = [mask, salt](auto x) -> ssize_t {
			
 
				+    auto [hash_index, _] = HashValue(x, salt).template ExtractIndexAndTag<7>();
			
 
				+    return (hash_index & mask) >> GroupShift;
			
 
				+  };
			
 
				+
			
 
				+  std::vector<std::vector<int>> grouped_key_indices(expected_size >>
			
 
				+                                                    GroupShift);
			
 
				+  for (auto [i, k] : llvm::enumerate(keys)) {
			
 
				+    ssize_t hash_index = get_hash_index(k);
			
 
				+    CARBON_CHECK(hash_index < (expected_size >> GroupShift)) << hash_index;
			
 
				+    grouped_key_indices[hash_index].push_back(i);
			
 
				+  }
			
 
				+  ssize_t max_group_index =
			
 
				+      std::max_element(grouped_key_indices.begin(), grouped_key_indices.end(),
			
 
				+                       [](const auto& lhs, const auto& rhs) {
			
 
				+                         return lhs.size() < rhs.size();
			
 
				+                       }) -
			
 
				+      grouped_key_indices.begin();
			
 
				+
			
 
				+  // If the max number of collisions on the index is less than or equal to the
			
 
				+  // group size, there shouldn't be any necessary probing (outside of deletion)
			
 
				+  // and so this isn't interesting, skip printing.
			
 
				+  if (grouped_key_indices[max_group_index].size() <= GroupSize) {
			
 
				+    return;
			
 
				+  }
			
 
				+
			
 
				+  llvm::errs() << "keys: " << keys.size()
			
 
				+               << "  groups: " << grouped_key_indices.size() << "\n"
			
 
				+               << "max group index: " << llvm::formatv("{0x8}", max_group_index)
			
 
				+               << "  collisions: "
			
 
				+               << grouped_key_indices[max_group_index].size() << "\n";
			
 
				+
			
 
				+  for (auto i : llvm::ArrayRef(grouped_key_indices[max_group_index])
			
 
				+                    .take_front(2 * GroupSize)) {
			
 
				+    auto k = keys[i];
			
 
				+    auto hash = static_cast<uint64_t>(HashValue(k, salt));
			
 
				+    llvm::errs() << "  key: " << k
			
 
				+                 << "  salt: " << llvm::formatv("{0:x16}", salt)
			
 
				+                 << "  hash: " << llvm::formatv("{0:x16}", hash) << "\n";
			
 
				+  }
			
 
				+}
			
 
				+template auto DumpHashStatistics(llvm::ArrayRef<int> keys) -> void;
			
 
				+template auto DumpHashStatistics(llvm::ArrayRef<int*> keys) -> void;
			
 
				+template auto DumpHashStatistics(llvm::ArrayRef<llvm::StringRef> keys) -> void;
			
 
				+
			
 
				+}  // namespace Carbon::RawHashtable
			
--- a/common/raw_hashtable_benchmark_helpers.h
+++ b/common/raw_hashtable_benchmark_helpers.h
@@ -0,0 +1,208 @@
 
				+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
			
 
				+// Exceptions. See /LICENSE for license information.
			
 
				+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
			
 
				+
			
 
				+#ifndef CARBON_COMMON_RAW_HASHTABLE_BENCHMARK_HELPERS_H_
			
 
				+#define CARBON_COMMON_RAW_HASHTABLE_BENCHMARK_HELPERS_H_
			
 
				+
			
 
				+#include <benchmark/benchmark.h>
			
 
				+#include <sys/types.h>
			
 
				+
			
 
				+#include <limits>
			
 
				+#include <map>
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "absl/base/no_destructor.h"
			
 
				+#include "absl/random/random.h"
			
 
				+#include "common/check.h"
			
 
				+#include "common/hashing.h"
			
 
				+#include "common/raw_hashtable.h"
			
 
				+#include "llvm/ADT/ArrayRef.h"
			
 
				+#include "llvm/ADT/Sequence.h"
			
 
				+#include "llvm/ADT/SmallVector.h"
			
 
				+#include "llvm/ADT/StringRef.h"
			
 
				+
			
 
				+namespace Carbon::RawHashtable {
			
 
				+
			
 
				+// We want to support benchmarking with 16M keys plus up to 256 "other" keys
			
 
				+// (for misses). The large number of keys helps check for performance hiccups
			
 
				+// with especially large tables and when missing all levels of cache.
			
 
				+inline constexpr ssize_t NumOtherKeys = 1 << 8;
			
 
				+inline constexpr ssize_t MaxNumKeys = (1 << 24) + NumOtherKeys;
			
 
				+
			
 
				+// Get an array of main keys with the given `size`, which must be less than
			
 
				+// 2^24. Also get a miss keys array of `NumOtherKeys` which has no collisions
			
 
				+// with the main keys.
			
 
				+//
			
 
				+// For a given size, this will return the same arrays. This uses unsynchronized
			
 
				+// global state, and so is thread hostile and must not be called before main.
			
 
				+template <typename T>
			
 
				+auto GetKeysAndMissKeys(ssize_t table_keys_size)
			
 
				+    -> std::pair<llvm::ArrayRef<T>, llvm::ArrayRef<T>>;
			
 
				+
			
 
				+// Get an array of main keys with the given `size`, which must be less than
			
 
				+// 2^24. Also get a hit keys array of `lookup_keys_size` all of which will occur
			
 
				+// in the may keys array. If the lookup size is larger than the main size, the
			
 
				+// lookup sequence will contain duplicates.
			
 
				+//
			
 
				+// For a given size, this will return the same arrays. This uses unsynchronized
			
 
				+// global state, and so is thread hostile and must not be called before main.
			
 
				+template <typename T>
			
 
				+auto GetKeysAndHitKeys(ssize_t table_keys_size, ssize_t lookup_keys_size)
			
 
				+    -> std::pair<llvm::ArrayRef<T>, llvm::ArrayRef<T>>;
			
 
				+
			
 
				+// Dump statistics about hashing the given keys.
			
 
				+template <typename T>
			
 
				+auto DumpHashStatistics(llvm::ArrayRef<T> keys) -> void;
			
 
				+
			
 
				+// Convert values used in hashtable benchmarking to a bool. This is used to form
			
 
				+// dependencies between values stored in the hashtable between benchmark
			
 
				+// iterations.
			
 
				+template <typename T>
			
 
				+auto ValueToBool(T value) -> bool {
			
 
				+  if constexpr (std::is_same_v<T, llvm::StringRef>) {
			
 
				+    return value.size() > 0;
			
 
				+  } else if constexpr (std::is_pointer_v<T>) {
			
 
				+    return value != nullptr;
			
 
				+  } else {
			
 
				+    // We want our keys to include `0` for integers, so use the largest value.
			
 
				+    return value != std::numeric_limits<T>::max();
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+inline auto SizeArgs(benchmark::internal::Benchmark* b) -> void {
			
 
				+  // Benchmarks for "miss" operations only have one parameter -- the size of the
			
 
				+  // table. These benchmarks use a fixed `NumOtherKeys` set of extra keys for
			
 
				+  // each miss operation.
			
 
				+  b->DenseRange(1, 4, 1);
			
 
				+  b->Arg(8);
			
 
				+  b->Arg(16);
			
 
				+  b->Arg(32);
			
 
				+
			
 
				+  // For sizes >= 64 we first use the power of two which will have a low load
			
 
				+  // factor, and then target exactly at our max load factor.
			
 
				+  auto large_sizes = {64, 1 << 8, 1 << 12, 1 << 16, 1 << 20, 1 << 24};
			
 
				+  for (auto s : large_sizes) {
			
 
				+    b->Arg(s);
			
 
				+  }
			
 
				+  for (auto s : large_sizes) {
			
 
				+    b->Arg(s - (s / 8));
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+inline auto HitArgs(benchmark::internal::Benchmark* b) -> void {
			
 
				+  // There are two parameters for benchmarks of "hit" operations. The first is
			
 
				+  // the size of the hashtable itself. The second is the size of a buffer of
			
 
				+  // random keys actually in the hashtable to use for the operations.
			
 
				+  //
			
 
				+  // For small sizes, we use a fixed `NumOtherKeys` lookup key count. This is
			
 
				+  // enough to avoid patterns of queries training the branch predictor just from
			
 
				+  // the keys themselves, while small enough to avoid significant L1 cache
			
 
				+  // pressure.
			
 
				+  b->ArgsProduct({benchmark::CreateDenseRange(1, 4, 1), {NumOtherKeys}});
			
 
				+  b->Args({8, NumOtherKeys});
			
 
				+  b->Args({16, NumOtherKeys});
			
 
				+  b->Args({32, NumOtherKeys});
			
 
				+
			
 
				+  // For sizes >= 64 we first use the power of two which will have a low load
			
 
				+  // factor, and then target exactly at our max load factor. Start the sizes
			
 
				+  // list off with the powers of two, and the append a version of each power of
			
 
				+  // two adjusted down to the load factor. We'll then build the benchmarks from
			
 
				+  // these below.
			
 
				+  std::vector<ssize_t> large_sizes = {64,      1 << 8,  1 << 12,
			
 
				+                                      1 << 16, 1 << 20, 1 << 24};
			
 
				+  for (auto i : llvm::seq<int>(0, large_sizes.size())) {
			
 
				+    ssize_t s = large_sizes[i];
			
 
				+    large_sizes.push_back(s - (s / 8));
			
 
				+  }
			
 
				+
			
 
				+  for (auto s : large_sizes) {
			
 
				+    b->Args({s, NumOtherKeys});
			
 
				+
			
 
				+    // Once the sizes are more than 4x the `NumOtherKeys` minimum lookup buffer
			
 
				+    // size, also include 25% and 50% lookup buffer sizes which will
			
 
				+    // increasingly exhaust the ability to keep matching entries in the cache.
			
 
				+    if (s >= NumOtherKeys) {
			
 
				+      b->Args({s, s / 4});
			
 
				+      b->Args({s, s / 2});
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+// Provide some Dense{Map,Set}Info viable implementations for the key types
			
 
				+// using Carbon's hashing framework. These let us benchmark the data structure
			
 
				+// alone rather than the combination of data structure and hashing routine.
			
 
				+//
			
 
				+// We only provide these for benchmarking -- they are *not* necessarily suitable
			
 
				+// for broader use. The Carbon hashing infrastructure has only been evaluated in
			
 
				+// the context of its specific hashtable design.
			
 
				+template <typename T>
			
 
				+struct CarbonHashDI;
			
 
				+
			
 
				+template <>
			
 
				+struct CarbonHashDI<int> {
			
 
				+  static auto getEmptyKey() -> int { return -1; }
			
 
				+  static auto getTombstoneKey() -> int { return -2; }
			
 
				+  static auto getHashValue(const int val) -> unsigned {
			
 
				+    return static_cast<uint64_t>(HashValue(val));
			
 
				+  }
			
 
				+  static auto isEqual(const int lhs, const int rhs) -> bool {
			
 
				+    return lhs == rhs;
			
 
				+  }
			
 
				+};
			
 
				+
			
 
				+template <typename T>
			
 
				+struct CarbonHashDI<T*> {
			
 
				+  static constexpr uintptr_t Log2MaxAlign = 12;
			
 
				+
			
 
				+  static auto getEmptyKey() -> T* {
			
 
				+    auto val = static_cast<uintptr_t>(-1);
			
 
				+    val <<= Log2MaxAlign;
			
 
				+    // NOLINTNEXTLINE(performance-no-int-to-ptr): This is required by the API.
			
 
				+    return reinterpret_cast<int*>(val);
			
 
				+  }
			
 
				+
			
 
				+  static auto getTombstoneKey() -> T* {
			
 
				+    auto val = static_cast<uintptr_t>(-2);
			
 
				+    val <<= Log2MaxAlign;
			
 
				+    // NOLINTNEXTLINE(performance-no-int-to-ptr): This is required by the API.
			
 
				+    return reinterpret_cast<int*>(val);
			
 
				+  }
			
 
				+
			
 
				+  static auto getHashValue(const T* ptr_val) -> unsigned {
			
 
				+    return static_cast<uint64_t>(HashValue(ptr_val));
			
 
				+  }
			
 
				+
			
 
				+  static auto isEqual(const T* lhs, const T* rhs) -> bool { return lhs == rhs; }
			
 
				+};
			
 
				+
			
 
				+template <>
			
 
				+struct CarbonHashDI<llvm::StringRef> {
			
 
				+  static auto getEmptyKey() -> llvm::StringRef {
			
 
				+    return llvm::StringRef(
			
 
				+        // NOLINTNEXTLINE(performance-no-int-to-ptr): Required by the API.
			
 
				+        reinterpret_cast<const char*>(~static_cast<uintptr_t>(0)), 0);
			
 
				+  }
			
 
				+
			
 
				+  static auto getTombstoneKey() -> llvm::StringRef {
			
 
				+    return llvm::StringRef(
			
 
				+        // NOLINTNEXTLINE(performance-no-int-to-ptr): Required by the API.
			
 
				+        reinterpret_cast<const char*>(~static_cast<uintptr_t>(1)), 0);
			
 
				+  }
			
 
				+  static auto getHashValue(llvm::StringRef val) -> unsigned {
			
 
				+    return static_cast<uint64_t>(HashValue(val));
			
 
				+  }
			
 
				+  static auto isEqual(llvm::StringRef lhs, llvm::StringRef rhs) -> bool {
			
 
				+    if (rhs.data() == getEmptyKey().data()) {
			
 
				+      return lhs.data() == getEmptyKey().data();
			
 
				+    }
			
 
				+    if (rhs.data() == getTombstoneKey().data()) {
			
 
				+      return lhs.data() == getTombstoneKey().data();
			
 
				+    }
			
 
				+    return lhs == rhs;
			
 
				+  }
			
 
				+};
			
 
				+
			
 
				+}  // namespace Carbon::RawHashtable
			
 
				+
			
 
				+#endif  // CARBON_COMMON_RAW_HASHTABLE_BENCHMARK_HELPERS_H_
			
--- a/common/raw_hashtable_metadata_group.cpp
+++ b/common/raw_hashtable_metadata_group.cpp
@@ -0,0 +1,20 @@
 
				+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
			
 
				+// Exceptions. See /LICENSE for license information.
			
 
				+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
			
 
				+
			
 
				+#include "common/raw_hashtable_metadata_group.h"
			
 
				+
			
 
				+#include "llvm/ADT/StringExtras.h"
			
 
				+
			
 
				+namespace Carbon::RawHashtable {
			
 
				+
			
 
				+auto MetadataGroup::Print(llvm::raw_ostream& out) const -> void {
			
 
				+  out << "[";
			
 
				+  llvm::ListSeparator sep;
			
 
				+  for (uint8_t byte : metadata_bytes) {
			
 
				+    out << sep << llvm::formatv("{0:x2}", byte);
			
 
				+  }
			
 
				+  out << "]";
			
 
				+}
			
 
				+
			
 
				+}  // namespace Carbon::RawHashtable
			
--- a/common/raw_hashtable_metadata_group.h
+++ b/common/raw_hashtable_metadata_group.h
@@ -0,0 +1,1093 @@
 
				+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
			
 
				+// Exceptions. See /LICENSE for license information.
			
 
				+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
			
 
				+
			
 
				+#ifndef CARBON_COMMON_RAW_HASHTABLE_METADATA_GROUP_H_
			
 
				+#define CARBON_COMMON_RAW_HASHTABLE_METADATA_GROUP_H_
			
 
				+
			
 
				+#include <cstddef>
			
 
				+#include <cstring>
			
 
				+#include <iterator>
			
 
				+
			
 
				+#include "common/check.h"
			
 
				+#include "llvm/ADT/Sequence.h"
			
 
				+#include "llvm/ADT/bit.h"
			
 
				+#include "llvm/Support/FormatVariadic.h"
			
 
				+#include "llvm/Support/MathExtras.h"
			
 
				+
			
 
				+// Detect whether we can use SIMD accelerated implementations of the control
			
 
				+// groups, and include the relevant platform specific APIs for the SIMD
			
 
				+// implementations.
			
 
				+//
			
 
				+// Reference documentation for the SIMD APIs used here:
			
 
				+// - https://arm-software.github.io/acle/neon_intrinsics/advsimd.html
			
 
				+// - https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html
			
 
				+#if defined(__SSSE3__)
			
 
				+#include <x86intrin.h>
			
 
				+#define CARBON_X86_SIMD_SUPPORT 1
			
 
				+#elif defined(__ARM_NEON)
			
 
				+#include <arm_neon.h>
			
 
				+#define CARBON_NEON_SIMD_SUPPORT 1
			
 
				+#endif
			
 
				+
			
 
				+// This namespace collects low-level utilities for implementing hashtable
			
 
				+// data structures. This file only provides one of them:
			
 
				+//
			
 
				+// - Primitives to manage "groups" of hashtable entries that have densely packed
			
 
				+//   control bytes we can scan rapidly as a group, often using SIMD facilities
			
 
				+//   to process the entire group at once.
			
 
				+namespace Carbon::RawHashtable {
			
 
				+
			
 
				+// We define a constant max group size. The particular group size used in
			
 
				+// practice may vary, but we want to have some upper bound used to ensure
			
 
				+// memory allocation is done consistently across different architectures.
			
 
				+constexpr ssize_t MaxGroupSize = 16;
			
 
				+
			
 
				+// This takes a collection of bits representing the results of looking for a
			
 
				+// particular tag in this metadata group and determines the first position with
			
 
				+// a match. The position is represented by either the least significant set bit
			
 
				+// or the least significant non-zero byte, depending on `ByteEncoding`. When
			
 
				+// represented with a non-zero byte, that byte must have at least its most
			
 
				+// significant bit set, but may have other bits set to any value. Bits more
			
 
				+// significant than the match may have any value provided there is at least one
			
 
				+// match. Zero matches must be represented by a zero input.
			
 
				+//
			
 
				+// Some bits of the underlying value may be known-zero, which can optimize
			
 
				+// various operations. These can be represented as a `ZeroMask`.
			
 
				+template <typename BitsInputT, bool ByteEncoding, BitsInputT ZeroMask = 0>
			
 
				+class BitIndex
			
 
				+    : public Printable<BitIndex<BitsInputT, ByteEncoding, ZeroMask>> {
			
 
				+ public:
			
 
				+  using BitsT = BitsInputT;
			
 
				+
			
 
				+  BitIndex() = default;
			
 
				+  explicit BitIndex(BitsT bits) : bits_(bits) {}
			
 
				+
			
 
				+  friend auto operator==(BitIndex lhs, BitIndex rhs) -> bool {
			
 
				+    if (lhs.empty() || rhs.empty()) {
			
 
				+      return lhs.empty() == rhs.empty();
			
 
				+    }
			
 
				+    // For non-empty bit indices, compare the indices directly to ignore other
			
 
				+    // (extraneous) parts of the incoming bits.
			
 
				+    return lhs.index() == rhs.index();
			
 
				+  }
			
 
				+
			
 
				+  auto Print(llvm::raw_ostream& out) const -> void {
			
 
				+    out << llvm::formatv("{0:x}", bits_);
			
 
				+  }
			
 
				+
			
 
				+  explicit operator bool() const { return !empty(); }
			
 
				+
			
 
				+  // Returns true when there are no matches for the tag.
			
 
				+  auto empty() const -> bool {
			
 
				+    CARBON_DCHECK((bits_ & ZeroMask) == 0) << "Unexpected non-zero bits!";
			
 
				+    __builtin_assume((bits_ & ZeroMask) == 0);
			
 
				+    return bits_ == 0;
			
 
				+  }
			
 
				+
			
 
				+  // Returns the index of the first matched tag.
			
 
				+  auto index() -> ssize_t {
			
 
				+    CARBON_DCHECK(bits_ != 0) << "Cannot get an index from zero bits!";
			
 
				+    __builtin_assume(bits_ != 0);
			
 
				+    ssize_t index = unscaled_index();
			
 
				+
			
 
				+    if constexpr (ByteEncoding) {
			
 
				+      // Shift to scale out of the byte encoding.
			
 
				+      index >>= ByteEncodingShift;
			
 
				+    }
			
 
				+
			
 
				+    return index;
			
 
				+  }
			
 
				+
			
 
				+  // Optimized tool to index a pointer `p` by `index()`.
			
 
				+  template <typename T>
			
 
				+  auto index_ptr(T* pointer) -> T* {
			
 
				+    CARBON_DCHECK(bits_ != 0) << "Cannot get an index from zero bits!";
			
 
				+    __builtin_assume(bits_ != 0);
			
 
				+    if constexpr (!ByteEncoding) {
			
 
				+      return &pointer[unscaled_index()];
			
 
				+    }
			
 
				+
			
 
				+    ssize_t index = unscaled_index();
			
 
				+
			
 
				+    // Scale the index as we counted zero *bits* and not zero *bytes*.
			
 
				+    // However, we can fold that scale with the size of `T` when it is a power
			
 
				+    // of two or divisible by 8.
			
 
				+    CARBON_DCHECK(
			
 
				+        (index & ((static_cast<size_t>(1) << ByteEncodingShift) - 1)) == 0);
			
 
				+    if constexpr (sizeof(T) % 8 == 0) {
			
 
				+      constexpr size_t FoldedScale = sizeof(T) / 8;
			
 
				+      index *= FoldedScale;
			
 
				+      return reinterpret_cast<T*>(
			
 
				+          &reinterpret_cast<std::byte*>(pointer)[index]);
			
 
				+    } else if constexpr (llvm::isPowerOf2_64(sizeof(T))) {
			
 
				+      constexpr size_t ScaleShift = llvm::CTLog2<sizeof(T)>();
			
 
				+      static_assert(ScaleShift <= ByteEncodingShift,
			
 
				+                    "Scaling by >=8 should be handled above!");
			
 
				+      constexpr size_t FoldedShift = ByteEncodingShift - ScaleShift;
			
 
				+      index >>= FoldedShift;
			
 
				+      return reinterpret_cast<T*>(
			
 
				+          &reinterpret_cast<std::byte*>(pointer)[index]);
			
 
				+    }
			
 
				+
			
 
				+    // Nothing we can fold here.
			
 
				+    return &pointer[index >> ByteEncodingShift];
			
 
				+  }
			
 
				+
			
 
				+ private:
			
 
				+  // When using a byte encoding, we'll need to shift any index by this amount.
			
 
				+  static constexpr size_t ByteEncodingShift = 3;
			
 
				+
			
 
				+  auto unscaled_index() -> ssize_t {
			
 
				+    if constexpr (!ByteEncoding) {
			
 
				+      // Note the cast to `size_t` to force zero extending the result.
			
 
				+      return static_cast<size_t>(llvm::countr_zero(bits_));
			
 
				+    } else {
			
 
				+      // The index is encoded in the high bit of each byte. We compute the index
			
 
				+      // by counting the number of low zero bytes there are before the first
			
 
				+      // byte with its high bit set. Rather that shifting the high bit to be the
			
 
				+      // low bit and counting the trailing (least significant) zero bits
			
 
				+      // directly, we instead byte-reverse the bits and count the *leading*
			
 
				+      // (most significant) zero bits. While this may be a wash on CPUs with
			
 
				+      // direct support for counting the trailing zero bits, AArch64 only
			
 
				+      // supports counting the leading zero bits and requires a bit-reverse to
			
 
				+      // count the trailing zero bits. Doing the byte-reverse approach
			
 
				+      // essentially combines moving the high bit into the low bit and the
			
 
				+      // reverse necessary for counting the zero bits. While this only removes
			
 
				+      // one instruction, it is an instruction in the critical path of the
			
 
				+      // hottest part of table lookup, and that critical path dependency height
			
 
				+      // is few enough instructions that removing even one significantly impacts
			
 
				+      // latency.
			
 
				+      //
			
 
				+      // We also cast to `size_t` to clearly zero-extend the result.
			
 
				+      return static_cast<size_t>(llvm::countl_zero(llvm::byteswap(bits_)));
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  BitsT bits_ = 0;
			
 
				+};
			
 
				+
			
 
				+// This is like `BitIndex`, but allows iterating through all of the matches.
			
 
				+//
			
 
				+// A key requirement for efficient iteration is that all of the matches are
			
 
				+// represented with a single bit and there are no other bits set. For example,
			
 
				+// with byte-encoded bit indices, exactly the high bit and no other bit of each
			
 
				+// matching byte must be set. This is a stricter constraint than what `BitIndex`
			
 
				+// alone would impose on any one of the matches.
			
 
				+template <typename BitIndexT>
			
 
				+class BitIndexRange : public Printable<BitIndexRange<BitIndexT>> {
			
 
				+ public:
			
 
				+  using BitsT = BitIndexT::BitsT;
			
 
				+
			
 
				+  class Iterator
			
 
				+      : public llvm::iterator_facade_base<Iterator, std::forward_iterator_tag,
			
 
				+                                          ssize_t, ssize_t> {
			
 
				+   public:
			
 
				+    Iterator() = default;
			
 
				+    explicit Iterator(BitsT bits) : bits_(bits) {}
			
 
				+
			
 
				+    auto operator==(const Iterator& rhs) const -> bool {
			
 
				+      return bits_ == rhs.bits_;
			
 
				+    }
			
 
				+
			
 
				+    auto operator*() -> ssize_t& {
			
 
				+      CARBON_DCHECK(bits_ != 0) << "Cannot get an index from zero bits!";
			
 
				+      __builtin_assume(bits_ != 0);
			
 
				+      index_ = BitIndexT(bits_).index();
			
 
				+      // Note that we store the index in a member so we can return a reference
			
 
				+      // to it here as required to be a forward iterator.
			
 
				+      return index_;
			
 
				+    }
			
 
				+
			
 
				+    template <typename T>
			
 
				+    auto index_ptr(T* pointer) -> T* {
			
 
				+      return BitIndexT(bits_).index_ptr(pointer);
			
 
				+    }
			
 
				+
			
 
				+    auto operator++() -> Iterator& {
			
 
				+      CARBON_DCHECK(bits_ != 0) << "Must not increment past the end!";
			
 
				+      __builtin_assume(bits_ != 0);
			
 
				+      // Clears the least significant set bit, effectively stepping to the next
			
 
				+      // match.
			
 
				+      bits_ &= (bits_ - 1);
			
 
				+      return *this;
			
 
				+    }
			
 
				+
			
 
				+   private:
			
 
				+    ssize_t index_;
			
 
				+    BitsT bits_ = 0;
			
 
				+  };
			
 
				+
			
 
				+  BitIndexRange() = default;
			
 
				+  explicit BitIndexRange(BitsT bits) : bits_(bits) {}
			
 
				+
			
 
				+  explicit operator bool() const { return !empty(); }
			
 
				+  auto empty() const -> bool { return BitIndexT(bits_).empty(); }
			
 
				+
			
 
				+  auto begin() const -> Iterator { return Iterator(bits_); }
			
 
				+  auto end() const -> Iterator { return Iterator(); }
			
 
				+
			
 
				+  friend auto operator==(BitIndexRange lhs, BitIndexRange rhs) -> bool {
			
 
				+    return lhs.bits_ == rhs.bits_;
			
 
				+  }
			
 
				+
			
 
				+  auto Print(llvm::raw_ostream& out) const -> void {
			
 
				+    out << llvm::formatv("{0:x}", bits_);
			
 
				+  }
			
 
				+
			
 
				+  explicit operator BitsT() const { return bits_; }
			
 
				+  explicit operator BitIndexT() const { return BitIndexT(bits_); }
			
 
				+
			
 
				+ private:
			
 
				+  BitsT bits_ = 0;
			
 
				+};
			
 
				+
			
 
				+// A group of metadata bytes that can be manipulated together.
			
 
				+//
			
 
				+// The metadata bytes used Carbon's hashtable implementation are designed to
			
 
				+// support being manipulating as groups, either using architecture specific SIMD
			
 
				+// code sequences or using portable SIMD-in-an-integer-register code sequences.
			
 
				+// These operations are unusually performance sensitive and in sometimes
			
 
				+// surprising ways. The implementations here are crafted specifically to
			
 
				+// optimize the particular usages in Carbon's hashtable and should not be
			
 
				+// expected to be reusable in any other context.
			
 
				+//
			
 
				+// Throughout the functions operating on this type we try to use patterns with a
			
 
				+// fallback portable implementation which can be directly used in the absence of
			
 
				+// a SIMD implementation, but is also used (with the same code) to check that
			
 
				+// any SIMD implementation produces the same result as the portable one. These
			
 
				+// patterns help minimize un-compiled or un-tested paths through either portable
			
 
				+// or SIMD code, regardless of which path is actually *used* on a particular
			
 
				+// platform. To illustrate a common version of this pattern, we might have code
			
 
				+// like:
			
 
				+//
			
 
				+// ```cpp
			
 
				+// auto MetadataGroup::Operation(...) -> ... {
			
 
				+//   ... portable_result;
			
 
				+//   ... simd_result;
			
 
				+//   if constexpr (!UseSIMD || DebugSIMD) {
			
 
				+//     portable_result = PortableOperation(...);
			
 
				+//   }
			
 
				+//   if (UseSIMD || DebugSIMD) {
			
 
				+//     simd_result = SIMDOperation(...)
			
 
				+//     CARBON_DCHECK(result == portable_result) << ...;
			
 
				+//   }
			
 
				+//   return UseSIMD ? simd_result : portable_result;
			
 
				+// }
			
 
				+// ```
			
 
				+class MetadataGroup : public Printable<MetadataGroup> {
			
 
				+ public:
			
 
				+  static constexpr ssize_t Size =
			
 
				+#if CARBON_X86_SIMD_SUPPORT
			
 
				+      16;
			
 
				+#else
			
 
				+      8;
			
 
				+#endif
			
 
				+  static_assert(Size >= 8);
			
 
				+  static_assert(Size % 8 == 0);
			
 
				+  static_assert(Size <= MaxGroupSize);
			
 
				+  static_assert(MaxGroupSize % Size == 0);
			
 
				+  static_assert(llvm::isPowerOf2_64(Size),
			
 
				+                "The group size must be a constant power of two so dividing by "
			
 
				+                "it is a simple shift.");
			
 
				+  static constexpr ssize_t Mask = Size - 1;
			
 
				+
			
 
				+  // Each control byte can have special values. All special values have the
			
 
				+  // most significant bit cleared to distinguish them from the seven hash bits
			
 
				+  // stored when the control byte represents a full bucket.
			
 
				+  //
			
 
				+  // Otherwise, their values are chose primarily to provide efficient SIMD
			
 
				+  // implementations of the common operations on an entire control group.
			
 
				+  static constexpr uint8_t Empty = 0;
			
 
				+  static constexpr uint8_t Deleted = 1;
			
 
				+
			
 
				+  static constexpr uint8_t PresentMask = 0b1000'0000;
			
 
				+
			
 
				+  // Some architectures make it much more efficient to build the match indices
			
 
				+  // in a byte-encoded form rather than a bit-encoded form. This encoding
			
 
				+  // changes verification and other aspects of our algorithms.
			
 
				+  static constexpr bool ByteEncoding =
			
 
				+#if CARBON_X86_SIMD_SUPPORT
			
 
				+      false;
			
 
				+#else
			
 
				+      true;
			
 
				+#endif
			
 
				+  static_assert(!ByteEncoding || Size == 8,
			
 
				+                "We can only support byte encoding with a group size of 8.");
			
 
				+
			
 
				+  // We need to indicate to users of the metadata group when they can hold a
			
 
				+  // group value in a "register" (local variable) across clearing of individual
			
 
				+  // bytes in the group efficiently. If the entire group can fit in an integer
			
 
				+  // register, this works well and clients of the group should work to use the
			
 
				+  // already-loaded value when clearing bytes. But when we have a larger group
			
 
				+  // size, clearing the byte will typically require storing a byte to memory and
			
 
				+  // re-loading the group. The usage patterns that need to clear bytes can in
			
 
				+  // those cases avoid clearing a loaded group, and clear the byte directly in
			
 
				+  // the larger metadata array.
			
 
				+  static constexpr bool FastByteClear = Size == 8;
			
 
				+
			
 
				+  using MatchIndex =
			
 
				+      BitIndex<std::conditional_t<ByteEncoding, uint64_t, uint32_t>,
			
 
				+               ByteEncoding,
			
 
				+               /*ZeroMask=*/ByteEncoding ? 0 : (~0U << Size)>;
			
 
				+  using MatchRange = BitIndexRange<MatchIndex>;
			
 
				+
			
 
				+  union {
			
 
				+    uint8_t metadata_bytes[Size];
			
 
				+    uint64_t metadata_ints[Size / 8];
			
 
				+#if CARBON_NEON_SIMD_SUPPORT
			
 
				+    uint8x8_t metadata_vec = {};
			
 
				+    static_assert(sizeof(metadata_vec) == Size);
			
 
				+#elif CARBON_X86_SIMD_SUPPORT
			
 
				+    __m128i metadata_vec = {};
			
 
				+    static_assert(sizeof(metadata_vec) == Size);
			
 
				+#endif
			
 
				+  };
			
 
				+
			
 
				+  auto Print(llvm::raw_ostream& out) const -> void;
			
 
				+
			
 
				+  friend auto operator==(MetadataGroup lhs, MetadataGroup rhs) -> bool {
			
 
				+    return CompareEqual(lhs, rhs);
			
 
				+  }
			
 
				+
			
 
				+  // The main API for this class. This API will switch between a portable and
			
 
				+  // SIMD implementation based on what is most efficient, but in debug builds
			
 
				+  // will cross check that the implementations do not diverge.
			
 
				+
			
 
				+  // Load and return a group of metadata bytes out of the main metadata array at
			
 
				+  // a particular `index`. The index must be a multiple of `GroupSize`. This
			
 
				+  // will arrange for the load to place the group into the correct structure for
			
 
				+  // efficient register-based processing.
			
 
				+  static auto Load(const uint8_t* metadata, ssize_t index) -> MetadataGroup;
			
 
				+
			
 
				+  // Store this metadata group into the main metadata array at the provided
			
 
				+  // `index`. The index must be a multiple of `GroupSize`.
			
 
				+  auto Store(uint8_t* metadata, ssize_t index) const -> void;
			
 
				+
			
 
				+  // Clear a byte of this group's metadata at the provided `byte_index` to the
			
 
				+  // empty value. Note that this must only be called when `FastByteClear` is
			
 
				+  // true -- in all other cases users of this class should arrange to clear
			
 
				+  // individual bytes in the underlying array rather than using the group API.
			
 
				+  auto ClearByte(ssize_t byte_index) -> void;
			
 
				+
			
 
				+  // Clear all of this group's metadata bytes that indicate a deleted slot to
			
 
				+  // the empty value.
			
 
				+  auto ClearDeleted() -> void;
			
 
				+
			
 
				+  // Find all of the bytes of metadata in this group that are present and whose
			
 
				+  // low 7 bits match the provided `tag`. The `tag` byte must have a clear high
			
 
				+  // bit, only 7 bits of tag are used. Note that this means the provided tag is
			
 
				+  // *not* the actual present metadata byte -- this function is responsible for
			
 
				+  // mapping the tag into that form as it can do so more efficiently in some
			
 
				+  // cases. A range over all of the byte indices which matched is returned.
			
 
				+  auto Match(uint8_t tag) const -> MatchRange;
			
 
				+
			
 
				+  // Find all of the present bytes of metadata in this group. A range over all
			
 
				+  // of the byte indices which are present is returned.
			
 
				+  auto MatchPresent() const -> MatchRange;
			
 
				+
			
 
				+  // Find the first byte of the metadata group that is empty and return that
			
 
				+  // index. There is no order or position required for which of the bytes of
			
 
				+  // metadata is considered "first", any model will do that makes it efficient
			
 
				+  // to produce the matching index. Must return an empty match index if no bytes
			
 
				+  // match the empty metadata.
			
 
				+  auto MatchEmpty() const -> MatchIndex;
			
 
				+
			
 
				+  // Find the first byte of the metadata group that is deleted and return that
			
 
				+  // index. There is no order or position required for which of the bytes of
			
 
				+  // metadata is considered "first", any model will do that makes it efficient
			
 
				+  // to produce the matching index. Must return an empty match index if no bytes
			
 
				+  // match the deleted metadata.
			
 
				+  auto MatchDeleted() const -> MatchIndex;
			
 
				+
			
 
				+ private:
			
 
				+  // Two classes only defined in the benchmark code are allowed to directly call
			
 
				+  // the portable and SIMD implementations for benchmarking purposes.
			
 
				+  friend class BenchmarkPortableMetadataGroup;
			
 
				+  friend class BenchmarkSIMDMetadataGroup;
			
 
				+
			
 
				+  // Whether to use a SIMD implementation. Even when we *support* a SIMD
			
 
				+  // implementation, we do not always have to use it in the event that it is
			
 
				+  // less efficient than the portable version.
			
 
				+  static constexpr bool UseSIMD =
			
 
				+#if CARBON_X86_SIMD_SUPPORT
			
 
				+      true;
			
 
				+#else
			
 
				+      false;
			
 
				+#endif
			
 
				+
			
 
				+  // All SIMD variants that we have an implementation for should be enabled for
			
 
				+  // debugging. This lets us maintain a SIMD implementation even if it is not
			
 
				+  // used due to performance reasons, and easily re-enable it if the performance
			
 
				+  // changes.
			
 
				+  static constexpr bool DebugSIMD =
			
 
				+#if !defined(NDEBUG) && (CARBON_NEON_SIMD_SUPPORT || CARBON_X86_SIMD_SUPPORT)
			
 
				+      true;
			
 
				+#else
			
 
				+      false;
			
 
				+#endif
			
 
				+
			
 
				+  // Most and least significant bits set.
			
 
				+  static constexpr uint64_t MSBs = 0x8080'8080'8080'8080ULL;
			
 
				+  static constexpr uint64_t LSBs = 0x0101'0101'0101'0101ULL;
			
 
				+
			
 
				+  using MatchBitsT = MatchIndex::BitsT;
			
 
				+
			
 
				+  static auto CompareEqual(MetadataGroup lhs, MetadataGroup rhs) -> bool;
			
 
				+
			
 
				+  // Functions for validating the returned matches agree with what is predicted
			
 
				+  // by the `byte_match` function. These either `CHECK`-fail or return true. To
			
 
				+  // pass validation, the `*_bits` argument must have `0x80` for those bytes
			
 
				+  // where `byte_match` returns true, and `0` for the rest.
			
 
				+
			
 
				+  // `VerifyIndexBits` is for functions that return `MatchIndex`, as they only
			
 
				+  // promise to return accurate information up to the first match.
			
 
				+  auto VerifyIndexBits(
			
 
				+      MatchBitsT index_bits,
			
 
				+      llvm::function_ref<auto(uint8_t byte)->bool> byte_match) const -> bool;
			
 
				+  // `VerifyRangeBits` is for functions that return `MatchRange`, and so it
			
 
				+  // validates all the bytes of `range_bits`.
			
 
				+  auto VerifyRangeBits(
			
 
				+      MatchBitsT range_bits,
			
 
				+      llvm::function_ref<auto(uint8_t byte)->bool> byte_match) const -> bool;
			
 
				+
			
 
				+  // Portable implementations of each operation. These are used on platforms
			
 
				+  // without SIMD support or where the portable implementation is faster than
			
 
				+  // SIMD. They are heavily optimized even though they are not SIMD because we
			
 
				+  // expect there to be platforms where the portable implementation can
			
 
				+  // outperform SIMD. Their behavior and semantics exactly match the
			
 
				+  // documentation for the un-prefixed functions.
			
 
				+  //
			
 
				+  // In debug builds, these also directly verify their results to help establish
			
 
				+  // baseline functionality.
			
 
				+  static auto PortableLoad(const uint8_t* metadata, ssize_t index)
			
 
				+      -> MetadataGroup;
			
 
				+  auto PortableStore(uint8_t* metadata, ssize_t index) const -> void;
			
 
				+
			
 
				+  auto PortableClearDeleted() -> void;
			
 
				+
			
 
				+  auto PortableMatch(uint8_t tag) const -> MatchRange;
			
 
				+  auto PortableMatchPresent() const -> MatchRange;
			
 
				+
			
 
				+  auto PortableMatchEmpty() const -> MatchIndex;
			
 
				+  auto PortableMatchDeleted() const -> MatchIndex;
			
 
				+
			
 
				+  static auto PortableCompareEqual(MetadataGroup lhs, MetadataGroup rhs)
			
 
				+      -> bool;
			
 
				+
			
 
				+  // SIMD implementations of each operation. We minimize platform-specific APIs
			
 
				+  // to reduce the scope of errors that can only be discoverd building on one
			
 
				+  // platform, so the bodies of these contain the platform specific code. Their
			
 
				+  // behavior and semantics exactly match the documentation for the un-prefixed
			
 
				+  // functions.
			
 
				+  //
			
 
				+  // These routines don't directly verify their results as we can build simpler
			
 
				+  // debug checks by comparing them against the verified portable results.
			
 
				+  static auto SIMDLoad(const uint8_t* metadata, ssize_t index) -> MetadataGroup;
			
 
				+  auto SIMDStore(uint8_t* metadata, ssize_t index) const -> void;
			
 
				+
			
 
				+  auto SIMDClearDeleted() -> void;
			
 
				+
			
 
				+  auto SIMDMatch(uint8_t tag) const -> MatchRange;
			
 
				+  auto SIMDMatchPresent() const -> MatchRange;
			
 
				+
			
 
				+  auto SIMDMatchEmpty() const -> MatchIndex;
			
 
				+  auto SIMDMatchDeleted() const -> MatchIndex;
			
 
				+
			
 
				+  static auto SIMDCompareEqual(MetadataGroup lhs, MetadataGroup rhs) -> bool;
			
 
				+
			
 
				+#if CARBON_X86_SIMD_SUPPORT
			
 
				+  // A common routine for x86 SIMD matching that can be used for matching
			
 
				+  // present, empty, and deleted bytes with equal efficiency.
			
 
				+  auto X86SIMDMatch(uint8_t match_byte) const -> MatchRange;
			
 
				+#endif
			
 
				+};
			
 
				+
			
 
				+// Promote the size and mask to top-level constants as we'll need to operate on
			
 
				+// the grouped structure outside of the metadata bytes.
			
 
				+inline constexpr ssize_t GroupSize = MetadataGroup::Size;
			
 
				+inline constexpr ssize_t GroupMask = MetadataGroup::Mask;
			
 
				+
			
 
				+inline auto MetadataGroup::Load(const uint8_t* metadata, ssize_t index)
			
 
				+    -> MetadataGroup {
			
 
				+  MetadataGroup portable_g;
			
 
				+  if constexpr (!UseSIMD || DebugSIMD) {
			
 
				+    portable_g = PortableLoad(metadata, index);
			
 
				+    if constexpr (!UseSIMD) {
			
 
				+      return portable_g;
			
 
				+    }
			
 
				+  }
			
 
				+  MetadataGroup g = SIMDLoad(metadata, index);
			
 
				+  CARBON_DCHECK(g == portable_g);
			
 
				+  return g;
			
 
				+}
			
 
				+
			
 
				+inline auto MetadataGroup::Store(uint8_t* metadata, ssize_t index) const
			
 
				+    -> void {
			
 
				+  if constexpr (!UseSIMD) {
			
 
				+    std::memcpy(metadata + index, &metadata_bytes, Size);
			
 
				+  } else {
			
 
				+    SIMDStore(metadata, index);
			
 
				+  }
			
 
				+  CARBON_DCHECK(0 == std::memcmp(metadata + index, &metadata_bytes, Size));
			
 
				+}
			
 
				+
			
 
				+inline auto MetadataGroup::ClearByte(ssize_t byte_index) -> void {
			
 
				+  CARBON_DCHECK(FastByteClear) << "Only use byte clearing when fast!";
			
 
				+  CARBON_DCHECK(Size == 8)
			
 
				+      << "The clear implementation assumes an 8-byte group.";
			
 
				+
			
 
				+  metadata_ints[0] &= ~(static_cast<uint64_t>(0xff) << (byte_index * 8));
			
 
				+}
			
 
				+
			
 
				+inline auto MetadataGroup::ClearDeleted() -> void {
			
 
				+  MetadataGroup portable_g = *this;
			
 
				+  MetadataGroup simd_g = *this;
			
 
				+  if constexpr (!UseSIMD || DebugSIMD) {
			
 
				+    portable_g.PortableClearDeleted();
			
 
				+  }
			
 
				+  if constexpr (UseSIMD || DebugSIMD) {
			
 
				+    simd_g.SIMDClearDeleted();
			
 
				+    CARBON_DCHECK(simd_g == portable_g)
			
 
				+        << "SIMD cleared group '" << simd_g
			
 
				+        << "' doesn't match portable cleared group '" << portable_g << "'";
			
 
				+  }
			
 
				+  *this = UseSIMD ? simd_g : portable_g;
			
 
				+}
			
 
				+
			
 
				+inline auto MetadataGroup::Match(uint8_t tag) const -> MatchRange {
			
 
				+  // The caller should provide us with the present byte hash, and not set any
			
 
				+  // present bit tag on it so that this layer can manage tagging the high bit of
			
 
				+  // a present byte.
			
 
				+  CARBON_DCHECK((tag & PresentMask) == 0) << llvm::formatv("{0:x}", tag);
			
 
				+
			
 
				+  MatchRange portable_result;
			
 
				+  MatchRange simd_result;
			
 
				+  if constexpr (!UseSIMD || DebugSIMD) {
			
 
				+    portable_result = PortableMatch(tag);
			
 
				+  }
			
 
				+  if constexpr (UseSIMD || DebugSIMD) {
			
 
				+    simd_result = SIMDMatch(tag);
			
 
				+    CARBON_DCHECK(simd_result == portable_result)
			
 
				+        << "SIMD result '" << simd_result << "' doesn't match portable result '"
			
 
				+        << portable_result << "'";
			
 
				+  }
			
 
				+  return UseSIMD ? simd_result : portable_result;
			
 
				+}
			
 
				+
			
 
				+inline auto MetadataGroup::MatchPresent() const -> MatchRange {
			
 
				+  MatchRange portable_result;
			
 
				+  MatchRange simd_result;
			
 
				+  if constexpr (!UseSIMD || DebugSIMD) {
			
 
				+    portable_result = PortableMatchPresent();
			
 
				+  }
			
 
				+  if constexpr (UseSIMD || DebugSIMD) {
			
 
				+    simd_result = SIMDMatchPresent();
			
 
				+    CARBON_DCHECK(simd_result == portable_result)
			
 
				+        << "SIMD result '" << simd_result << "' doesn't match portable result '"
			
 
				+        << portable_result << "'";
			
 
				+  }
			
 
				+  return UseSIMD ? simd_result : portable_result;
			
 
				+}
			
 
				+
			
 
				+inline auto MetadataGroup::MatchEmpty() const -> MatchIndex {
			
 
				+  MatchIndex portable_result;
			
 
				+  MatchIndex simd_result;
			
 
				+  if constexpr (!UseSIMD || DebugSIMD) {
			
 
				+    portable_result = PortableMatchEmpty();
			
 
				+  }
			
 
				+  if constexpr (UseSIMD || DebugSIMD) {
			
 
				+    simd_result = SIMDMatchEmpty();
			
 
				+    CARBON_DCHECK(simd_result == portable_result)
			
 
				+        << "SIMD result '" << simd_result << "' doesn't match portable result '"
			
 
				+        << portable_result << "'";
			
 
				+  }
			
 
				+  return UseSIMD ? simd_result : portable_result;
			
 
				+}
			
 
				+
			
 
				+inline auto MetadataGroup::MatchDeleted() const -> MatchIndex {
			
 
				+  MatchIndex portable_result;
			
 
				+  MatchIndex simd_result;
			
 
				+  if constexpr (!UseSIMD || DebugSIMD) {
			
 
				+    portable_result = PortableMatchDeleted();
			
 
				+  }
			
 
				+  if constexpr (UseSIMD || DebugSIMD) {
			
 
				+    simd_result = SIMDMatchDeleted();
			
 
				+    CARBON_DCHECK(simd_result == portable_result)
			
 
				+        << "SIMD result '" << simd_result << "' doesn't match portable result '"
			
 
				+        << portable_result << "'";
			
 
				+  }
			
 
				+  return UseSIMD ? simd_result : portable_result;
			
 
				+}
			
 
				+
			
 
				+inline auto MetadataGroup::CompareEqual(MetadataGroup lhs, MetadataGroup rhs)
			
 
				+    -> bool {
			
 
				+  bool portable_result;
			
 
				+  bool simd_result;
			
 
				+  if constexpr (!UseSIMD || DebugSIMD) {
			
 
				+    portable_result = PortableCompareEqual(lhs, rhs);
			
 
				+  }
			
 
				+  if constexpr (UseSIMD || DebugSIMD) {
			
 
				+    simd_result = SIMDCompareEqual(lhs, rhs);
			
 
				+    CARBON_DCHECK(simd_result == portable_result);
			
 
				+  }
			
 
				+  return UseSIMD ? simd_result : portable_result;
			
 
				+}
			
 
				+
			
 
				+inline auto MetadataGroup::VerifyIndexBits(
			
 
				+    MatchBitsT index_bits,
			
 
				+    llvm::function_ref<auto(uint8_t byte)->bool> byte_match) const -> bool {
			
 
				+  for (ssize_t byte_index : llvm::seq<ssize_t>(0, Size)) {
			
 
				+    if constexpr (!ByteEncoding) {
			
 
				+      if (byte_match(metadata_bytes[byte_index])) {
			
 
				+        CARBON_CHECK(((index_bits >> byte_index) & 1) == 1)
			
 
				+            << "Bit not set at matching byte index: " << byte_index;
			
 
				+        // Only the first match is needed, so stop scanning once found.
			
 
				+        break;
			
 
				+      }
			
 
				+
			
 
				+      CARBON_CHECK(((index_bits >> byte_index) & 1) == 0)
			
 
				+          << "Bit set at non-matching byte index: " << byte_index;
			
 
				+    } else {
			
 
				+      // `index_bits` is byte-encoded rather than bit encoded, so extract a
			
 
				+      // byte.
			
 
				+      uint8_t index_byte = (index_bits >> (byte_index * 8)) & 0xFF;
			
 
				+      if (byte_match(metadata_bytes[byte_index])) {
			
 
				+        CARBON_CHECK((index_byte & 0x80) == 0x80)
			
 
				+            << "Should have the high bit set for a matching byte, found: "
			
 
				+            << llvm::formatv("{0:x}", index_byte);
			
 
				+        // Only the first match is needed so stop scanning once found.
			
 
				+        break;
			
 
				+      }
			
 
				+
			
 
				+      CARBON_CHECK(index_byte == 0)
			
 
				+          << "Should have no bits set for an unmatched byte, found: "
			
 
				+          << llvm::formatv("{0:x}", index_byte);
			
 
				+    }
			
 
				+  }
			
 
				+  return true;
			
 
				+}
			
 
				+
			
 
				+inline auto MetadataGroup::VerifyRangeBits(
			
 
				+    MatchBitsT range_bits,
			
 
				+    llvm::function_ref<auto(uint8_t byte)->bool> byte_match) const -> bool {
			
 
				+  for (ssize_t byte_index : llvm::seq<ssize_t>(0, Size)) {
			
 
				+    if constexpr (!ByteEncoding) {
			
 
				+      if (byte_match(metadata_bytes[byte_index])) {
			
 
				+        CARBON_CHECK(((range_bits >> byte_index) & 1) == 1)
			
 
				+            << "Bit not set at matching byte index: " << byte_index;
			
 
				+      } else {
			
 
				+        CARBON_CHECK(((range_bits >> byte_index) & 1) == 0)
			
 
				+            << "Bit set at non-matching byte index: " << byte_index;
			
 
				+      }
			
 
				+    } else {
			
 
				+      // `range_bits` is byte-encoded rather than bit encoded, so extract a
			
 
				+      // byte.
			
 
				+      uint8_t range_byte = (range_bits >> (byte_index * 8)) & 0xFF;
			
 
				+      if (byte_match(metadata_bytes[byte_index])) {
			
 
				+        CARBON_CHECK(range_byte == 0x80)
			
 
				+            << "Should just have the high bit set for a matching byte, found: "
			
 
				+            << llvm::formatv("{0:x}", range_byte);
			
 
				+      } else {
			
 
				+        CARBON_CHECK(range_byte == 0)
			
 
				+            << "Should have no bits set for an unmatched byte, found: "
			
 
				+            << llvm::formatv("{0:x}", range_byte);
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+  return true;
			
 
				+}
			
 
				+
			
 
				+inline auto MetadataGroup::PortableLoad(const uint8_t* metadata, ssize_t index)
			
 
				+    -> MetadataGroup {
			
 
				+  MetadataGroup g;
			
 
				+  static_assert(sizeof(g) == Size);
			
 
				+  std::memcpy(&g.metadata_bytes, metadata + index, Size);
			
 
				+  return g;
			
 
				+}
			
 
				+
			
 
				+inline auto MetadataGroup::PortableStore(uint8_t* metadata, ssize_t index) const
			
 
				+    -> void {
			
 
				+  std::memcpy(metadata + index, &metadata_bytes, Size);
			
 
				+}
			
 
				+
			
 
				+inline auto MetadataGroup::PortableClearDeleted() -> void {
			
 
				+  for (uint64_t& metadata_int : metadata_ints) {
			
 
				+    // Deleted bytes have only the least significant bits set, so to clear them
			
 
				+    // we only need to clear the least significant bit. And empty bytes already
			
 
				+    // have a clear least significant bit, so the only least significant bits we
			
 
				+    // need to preserve are those of present bytes. The most significant bit of
			
 
				+    // every present byte is set, so we take the most significant bit of each
			
 
				+    // byte, shift it into the least significant bit position, and bit-or it
			
 
				+    // with the compliment of `LSBs`. This will have ones for every bit but the
			
 
				+    // least significant bits, and ones for the least significant bits of every
			
 
				+    // present byte.
			
 
				+    metadata_int &= (~LSBs | metadata_int >> 7);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+inline auto MetadataGroup::PortableMatch(uint8_t tag) const -> MatchRange {
			
 
				+  // The caller should provide us with the present byte hash, and not set any
			
 
				+  // present bit tag on it so that this layer can manage tagging the high bit of
			
 
				+  // a present byte.
			
 
				+  CARBON_DCHECK((tag & PresentMask) == 0) << llvm::formatv("{0:x}", tag);
			
 
				+
			
 
				+  // Use a simple fallback approach for sizes beyond 8.
			
 
				+  // TODO: Instead of a simple fallback, we should generalize the below
			
 
				+  // algorithm for sizes above 8, even if to just exercise the same code on
			
 
				+  // more platforms.
			
 
				+  if constexpr (Size > 8) {
			
 
				+    static_assert(Size <= 32, "Sizes larger than 32 not yet supported!");
			
 
				+    uint32_t match_bits = 0;
			
 
				+    uint32_t bit = 1;
			
 
				+    uint8_t present_byte = tag | PresentMask;
			
 
				+    for (ssize_t i : llvm::seq<ssize_t>(0, Size)) {
			
 
				+      if (metadata_bytes[i] == present_byte) {
			
 
				+        match_bits |= bit;
			
 
				+      }
			
 
				+      bit <<= 1;
			
 
				+    }
			
 
				+    return MatchRange(match_bits);
			
 
				+  }
			
 
				+
			
 
				+  // This algorithm only works for matching *present* bytes. We leverage the
			
 
				+  // set high bit in the present case as part of the algorithm. The whole
			
 
				+  // algorithm has a critical path height of 4 operations, and does 6
			
 
				+  // operations total on AArch64. The operation dependency graph is:
			
 
				+  //
			
 
				+  //          group | MSBs        LSBs * match_byte + MSBs
			
 
				+  //                 \                /
			
 
				+  //                 match_bits ^ broadcast
			
 
				+  //                            |
			
 
				+  //   group & MSBs        MSBs - match_bits
			
 
				+  //          \                /
			
 
				+  //        group_MSBs & match_bits
			
 
				+  //
			
 
				+  // This diagram and the operation count are specific to AArch64 where we have
			
 
				+  // a fused *integer* multiply-add operation.
			
 
				+  //
			
 
				+  // While it is superficially similar to the "find zero bytes in a word" bit
			
 
				+  // math trick, it is different because this is designed to have no false
			
 
				+  // positives and perfectly produce 0x80 for matching bytes and 0x00 for
			
 
				+  // non-matching bytes. This is do-able because we constrain to only handle
			
 
				+  // present matches which only require testing 7 bits and have a particular
			
 
				+  // layout.
			
 
				+
			
 
				+  // Set the high bit of every byte to `1`. Any matching byte is a present byte
			
 
				+  // and so always has this bit set as well, which means the xor below, in
			
 
				+  // addition to zeroing the low 7 bits of any byte that matches the tag, also
			
 
				+  // clears the high bit of every byte.
			
 
				+  uint64_t match_bits = metadata_ints[0] | MSBs;
			
 
				+  // Broadcast the match byte to all bytes, and mask in the present bits in the
			
 
				+  // MSBs of each byte. We structure this as a multiply and an add because we
			
 
				+  // know that the add cannot carry, and this way it can be lowered using
			
 
				+  // combined multiply-add instructions if available.
			
 
				+  uint64_t broadcast = LSBs * tag + MSBs;
			
 
				+  CARBON_DCHECK(broadcast == (LSBs * tag | MSBs))
			
 
				+      << "Unexpected carry from addition!";
			
 
				+
			
 
				+  // Xor the broadcast byte pattern. This makes bytes with matches become 0, and
			
 
				+  // clears the high-bits of non-matches. Note that if we are looking for a tag
			
 
				+  // with the same value as `Empty` or `Deleted`, those bytes will be zero as
			
 
				+  // well.
			
 
				+  match_bits = match_bits ^ broadcast;
			
 
				+  // Subtract each byte of `match_bits` from `0x80` bytes. After this, the high
			
 
				+  // bit will be set only for those bytes that were zero.
			
 
				+  match_bits = MSBs - match_bits;
			
 
				+  // Zero everything but the high bits, and also zero the high bits of any bytes
			
 
				+  // for "not present" slots in the original group. This avoids false positives
			
 
				+  // for `Empty` and `Deleted` bytes in the metadata.
			
 
				+  match_bits &= (metadata_ints[0] & MSBs);
			
 
				+
			
 
				+  // At this point, `match_bits` has the high bit set for bytes where the
			
 
				+  // original group byte equals `tag` plus the high bit.
			
 
				+  CARBON_DCHECK(VerifyRangeBits(
			
 
				+      match_bits, [&](uint8_t byte) { return byte == (tag | PresentMask); }));
			
 
				+  return MatchRange(match_bits);
			
 
				+}
			
 
				+
			
 
				+inline auto MetadataGroup::PortableMatchPresent() const -> MatchRange {
			
 
				+  // Use a simple fallback approach for sizes beyond 8.
			
 
				+  // TODO: Instead of a simple fallback, we should generalize the below
			
 
				+  // algorithm for sizes above 8, even if to just exercise the same code on
			
 
				+  // more platforms.
			
 
				+  if constexpr (Size > 8) {
			
 
				+    static_assert(Size <= 32, "Sizes larger than 32 not yet supported!");
			
 
				+    uint32_t match_bits = 0;
			
 
				+    uint32_t bit = 1;
			
 
				+    for (ssize_t i : llvm::seq<ssize_t>(0, Size)) {
			
 
				+      if (metadata_bytes[i] & PresentMask) {
			
 
				+        match_bits |= bit;
			
 
				+      }
			
 
				+      bit <<= 1;
			
 
				+    }
			
 
				+    return MatchRange(match_bits);
			
 
				+  }
			
 
				+
			
 
				+  // Want to keep the high bit of each byte, which indicates whether that byte
			
 
				+  // represents a present slot.
			
 
				+  uint64_t match_bits = metadata_ints[0] & MSBs;
			
 
				+
			
 
				+  CARBON_DCHECK(VerifyRangeBits(
			
 
				+      match_bits, [&](uint8_t byte) { return (byte & PresentMask) != 0; }));
			
 
				+  return MatchRange(match_bits);
			
 
				+}
			
 
				+
			
 
				+inline auto MetadataGroup::PortableMatchEmpty() const -> MatchIndex {
			
 
				+  // Use a simple fallback approach for sizes beyond 8.
			
 
				+  // TODO: Instead of a simple fallback, we should generalize the below
			
 
				+  // algorithm for sizes above 8, even if to just exercise the same code on
			
 
				+  // more platforms.
			
 
				+  if constexpr (Size > 8) {
			
 
				+    static_assert(Size <= 32, "Sizes larger than 32 not yet supported!");
			
 
				+    uint32_t bit = 1;
			
 
				+    for (ssize_t i : llvm::seq<ssize_t>(0, Size)) {
			
 
				+      if (metadata_bytes[i] == Empty) {
			
 
				+        return MatchIndex(bit);
			
 
				+      }
			
 
				+      bit <<= 1;
			
 
				+    }
			
 
				+    return MatchIndex(0);
			
 
				+  }
			
 
				+
			
 
				+  // This sets the high bit of every byte in `match_bits` unless the
			
 
				+  // corresponding metadata byte is 0. We take advantage of the fact that
			
 
				+  // the metadata bytes in are non-zero only if they are either:
			
 
				+  // - present: in which case the high bit of the byte will already be set; or
			
 
				+  // - deleted: in which case the byte will be 1, and shifting it left by 7 will
			
 
				+  //   cause the high bit to be set.
			
 
				+  uint64_t match_bits = metadata_ints[0] | (metadata_ints[0] << 7);
			
 
				+  // This inverts the high bits of the bytes, and clears the remaining bits.
			
 
				+  match_bits = ~match_bits & MSBs;
			
 
				+
			
 
				+  // The high bits of the bytes of `match_bits` are set if the corresponding
			
 
				+  // metadata byte is `Empty`.
			
 
				+  CARBON_DCHECK(
			
 
				+      VerifyIndexBits(match_bits, [](uint8_t byte) { return byte == Empty; }));
			
 
				+  return MatchIndex(match_bits);
			
 
				+}
			
 
				+
			
 
				+inline auto MetadataGroup::PortableMatchDeleted() const -> MatchIndex {
			
 
				+  // Use a simple fallback approach for sizes beyond 8.
			
 
				+  // TODO: Instead of a simple fallback, we should generalize the below
			
 
				+  // algorithm for sizes above 8, even if to just exercise the same code on
			
 
				+  // more platforms.
			
 
				+  if constexpr (Size > 8) {
			
 
				+    static_assert(Size <= 32, "Sizes larger than 32 not yet supported!");
			
 
				+    uint32_t bit = 1;
			
 
				+    for (ssize_t i : llvm::seq<ssize_t>(0, Size)) {
			
 
				+      if (metadata_bytes[i] == Deleted) {
			
 
				+        return MatchIndex(bit);
			
 
				+      }
			
 
				+      bit <<= 1;
			
 
				+    }
			
 
				+    return MatchIndex(0);
			
 
				+  }
			
 
				+
			
 
				+  // This sets the high bit of every byte in `match_bits` unless the
			
 
				+  // corresponding metadata byte is 1. We take advantage of the fact that the
			
 
				+  // metadata bytes are not 1 only if they are either:
			
 
				+  // - present: in which case the high bit of the byte will already be set; or
			
 
				+  // - empty: in which case the byte will be 0, and in that case inverting and
			
 
				+  //   shifting left by 7 will have the high bit set.
			
 
				+  uint64_t match_bits = metadata_ints[0] | (~metadata_ints[0] << 7);
			
 
				+  // This inverts the high bits of the bytes, and clears the remaining bits.
			
 
				+  match_bits = ~match_bits & MSBs;
			
 
				+
			
 
				+  // The high bits of the bytes of `match_bits` are set if the corresponding
			
 
				+  // metadata byte is `Deleted`.
			
 
				+  CARBON_DCHECK(VerifyIndexBits(match_bits,
			
 
				+                                [](uint8_t byte) { return byte == Deleted; }));
			
 
				+  return MatchIndex(match_bits);
			
 
				+}
			
 
				+
			
 
				+inline auto MetadataGroup::PortableCompareEqual(MetadataGroup lhs,
			
 
				+                                                MetadataGroup rhs) -> bool {
			
 
				+  return llvm::equal(lhs.metadata_bytes, rhs.metadata_bytes);
			
 
				+}
			
 
				+
			
 
				+inline auto MetadataGroup::SIMDLoad(const uint8_t* metadata, ssize_t index)
			
 
				+    -> MetadataGroup {
			
 
				+  MetadataGroup g;
			
 
				+#if CARBON_NEON_SIMD_SUPPORT
			
 
				+  g.metadata_vec = vld1_u8(metadata + index);
			
 
				+#elif CARBON_X86_SIMD_SUPPORT
			
 
				+  g.metadata_vec =
			
 
				+      _mm_load_si128(reinterpret_cast<const __m128i*>(metadata + index));
			
 
				+#else
			
 
				+  static_assert(!UseSIMD, "Unimplemented SIMD operation");
			
 
				+  static_cast<void>(metadata);
			
 
				+  static_cast<void>(index);
			
 
				+#endif
			
 
				+  return g;
			
 
				+}
			
 
				+
			
 
				+inline auto MetadataGroup::SIMDStore(uint8_t* metadata, ssize_t index) const
			
 
				+    -> void {
			
 
				+#if CARBON_NEON_SIMD_SUPPORT
			
 
				+  vst1_u8(metadata + index, metadata_vec);
			
 
				+#elif CARBON_X86_SIMD_SUPPORT
			
 
				+  _mm_store_si128(reinterpret_cast<__m128i*>(metadata + index), metadata_vec);
			
 
				+#else
			
 
				+  static_assert(!UseSIMD, "Unimplemented SIMD operation");
			
 
				+  static_cast<void>(metadata);
			
 
				+  static_cast<void>(index);
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+inline auto MetadataGroup::SIMDClearDeleted() -> void {
			
 
				+#if CARBON_NEON_SIMD_SUPPORT
			
 
				+  // There is no good Neon operation to implement this, so do it using integer
			
 
				+  // code. This is reasonably fast, but unfortunate because it forces the group
			
 
				+  // out of a SIMD register and into a general purpose register, which can have
			
 
				+  // high latency.
			
 
				+  metadata_ints[0] &= (~LSBs | metadata_ints[0] >> 7);
			
 
				+#elif CARBON_X86_SIMD_SUPPORT
			
 
				+  // For each byte, use `metadata_vec` if the byte's high bit is set (indicating
			
 
				+  // it is present), otherwise (it is empty or deleted) replace it with zero
			
 
				+  // (representing empty).
			
 
				+  metadata_vec =
			
 
				+      _mm_blendv_epi8(_mm_setzero_si128(), metadata_vec, metadata_vec);
			
 
				+#else
			
 
				+  static_assert(!UseSIMD && !DebugSIMD, "Unimplemented SIMD operation");
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+inline auto MetadataGroup::SIMDMatch(uint8_t tag) const -> MatchRange {
			
 
				+  MatchRange result;
			
 
				+#if CARBON_NEON_SIMD_SUPPORT
			
 
				+  // Broadcast byte we want to match to every byte in the vector.
			
 
				+  auto match_byte_vec = vdup_n_u8(tag | PresentMask);
			
 
				+  // Result bytes have all bits set for the bytes that match, so we have to
			
 
				+  // clear everything but MSBs next.
			
 
				+  auto match_byte_cmp_vec = vceq_u8(metadata_vec, match_byte_vec);
			
 
				+  uint64_t match_bits = vreinterpret_u64_u8(match_byte_cmp_vec)[0];
			
 
				+  // The matched range is likely to be tested for zero by the caller, and that
			
 
				+  // test can often be folded into masking the bits with `MSBs` when we do that
			
 
				+  // mask in the scalar domain rather than the SIMD domain. So we do the mask
			
 
				+  // here rather than above prior to extracting the match bits.
			
 
				+  result = MatchRange(match_bits & MSBs);
			
 
				+#elif CARBON_X86_SIMD_SUPPORT
			
 
				+  result = X86SIMDMatch(tag | PresentMask);
			
 
				+#else
			
 
				+  static_assert(!UseSIMD && !DebugSIMD, "Unimplemented SIMD operation");
			
 
				+  static_cast<void>(tag);
			
 
				+#endif
			
 
				+  return result;
			
 
				+}
			
 
				+
			
 
				+inline auto MetadataGroup::SIMDMatchPresent() const -> MatchRange {
			
 
				+  MatchRange result;
			
 
				+#if CARBON_NEON_SIMD_SUPPORT
			
 
				+  // Just extract the metadata directly.
			
 
				+  uint64_t match_bits = vreinterpret_u64_u8(metadata_vec)[0];
			
 
				+  // The matched range is likely to be tested for zero by the caller, and that
			
 
				+  // test can often be folded into masking the bits with `MSBs` when we do that
			
 
				+  // mask in the scalar domain rather than the SIMD domain. So we do the mask
			
 
				+  // here rather than above prior to extracting the match bits.
			
 
				+  result = MatchRange(match_bits & MSBs);
			
 
				+#elif CARBON_X86_SIMD_SUPPORT
			
 
				+  // We arranged the byte vector so that present bytes have the high bit set,
			
 
				+  // which this instruction extracts.
			
 
				+  result = MatchRange(_mm_movemask_epi8(metadata_vec));
			
 
				+#else
			
 
				+  static_assert(!UseSIMD && !DebugSIMD, "Unimplemented SIMD operation");
			
 
				+#endif
			
 
				+  return result;
			
 
				+}
			
 
				+
			
 
				+inline auto MetadataGroup::SIMDMatchEmpty() const -> MatchIndex {
			
 
				+  MatchIndex result;
			
 
				+#if CARBON_NEON_SIMD_SUPPORT
			
 
				+  // Compare all bytes with zero, as that is the empty byte value. Result will
			
 
				+  // have all bits set for any input zero byte, so we zero all but the high bits
			
 
				+  // below.
			
 
				+  auto cmp_vec = vceqz_u8(metadata_vec);
			
 
				+  uint64_t metadata_bits = vreinterpret_u64_u8(cmp_vec)[0];
			
 
				+  // The matched range is likely to be tested for zero by the caller, and that
			
 
				+  // test can often be folded into masking the bits with `MSBs` when we do that
			
 
				+  // mask in the scalar domain rather than the SIMD domain. So we do the mask
			
 
				+  // here rather than above prior to extracting the match bits.
			
 
				+  result = MatchIndex(metadata_bits & MSBs);
			
 
				+#elif CARBON_X86_SIMD_SUPPORT
			
 
				+  // Even though we only need the first match rather than all matches, we don't
			
 
				+  // have a more efficient way to compute this on x86 and so we reuse the
			
 
				+  // general match infrastructure that computes all matches in a bit-encoding.
			
 
				+  // We then convert it into a `MatchIndex` that just finds the first one.
			
 
				+  result = static_cast<MatchIndex>(X86SIMDMatch(Empty));
			
 
				+#else
			
 
				+  static_assert(!UseSIMD && !DebugSIMD, "Unimplemented SIMD operation");
			
 
				+#endif
			
 
				+  return result;
			
 
				+}
			
 
				+
			
 
				+inline auto MetadataGroup::SIMDMatchDeleted() const -> MatchIndex {
			
 
				+  MatchIndex result;
			
 
				+#if CARBON_NEON_SIMD_SUPPORT
			
 
				+  // Broadcast the `Deleted` byte across the vector and compare the bytes of
			
 
				+  // that with the metadata vector. The result will have all bits set for any
			
 
				+  // input zero byte, so we zero all but the high bits below.
			
 
				+  auto cmp_vec = vceq_u8(metadata_vec, vdup_n_u8(Deleted));
			
 
				+  uint64_t match_bits = vreinterpret_u64_u8(cmp_vec)[0];
			
 
				+  // The matched range is likely to be tested for zero by the caller, and that
			
 
				+  // test can often be folded into masking the bits with `MSBs` when we do that
			
 
				+  // mask in the scalar domain rather than the SIMD domain. So we do the mask
			
 
				+  // here rather than above prior to extracting the match bits.
			
 
				+  result = MatchIndex(match_bits & MSBs);
			
 
				+#elif CARBON_X86_SIMD_SUPPORT
			
 
				+  // Even though we only need the first match rather than all matches, we don't
			
 
				+  // have a more efficient way to compute this on x86 and so we reuse the
			
 
				+  // general match infrastructure that computes all matches in a bit-encoding.
			
 
				+  // We then convert it into a `MatchIndex` that just finds the first one.
			
 
				+  result = static_cast<MatchIndex>(X86SIMDMatch(Deleted));
			
 
				+#else
			
 
				+  static_assert(!UseSIMD && !DebugSIMD, "Unimplemented SIMD operation");
			
 
				+#endif
			
 
				+  return result;
			
 
				+}
			
 
				+
			
 
				+inline auto MetadataGroup::SIMDCompareEqual(MetadataGroup lhs,
			
 
				+                                            MetadataGroup rhs) -> bool {
			
 
				+#if CARBON_NEON_SIMD_SUPPORT
			
 
				+  return vreinterpret_u64_u8(vceq_u8(lhs.metadata_vec, rhs.metadata_vec))[0] ==
			
 
				+         static_cast<uint64_t>(-1LL);
			
 
				+#elif CARBON_X86_SIMD_SUPPORT
			
 
				+  // Different x86 SIMD extensions provide different comparison functionality
			
 
				+  // available.
			
 
				+#if __SSE4_2__
			
 
				+  // With SSE 4.2, we can directly test and branch in the SIMD domain on whether
			
 
				+  // the two metadata vectors are equal.
			
 
				+  return _mm_testc_si128(_mm_cmpeq_epi8(lhs.metadata_vec, rhs.metadata_vec),
			
 
				+                         _mm_set1_epi8(0xff)) == 1;
			
 
				+#else
			
 
				+  // With older versions of SSE we have to extract the result of the comparison,
			
 
				+  // much like we do when matching. That will have the usual bitmask
			
 
				+  // representing equal bytes, and test for that exact bitmask in scalar code.
			
 
				+  return _mm_movemask_epi8(_mm_cmpeq_epi8(lhs.metadata_vec,
			
 
				+                                          rhs.metadata_vec)) == 0x0000'ffffU;
			
 
				+#endif
			
 
				+#else
			
 
				+  static_assert(!UseSIMD && !DebugSIMD, "Unimplemented SIMD operation");
			
 
				+  static_cast<void>(lhs);
			
 
				+  static_cast<void>(rhs);
			
 
				+  return false;
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+#if CARBON_X86_SIMD_SUPPORT
			
 
				+inline auto MetadataGroup::X86SIMDMatch(uint8_t match_byte) const
			
 
				+    -> MatchRange {
			
 
				+  // Broadcast the byte we're matching against to all bytes in a vector, and
			
 
				+  // compare those bytes with the metadata vector bytes.
			
 
				+  auto match_byte_vec = _mm_set1_epi8(match_byte);
			
 
				+  auto match_byte_cmp_vec = _mm_cmpeq_epi8(metadata_vec, match_byte_vec);
			
 
				+  // Extract the result of each byte-wise comparison into the low bits of an
			
 
				+  // integer.
			
 
				+  uint32_t match_bits = _mm_movemask_epi8(match_byte_cmp_vec);
			
 
				+  return MatchRange(match_bits);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+}  // namespace Carbon::RawHashtable
			
 
				+
			
 
				+#endif  // CARBON_COMMON_RAW_HASHTABLE_METADATA_GROUP_H_
			
--- a/common/raw_hashtable_metadata_group_benchmark.cpp
+++ b/common/raw_hashtable_metadata_group_benchmark.cpp
@@ -0,0 +1,328 @@
 
				+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
			
 
				+// Exceptions. See /LICENSE for license information.
			
 
				+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
			
 
				+
			
 
				+#include <benchmark/benchmark.h>
			
 
				+
			
 
				+#include <algorithm>
			
 
				+
			
 
				+#include "absl/random/random.h"
			
 
				+#include "common/raw_hashtable_metadata_group.h"
			
 
				+
			
 
				+namespace Carbon::RawHashtable {
			
 
				+
			
 
				+// If we have any SIMD support, create dedicated benchmark utilities for the
			
 
				+// portable and SIMD implementation so we can directly benchmark both.
			
 
				+#if CARBON_NEON_SIMD_SUPPORT || CARBON_X86_SIMD_SUPPORT
			
 
				+// Override the core API with explicit use of the portable API.
			
 
				+class BenchmarkPortableMetadataGroup : public MetadataGroup {
			
 
				+ public:
			
 
				+  explicit BenchmarkPortableMetadataGroup(MetadataGroup g) : MetadataGroup(g) {}
			
 
				+
			
 
				+  static auto Load(uint8_t* metadata, ssize_t index)
			
 
				+      -> BenchmarkPortableMetadataGroup {
			
 
				+    return BenchmarkPortableMetadataGroup(PortableLoad(metadata, index));
			
 
				+  }
			
 
				+  auto Store(uint8_t* metadata, ssize_t index) const -> void {
			
 
				+    PortableStore(metadata, index);
			
 
				+  }
			
 
				+
			
 
				+  auto ClearDeleted() -> void { PortableClearDeleted(); }
			
 
				+
			
 
				+  auto Match(uint8_t present_byte) const -> MatchRange {
			
 
				+    return PortableMatch(present_byte);
			
 
				+  }
			
 
				+  auto MatchPresent() const -> MatchRange { return PortableMatchPresent(); }
			
 
				+
			
 
				+  auto MatchEmpty() const -> MatchIndex { return PortableMatchEmpty(); }
			
 
				+  auto MatchDeleted() const -> MatchIndex { return PortableMatchDeleted(); }
			
 
				+};
			
 
				+
			
 
				+// Override the core API with explicit use of the SIMD API.
			
 
				+class BenchmarkSIMDMetadataGroup : public MetadataGroup {
			
 
				+ public:
			
 
				+  explicit BenchmarkSIMDMetadataGroup(MetadataGroup g) : MetadataGroup(g) {}
			
 
				+
			
 
				+  static auto Load(uint8_t* metadata, ssize_t index)
			
 
				+      -> BenchmarkSIMDMetadataGroup {
			
 
				+    return BenchmarkSIMDMetadataGroup(SIMDLoad(metadata, index));
			
 
				+  }
			
 
				+  auto Store(uint8_t* metadata, ssize_t index) const -> void {
			
 
				+    SIMDStore(metadata, index);
			
 
				+  }
			
 
				+
			
 
				+  auto ClearDeleted() -> void { SIMDClearDeleted(); }
			
 
				+
			
 
				+  auto Match(uint8_t present_byte) const -> MatchRange {
			
 
				+    return SIMDMatch(present_byte);
			
 
				+  }
			
 
				+  auto MatchPresent() const -> MatchRange { return SIMDMatchPresent(); }
			
 
				+
			
 
				+  auto MatchEmpty() const -> MatchIndex { return SIMDMatchEmpty(); }
			
 
				+  auto MatchDeleted() const -> MatchIndex { return SIMDMatchDeleted(); }
			
 
				+};
			
 
				+#endif
			
 
				+
			
 
				+namespace {
			
 
				+
			
 
				+// The number of metadata groups we use when benchmarking a particular scenario
			
 
				+// of matching within a group.
			
 
				+constexpr ssize_t BenchSize = 256;
			
 
				+
			
 
				+#if CARBON_NEON_SIMD_SUPPORT || CARBON_X86_SIMD_SUPPORT
			
 
				+using PortableGroup = BenchmarkPortableMetadataGroup;
			
 
				+using SIMDGroup = BenchmarkSIMDMetadataGroup;
			
 
				+#endif
			
 
				+
			
 
				+struct BenchMetadata {
			
 
				+  // The metadata for benchmarking, arranged in `BenchSize` groups, each one
			
 
				+  // `GroupSize` in length. As a consequence, the size of this array will always
			
 
				+  // be `BenchSize * GroupSize`.
			
 
				+  llvm::MutableArrayRef<uint8_t> metadata;
			
 
				+
			
 
				+  // For benchmarking random matches in the metadata, each byte here is the tag
			
 
				+  // that should be matched against the corresponding group of the metadata.
			
 
				+  // Because this array parallels the *groups* of the metadata array, its size
			
 
				+  // will be `BenchSize`. For other kinds, this is empty.
			
 
				+  llvm::ArrayRef<uint8_t> bytes;
			
 
				+};
			
 
				+
			
 
				+enum class BenchKind : uint8_t {
			
 
				+  Random,
			
 
				+  Empty,
			
 
				+  Deleted,
			
 
				+};
			
 
				+
			
 
				+// This routine should only be called once per `BenchKind` as the initializer of
			
 
				+// a global variable below. It returns an `ArrayRef` pointing into
			
 
				+// function-local static storage that provides our benchmark metadata.
			
 
				+//
			
 
				+// The returned array will have exactly `GroupSize` elements, each of
			
 
				+// `BenchMetadata`. For the `BenchMetadata` at index `i`, there will be `i+1`
			
 
				+// matches of that kind within each group of the metadata. This lets us
			
 
				+// benchmark each of the possible match-counts for a group.
			
 
				+template <BenchKind Kind = BenchKind::Random>
			
 
				+static auto BuildBenchMetadata() -> llvm::ArrayRef<BenchMetadata> {
			
 
				+  // We build `GroupSize` elements of `BenchMetadata` below, and so we need
			
 
				+  // `GroupSize` copies of each of these arrays to serve as inputs to it.
			
 
				+  //
			
 
				+  // The first storage is of `BenchSize` groups of metadata.
			
 
				+  static uint8_t metadata_storage[GroupSize][BenchSize * GroupSize];
			
 
				+  // When `Kind` is `Random`, each group above will have a *different* byte that
			
 
				+  // matches in that group. This array stores those bytes for the benchmark to
			
 
				+  // match against the group.
			
 
				+  static uint8_t bytes_storage[GroupSize][BenchSize];
			
 
				+
			
 
				+  // The backing storage for the returned `ArrayRef`.
			
 
				+  static BenchMetadata bm_storage[GroupSize];
			
 
				+
			
 
				+  absl::BitGen gen;
			
 
				+  for (auto [bm_index, bm] : llvm::enumerate(bm_storage)) {
			
 
				+    int match_count = bm_index + 1;
			
 
				+
			
 
				+    for (ssize_t g_index : llvm::seq<ssize_t>(0, BenchSize)) {
			
 
				+      // Start by filling the group with random bytes.
			
 
				+      auto group_bytes = llvm::MutableArrayRef(
			
 
				+          &metadata_storage[bm_index][g_index * GroupSize], GroupSize);
			
 
				+      for (uint8_t& b : group_bytes) {
			
 
				+        b = absl::Uniform<uint8_t>(gen) | MetadataGroup::PresentMask;
			
 
				+      }
			
 
				+
			
 
				+      // Now we need up to `match_count` random indices into the group where
			
 
				+      // we'll put a matching byte.
			
 
				+      std::array<ssize_t, GroupSize> group_indices;
			
 
				+      std::iota(group_indices.begin(), group_indices.end(), 0);
			
 
				+      std::shuffle(group_indices.begin(), group_indices.end(), gen);
			
 
				+
			
 
				+      // Now cause the first match index to have the desired value.
			
 
				+      ssize_t match_index = *group_indices.begin();
			
 
				+      uint8_t& match_b = group_bytes[match_index];
			
 
				+      switch (Kind) {
			
 
				+        case BenchKind::Random: {
			
 
				+          // Already a random value, but we need to  ensure it isn't one that
			
 
				+          // repeats elsewhere in the group.
			
 
				+          while (llvm::count(group_bytes, match_b) > 1) {
			
 
				+            match_b = absl::Uniform<uint8_t>(gen) | MetadataGroup::PresentMask;
			
 
				+          }
			
 
				+          // Store this as the byte to search for in this group, but without the
			
 
				+          // present bit to simulate where we start when using a 7-bit tag
			
 
				+          // from a hash.
			
 
				+          bytes_storage[bm_index][g_index] =
			
 
				+              match_b & ~MetadataGroup::PresentMask;
			
 
				+          break;
			
 
				+        }
			
 
				+        case BenchKind::Empty: {
			
 
				+          match_b = MetadataGroup::Empty;
			
 
				+          break;
			
 
				+        }
			
 
				+        case BenchKind::Deleted: {
			
 
				+          match_b = MetadataGroup::Deleted;
			
 
				+          break;
			
 
				+        }
			
 
				+      }
			
 
				+
			
 
				+      // Replicate the match byte in each of the other matching indices.
			
 
				+      for (ssize_t m_index : llvm::ArrayRef(group_indices)
			
 
				+                                 .drop_front()
			
 
				+                                 .take_front(match_count - 1)) {
			
 
				+        group_bytes[m_index] = match_b;
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+    // Now that the storage is set up, record these in our struct.
			
 
				+    bm.metadata = metadata_storage[bm_index];
			
 
				+    if constexpr (Kind == BenchKind::Random) {
			
 
				+      bm.bytes = bytes_storage[bm_index];
			
 
				+    }
			
 
				+  }
			
 
				+  return bm_storage;
			
 
				+}
			
 
				+
			
 
				+template <BenchKind Kind>
			
 
				+// NOLINTNEXTLINE(google-readability-casting): False positive clang-tidy bug.
			
 
				+const auto bench_metadata = BuildBenchMetadata<Kind>();
			
 
				+
			
 
				+// Benchmark that simulates the dynamic execution pattern when we match exactly
			
 
				+// one entry in the group, typically then using the index of the matching byte
			
 
				+// to index into an element of a group of entries. But notably, the *first*
			
 
				+// match is sufficient, and we never have to find the *next* match within the
			
 
				+// group.
			
 
				+template <BenchKind Kind, typename GroupT = MetadataGroup>
			
 
				+static void BM_LoadMatch(benchmark::State& s) {
			
 
				+  BenchMetadata bm = bench_metadata<Kind>[0];
			
 
				+
			
 
				+  // We want to make the index used by the next iteration of the benchmark have
			
 
				+  // a data dependency on the result of matching. A match produces an index into
			
 
				+  // the group of metadata. To consume this match in a way that is
			
 
				+  // representative of how it will be used in a hashtable (indexing into an
			
 
				+  // array of entries), while establishing that dependence, we keep a
			
 
				+  // group-sized array of the value `1` in memory that we can index into to
			
 
				+  // increment to the next step of the loop. We do have to hide the contents of
			
 
				+  // the loop from the optimizer by clobbering the memory.
			
 
				+  ssize_t all_ones[GroupSize];
			
 
				+  for (ssize_t& n : all_ones) {
			
 
				+    n = 1;
			
 
				+  }
			
 
				+  benchmark::ClobberMemory();
			
 
				+
			
 
				+  // We don't want the optimizer to peel iterations off of this loop, so hide
			
 
				+  // the starting index.
			
 
				+  ssize_t i = 0;
			
 
				+  benchmark::DoNotOptimize(i);
			
 
				+
			
 
				+  // This loop looks *really* attractive to unroll to the compiler. However,
			
 
				+  // that can easily overlap some of the memory operations and generally makes
			
 
				+  // it harder to analyze the exact operation sequence we care about.
			
 
				+#pragma clang loop unroll(disable)
			
 
				+  for (auto _ : s) {
			
 
				+    auto g = GroupT::Load(bm.metadata.data(), i * GroupSize);
			
 
				+    typename GroupT::MatchIndex matches;
			
 
				+    if constexpr (Kind == BenchKind::Empty) {
			
 
				+      matches = g.MatchEmpty();
			
 
				+    } else if constexpr (Kind == BenchKind::Deleted) {
			
 
				+      matches = g.MatchDeleted();
			
 
				+    } else {
			
 
				+      static_assert(Kind == BenchKind::Random);
			
 
				+      matches = static_cast<MetadataGroup::MatchIndex>(g.Match(bm.bytes[i]));
			
 
				+    }
			
 
				+    // Despite not being a DCHECK, this is fine for benchmarking. In an actual
			
 
				+    // hashtable, we expect to have a test for empty of the match prior to using
			
 
				+    // it to index an array, and that test is expected to be strongly predicted.
			
 
				+    // That exactly matches how the `CARBON_CHECK` macro works, and so this
			
 
				+    // serves as both a good correctness test and replication of hashtable usage
			
 
				+    // of a match.
			
 
				+    CARBON_CHECK(matches);
			
 
				+
			
 
				+    // Now do the data-dependent increment by indexing our "all ones" array. The
			
 
				+    // index into `all_ones` is analogous to the index into a group of hashtable
			
 
				+    // entries.
			
 
				+    i = (i + all_ones[matches.index()]) & (BenchSize - 1);
			
 
				+  }
			
 
				+}
			
 
				+BENCHMARK(BM_LoadMatch<BenchKind::Random>);
			
 
				+BENCHMARK(BM_LoadMatch<BenchKind::Empty>);
			
 
				+BENCHMARK(BM_LoadMatch<BenchKind::Deleted>);
			
 
				+#if CARBON_NEON_SIMD_SUPPORT || CARBON_X86_SIMD_SUPPORT
			
 
				+BENCHMARK(BM_LoadMatch<BenchKind::Random, PortableGroup>);
			
 
				+BENCHMARK(BM_LoadMatch<BenchKind::Empty, PortableGroup>);
			
 
				+BENCHMARK(BM_LoadMatch<BenchKind::Deleted, PortableGroup>);
			
 
				+BENCHMARK(BM_LoadMatch<BenchKind::Random, SIMDGroup>);
			
 
				+BENCHMARK(BM_LoadMatch<BenchKind::Empty, SIMDGroup>);
			
 
				+BENCHMARK(BM_LoadMatch<BenchKind::Deleted, SIMDGroup>);
			
 
				+#endif
			
 
				+
			
 
				+// Benchmark that measures the speed of a match that is only found after at
			
 
				+// least one miss. Because the first match doesn't work, this covers
			
 
				+// incrementing to the next match, with a number of increments taken from the
			
 
				+// `Step` template parameter.
			
 
				+template <BenchKind Kind, ssize_t Steps>
			
 
				+static void BM_LoadMatchMissSteps(benchmark::State& s) {
			
 
				+  static_assert(Steps > 0);
			
 
				+  static_assert(Steps <= GroupSize);
			
 
				+
			
 
				+  // We pick the benchmark metadata at index `Steps - 1`, which will have
			
 
				+  // `Steps` matches within each group.
			
 
				+  BenchMetadata bm = bench_metadata<Kind>[Steps - 1];
			
 
				+
			
 
				+  // We want to make the index used by the next iteration of the benchmark have
			
 
				+  // a data dependency on the result of matching. A match produces an index into
			
 
				+  // the group of metadata. To consume this match in a way that is
			
 
				+  // representative of how it will be used in a hashtable (indexing into an
			
 
				+  // array of entries), while establishing that dependence, we keep a
			
 
				+  // group-sized array of the value `1` in memory that we can index into to
			
 
				+  // increment to the next step of the loop. We do have to hide the contents of
			
 
				+  // the loop from the optimizer by clobbering the memory.
			
 
				+  ssize_t all_ones[GroupSize];
			
 
				+  for (ssize_t& n : all_ones) {
			
 
				+    n = 1;
			
 
				+  }
			
 
				+  benchmark::ClobberMemory();
			
 
				+
			
 
				+  // We don't want the optimizer to peel iterations off of this loop, so hide
			
 
				+  // the starting index.
			
 
				+  ssize_t i = 0;
			
 
				+  benchmark::DoNotOptimize(i);
			
 
				+
			
 
				+  // This loop looks *really* attractive to unroll to the compiler. However,
			
 
				+  // that can easily overlap some of the memory operations and generally makes
			
 
				+  // it harder to analyze the exact operation sequence we care about.
			
 
				+#pragma clang loop unroll(disable)
			
 
				+  for (auto _ : s) {
			
 
				+    auto g = MetadataGroup::Load(bm.metadata.data(), i * GroupSize);
			
 
				+    auto matched_range = g.Match(bm.bytes[i]);
			
 
				+
			
 
				+    // We don't use a `CARBON_CHECK` here as the loop below will test the range
			
 
				+    // to see if the loop should be skipped, replicating the test that we also
			
 
				+    // expect in hashtable usage.
			
 
				+
			
 
				+    // We want to simulate the code sequence a hashtable would produce when
			
 
				+    // matching indices are "misses" in the hashtable, but only the aspects of
			
 
				+    // those that reflect on the specific *match* implementation's generated
			
 
				+    // code and performance. For each index in the match, we locate it in the
			
 
				+    // `matched_range`, extract it as an index, and use that to index a
			
 
				+    // group-sized array. We read memory from that array to increment `indices`,
			
 
				+    // establishing data dependencies on each match index. This loop will run
			
 
				+    // exactly `Steps` times.
			
 
				+    ssize_t indices = 0;
			
 
				+    for (ssize_t index : matched_range) {
			
 
				+      indices += all_ones[index];
			
 
				+    }
			
 
				+
			
 
				+    // We want to propagate the data dependencies accumulated into `indices`
			
 
				+    // into the next value of `i`, and we know exactly how many increments were
			
 
				+    // done in the loop, so subtract that constant and add one to arrive back at
			
 
				+    // an increment of 1.
			
 
				+    i = (i + (indices - Steps + 1)) & (BenchSize - 1);
			
 
				+  }
			
 
				+}
			
 
				+BENCHMARK(BM_LoadMatchMissSteps<BenchKind::Random, 1>);
			
 
				+BENCHMARK(BM_LoadMatchMissSteps<BenchKind::Random, 2>);
			
 
				+BENCHMARK(BM_LoadMatchMissSteps<BenchKind::Random, 4>);
			
 
				+BENCHMARK(BM_LoadMatchMissSteps<BenchKind::Random, 8>);
			
 
				+#if CARBON_USE_X86_SIMD_CONTROL_GROUP
			
 
				+BENCHMARK(BM_LoadMatchMissSteps<BenchKind::Random, 12>);
			
 
				+BENCHMARK(BM_LoadMatchMissSteps<BenchKind::Random, 16>);
			
 
				+#endif
			
 
				+
			
 
				+}  // namespace
			
 
				+}  // namespace Carbon::RawHashtable
			
--- a/common/raw_hashtable_metadata_group_benchmark_test.sh
+++ b/common/raw_hashtable_metadata_group_benchmark_test.sh
@@ -0,0 +1,11 @@
 
				+#!/usr/bin/env bash
			
 
				+#
			
 
				+# Part of the Carbon Language project, under the Apache License v2.0 with LLVM
			
 
				+# Exceptions. See /LICENSE for license information.
			
 
				+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
			
 
				+
			
 
				+BENCHMARK="$TEST_SRCDIR/$TEST_WORKSPACE/common/raw_hashtable_metadata_group_benchmark"
			
 
				+
			
 
				+exec "$BENCHMARK" \
			
 
				+  --benchmark_counters_tabular=true \
			
 
				+  --benchmark_min_time=1x
			
--- a/common/raw_hashtable_test_helpers.h
+++ b/common/raw_hashtable_test_helpers.h
@@ -0,0 +1,115 @@
 
				+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
			
 
				+// Exceptions. See /LICENSE for license information.
			
 
				+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
			
 
				+
			
 
				+#ifndef CARBON_COMMON_RAW_HASHTABLE_TEST_HELPERS_H_
			
 
				+#define CARBON_COMMON_RAW_HASHTABLE_TEST_HELPERS_H_
			
 
				+
			
 
				+#include <compare>
			
 
				+
			
 
				+#include "common/check.h"
			
 
				+#include "common/hashing.h"
			
 
				+#include "common/hashtable_key_context.h"
			
 
				+#include "common/ostream.h"
			
 
				+
			
 
				+namespace Carbon::RawHashtable {
			
 
				+
			
 
				+// Non-trivial type for testing.
			
 
				+struct TestData : Printable<TestData> {
			
 
				+  int value;
			
 
				+
			
 
				+  // NOLINTNEXTLINE: google-explicit-constructor
			
 
				+  TestData(int v) : value(v) { CARBON_CHECK(value >= 0); }
			
 
				+  ~TestData() {
			
 
				+    CARBON_CHECK(value >= 0);
			
 
				+    value = -1;
			
 
				+  }
			
 
				+  TestData(const TestData& other) : TestData(other.value) {}
			
 
				+  TestData(TestData&& other) noexcept : TestData(other.value) {
			
 
				+    other.value = 0;
			
 
				+  }
			
 
				+  auto Print(llvm::raw_ostream& out) const -> void { out << value; }
			
 
				+
			
 
				+  friend auto operator==(TestData lhs, TestData rhs) -> bool {
			
 
				+    return lhs.value == rhs.value;
			
 
				+  }
			
 
				+
			
 
				+  friend auto operator<=>(TestData lhs, TestData rhs) -> std::strong_ordering {
			
 
				+    return lhs.value <=> rhs.value;
			
 
				+  }
			
 
				+
			
 
				+  friend auto CarbonHashValue(TestData data, uint64_t seed) -> HashCode {
			
 
				+    return Carbon::HashValue(data.value, seed);
			
 
				+  }
			
 
				+};
			
 
				+
			
 
				+// Test stateless key context that produces different hashes from normal.
			
 
				+// Changing the hash values should result in test failures if the context ever
			
 
				+// fails to be used.
			
 
				+struct TestKeyContext : DefaultKeyContext {
			
 
				+  template <typename KeyT>
			
 
				+  auto HashKey(const KeyT& key, uint64_t seed) const -> HashCode {
			
 
				+    Hasher hash(seed);
			
 
				+    // Inject some other data to the hash.
			
 
				+    hash.Hash(42);
			
 
				+    hash.Hash(HashValue(key));
			
 
				+    return static_cast<HashCode>(hash);
			
 
				+  }
			
 
				+};
			
 
				+
			
 
				+// Hostile fixed hashing key context used for stress testing. Allows control
			
 
				+// over which parts of the hash will be forced to collide, and the values they
			
 
				+// are coerced to. Note that this relies on implementation details and internals
			
 
				+// of `HashCode`.
			
 
				+template <int TagBits, bool FixIndexBits, bool FixTagBits, uint64_t FixedVal>
			
 
				+struct FixedHashKeyContext : DefaultKeyContext {
			
 
				+  template <typename KeyT>
			
 
				+  auto HashKey(const KeyT& key, uint64_t seed) const -> HashCode {
			
 
				+    HashCode original_hash = HashValue(key, seed);
			
 
				+    auto raw_hash = static_cast<uint64_t>(original_hash);
			
 
				+
			
 
				+    constexpr uint64_t TagMask = (1U << TagBits) - 1;
			
 
				+    if (FixIndexBits) {
			
 
				+      raw_hash &= TagMask;
			
 
				+      raw_hash |= FixedVal << TagBits;
			
 
				+      CARBON_DCHECK(HashCode(raw_hash).ExtractIndexAndTag<TagBits>().first ==
			
 
				+                    (FixedVal & (~static_cast<uint64_t>(0) >> TagBits)));
			
 
				+    }
			
 
				+    if (FixTagBits) {
			
 
				+      raw_hash &= ~TagMask;
			
 
				+      raw_hash |= FixedVal & TagMask;
			
 
				+      CARBON_DCHECK(HashCode(raw_hash).ExtractIndexAndTag<TagBits>().second ==
			
 
				+                    (FixedVal & TagMask));
			
 
				+    }
			
 
				+    return HashCode(raw_hash);
			
 
				+  }
			
 
				+};
			
 
				+
			
 
				+template <typename T>
			
 
				+class IndexKeyContext {
			
 
				+ public:
			
 
				+  explicit IndexKeyContext(llvm::ArrayRef<T> array) : array_(array) {}
			
 
				+
			
 
				+  auto HashKey(const T& value, uint64_t seed) const -> HashCode {
			
 
				+    return HashValue(value, seed);
			
 
				+  }
			
 
				+  auto HashKey(ssize_t index, uint64_t seed) const -> HashCode {
			
 
				+    return HashKey(array_[index], seed);
			
 
				+  }
			
 
				+
			
 
				+  auto KeyEq(const T& lhs, ssize_t rhs_index) const -> bool {
			
 
				+    return lhs == array_[rhs_index];
			
 
				+  }
			
 
				+  auto KeyEq(ssize_t lhs_index, ssize_t rhs_index) const -> bool {
			
 
				+    // No need to compare the elements, if the indices are equal, the values
			
 
				+    // must be.
			
 
				+    return lhs_index == rhs_index;
			
 
				+  }
			
 
				+
			
 
				+ private:
			
 
				+  llvm::ArrayRef<T> array_;
			
 
				+};
			
 
				+
			
 
				+}  // namespace Carbon::RawHashtable
			
 
				+
			
 
				+#endif  // CARBON_COMMON_RAW_HASHTABLE_TEST_HELPERS_H_
			
--- a/common/set.h
+++ b/common/set.h
@@ -0,0 +1,358 @@
 
				+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
			
 
				+// Exceptions. See /LICENSE for license information.
			
 
				+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
			
 
				+
			
 
				+#ifndef CARBON_COMMON_SET_H_
			
 
				+#define CARBON_COMMON_SET_H_
			
 
				+
			
 
				+#include <concepts>
			
 
				+
			
 
				+#include "common/check.h"
			
 
				+#include "common/hashtable_key_context.h"
			
 
				+#include "common/raw_hashtable.h"
			
 
				+#include "llvm/Support/Compiler.h"
			
 
				+
			
 
				+namespace Carbon {
			
 
				+
			
 
				+// Forward declarations to resolve cyclic references.
			
 
				+template <typename KeyT, typename KeyContextT>
			
 
				+class SetView;
			
 
				+template <typename KeyT, typename KeyContextT>
			
 
				+class SetBase;
			
 
				+template <typename KeyT, ssize_t SmallSize, typename KeyContextT>
			
 
				+class Set;
			
 
				+
			
 
				+// A read-only view type for a set of keys.
			
 
				+//
			
 
				+// This view is a cheap-to-copy type that should be passed by value, but
			
 
				+// provides view or read-only reference semantics to the underlying set data
			
 
				+// structure.
			
 
				+//
			
 
				+// This should always be preferred to a `const`-ref parameter for the `SetBase`
			
 
				+// or `Set` type as it provides more flexibility and a cleaner API.
			
 
				+//
			
 
				+// Note that while this type is a read-only view, that applies to the underlying
			
 
				+// *set* data structure, not the individual entries stored within it. Those can
			
 
				+// be mutated freely as long as both the hashes and equality of the keys are
			
 
				+// preserved. If we applied a deep-`const` design here, it would prevent using
			
 
				+// this type in situations where the keys carry state (unhashed and not part of
			
 
				+// equality) that is mutated while the associative container is not. A view of
			
 
				+// immutable data can always be obtained by using `SetView<const T>`, and we
			
 
				+// enable conversions to more-const views. This mirrors the semantics of views
			
 
				+// like `std::span`.
			
 
				+//
			
 
				+// A specific `KeyContextT` type can optionally be provided to configure how
			
 
				+// keys will be hashed and compared. The default is `DefaultKeyContext` which is
			
 
				+// stateless and will hash using `Carbon::HashValue` and compare using
			
 
				+// `operator==`. Every method accepting a lookup key or operating on the keys in
			
 
				+// the table will also accept an instance of this type. For stateless context
			
 
				+// types, including the default, an instance will be default constructed if not
			
 
				+// provided to these methods. However, stateful contexts should be constructed
			
 
				+// and passed in explicitly. The context type should be small and reasonable to
			
 
				+// pass by value, often a wrapper or pointer to the relevant context needed for
			
 
				+// hashing and comparing keys. For more details about the key context, see
			
 
				+// `hashtable_key_context.h`.
			
 
				+template <typename InputKeyT, typename InputKeyContextT = DefaultKeyContext>
			
 
				+class SetView : RawHashtable::ViewImpl<InputKeyT, void, InputKeyContextT> {
			
 
				+  using ImplT = RawHashtable::ViewImpl<InputKeyT, void, InputKeyContextT>;
			
 
				+
			
 
				+ public:
			
 
				+  using KeyT = typename ImplT::KeyT;
			
 
				+  using KeyContextT = typename ImplT::KeyContextT;
			
 
				+
			
 
				+  // This type represents the result of lookup operations. It encodes whether
			
 
				+  // the lookup was a success as well as accessors for the key.
			
 
				+  class LookupResult {
			
 
				+   public:
			
 
				+    LookupResult() = default;
			
 
				+    explicit LookupResult(KeyT& key) : key_(&key) {}
			
 
				+
			
 
				+    explicit operator bool() const { return key_ != nullptr; }
			
 
				+
			
 
				+    auto key() const -> KeyT& { return *key_; }
			
 
				+
			
 
				+   private:
			
 
				+    KeyT* key_ = nullptr;
			
 
				+  };
			
 
				+
			
 
				+  // Enable implicit conversions that add `const`-ness to the key type.
			
 
				+  // NOLINTNEXTLINE(google-explicit-constructor)
			
 
				+  SetView(SetView<std::remove_const_t<KeyT>, KeyContextT> other_view)
			
 
				+    requires(!std::same_as<KeyT, std::remove_const_t<KeyT>>)
			
 
				+      : ImplT(other_view) {}
			
 
				+
			
 
				+  // Tests whether a key is present in the set.
			
 
				+  template <typename LookupKeyT>
			
 
				+  auto Contains(LookupKeyT lookup_key,
			
 
				+                KeyContextT key_context = KeyContextT()) const -> bool;
			
 
				+
			
 
				+  // Lookup a key in the set.
			
 
				+  template <typename LookupKeyT>
			
 
				+  auto Lookup(LookupKeyT lookup_key,
			
 
				+              KeyContextT key_context = KeyContextT()) const -> LookupResult;
			
 
				+
			
 
				+  // Run the provided callback for every key in the set.
			
 
				+  template <typename CallbackT>
			
 
				+  void ForEach(CallbackT callback)
			
 
				+    requires(std::invocable<CallbackT, KeyT&>);
			
 
				+
			
 
				+  // This routine is relatively inefficient and only intended for use in
			
 
				+  // benchmarking or logging of performance anomalies. The specific count
			
 
				+  // returned has no specific guarantees beyond being informative in benchmarks.
			
 
				+  // It counts how many of the keys in the hashtable have required probing
			
 
				+  // beyond their initial group of slots.
			
 
				+  //
			
 
				+  // TODO: Replace with a more general metrics routine that covers other
			
 
				+  // important aspects such as load factor, and average probe *distance*.
			
 
				+  auto CountProbedKeys(KeyContextT key_context = KeyContextT()) -> ssize_t {
			
 
				+    return ImplT::CountProbedKeys(key_context);
			
 
				+  }
			
 
				+
			
 
				+ private:
			
 
				+  template <typename SetKeyT, ssize_t SmallSize, typename KeyContextT>
			
 
				+  friend class Set;
			
 
				+  friend class SetBase<KeyT, KeyContextT>;
			
 
				+  friend class SetView<const KeyT, KeyContextT>;
			
 
				+
			
 
				+  using EntryT = typename ImplT::EntryT;
			
 
				+
			
 
				+  SetView() = default;
			
 
				+  // NOLINTNEXTLINE(google-explicit-constructor): Implicit by design.
			
 
				+  SetView(ImplT base) : ImplT(base) {}
			
 
				+  SetView(ssize_t size, RawHashtable::Storage* storage)
			
 
				+      : ImplT(size, storage) {}
			
 
				+};
			
 
				+
			
 
				+// A base class for a `Set` type that remains mutable while type-erasing the
			
 
				+// `SmallSize` (SSO) template parameter.
			
 
				+//
			
 
				+// A pointer or reference to this type is the preferred way to pass a mutable
			
 
				+// handle to a `Set` type across API boundaries as it avoids encoding specific
			
 
				+// SSO sizing information while providing a near-complete mutable API.
			
 
				+template <typename InputKeyT, typename InputKeyContextT>
			
 
				+class SetBase
			
 
				+    : protected RawHashtable::BaseImpl<InputKeyT, void, InputKeyContextT> {
			
 
				+ protected:
			
 
				+  using ImplT = RawHashtable::BaseImpl<InputKeyT, void, InputKeyContextT>;
			
 
				+
			
 
				+ public:
			
 
				+  using KeyT = typename ImplT::KeyT;
			
 
				+  using KeyContextT = typename ImplT::KeyContextT;
			
 
				+  using ViewT = SetView<KeyT, KeyContextT>;
			
 
				+  using LookupResult = typename ViewT::LookupResult;
			
 
				+
			
 
				+  // The result type for insertion operations both indicates whether an insert
			
 
				+  // was needed (as opposed to the key already being in the set), and provides
			
 
				+  // access to the key.
			
 
				+  class InsertResult {
			
 
				+   public:
			
 
				+    InsertResult() = default;
			
 
				+    explicit InsertResult(bool inserted, KeyT& key)
			
 
				+        : key_(&key), inserted_(inserted) {}
			
 
				+
			
 
				+    auto is_inserted() const -> bool { return inserted_; }
			
 
				+
			
 
				+    auto key() const -> KeyT& { return *key_; }
			
 
				+
			
 
				+   private:
			
 
				+    KeyT* key_;
			
 
				+    bool inserted_;
			
 
				+  };
			
 
				+
			
 
				+  // Implicitly convertible to the relevant view type.
			
 
				+  //
			
 
				+  // NOLINTNEXTLINE(google-explicit-constructor): Designed to implicitly decay.
			
 
				+  operator ViewT() const { return this->view_impl(); }
			
 
				+
			
 
				+  // We can't chain the above conversion with the conversions on `ViewT` to add
			
 
				+  // const, so explicitly support adding const to produce a view here.
			
 
				+  //
			
 
				+  // NOLINTNEXTLINE(google-explicit-constructor): Designed to implicitly decay.
			
 
				+  operator SetView<const KeyT, KeyContextT>() const { return ViewT(*this); }
			
 
				+
			
 
				+  // Convenience forwarder to the view type.
			
 
				+  template <typename LookupKeyT>
			
 
				+  auto Contains(LookupKeyT lookup_key,
			
 
				+                KeyContextT key_context = KeyContextT()) const -> bool {
			
 
				+    return ViewT(*this).Contains(lookup_key, key_context);
			
 
				+  }
			
 
				+
			
 
				+  // Convenience forwarder to the view type.
			
 
				+  template <typename LookupKeyT>
			
 
				+  auto Lookup(LookupKeyT lookup_key,
			
 
				+              KeyContextT key_context = KeyContextT()) const -> LookupResult {
			
 
				+    return ViewT(*this).Lookup(lookup_key, key_context);
			
 
				+  }
			
 
				+
			
 
				+  // Convenience forwarder to the view type.
			
 
				+  template <typename CallbackT>
			
 
				+  void ForEach(CallbackT callback)
			
 
				+    requires(std::invocable<CallbackT, KeyT&>)
			
 
				+  {
			
 
				+    return ViewT(*this).ForEach(callback);
			
 
				+  }
			
 
				+
			
 
				+  // Convenience forwarder to the view type.
			
 
				+  auto CountProbedKeys(KeyContextT key_context = KeyContextT()) const
			
 
				+      -> ssize_t {
			
 
				+    return ViewT(*this).CountProbedKeys(key_context);
			
 
				+  }
			
 
				+
			
 
				+  // Insert a key into the set. If the key is already present, no insertion is
			
 
				+  // performed and that present key is available in the result. Otherwise a new
			
 
				+  // key is inserted and constructed from the argument and available in the
			
 
				+  // result.
			
 
				+  template <typename LookupKeyT>
			
 
				+  auto Insert(LookupKeyT lookup_key, KeyContextT key_context = KeyContextT())
			
 
				+      -> InsertResult;
			
 
				+
			
 
				+  // Insert a key into the set and call the provided callback to allow in-place
			
 
				+  // construction of the key if not already present. The lookup key is passed
			
 
				+  // through to the callback so it needn't be captured and can be kept in a
			
 
				+  // register argument throughout.
			
 
				+  //
			
 
				+  // Example:
			
 
				+  // ```cpp
			
 
				+  //   m.Insert("widget", [](MyStringViewType lookup_key, void* key_storage) {
			
 
				+  //     new (key_storage) MyStringType(lookup_key);
			
 
				+  //   });
			
 
				+  // ```
			
 
				+  template <typename LookupKeyT, typename InsertCallbackT>
			
 
				+  auto Insert(LookupKeyT lookup_key, InsertCallbackT insert_cb,
			
 
				+              KeyContextT key_context = KeyContextT()) -> InsertResult
			
 
				+    requires std::invocable<InsertCallbackT, LookupKeyT, void*>;
			
 
				+
			
 
				+  // Erase a key from the set.
			
 
				+  template <typename LookupKeyT>
			
 
				+  auto Erase(LookupKeyT lookup_key, KeyContextT key_context = KeyContextT())
			
 
				+      -> bool;
			
 
				+
			
 
				+  // Clear all key/value pairs from the set but leave the underlying hashtable
			
 
				+  // allocated and in place.
			
 
				+  void Clear();
			
 
				+
			
 
				+ protected:
			
 
				+  using ImplT::ImplT;
			
 
				+};
			
 
				+
			
 
				+// A data structure for a set of keys.
			
 
				+//
			
 
				+// This set supports small size optimization (or "SSO"). The provided
			
 
				+// `SmallSize` type parameter indicates the size of an embedded buffer for
			
 
				+// storing sets small enough to fit. The default is zero, which always allocates
			
 
				+// a heap buffer on construction. When non-zero, must be a multiple of the
			
 
				+// `MaxGroupSize` which is currently 16. The library will check that the size is
			
 
				+// valid and provide an error at compile time if not. We don't automatically
			
 
				+// select the next multiple or otherwise fit the size to the constraints to make
			
 
				+// it clear in the code how much memory is used by the SSO buffer.
			
 
				+//
			
 
				+// This data structure optimizes heavily for small key types that are cheap to
			
 
				+// move and even copy. Using types with large keys or expensive to copy keys may
			
 
				+// create surprising performance bottlenecks. A `std::string` key should be fine
			
 
				+// with generally small strings, but if some or many strings are large heap
			
 
				+// allocations the performance of hashtable routines may be unacceptably bad and
			
 
				+// another data structure or key design is likely preferable.
			
 
				+//
			
 
				+// Note that this type should typically not appear on API boundaries; either
			
 
				+// `SetBase` or `SetView` should be used instead.
			
 
				+template <typename InputKeyT, ssize_t SmallSize = 0,
			
 
				+          typename InputKeyContextT = DefaultKeyContext>
			
 
				+class Set : public RawHashtable::TableImpl<SetBase<InputKeyT, InputKeyContextT>,
			
 
				+                                           SmallSize> {
			
 
				+  using BaseT = SetBase<InputKeyT, InputKeyContextT>;
			
 
				+  using ImplT = RawHashtable::TableImpl<BaseT, SmallSize>;
			
 
				+
			
 
				+ public:
			
 
				+  using KeyT = typename BaseT::KeyT;
			
 
				+
			
 
				+  Set() = default;
			
 
				+  Set(const Set& arg) = default;
			
 
				+  Set(Set&& arg) noexcept = default;
			
 
				+
			
 
				+  // Reset the entire state of the hashtable to as it was when constructed,
			
 
				+  // throwing away any intervening allocations.
			
 
				+  void Reset();
			
 
				+};
			
 
				+
			
 
				+template <typename InputKeyT, typename InputKeyContextT>
			
 
				+template <typename LookupKeyT>
			
 
				+auto SetView<InputKeyT, InputKeyContextT>::Contains(
			
 
				+    LookupKeyT lookup_key, KeyContextT key_context) const -> bool {
			
 
				+  return this->LookupEntry(lookup_key, key_context) != nullptr;
			
 
				+}
			
 
				+
			
 
				+template <typename InputKeyT, typename InputKeyContextT>
			
 
				+template <typename LookupKeyT>
			
 
				+auto SetView<InputKeyT, InputKeyContextT>::Lookup(LookupKeyT lookup_key,
			
 
				+                                                  KeyContextT key_context) const
			
 
				+    -> LookupResult {
			
 
				+  EntryT* entry = this->LookupEntry(lookup_key, key_context);
			
 
				+  if (!entry) {
			
 
				+    return LookupResult();
			
 
				+  }
			
 
				+
			
 
				+  return LookupResult(entry->key());
			
 
				+}
			
 
				+
			
 
				+template <typename InputKeyT, typename InputKeyContextT>
			
 
				+template <typename CallbackT>
			
 
				+void SetView<InputKeyT, InputKeyContextT>::ForEach(CallbackT callback)
			
 
				+  requires(std::invocable<CallbackT, KeyT&>)
			
 
				+{
			
 
				+  this->ForEachEntry([callback](EntryT& entry) { callback(entry.key()); },
			
 
				+                     [](auto...) {});
			
 
				+}
			
 
				+
			
 
				+template <typename InputKeyT, typename InputKeyContextT>
			
 
				+template <typename LookupKeyT>
			
 
				+auto SetBase<InputKeyT, InputKeyContextT>::Insert(LookupKeyT lookup_key,
			
 
				+                                                  KeyContextT key_context)
			
 
				+    -> InsertResult {
			
 
				+  return Insert(
			
 
				+      lookup_key,
			
 
				+      [](LookupKeyT lookup_key, void* key_storage) {
			
 
				+        new (key_storage) KeyT(std::move(lookup_key));
			
 
				+      },
			
 
				+      key_context);
			
 
				+}
			
 
				+
			
 
				+template <typename InputKeyT, typename InputKeyContextT>
			
 
				+template <typename LookupKeyT, typename InsertCallbackT>
			
 
				+auto SetBase<InputKeyT, InputKeyContextT>::Insert(LookupKeyT lookup_key,
			
 
				+                                                  InsertCallbackT insert_cb,
			
 
				+                                                  KeyContextT key_context)
			
 
				+    -> InsertResult
			
 
				+  requires std::invocable<InsertCallbackT, LookupKeyT, void*>
			
 
				+{
			
 
				+  auto [entry, inserted] = this->InsertImpl(lookup_key, key_context);
			
 
				+  CARBON_DCHECK(entry) << "Should always result in a valid index.";
			
 
				+
			
 
				+  if (LLVM_LIKELY(!inserted)) {
			
 
				+    return InsertResult(false, entry->key());
			
 
				+  }
			
 
				+
			
 
				+  insert_cb(lookup_key, static_cast<void*>(&entry->key_storage));
			
 
				+  return InsertResult(true, entry->key());
			
 
				+}
			
 
				+
			
 
				+template <typename InputKeyT, typename InputKeyContextT>
			
 
				+template <typename LookupKeyT>
			
 
				+auto SetBase<InputKeyT, InputKeyContextT>::Erase(LookupKeyT lookup_key,
			
 
				+                                                 KeyContextT key_context)
			
 
				+    -> bool {
			
 
				+  return this->EraseImpl(lookup_key, key_context);
			
 
				+}
			
 
				+
			
 
				+template <typename InputKeyT, typename InputKeyContextT>
			
 
				+void SetBase<InputKeyT, InputKeyContextT>::Clear() {
			
 
				+  this->ClearImpl();
			
 
				+}
			
 
				+
			
 
				+template <typename InputKeyT, ssize_t SmallSize, typename InputKeyContextT>
			
 
				+void Set<InputKeyT, SmallSize, InputKeyContextT>::Reset() {
			
 
				+  this->ResetImpl();
			
 
				+}
			
 
				+
			
 
				+}  // namespace Carbon
			
 
				+
			
 
				+#endif  // CARBON_COMMON_SET_H_
			
--- a/common/set_benchmark.cpp
+++ b/common/set_benchmark.cpp
@@ -0,0 +1,382 @@
 
				+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
			
 
				+// Exceptions. See /LICENSE for license information.
			
 
				+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
			
 
				+
			
 
				+#include <benchmark/benchmark.h>
			
 
				+
			
 
				+#include "absl/container/flat_hash_set.h"
			
 
				+#include "common/raw_hashtable_benchmark_helpers.h"
			
 
				+#include "common/set.h"
			
 
				+#include "llvm/ADT/DenseSet.h"
			
 
				+
			
 
				+namespace Carbon {
			
 
				+namespace {
			
 
				+
			
 
				+using RawHashtable::CarbonHashDI;
			
 
				+using RawHashtable::GetKeysAndHitKeys;
			
 
				+using RawHashtable::GetKeysAndMissKeys;
			
 
				+using RawHashtable::HitArgs;
			
 
				+using RawHashtable::SizeArgs;
			
 
				+using RawHashtable::ValueToBool;
			
 
				+
			
 
				+template <typename SetT>
			
 
				+struct IsCarbonSetImpl : std::false_type {};
			
 
				+template <typename KT, int MinSmallSize>
			
 
				+struct IsCarbonSetImpl<Set<KT, MinSmallSize>> : std::true_type {};
			
 
				+
			
 
				+template <typename SetT>
			
 
				+static constexpr bool IsCarbonSet = IsCarbonSetImpl<SetT>::value;
			
 
				+
			
 
				+// A wrapper around various set types that we specialize to implement a common
			
 
				+// API used in the benchmarks for various different map data structures that
			
 
				+// support different APIs. The primary template assumes a roughly
			
 
				+// `std::unordered_set` API design, and types with a different API design are
			
 
				+// supported through specializations.
			
 
				+template <typename SetT>
			
 
				+struct SetWrapperImpl {
			
 
				+  using KeyT = typename SetT::key_type;
			
 
				+
			
 
				+  SetT s;
			
 
				+
			
 
				+  auto BenchContains(KeyT k) -> bool { return s.find(k) != s.end(); }
			
 
				+
			
 
				+  auto BenchLookup(KeyT k) -> bool {
			
 
				+    auto it = s.find(k);
			
 
				+    if (it == s.end()) {
			
 
				+      return false;
			
 
				+    }
			
 
				+    // We expect keys to always convert to `true` so directly return that here.
			
 
				+    return ValueToBool(*it);
			
 
				+  }
			
 
				+
			
 
				+  auto BenchInsert(KeyT k) -> bool {
			
 
				+    auto result = s.insert(k);
			
 
				+    return result.second;
			
 
				+  }
			
 
				+
			
 
				+  auto BenchErase(KeyT k) -> bool { return s.erase(k) != 0; }
			
 
				+};
			
 
				+
			
 
				+// Explicit (partial) specialization for the Carbon map type that uses its
			
 
				+// different API design.
			
 
				+template <typename KT, int MinSmallSize>
			
 
				+struct SetWrapperImpl<Set<KT, MinSmallSize>> {
			
 
				+  using SetT = Set<KT, MinSmallSize>;
			
 
				+  using KeyT = KT;
			
 
				+
			
 
				+  SetT s;
			
 
				+
			
 
				+  auto BenchContains(KeyT k) -> bool { return s.Contains(k); }
			
 
				+
			
 
				+  auto BenchLookup(KeyT k) -> bool {
			
 
				+    auto result = s.Lookup(k);
			
 
				+    if (!result) {
			
 
				+      return false;
			
 
				+    }
			
 
				+    return ValueToBool(result.key());
			
 
				+  }
			
 
				+
			
 
				+  auto BenchInsert(KeyT k) -> bool {
			
 
				+    auto result = s.Insert(k);
			
 
				+    return result.is_inserted();
			
 
				+  }
			
 
				+
			
 
				+  auto BenchErase(KeyT k) -> bool { return s.Erase(k); }
			
 
				+};
			
 
				+
			
 
				+// Provide a way to override the Carbon Set specific benchmark runs with another
			
 
				+// hashtable implementation. When building, you can use one of these enum names
			
 
				+// in a macro define such as `-DCARBON_SET_BENCH_OVERRIDE=Name` in order to
			
 
				+// trigger a specific override for the `Set` type benchmarks. This is used to
			
 
				+// get before/after runs that compare the performance of Carbon's Set versus
			
 
				+// other implementations.
			
 
				+enum class SetOverride : uint8_t {
			
 
				+  Abseil,
			
 
				+  LLVM,
			
 
				+  LLVMAndCarbonHash,
			
 
				+};
			
 
				+template <typename SetT, SetOverride Override>
			
 
				+struct SetWrapperOverride : SetWrapperImpl<SetT> {};
			
 
				+
			
 
				+template <typename KeyT, int MinSmallSize>
			
 
				+struct SetWrapperOverride<Set<KeyT, MinSmallSize>, SetOverride::Abseil>
			
 
				+    : SetWrapperImpl<absl::flat_hash_set<KeyT>> {};
			
 
				+
			
 
				+template <typename KeyT, int MinSmallSize>
			
 
				+struct SetWrapperOverride<Set<KeyT, MinSmallSize>, SetOverride::LLVM>
			
 
				+    : SetWrapperImpl<llvm::DenseSet<KeyT>> {};
			
 
				+
			
 
				+template <typename KeyT, int MinSmallSize>
			
 
				+struct SetWrapperOverride<Set<KeyT, MinSmallSize>,
			
 
				+                          SetOverride::LLVMAndCarbonHash>
			
 
				+    : SetWrapperImpl<llvm::DenseSet<KeyT, CarbonHashDI<KeyT>>> {};
			
 
				+
			
 
				+#ifndef CARBON_SET_BENCH_OVERRIDE
			
 
				+template <typename SetT>
			
 
				+using SetWrapper = SetWrapperImpl<SetT>;
			
 
				+#else
			
 
				+template <typename SetT>
			
 
				+using SetWrapper =
			
 
				+    SetWrapperOverride<SetT, SetOverride::CARBON_SET_BENCH_OVERRIDE>;
			
 
				+#endif
			
 
				+
			
 
				+// NOLINTBEGIN(bugprone-macro-parentheses): Parentheses are incorrect here.
			
 
				+#define MAP_BENCHMARK_ONE_OP_SIZE(NAME, APPLY, KT)        \
			
 
				+  BENCHMARK(NAME<Set<KT>>)->Apply(APPLY);                 \
			
 
				+  BENCHMARK(NAME<absl::flat_hash_set<KT>>)->Apply(APPLY); \
			
 
				+  BENCHMARK(NAME<llvm::DenseSet<KT>>)->Apply(APPLY);      \
			
 
				+  BENCHMARK(NAME<llvm::DenseSet<KT, CarbonHashDI<KT>>>)->Apply(APPLY)
			
 
				+// NOLINTEND(bugprone-macro-parentheses)
			
 
				+
			
 
				+#define MAP_BENCHMARK_ONE_OP(NAME, APPLY)       \
			
 
				+  MAP_BENCHMARK_ONE_OP_SIZE(NAME, APPLY, int);  \
			
 
				+  MAP_BENCHMARK_ONE_OP_SIZE(NAME, APPLY, int*); \
			
 
				+  MAP_BENCHMARK_ONE_OP_SIZE(NAME, APPLY, llvm::StringRef)
			
 
				+
			
 
				+// Benchmark the "latency" of testing for a key in a set. This always tests with
			
 
				+// a key that is found.
			
 
				+//
			
 
				+// However, because the key is always found and because the test ultimately
			
 
				+// involves conditional control flow that can be predicted, we expect modern
			
 
				+// CPUs to perfectly predict the control flow here and turn the measurement from
			
 
				+// one iteration to the next into a throughput measurement rather than a real
			
 
				+// latency measurement.
			
 
				+//
			
 
				+// However, this does represent a particularly common way in which a set data
			
 
				+// structure is accessed. The numbers should just be carefully interpreted in
			
 
				+// the context of being more a reflection of reciprocal throughput than actual
			
 
				+// latency. See the `Lookup` benchmarks for a genuine latency measure with its
			
 
				+// own caveats.
			
 
				+//
			
 
				+// However, this does still show some interesting caching effects when querying
			
 
				+// large fractions of large tables, and can give a sense of the inescapable
			
 
				+// magnitude of these effects even when there is a great deal of prediction and
			
 
				+// speculative execution to hide memory access latency.
			
 
				+template <typename SetT>
			
 
				+static void BM_SetContainsHitPtr(benchmark::State& state) {
			
 
				+  using SetWrapperT = SetWrapper<SetT>;
			
 
				+  using KT = typename SetWrapperT::KeyT;
			
 
				+  SetWrapperT s;
			
 
				+  auto [keys, lookup_keys] =
			
 
				+      GetKeysAndHitKeys<KT>(state.range(0), state.range(1));
			
 
				+  for (auto k : keys) {
			
 
				+    s.BenchInsert(k);
			
 
				+  }
			
 
				+  ssize_t lookup_keys_size = lookup_keys.size();
			
 
				+
			
 
				+  while (state.KeepRunningBatch(lookup_keys_size)) {
			
 
				+    for (ssize_t i = 0; i < lookup_keys_size;) {
			
 
				+      // We block optimizing `i` as that has proven both more effective at
			
 
				+      // blocking the loop from being optimized away and avoiding disruption of
			
 
				+      // the generated code that we're benchmarking.
			
 
				+      benchmark::DoNotOptimize(i);
			
 
				+
			
 
				+      bool result = s.BenchContains(lookup_keys[i]);
			
 
				+      CARBON_DCHECK(result);
			
 
				+      // We use the lookup success to step through keys, establishing a
			
 
				+      // dependency between each lookup. This doesn't fully allow us to measure
			
 
				+      // latency rather than throughput, as noted above.
			
 
				+      i += static_cast<ssize_t>(result);
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+MAP_BENCHMARK_ONE_OP(BM_SetContainsHitPtr, HitArgs);
			
 
				+
			
 
				+// Benchmark the "latency" (but more likely the reciprocal throughput, see
			
 
				+// comment above) of testing for a key in the set that is *not* present.
			
 
				+template <typename SetT>
			
 
				+static void BM_SetContainsMissPtr(benchmark::State& state) {
			
 
				+  using SetWrapperT = SetWrapper<SetT>;
			
 
				+  using KT = typename SetWrapperT::KeyT;
			
 
				+  SetWrapperT s;
			
 
				+  auto [keys, lookup_keys] = GetKeysAndMissKeys<KT>(state.range(0));
			
 
				+  for (auto k : keys) {
			
 
				+    s.BenchInsert(k);
			
 
				+  }
			
 
				+  ssize_t lookup_keys_size = lookup_keys.size();
			
 
				+
			
 
				+  while (state.KeepRunningBatch(lookup_keys_size)) {
			
 
				+    for (ssize_t i = 0; i < lookup_keys_size;) {
			
 
				+      benchmark::DoNotOptimize(i);
			
 
				+
			
 
				+      bool result = s.BenchContains(lookup_keys[i]);
			
 
				+      CARBON_DCHECK(!result);
			
 
				+      i += static_cast<ssize_t>(!result);
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+MAP_BENCHMARK_ONE_OP(BM_SetContainsMissPtr, SizeArgs);
			
 
				+
			
 
				+// A somewhat contrived latency test for the lookup code path.
			
 
				+//
			
 
				+// While lookups into a set are often (but not always) simply used to influence
			
 
				+// control flow, that style of access produces difficult to evaluate benchmark
			
 
				+// results (see the comments on the `Contains` benchmarks above).
			
 
				+//
			
 
				+// So here we actually access the key in the set and convert that key's value to
			
 
				+// a boolean on the critical path of each iteration. This lets us have a genuine
			
 
				+// latency benchmark of looking up a key in the set, at the expense of being
			
 
				+// somewhat contrived. That said, for usage where the key object is queried or
			
 
				+// operated on in some way once looked up in the set, this will be fairly
			
 
				+// representative of the latency cost from the data structure.
			
 
				+template <typename SetT>
			
 
				+static void BM_SetLookupHitPtr(benchmark::State& state) {
			
 
				+  using SetWrapperT = SetWrapper<SetT>;
			
 
				+  using KT = typename SetWrapperT::KeyT;
			
 
				+  SetWrapperT s;
			
 
				+  auto [keys, lookup_keys] =
			
 
				+      GetKeysAndHitKeys<KT>(state.range(0), state.range(1));
			
 
				+  for (auto k : keys) {
			
 
				+    s.BenchInsert(k);
			
 
				+  }
			
 
				+  ssize_t lookup_keys_size = lookup_keys.size();
			
 
				+
			
 
				+  while (state.KeepRunningBatch(lookup_keys_size)) {
			
 
				+    for (ssize_t i = 0; i < lookup_keys_size;) {
			
 
				+      benchmark::DoNotOptimize(i);
			
 
				+
			
 
				+      bool result = s.BenchLookup(lookup_keys[i]);
			
 
				+      CARBON_DCHECK(result);
			
 
				+      i += static_cast<ssize_t>(result);
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+MAP_BENCHMARK_ONE_OP(BM_SetLookupHitPtr, HitArgs);
			
 
				+
			
 
				+// First erase and then insert the key. The code path will always be the same
			
 
				+// here and so we expect this to largely be a throughput benchmark because of
			
 
				+// branch prediction and speculative execution.
			
 
				+//
			
 
				+// We don't expect erase followed by insertion to be a common user code
			
 
				+// sequence, but we don't have a good way of benchmarking either erase or insert
			
 
				+// in isolation -- each would change the size of the table and thus the next
			
 
				+// iteration's benchmark. And if we try to correct the table size outside of the
			
 
				+// timed region, we end up trying to exclude too fine grained of a region from
			
 
				+// timers to get good measurement data.
			
 
				+//
			
 
				+// Our solution is to benchmark both erase and insertion back to back. We can
			
 
				+// then get a good profile of the code sequence of each, and at least measure
			
 
				+// the sum cost of these reliably. Careful profiling can help attribute that
			
 
				+// cost between erase and insert in order to understand which of the two
			
 
				+// operations is contributing most to any performance artifacts observed.
			
 
				+template <typename SetT>
			
 
				+static void BM_SetEraseInsertHitPtr(benchmark::State& state) {
			
 
				+  using SetWrapperT = SetWrapper<SetT>;
			
 
				+  using KT = typename SetWrapperT::KeyT;
			
 
				+  SetWrapperT s;
			
 
				+  auto [keys, lookup_keys] =
			
 
				+      GetKeysAndHitKeys<KT>(state.range(0), state.range(1));
			
 
				+  for (auto k : keys) {
			
 
				+    s.BenchInsert(k);
			
 
				+  }
			
 
				+  ssize_t lookup_keys_size = lookup_keys.size();
			
 
				+
			
 
				+  while (state.KeepRunningBatch(lookup_keys_size)) {
			
 
				+    for (ssize_t i = 0; i < lookup_keys_size;) {
			
 
				+      benchmark::DoNotOptimize(i);
			
 
				+
			
 
				+      s.BenchErase(lookup_keys[i]);
			
 
				+      benchmark::ClobberMemory();
			
 
				+
			
 
				+      bool inserted = s.BenchInsert(lookup_keys[i]);
			
 
				+      CARBON_DCHECK(inserted);
			
 
				+      i += static_cast<ssize_t>(inserted);
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+MAP_BENCHMARK_ONE_OP(BM_SetEraseInsertHitPtr, HitArgs);
			
 
				+
			
 
				+// NOLINTBEGIN(bugprone-macro-parentheses): Parentheses are incorrect here.
			
 
				+#define MAP_BENCHMARK_OP_SEQ_SIZE(NAME, KT)                  \
			
 
				+  BENCHMARK(NAME<Set<KT>>)->Apply(SizeArgs);                 \
			
 
				+  BENCHMARK(NAME<absl::flat_hash_set<KT>>)->Apply(SizeArgs); \
			
 
				+  BENCHMARK(NAME<llvm::DenseSet<KT>>)->Apply(SizeArgs);      \
			
 
				+  BENCHMARK(NAME<llvm::DenseSet<KT, CarbonHashDI<KT>>>)->Apply(SizeArgs)
			
 
				+// NOLINTEND(bugprone-macro-parentheses)
			
 
				+
			
 
				+#define MAP_BENCHMARK_OP_SEQ(NAME)       \
			
 
				+  MAP_BENCHMARK_OP_SEQ_SIZE(NAME, int);  \
			
 
				+  MAP_BENCHMARK_OP_SEQ_SIZE(NAME, int*); \
			
 
				+  MAP_BENCHMARK_OP_SEQ_SIZE(NAME, llvm::StringRef)
			
 
				+
			
 
				+// This is an interesting, somewhat specialized benchmark that measures the cost
			
 
				+// of inserting a sequence of keys into a set up to some size and then inserting
			
 
				+// a colliding key and throwing away the set.
			
 
				+//
			
 
				+// This is an especially important usage pattern for sets as a large number of
			
 
				+// algorithms essentially look like this, such as collision detection, cycle
			
 
				+// detection, de-duplication, etc.
			
 
				+//
			
 
				+// It also covers both the insert-into-an-empty-slot code path that isn't
			
 
				+// covered elsewhere, and the code path for growing a table to a larger size.
			
 
				+//
			
 
				+// This is the second most important aspect of expected set usage after testing
			
 
				+// for presence. It also nicely lends itself to a single benchmark that covers
			
 
				+// the total cost of this usage pattern.
			
 
				+//
			
 
				+// Because this benchmark operates on whole sets, we also compute the number of
			
 
				+// probed keys for Carbon's set as that is both a general reflection of the
			
 
				+// efficacy of the underlying hash function, and a direct factor that drives the
			
 
				+// cost of these operations.
			
 
				+template <typename SetT>
			
 
				+static void BM_SetInsertSeq(benchmark::State& state) {
			
 
				+  using SetWrapperT = SetWrapper<SetT>;
			
 
				+  using KT = typename SetWrapperT::KeyT;
			
 
				+  constexpr ssize_t LookupKeysSize = 1 << 8;
			
 
				+  auto [keys, lookup_keys] =
			
 
				+      GetKeysAndHitKeys<KT>(state.range(0), LookupKeysSize);
			
 
				+
			
 
				+  // Now build a large shuffled set of keys (with duplicates) we'll use at the
			
 
				+  // end.
			
 
				+  ssize_t i = 0;
			
 
				+  for (auto _ : state) {
			
 
				+    benchmark::DoNotOptimize(i);
			
 
				+
			
 
				+    SetWrapperT s;
			
 
				+    for (auto k : keys) {
			
 
				+      bool inserted = s.BenchInsert(k);
			
 
				+      CARBON_DCHECK(inserted) << "Must be a successful insert!";
			
 
				+    }
			
 
				+
			
 
				+    // Now insert a final random repeated key.
			
 
				+    bool inserted = s.BenchInsert(lookup_keys[i]);
			
 
				+    CARBON_DCHECK(!inserted) << "Must already be in the map!";
			
 
				+
			
 
				+    // Rotate through the shuffled keys.
			
 
				+    i = (i + static_cast<ssize_t>(!inserted)) & (LookupKeysSize - 1);
			
 
				+  }
			
 
				+
			
 
				+  // It can be easier in some cases to think of this as a key-throughput rate of
			
 
				+  // insertion rather than the latency of inserting N keys, so construct the
			
 
				+  // rate counter as well.
			
 
				+  state.counters["KeyRate"] = benchmark::Counter(
			
 
				+      keys.size(), benchmark::Counter::kIsIterationInvariantRate);
			
 
				+
			
 
				+  // Report some extra statistics about the Carbon type.
			
 
				+  if constexpr (IsCarbonSet<SetT>) {
			
 
				+    // Re-build a set outside of the timing loop to look at the statistics
			
 
				+    // rather than the timing.
			
 
				+    SetT s;
			
 
				+    for (auto k : keys) {
			
 
				+      bool inserted = s.Insert(k).is_inserted();
			
 
				+      CARBON_DCHECK(inserted) << "Must be a successful insert!";
			
 
				+    }
			
 
				+
			
 
				+    // While this count is "iteration invariant" (it should be exactly the same
			
 
				+    // for every iteration as the set of keys is the same), we don't use that
			
 
				+    // because it will scale this by the number of iterations. We want to
			
 
				+    // display the probe count of this benchmark *parameter*, not the probe
			
 
				+    // count that resulted from the number of iterations. That means we use the
			
 
				+    // normal counter API without flags.
			
 
				+    state.counters["Probed"] = s.CountProbedKeys();
			
 
				+
			
 
				+    // Uncomment this call to print out statistics about the index-collisions
			
 
				+    // among these keys for debugging:
			
 
				+    //
			
 
				+    // RawHashtable::DumpHashStatistics(raw_keys);
			
 
				+  }
			
 
				+}
			
 
				+MAP_BENCHMARK_OP_SEQ(BM_SetInsertSeq);
			
 
				+
			
 
				+}  // namespace
			
 
				+}  // namespace Carbon
			
--- a/common/set_benchmark_test.sh
+++ b/common/set_benchmark_test.sh
@@ -0,0 +1,12 @@
 
				+#!/usr/bin/env bash
			
 
				+#
			
 
				+# Part of the Carbon Language project, under the Apache License v2.0 with LLVM
			
 
				+# Exceptions. See /LICENSE for license information.
			
 
				+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
			
 
				+
			
 
				+BENCHMARK="$TEST_SRCDIR/$TEST_WORKSPACE/common/set_benchmark"
			
 
				+
			
 
				+exec "$BENCHMARK" \
			
 
				+  --benchmark_counters_tabular=true \
			
 
				+  --benchmark_min_time=1x \
			
 
				+  --benchmark_filter='^[^/]*/[1-9][0-9]{0,3}(/[0-9]+)?$'
			
--- a/common/set_test.cpp
+++ b/common/set_test.cpp
@@ -0,0 +1,218 @@
 
				+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
			
 
				+// Exceptions. See /LICENSE for license information.
			
 
				+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
			
 
				+
			
 
				+#include "common/set.h"
			
 
				+
			
 
				+#include <gmock/gmock.h>
			
 
				+#include <gtest/gtest.h>
			
 
				+
			
 
				+#include <initializer_list>
			
 
				+#include <type_traits>
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "common/raw_hashtable_test_helpers.h"
			
 
				+
			
 
				+namespace Carbon {
			
 
				+namespace {
			
 
				+
			
 
				+using RawHashtable::IndexKeyContext;
			
 
				+using RawHashtable::TestData;
			
 
				+using ::testing::UnorderedElementsAreArray;
			
 
				+
			
 
				+template <typename SetT, typename MatcherRangeT>
			
 
				+void ExpectSetElementsAre(SetT&& s, MatcherRangeT element_matchers) {
			
 
				+  // Collect the elements into a container.
			
 
				+  using KeyT = typename std::remove_reference<SetT>::type::KeyT;
			
 
				+  std::vector<KeyT> entries;
			
 
				+  s.ForEach([&entries](KeyT& k) { entries.push_back(k); });
			
 
				+
			
 
				+  // Use the GoogleMock unordered container matcher to validate and show errors
			
 
				+  // on wrong elements.
			
 
				+  EXPECT_THAT(entries, UnorderedElementsAreArray(element_matchers));
			
 
				+}
			
 
				+
			
 
				+// Allow directly using an initializer list.
			
 
				+template <typename SetT, typename MatcherT>
			
 
				+void ExpectSetElementsAre(SetT&& s,
			
 
				+                          std::initializer_list<MatcherT> element_matchers) {
			
 
				+  std::vector<MatcherT> element_matchers_storage = element_matchers;
			
 
				+  ExpectSetElementsAre(s, element_matchers_storage);
			
 
				+}
			
 
				+
			
 
				+template <typename RangeT, typename... RangeTs>
			
 
				+auto MakeElements(RangeT&& range, RangeTs&&... ranges) {
			
 
				+  std::vector<typename RangeT::value_type> elements;
			
 
				+  auto add_range = [&elements](RangeT&& r) {
			
 
				+    for (const auto&& e : r) {
			
 
				+      elements.push_back(e);
			
 
				+    }
			
 
				+  };
			
 
				+  add_range(std::forward<RangeT>(range));
			
 
				+  (add_range(std::forward<RangeT>(ranges)), ...);
			
 
				+
			
 
				+  return elements;
			
 
				+}
			
 
				+
			
 
				+template <typename SetT>
			
 
				+class SetTest : public ::testing::Test {};
			
 
				+
			
 
				+using Types = ::testing::Types<Set<int>, Set<int, 16>, Set<int, 128>,
			
 
				+                               Set<TestData>, Set<TestData, 16>>;
			
 
				+TYPED_TEST_SUITE(SetTest, Types);
			
 
				+
			
 
				+TYPED_TEST(SetTest, Basic) {
			
 
				+  using SetT = TypeParam;
			
 
				+  SetT s;
			
 
				+
			
 
				+  EXPECT_FALSE(s.Contains(42));
			
 
				+  EXPECT_TRUE(s.Insert(1).is_inserted());
			
 
				+  EXPECT_TRUE(s.Contains(1));
			
 
				+  auto result = s.Lookup(1);
			
 
				+  EXPECT_TRUE(result);
			
 
				+  EXPECT_EQ(1, result.key());
			
 
				+  auto i_result = s.Insert(1);
			
 
				+  EXPECT_FALSE(i_result.is_inserted());
			
 
				+  EXPECT_TRUE(s.Contains(1));
			
 
				+
			
 
				+  // Verify all the elements.
			
 
				+  ExpectSetElementsAre(s, {1});
			
 
				+
			
 
				+  // Fill up a bunch to ensure we trigger growth a few times.
			
 
				+  for (int i : llvm::seq(2, 512)) {
			
 
				+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
			
 
				+    EXPECT_TRUE(s.Insert(i).is_inserted());
			
 
				+  }
			
 
				+  for (int i : llvm::seq(1, 512)) {
			
 
				+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
			
 
				+    EXPECT_TRUE(s.Contains(i));
			
 
				+    EXPECT_FALSE(s.Insert(i).is_inserted());
			
 
				+  }
			
 
				+  EXPECT_FALSE(s.Contains(513));
			
 
				+
			
 
				+  // Verify all the elements.
			
 
				+  ExpectSetElementsAre(s, MakeElements(llvm::seq(1, 512)));
			
 
				+}
			
 
				+
			
 
				+TYPED_TEST(SetTest, FactoryAPI) {
			
 
				+  using SetT = TypeParam;
			
 
				+  SetT s;
			
 
				+  EXPECT_TRUE(s.Insert(1, [](int k, void* key_storage) {
			
 
				+                 return new (key_storage) int(k);
			
 
				+               }).is_inserted());
			
 
				+  ASSERT_TRUE(s.Contains(1));
			
 
				+  // Reinsertion doesn't invoke the callback.
			
 
				+  EXPECT_FALSE(s.Insert(1, [](int, void*) -> int* {
			
 
				+                  llvm_unreachable("Should never be called!");
			
 
				+                }).is_inserted());
			
 
				+}
			
 
				+
			
 
				+TYPED_TEST(SetTest, Copy) {
			
 
				+  using SetT = TypeParam;
			
 
				+
			
 
				+  SetT s;
			
 
				+  // Make sure we exceed the small size for some of the set types, but not all
			
 
				+  // of them, so we cover all the combinations of copying between small and
			
 
				+  // large.
			
 
				+  for (int i : llvm::seq(1, 24)) {
			
 
				+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
			
 
				+    ASSERT_TRUE(s.Insert(i).is_inserted());
			
 
				+  }
			
 
				+
			
 
				+  SetT other_s1 = s;
			
 
				+  ExpectSetElementsAre(other_s1, MakeElements(llvm::seq(1, 24)));
			
 
				+
			
 
				+  // Add some more elements to the original.
			
 
				+  for (int i : llvm::seq(24, 32)) {
			
 
				+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
			
 
				+    ASSERT_TRUE(s.Insert(i).is_inserted());
			
 
				+  }
			
 
				+
			
 
				+  // The first copy doesn't change.
			
 
				+  ExpectSetElementsAre(other_s1, MakeElements(llvm::seq(1, 24)));
			
 
				+
			
 
				+  // A new copy does.
			
 
				+  SetT other_s2 = s;
			
 
				+  ExpectSetElementsAre(other_s2, MakeElements(llvm::seq(1, 32)));
			
 
				+}
			
 
				+
			
 
				+TYPED_TEST(SetTest, Move) {
			
 
				+  using SetT = TypeParam;
			
 
				+
			
 
				+  SetT s;
			
 
				+  // Make sure we exceed the small size for some of the set types, but not all
			
 
				+  // of them, so we cover all the combinations of copying between small and
			
 
				+  // large.
			
 
				+  for (int i : llvm::seq(1, 24)) {
			
 
				+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
			
 
				+    ASSERT_TRUE(s.Insert(i).is_inserted());
			
 
				+  }
			
 
				+
			
 
				+  SetT other_s1 = std::move(s);
			
 
				+  ExpectSetElementsAre(other_s1, MakeElements(llvm::seq(1, 24)));
			
 
				+
			
 
				+  // Add some more elements.
			
 
				+  for (int i : llvm::seq(24, 32)) {
			
 
				+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
			
 
				+    ASSERT_TRUE(other_s1.Insert(i).is_inserted());
			
 
				+  }
			
 
				+  ExpectSetElementsAre(other_s1, MakeElements(llvm::seq(1, 32)));
			
 
				+}
			
 
				+
			
 
				+TYPED_TEST(SetTest, Conversions) {
			
 
				+  using SetT = TypeParam;
			
 
				+  using KeyT = SetT::KeyT;
			
 
				+  SetT s;
			
 
				+  ASSERT_TRUE(s.Insert(1).is_inserted());
			
 
				+  ASSERT_TRUE(s.Insert(2).is_inserted());
			
 
				+  ASSERT_TRUE(s.Insert(3).is_inserted());
			
 
				+  ASSERT_TRUE(s.Insert(4).is_inserted());
			
 
				+
			
 
				+  SetView<KeyT> sv = s;
			
 
				+  SetView<const KeyT> csv = sv;
			
 
				+  SetView<const KeyT> csv2 = s;
			
 
				+  EXPECT_TRUE(sv.Contains(1));
			
 
				+  EXPECT_TRUE(csv.Contains(2));
			
 
				+  EXPECT_TRUE(csv2.Contains(3));
			
 
				+}
			
 
				+
			
 
				+TEST(SetContextTest, Basic) {
			
 
				+  llvm::SmallVector<TestData> keys;
			
 
				+  for (int i : llvm::seq(0, 513)) {
			
 
				+    keys.push_back(i * 100);
			
 
				+  }
			
 
				+  IndexKeyContext<TestData> key_context(keys);
			
 
				+  Set<ssize_t, 0, IndexKeyContext<TestData>> s;
			
 
				+
			
 
				+  EXPECT_FALSE(s.Contains(42, key_context));
			
 
				+  EXPECT_TRUE(s.Insert(1, key_context).is_inserted());
			
 
				+  EXPECT_TRUE(s.Contains(1, key_context));
			
 
				+  auto result = s.Lookup(TestData(100), key_context);
			
 
				+  EXPECT_TRUE(result);
			
 
				+  EXPECT_EQ(1, result.key());
			
 
				+  auto i_result = s.Insert(1, IndexKeyContext<TestData>(keys));
			
 
				+  EXPECT_FALSE(i_result.is_inserted());
			
 
				+  EXPECT_TRUE(s.Contains(1, key_context));
			
 
				+
			
 
				+  // Verify all the elements.
			
 
				+  ExpectSetElementsAre(s, {1});
			
 
				+
			
 
				+  // Fill up a bunch to ensure we trigger growth a few times.
			
 
				+  for (int i : llvm::seq(2, 512)) {
			
 
				+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
			
 
				+    EXPECT_TRUE(s.Insert(i, key_context).is_inserted());
			
 
				+  }
			
 
				+  for (int i : llvm::seq(1, 512)) {
			
 
				+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
			
 
				+    EXPECT_TRUE(s.Contains(i, key_context));
			
 
				+    EXPECT_FALSE(s.Insert(i, key_context).is_inserted());
			
 
				+  }
			
 
				+  EXPECT_FALSE(s.Contains(0, key_context));
			
 
				+  EXPECT_FALSE(s.Contains(512, key_context));
			
 
				+
			
 
				+  // Verify all the elements.
			
 
				+  ExpectSetElementsAre(s, MakeElements(llvm::seq(1, 512)));
			
 
				+}
			
 
				+
			
 
				+}  // namespace
			
 
				+}  // namespace Carbon