Explorar o código

Introduce custom hash table data structures. (#3940)

The hash table design is heavily based on Abseil's ["Swiss
Tables"][swiss-tables] design. It uses an array of bytes storing
metadata about each entry and an array of entries where each is a pair
of key and value. The metadata byte consists of 7-bits of hash of the
key (distinct from the bits used to index the table), and one bit
indicating the presence of a special entry -- either empty or deleted.

[swiss-tables]: https://abseil.io/about/design/swisstables

There are a large range of optimizations and other nuanced aspects of
this hash table design and implementation, a good point to understand
that context is `raw_hashtable.h` which has an overview of the design
and references to various other files for relevant details.

---------

Co-authored-by: josh11b <15258583+josh11b@users.noreply.github.com>
Chandler Carruth hai 1 ano
pai
achega
21a81bc59e

+ 4 - 0
.bazelrc

@@ -9,6 +9,10 @@ build:clang-tidy --aspects @bazel_clang_tidy//clang_tidy:clang_tidy.bzl%clang_ti
 build:clang-tidy --output_groups=report
 build:clang-tidy --@bazel_clang_tidy//:clang_tidy_config=//:clang_tidy_config
 
+# This warning seems to incorrectly fire in this build configuration, despite
+# not firing in our normal builds.
+build:clang-tidy --copt=-Wno-unknown-pragmas
+
 # Default to using a disk cache to minimize re-building LLVM and Clang which we
 # try to avoid updating too frequently to minimize rebuild cost. The location
 # here can be overridden in the user configuration where needed.

+ 1 - 0
.codespell_ignore

@@ -11,6 +11,7 @@ createor
 crossreference
 falsy
 forin
+groupt
 inout
 parameteras
 pullrequest

+ 175 - 0
common/BUILD

@@ -182,6 +182,14 @@ cc_binary(
     ],
 )
 
+cc_library(
+    name = "hashtable_key_context",
+    hdrs = ["hashtable_key_context.h"],
+    deps = [
+        ":hashing",
+    ],
+)
+
 cc_library(
     name = "indirect_value",
     hdrs = ["indirect_value.h"],
@@ -224,6 +232,53 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "map",
+    hdrs = ["map.h"],
+    deps = [
+        ":check",
+        ":hashtable_key_context",
+        ":raw_hashtable",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
+cc_test(
+    name = "map_test",
+    srcs = ["map_test.cpp"],
+    deps = [
+        ":map",
+        ":raw_hashtable_test_helpers",
+        "//testing/base:gtest_main",
+        "//testing/base:test_raw_ostream",
+        "@googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "map_benchmark",
+    testonly = 1,
+    srcs = ["map_benchmark.cpp"],
+    deps = [
+        ":map",
+        ":raw_hashtable_benchmark_helpers",
+        "@abseil-cpp//absl/container:flat_hash_map",
+        "@abseil-cpp//absl/random",
+        "@google_benchmark//:benchmark_main",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
+sh_test(
+    name = "map_benchmark_test",
+    # The benchmark allocates a large amount of memory.
+    size = "enormous",
+    # We configure the test to run quickly.
+    timeout = "short",
+    srcs = ["map_benchmark_test.sh"],
+    data = [":map_benchmark"],
+)
+
 cc_library(
     name = "ostream",
     hdrs = ["ostream.h"],
@@ -232,6 +287,126 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "raw_hashtable",
+    srcs = ["raw_hashtable.cpp"],
+    hdrs = ["raw_hashtable.h"],
+    deps = [
+        ":check",
+        ":hashing",
+        ":hashtable_key_context",
+        ":raw_hashtable_metadata_group",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "raw_hashtable_metadata_group",
+    srcs = ["raw_hashtable_metadata_group.cpp"],
+    hdrs = ["raw_hashtable_metadata_group.h"],
+    deps = [
+        ":check",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
+cc_binary(
+    name = "raw_hashtable_metadata_group_benchmark",
+    testonly = 1,
+    srcs = ["raw_hashtable_metadata_group_benchmark.cpp"],
+    deps = [
+        ":raw_hashtable_metadata_group",
+        "@abseil-cpp//absl/random",
+        "@google_benchmark//:benchmark_main",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
+sh_test(
+    name = "raw_hashtable_metadata_group_benchmark_test",
+    srcs = ["raw_hashtable_metadata_group_benchmark_test.sh"],
+    data = [":raw_hashtable_metadata_group_benchmark"],
+)
+
+cc_library(
+    name = "raw_hashtable_benchmark_helpers",
+    testonly = 1,
+    srcs = ["raw_hashtable_benchmark_helpers.cpp"],
+    hdrs = ["raw_hashtable_benchmark_helpers.h"],
+    copts = [
+        "-O2",  # Always optimize to make testing benchmarks faster.
+    ],
+    deps = [
+        ":check",
+        ":hashing",
+        ":raw_hashtable",
+        ":set",
+        "@abseil-cpp//absl/base:no_destructor",
+        "@abseil-cpp//absl/hash",
+        "@abseil-cpp//absl/random",
+        "@google_benchmark//:benchmark",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "raw_hashtable_test_helpers",
+    testonly = 1,
+    hdrs = ["raw_hashtable_test_helpers.h"],
+    deps = [
+        ":check",
+        ":hashing",
+        ":hashtable_key_context",
+        ":ostream",
+    ],
+)
+
+cc_library(
+    name = "set",
+    hdrs = ["set.h"],
+    deps = [
+        ":check",
+        ":hashtable_key_context",
+        ":raw_hashtable",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
+cc_test(
+    name = "set_test",
+    srcs = ["set_test.cpp"],
+    deps = [
+        ":raw_hashtable_test_helpers",
+        ":set",
+        "//testing/base:gtest_main",
+        "//testing/base:test_raw_ostream",
+        "@googletest//:gtest",
+    ],
+)
+
+cc_binary(
+    name = "set_benchmark",
+    testonly = 1,
+    srcs = ["set_benchmark.cpp"],
+    deps = [
+        ":raw_hashtable_benchmark_helpers",
+        ":set",
+        "@abseil-cpp//absl/container:flat_hash_set",
+        "@google_benchmark//:benchmark_main",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
+sh_test(
+    name = "set_benchmark_test",
+    # The benchmark allocates a large amount of memory.
+    size = "enormous",
+    # We configure the test to run quickly.
+    timeout = "short",
+    srcs = ["set_benchmark_test.sh"],
+    data = [":set_benchmark"],
+)
+
 cc_library(
     name = "string_helpers",
     srcs = ["string_helpers.cpp"],

+ 2 - 2
common/hashing.h

@@ -573,9 +573,9 @@ constexpr auto HashCode::ExtractIndex() -> ssize_t { return value_; }
 template <int N>
 constexpr auto HashCode::ExtractIndexAndTag() -> std::pair<ssize_t, uint32_t> {
   static_assert(N >= 1);
-  static_assert(N <= 32);
+  static_assert(N < 32);
   return {static_cast<ssize_t>(value_ >> N),
-          static_cast<uint32_t>(value_ & ((1U << (N + 1)) - 1))};
+          static_cast<uint32_t>(value_ & ((1U << N) - 1))};
 }
 
 // Building with `-DCARBON_MCA_MARKERS` will enable `llvm-mca` annotations in

+ 6 - 0
common/hashing_test.cpp

@@ -40,6 +40,12 @@ TEST(HashingTest, HashCodeAPI) {
   EXPECT_THAT(a.ExtractIndex(), Ne(b.ExtractIndex()));
   EXPECT_THAT(a.ExtractIndex(), Ne(empty.ExtractIndex()));
 
+  // The tag shouldn't have bits set outside the range requested.
+  EXPECT_THAT(HashValue("a").ExtractIndexAndTag<1>().second & ~0b1, Eq(0));
+  EXPECT_THAT(HashValue("a").ExtractIndexAndTag<2>().second & ~0b11, Eq(0));
+  EXPECT_THAT(HashValue("a").ExtractIndexAndTag<3>().second & ~0b111, Eq(0));
+  EXPECT_THAT(HashValue("a").ExtractIndexAndTag<4>().second & ~0b1111, Eq(0));
+
   // Note that the index produced with a tag may be different from the index
   // alone!
   EXPECT_THAT(HashValue("a").ExtractIndexAndTag<2>(),

+ 85 - 0
common/hashtable_key_context.h

@@ -0,0 +1,85 @@
+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+// Exceptions. See /LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef CARBON_COMMON_HASHTABLE_KEY_CONTEXT_H_
+#define CARBON_COMMON_HASHTABLE_KEY_CONTEXT_H_
+
+#include "common/hashing.h"
+
+namespace Carbon {
+
+// Customizable context for keys in hashtables.
+//
+// This type or customizations matching its API are used with the data
+// structures in `map.h` and `set.h`. By providing a custom version of the
+// `KeyContext` type parameter to those data structures, users can provide
+// either stateless or stateful customization of the two core hashtable key
+// operations: hashing and comparison.
+//
+// The default for hashing uses Carbon's `hashing.h`. Customizations must still
+// return a `HashCode` as defined there, and it needs to have the same core
+// properties of hashes produced by the `hashing.h` infrastructure.
+//
+// The default for comparison is `operator==`. The `KeyEq` method is always
+// called with a key *stored in the hashtable* as the second or "RHS" parameter.
+// This is to allow simplifying the set of overloads needed for heterogeneous
+// contexts: only the first, LHS, parameter needs to support different lookup
+// key types.
+//
+// Custom KeyContext types should have the the same API as the default type.
+// They can choose to use templates to support heterogeneous key types or not as
+// appropriate. The default context can also be used as a base class with only
+// one or the other APIs customized.
+//
+// An important consideration is how the key context is constructed. When the
+// key context can be default constructed, hashtable APIs trafficking in keys
+// will have overloads that provide a default constructed key context. When the
+// context is *not* default constructible, every API that accepts a key will
+// also require a context argument to be called, and that argument will be used
+// throughout that operation. The intent is to allow callers to provide stateful
+// contexts to each API where it would be needed, while managing that state
+// outside the hashtable. Often the needed state is trivially part of the
+// caller's existing state and needn't be stored separately.
+//
+// Example for a stateful, customized key context for interned strings:
+// ```cpp
+// class InternedStringIndexKeyContext {
+//  public:
+//   InternedStringIndexKeyContext(
+//       llvm::ArrayRef<llvm::StringRef> interned_strings)
+//       : interned_strings_(interned_strings) {}
+//
+//   auto HashKey(llvm::StringRef s, uint64_t seed) const -> HashCode {
+//     return HashValue(s);
+//   }
+//   auto HashKey(int index_key, uint64_t seed) const -> HashCode {
+//     return HashKey(interned_strings_[index_key]);
+//   }
+//
+//   auto KeyEq(llvm::StringRef lhs, int rhs_index) const -> bool {
+//     return lhs == interned_strings_[rhs_index];
+//   }
+//   auto KeyEq(int lhs_index, int rhs_index) const -> bool {
+//     return KeyEq(interned_strings_[lhs_index], rhs_index);
+//   }
+//
+//  private:
+//   llvm::ArrayRef<llvm::StringRef> interned_strings_;
+// };
+// ```
+struct DefaultKeyContext {
+  template <typename KeyT>
+  auto HashKey(const KeyT& key, uint64_t seed) const -> HashCode {
+    return HashValue(key, seed);
+  }
+
+  template <typename LHSKeyT, typename RHSKeyT>
+  auto KeyEq(const LHSKeyT& lhs_key, const RHSKeyT& rhs_key) const -> bool {
+    return lhs_key == rhs_key;
+  }
+};
+
+}  // namespace Carbon
+
+#endif  // CARBON_COMMON_HASHTABLE_KEY_CONTEXT_H_

+ 558 - 0
common/map.h

@@ -0,0 +1,558 @@
+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+// Exceptions. See /LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef CARBON_COMMON_MAP_H_
+#define CARBON_COMMON_MAP_H_
+
+#include <algorithm>
+#include <concepts>
+#include <utility>
+
+#include "common/check.h"
+#include "common/hashtable_key_context.h"
+#include "common/raw_hashtable.h"
+#include "llvm/Support/Compiler.h"
+
+namespace Carbon {
+
+// Forward declarations to resolve cyclic references.
+template <typename KeyT, typename ValueT, typename KeyContextT>
+class MapView;
+template <typename KeyT, typename ValueT, typename KeyContextT>
+class MapBase;
+template <typename KeyT, typename ValueT, ssize_t SmallSize,
+          typename KeyContextT>
+class Map;
+
+// A read-only view type for a map from key to value.
+//
+// This view is a cheap-to-copy type that should be passed by value, but
+// provides view or read-only reference semantics to the underlying map data
+// structure.
+//
+// This should always be preferred to a `const`-ref parameter for the `MapBase`
+// or `Map` type as it provides more flexibility and a cleaner API.
+//
+// Note that while this type is a read-only view, that applies to the underlying
+// *map* data structure, not the individual entries stored within it. Those can
+// be mutated freely as long as both the hashes and equality of the keys are
+// preserved. If we applied a deep-`const` design here, it would prevent using
+// this type in many useful situations where the elements are mutated but the
+// associative container is not. A view of immutable data can always be obtained
+// by using `MapView<const T, const V>`, and we enable conversions to more-const
+// views. This mirrors the semantics of views like `std::span`.
+//
+// A specific `KeyContextT` type can optionally be provided to configure how
+// keys will be hashed and compared. The default is `DefaultKeyContext` which is
+// stateless and will hash using `Carbon::HashValue` and compare using
+// `operator==`. Every method accepting a lookup key or operating on the keys in
+// the table will also accept an instance of this type. For stateless context
+// types, including the default, an instance will be default constructed if not
+// provided to these methods. However, stateful contexts should be constructed
+// and passed in explicitly. The context type should be small and reasonable to
+// pass by value, often a wrapper or pointer to the relevant context needed for
+// hashing and comparing keys. For more details about the key context, see
+// `hashtable_key_context.h`.
+template <typename InputKeyT, typename InputValueT,
+          typename InputKeyContextT = DefaultKeyContext>
+class MapView
+    : RawHashtable::ViewImpl<InputKeyT, InputValueT, InputKeyContextT> {
+  using ImplT =
+      RawHashtable::ViewImpl<InputKeyT, InputValueT, InputKeyContextT>;
+  using EntryT = typename ImplT::EntryT;
+
+ public:
+  using KeyT = typename ImplT::KeyT;
+  using ValueT = typename ImplT::ValueT;
+  using KeyContextT = typename ImplT::KeyContextT;
+
+  // This type represents the result of lookup operations. It encodes whether
+  // the lookup was a success as well as accessors for the key and value.
+  class LookupKVResult {
+   public:
+    LookupKVResult() = default;
+    explicit LookupKVResult(EntryT* entry) : entry_(entry) {}
+
+    explicit operator bool() const { return entry_ != nullptr; }
+
+    auto key() const -> KeyT& { return entry_->key(); }
+    auto value() const -> ValueT& { return entry_->value(); }
+
+   private:
+    EntryT* entry_ = nullptr;
+  };
+
+  // Enable implicit conversions that add `const`-ness to either key or value
+  // type. This is always safe to do with a view. We use a template to avoid
+  // needing all 3 versions.
+  template <typename OtherKeyT, typename OtherValueT>
+  // NOLINTNEXTLINE(google-explicit-constructor)
+  MapView(MapView<OtherKeyT, OtherValueT, KeyContextT> other_view)
+    requires(std::same_as<KeyT, OtherKeyT> ||
+             std::same_as<KeyT, const OtherKeyT>) &&
+            (std::same_as<ValueT, OtherValueT> ||
+             std::same_as<ValueT, const OtherValueT>)
+      : ImplT(other_view) {}
+
+  // Tests whether a key is present in the map.
+  template <typename LookupKeyT>
+  auto Contains(LookupKeyT lookup_key,
+                KeyContextT key_context = KeyContextT()) const -> bool;
+
+  // Lookup a key in the map.
+  template <typename LookupKeyT>
+  auto Lookup(LookupKeyT lookup_key,
+              KeyContextT key_context = KeyContextT()) const -> LookupKVResult;
+
+  // Lookup a key in the map and try to return a pointer to its value. Returns
+  // null on a missing key.
+  template <typename LookupKeyT>
+  auto operator[](LookupKeyT lookup_key) const
+      -> ValueT* requires(std::default_initializable<KeyContextT>);
+
+  // Run the provided callback for every key and value in the map.
+  template <typename CallbackT>
+  void ForEach(CallbackT callback)
+    requires(std::invocable<CallbackT, KeyT&, ValueT&>);
+
+  // This routine is relatively inefficient and only intended for use in
+  // benchmarking or logging of performance anomalies. The specific count
+  // returned has no specific guarantees beyond being informative in benchmarks.
+  // It counts how many of the keys in the hashtable have required probing
+  // beyond their initial group of slots.
+  //
+  // TODO: Replace with a more general metrics routine that covers other
+  // important aspects such as load factor, and average probe *distance*.
+  auto CountProbedKeys(KeyContextT key_context = KeyContextT()) -> ssize_t {
+    return ImplT::CountProbedKeys(key_context);
+  }
+
+ private:
+  template <typename MapKeyT, typename MapValueT, ssize_t MinSmallSize,
+            typename KeyContextT>
+  friend class Map;
+  friend class MapBase<KeyT, ValueT, KeyContextT>;
+  friend class MapView<const KeyT, ValueT, KeyContextT>;
+  friend class MapView<KeyT, const ValueT, KeyContextT>;
+  friend class MapView<const KeyT, const ValueT, KeyContextT>;
+
+  MapView() = default;
+  // NOLINTNEXTLINE(google-explicit-constructor): Implicit by design.
+  MapView(ImplT base) : ImplT(base) {}
+  MapView(ssize_t size, RawHashtable::Storage* storage)
+      : ImplT(size, storage) {}
+};
+
+// A base class for a `Map` type that remains mutable while type-erasing the
+// `SmallSize` (SSO) template parameter.
+//
+// A pointer or reference to this type is the preferred way to pass a mutable
+// handle to a `Map` type across API boundaries as it avoids encoding specific
+// SSO sizing information while providing a near-complete mutable API.
+template <typename InputKeyT, typename InputValueT,
+          typename InputKeyContextT = DefaultKeyContext>
+class MapBase : protected RawHashtable::BaseImpl<InputKeyT, InputValueT,
+                                                 InputKeyContextT> {
+ protected:
+  using ImplT =
+      RawHashtable::BaseImpl<InputKeyT, InputValueT, InputKeyContextT>;
+  using EntryT = typename ImplT::EntryT;
+
+ public:
+  using KeyT = typename ImplT::KeyT;
+  using ValueT = typename ImplT::ValueT;
+  using KeyContextT = typename ImplT::KeyContextT;
+  using ViewT = MapView<KeyT, ValueT, KeyContextT>;
+  using LookupKVResult = typename ViewT::LookupKVResult;
+
+  // The result type for insertion operations both indicates whether an insert
+  // was needed (as opposed to finding an existing element), and provides access
+  // to the element's key and value.
+  class InsertKVResult {
+   public:
+    InsertKVResult() = default;
+    explicit InsertKVResult(bool inserted, EntryT& entry)
+        : entry_(&entry), inserted_(inserted) {}
+
+    auto is_inserted() const -> bool { return inserted_; }
+
+    auto key() const -> KeyT& { return entry_->key(); }
+    auto value() const -> ValueT& { return entry_->value(); }
+
+   private:
+    EntryT* entry_;
+    bool inserted_;
+  };
+
+  // Implicitly convertible to the relevant view type.
+  //
+  // NOLINTNEXTLINE(google-explicit-constructor): Designed to implicitly decay.
+  operator ViewT() const { return this->view_impl(); }
+
+  // We can't chain the above conversion with the conversions on `ViewT` to add
+  // const, so explicitly support adding const to produce a view here.
+  template <typename OtherKeyT, typename OtherValueT>
+  // NOLINTNEXTLINE(google-explicit-constructor)
+  operator MapView<OtherKeyT, OtherValueT, KeyContextT>() const
+    requires(std::same_as<KeyT, OtherKeyT> ||
+             std::same_as<const KeyT, OtherKeyT>) &&
+            (std::same_as<ValueT, OtherValueT> ||
+             std::same_as<const ValueT, OtherValueT>)
+  {
+    return ViewT(*this);
+  }
+
+  // Convenience forwarder to the view type.
+  template <typename LookupKeyT>
+  auto Contains(LookupKeyT lookup_key,
+                KeyContextT key_context = KeyContextT()) const -> bool {
+    return ViewT(*this).Contains(lookup_key, key_context);
+  }
+
+  // Convenience forwarder to the view type.
+  template <typename LookupKeyT>
+  auto Lookup(LookupKeyT lookup_key,
+              KeyContextT key_context = KeyContextT()) const -> LookupKVResult {
+    return ViewT(*this).Lookup(lookup_key, key_context);
+  }
+
+  // Convenience forwarder to the view type.
+  template <typename LookupKeyT>
+  auto operator[](LookupKeyT lookup_key) const
+      -> ValueT* requires(std::default_initializable<KeyContextT>) {
+        return ViewT(*this)[lookup_key];
+      }
+
+  // Convenience forwarder to the view type.
+  template <typename CallbackT>
+  void ForEach(CallbackT callback)
+    requires(std::invocable<CallbackT, KeyT&, ValueT&>)
+  {
+    return ViewT(*this).ForEach(callback);
+  }
+
+  // Convenience forwarder to the view type.
+  auto CountProbedKeys(KeyContextT key_context = KeyContextT()) const
+      -> ssize_t {
+    return ViewT(*this).CountProbedKeys(key_context);
+  }
+
+  // Insert a key and value into the map. If the key is already present, the new
+  // value is discarded and the existing value preserved.
+  template <typename LookupKeyT>
+  auto Insert(LookupKeyT lookup_key, ValueT new_v,
+              KeyContextT key_context = KeyContextT()) -> InsertKVResult;
+
+  // Insert a key into the map and call the provided callback if necessary to
+  // produce a new value when no existing value is found.
+  //
+  // Example: `m.Insert(key, [] { return default_value; });`
+  //
+  // TODO: The `;` formatting below appears to be bugs in clang-format with
+  // concepts that should be filed upstream.
+  template <typename LookupKeyT, typename ValueCallbackT>
+  auto Insert(LookupKeyT lookup_key, ValueCallbackT value_cb,
+              KeyContextT key_context = KeyContextT()) -> InsertKVResult
+    requires(
+        !std::same_as<ValueT, ValueCallbackT> &&
+        std::convertible_to<decltype(std::declval<ValueCallbackT>()()), ValueT>)
+  ;
+
+  // Lookup a key in the map and if missing insert it and call the provided
+  // callback to in-place construct both the key and value. The lookup key is
+  // passed through to the callback so it needn't be captured and can be kept in
+  // a register argument throughout.
+  //
+  // Example:
+  // ```cpp
+  //   m.Insert("widget", [](MyStringViewType lookup_key, void* key_storage,
+  //                         void* value_storage) {
+  //     new (key_storage) MyStringType(lookup_key);
+  //     new (value_storage) MyValueType(....);
+  //   });
+  // ```
+  template <typename LookupKeyT, typename InsertCallbackT>
+  auto Insert(LookupKeyT lookup_key, InsertCallbackT insert_cb,
+              KeyContextT key_context = KeyContextT()) -> InsertKVResult
+    requires(!std::same_as<ValueT, InsertCallbackT> &&
+             std::invocable<InsertCallbackT, LookupKeyT, void*, void*>);
+
+  // Replace a key's value in a map if already present or insert it if not
+  // already present. The new value is always used.
+  template <typename LookupKeyT>
+  auto Update(LookupKeyT lookup_key, ValueT new_v,
+              KeyContextT key_context = KeyContextT()) -> InsertKVResult;
+
+  // Lookup or insert a key into the map, and set it's value to the result of
+  // the `value_cb` callback. The callback is always run and its result is
+  // always used, whether the key was already in the map or not. Any existing
+  // value is replaced with the result.
+  //
+  // Example: `m.Update(key, [] { return new_value; });`
+  template <typename LookupKeyT, typename ValueCallbackT>
+  auto Update(LookupKeyT lookup_key, ValueCallbackT value_cb,
+              KeyContextT key_context = KeyContextT()) -> InsertKVResult
+    requires(
+        !std::same_as<ValueT, ValueCallbackT> &&
+        std::convertible_to<decltype(std::declval<ValueCallbackT>()()), ValueT>)
+  ;
+
+  // Lookup or insert a key into the map. If not already present and the key is
+  // inserted, the `insert_cb` is used to construct the new key and value in
+  // place. When inserting, the lookup key is passed through to the callback so
+  // it needn't be captured and can be kept in a register argument throughout.
+  // If the key was already present, the `update_cb` is called to update the
+  // existing key and value as desired.
+  //
+  // Example of counting occurrences:
+  // ```cpp
+  //   m.Update(item, /*insert_cb=*/[](MyStringViewType lookup_key,
+  //                                   void* key_storage, void* value_storage) {
+  //                    new (key_storage) MyItem(lookup_key);
+  //                    new (value_storage) Count(1);
+  //                  },
+  //                  /*update_cb=*/[](MyItem& /*key*/, Count& count) {
+  //                    ++count;
+  //                  });
+  // ```
+  template <typename LookupKeyT, typename InsertCallbackT,
+            typename UpdateCallbackT>
+  auto Update(LookupKeyT lookup_key, InsertCallbackT insert_cb,
+              UpdateCallbackT update_cb,
+              KeyContextT key_context = KeyContextT()) -> InsertKVResult
+    requires(!std::same_as<ValueT, InsertCallbackT> &&
+             std::invocable<InsertCallbackT, LookupKeyT, void*, void*> &&
+             std::invocable<UpdateCallbackT, KeyT&, ValueT&>);
+
+  // Erase a key from the map.
+  template <typename LookupKeyT>
+  auto Erase(LookupKeyT lookup_key, KeyContextT key_context = KeyContextT())
+      -> bool;
+
+  // Clear all key/value pairs from the map but leave the underlying hashtable
+  // allocated and in place.
+  void Clear();
+
+ protected:
+  using ImplT::ImplT;
+};
+
+// A data structure mapping from key to value.
+//
+// This map also supports small size optimization (or "SSO"). The provided
+// `SmallSize` type parameter indicates the size of an embedded buffer for
+// storing maps small enough to fit. The default is zero, which always allocates
+// a heap buffer on construction. When non-zero, must be a multiple of the
+// `MaxGroupSize` which is currently 16. The library will check that the size is
+// valid and provide an error at compile time if not. We don't automatically
+// select the next multiple or otherwise fit the size to the constraints to make
+// it clear in the code how much memory is used by the SSO buffer.
+//
+// This data structure optimizes heavily for small key types that are cheap to
+// move and even copy. Using types with large keys or expensive to copy keys may
+// create surprising performance bottlenecks. A `std::string` key should be fine
+// with generally small strings, but if some or many strings are large heap
+// allocations the performance of hashtable routines may be unacceptably bad and
+// another data structure or key design is likely preferable.
+//
+// Note that this type should typically not appear on API boundaries; either
+// `MapBase` or `MapView` should be used instead.
+template <typename InputKeyT, typename InputValueT, ssize_t SmallSize = 0,
+          typename InputKeyContextT = DefaultKeyContext>
+class Map : public RawHashtable::TableImpl<
+                MapBase<InputKeyT, InputValueT, InputKeyContextT>, SmallSize> {
+  using BaseT = MapBase<InputKeyT, InputValueT, InputKeyContextT>;
+  using ImplT = RawHashtable::TableImpl<BaseT, SmallSize>;
+
+ public:
+  using KeyT = typename BaseT::KeyT;
+  using ValueT = typename BaseT::ValueT;
+
+  Map() = default;
+  Map(const Map& arg) = default;
+  Map(Map&& arg) noexcept = default;
+
+  // Reset the entire state of the hashtable to as it was when constructed,
+  // throwing away any intervening allocations.
+  void Reset();
+};
+
+template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
+template <typename LookupKeyT>
+auto MapView<InputKeyT, InputValueT, InputKeyContextT>::Contains(
+    LookupKeyT lookup_key, KeyContextT key_context) const -> bool {
+  return this->LookupEntry(lookup_key, key_context) != nullptr;
+}
+
+template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
+template <typename LookupKeyT>
+auto MapView<InputKeyT, InputValueT, InputKeyContextT>::Lookup(
+    LookupKeyT lookup_key, KeyContextT key_context) const -> LookupKVResult {
+  return LookupKVResult(this->LookupEntry(lookup_key, key_context));
+}
+
+template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
+template <typename LookupKeyT>
+auto MapView<InputKeyT, InputValueT, InputKeyContextT>::operator[](
+    LookupKeyT lookup_key) const
+    -> ValueT* requires(std::default_initializable<KeyContextT>) {
+      auto result = Lookup(lookup_key, KeyContextT());
+      return result ? &result.value() : nullptr;
+    }
+
+template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
+template <typename CallbackT>
+void MapView<InputKeyT, InputValueT, InputKeyContextT>::ForEach(
+    CallbackT callback)
+  requires(std::invocable<CallbackT, KeyT&, ValueT&>)
+{
+  this->ForEachEntry(
+      [callback](EntryT& entry) { callback(entry.key(), entry.value()); },
+      [](auto...) {});
+}
+
+template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
+template <typename LookupKeyT>
+[[clang::always_inline]] auto
+MapBase<InputKeyT, InputValueT, InputKeyContextT>::Insert(
+    LookupKeyT lookup_key, ValueT new_v, KeyContextT key_context)
+    -> InsertKVResult {
+  return Insert(
+      lookup_key,
+      [&new_v](LookupKeyT lookup_key, void* key_storage, void* value_storage) {
+        new (key_storage) KeyT(lookup_key);
+        new (value_storage) ValueT(std::move(new_v));
+      },
+      key_context);
+}
+
+template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
+template <typename LookupKeyT, typename ValueCallbackT>
+[[clang::always_inline]] auto
+MapBase<InputKeyT, InputValueT, InputKeyContextT>::Insert(
+    LookupKeyT lookup_key, ValueCallbackT value_cb, KeyContextT key_context)
+    -> InsertKVResult
+  requires(
+      !std::same_as<ValueT, ValueCallbackT> &&
+      std::convertible_to<decltype(std::declval<ValueCallbackT>()()), ValueT>)
+{
+  return Insert(
+      lookup_key,
+      [&value_cb](LookupKeyT lookup_key, void* key_storage,
+                  void* value_storage) {
+        new (key_storage) KeyT(lookup_key);
+        new (value_storage) ValueT(value_cb());
+      },
+      key_context);
+}
+
+template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
+template <typename LookupKeyT, typename InsertCallbackT>
+[[clang::always_inline]] auto
+MapBase<InputKeyT, InputValueT, InputKeyContextT>::Insert(
+    LookupKeyT lookup_key, InsertCallbackT insert_cb, KeyContextT key_context)
+    -> InsertKVResult
+  requires(!std::same_as<ValueT, InsertCallbackT> &&
+           std::invocable<InsertCallbackT, LookupKeyT, void*, void*>)
+{
+  auto [entry, inserted] = this->InsertImpl(lookup_key, key_context);
+  CARBON_DCHECK(entry) << "Should always result in a valid index.";
+
+  if (LLVM_LIKELY(!inserted)) {
+    return InsertKVResult(false, *entry);
+  }
+
+  insert_cb(lookup_key, static_cast<void*>(&entry->key_storage),
+            static_cast<void*>(&entry->value_storage));
+  return InsertKVResult(true, *entry);
+}
+
+template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
+template <typename LookupKeyT>
+[[clang::always_inline]] auto
+MapBase<InputKeyT, InputValueT, InputKeyContextT>::Update(
+    LookupKeyT lookup_key, ValueT new_v, KeyContextT key_context)
+    -> InsertKVResult {
+  return Update(
+      lookup_key,
+      [&new_v](LookupKeyT lookup_key, void* key_storage, void* value_storage) {
+        new (key_storage) KeyT(lookup_key);
+        new (value_storage) ValueT(std::move(new_v));
+      },
+      [&new_v](KeyT& /*key*/, ValueT& value) {
+        value.~ValueT();
+        new (&value) ValueT(std::move(new_v));
+      },
+      key_context);
+}
+
+template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
+template <typename LookupKeyT, typename ValueCallbackT>
+[[clang::always_inline]] auto
+MapBase<InputKeyT, InputValueT, InputKeyContextT>::Update(
+    LookupKeyT lookup_key, ValueCallbackT value_cb, KeyContextT key_context)
+    -> InsertKVResult
+  requires(
+      !std::same_as<ValueT, ValueCallbackT> &&
+      std::convertible_to<decltype(std::declval<ValueCallbackT>()()), ValueT>)
+{
+  return Update(
+      lookup_key,
+      [&value_cb](LookupKeyT lookup_key, void* key_storage,
+                  void* value_storage) {
+        new (key_storage) KeyT(lookup_key);
+        new (value_storage) ValueT(value_cb());
+      },
+      [&value_cb](KeyT& /*key*/, ValueT& value) {
+        value.~ValueT();
+        new (&value) ValueT(value_cb());
+      },
+      key_context);
+}
+
+template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
+template <typename LookupKeyT, typename InsertCallbackT,
+          typename UpdateCallbackT>
+[[clang::always_inline]] auto
+MapBase<InputKeyT, InputValueT, InputKeyContextT>::Update(
+    LookupKeyT lookup_key, InsertCallbackT insert_cb, UpdateCallbackT update_cb,
+    KeyContextT key_context) -> InsertKVResult
+  requires(!std::same_as<ValueT, InsertCallbackT> &&
+           std::invocable<InsertCallbackT, LookupKeyT, void*, void*> &&
+           std::invocable<UpdateCallbackT, KeyT&, ValueT&>)
+{
+  auto [entry, inserted] = this->InsertImpl(lookup_key, key_context);
+  CARBON_DCHECK(entry) << "Should always result in a valid index.";
+
+  if (LLVM_LIKELY(!inserted)) {
+    update_cb(entry->key(), entry->value());
+    return InsertKVResult(false, *entry);
+  }
+
+  insert_cb(lookup_key, static_cast<void*>(&entry->key_storage),
+            static_cast<void*>(&entry->value_storage));
+  return InsertKVResult(true, *entry);
+}
+
+template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
+template <typename LookupKeyT>
+auto MapBase<InputKeyT, InputValueT, InputKeyContextT>::Erase(
+    LookupKeyT lookup_key, KeyContextT key_context) -> bool {
+  return this->EraseImpl(lookup_key, key_context);
+}
+
+template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
+void MapBase<InputKeyT, InputValueT, InputKeyContextT>::Clear() {
+  this->ClearImpl();
+}
+
+template <typename InputKeyT, typename InputValueT, ssize_t SmallSize,
+          typename InputKeyContextT>
+void Map<InputKeyT, InputValueT, SmallSize, InputKeyContextT>::Reset() {
+  this->ResetImpl();
+}
+
+}  // namespace Carbon
+
+#endif  // CARBON_COMMON_MAP_H_

+ 480 - 0
common/map_benchmark.cpp

@@ -0,0 +1,480 @@
+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+// Exceptions. See /LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <benchmark/benchmark.h>
+
+#include <type_traits>
+
+#include "absl/container/flat_hash_map.h"
+#include "common/map.h"
+#include "common/raw_hashtable_benchmark_helpers.h"
+#include "llvm/ADT/DenseMap.h"
+
+namespace Carbon {
+namespace {
+
+using RawHashtable::CarbonHashDI;
+using RawHashtable::GetKeysAndHitKeys;
+using RawHashtable::GetKeysAndMissKeys;
+using RawHashtable::HitArgs;
+using RawHashtable::SizeArgs;
+using RawHashtable::ValueToBool;
+
+// Helpers to synthesize some value of one of the three types we use as value
+// types.
+template <typename T>
+auto MakeValue() -> T {
+  if constexpr (std::is_same_v<T, llvm::StringRef>) {
+    return "abc";
+  } else if constexpr (std::is_pointer_v<T>) {
+    static std::remove_pointer_t<T> x;
+    return &x;
+  } else {
+    return 42;
+  }
+}
+template <typename T>
+auto MakeValue2() -> T {
+  if constexpr (std::is_same_v<T, llvm::StringRef>) {
+    return "qux";
+  } else if constexpr (std::is_pointer_v<T>) {
+    static std::remove_pointer_t<T> y;
+    return &y;
+  } else {
+    return 7;
+  }
+}
+
+template <typename MapT>
+struct IsCarbonMapImpl : std::false_type {};
+template <typename KT, typename VT, int MinSmallSize>
+struct IsCarbonMapImpl<Map<KT, VT, MinSmallSize>> : std::true_type {};
+
+template <typename MapT>
+static constexpr bool IsCarbonMap = IsCarbonMapImpl<MapT>::value;
+
+// A wrapper around various map types that we specialize to implement a common
+// API used in the benchmarks for various different map data structures that
+// support different APIs. The primary template assumes a roughly
+// `std::unordered_map` API design, and types with a different API design are
+// supported through specializations.
+template <typename MapT>
+struct MapWrapperImpl {
+  using KeyT = typename MapT::key_type;
+  using ValueT = typename MapT::mapped_type;
+
+  MapT m;
+
+  auto BenchContains(KeyT k) -> bool { return m.find(k) != m.end(); }
+
+  auto BenchLookup(KeyT k) -> bool {
+    auto it = m.find(k);
+    if (it == m.end()) {
+      return false;
+    }
+    return ValueToBool(it->second);
+  }
+
+  auto BenchInsert(KeyT k, ValueT v) -> bool {
+    auto result = m.insert({k, v});
+    return result.second;
+  }
+
+  auto BenchUpdate(KeyT k, ValueT v) -> bool {
+    auto result = m.insert({k, v});
+    result.first->second = v;
+    return result.second;
+  }
+
+  auto BenchErase(KeyT k) -> bool { return m.erase(k) != 0; }
+};
+
+// Explicit (partial) specialization for the Carbon map type that uses its
+// different API design.
+template <typename KT, typename VT, int MinSmallSize>
+struct MapWrapperImpl<Map<KT, VT, MinSmallSize>> {
+  using MapT = Map<KT, VT, MinSmallSize>;
+  using KeyT = KT;
+  using ValueT = VT;
+
+  MapT m;
+
+  auto BenchContains(KeyT k) -> bool { return m.Contains(k); }
+
+  auto BenchLookup(KeyT k) -> bool {
+    auto result = m.Lookup(k);
+    if (!result) {
+      return false;
+    }
+    return ValueToBool(result.value());
+  }
+
+  auto BenchInsert(KeyT k, ValueT v) -> bool {
+    auto result = m.Insert(k, v);
+    return result.is_inserted();
+  }
+
+  auto BenchUpdate(KeyT k, ValueT v) -> bool {
+    auto result = m.Update(k, v);
+    return result.is_inserted();
+  }
+
+  auto BenchErase(KeyT k) -> bool { return m.Erase(k); }
+};
+
+// Provide a way to override the Carbon Map specific benchmark runs with another
+// hashtable implementation. When building, you can use one of these enum names
+// in a macro define such as `-DCARBON_MAP_BENCH_OVERRIDE=Name` in order to
+// trigger a specific override for the `Map` type benchmarks. This is used to
+// get before/after runs that compare the performance of Carbon's Map versus
+// other implementations.
+enum class MapOverride : uint8_t {
+  None,
+  Abseil,
+  LLVM,
+  LLVMAndCarbonHash,
+};
+#ifndef CARBON_MAP_BENCH_OVERRIDE
+#define CARBON_MAP_BENCH_OVERRIDE None
+#endif
+
+template <typename MapT, MapOverride Override>
+struct MapWrapperOverride : MapWrapperImpl<MapT> {};
+
+template <typename KeyT, typename ValueT, int MinSmallSize>
+struct MapWrapperOverride<Map<KeyT, ValueT, MinSmallSize>, MapOverride::Abseil>
+    : MapWrapperImpl<absl::flat_hash_map<KeyT, ValueT>> {};
+
+template <typename KeyT, typename ValueT, int MinSmallSize>
+struct MapWrapperOverride<Map<KeyT, ValueT, MinSmallSize>, MapOverride::LLVM>
+    : MapWrapperImpl<llvm::DenseMap<KeyT, ValueT>> {};
+
+template <typename KeyT, typename ValueT, int MinSmallSize>
+struct MapWrapperOverride<Map<KeyT, ValueT, MinSmallSize>,
+                          MapOverride::LLVMAndCarbonHash>
+    : MapWrapperImpl<llvm::DenseMap<KeyT, ValueT, CarbonHashDI<KeyT>>> {};
+
+template <typename MapT>
+using MapWrapper =
+    MapWrapperOverride<MapT, MapOverride::CARBON_MAP_BENCH_OVERRIDE>;
+
+// NOLINTBEGIN(bugprone-macro-parentheses): Parentheses are incorrect here.
+#define MAP_BENCHMARK_ONE_OP_SIZE(NAME, APPLY, KT, VT)        \
+  BENCHMARK(NAME<Map<KT, VT>>)->Apply(APPLY);                 \
+  BENCHMARK(NAME<absl::flat_hash_map<KT, VT>>)->Apply(APPLY); \
+  BENCHMARK(NAME<llvm::DenseMap<KT, VT>>)->Apply(APPLY);      \
+  BENCHMARK(NAME<llvm::DenseMap<KT, VT, CarbonHashDI<KT>>>)->Apply(APPLY)
+// NOLINTEND(bugprone-macro-parentheses)
+
+#define MAP_BENCHMARK_ONE_OP(NAME, APPLY)                       \
+  MAP_BENCHMARK_ONE_OP_SIZE(NAME, APPLY, int, int);             \
+  MAP_BENCHMARK_ONE_OP_SIZE(NAME, APPLY, int*, int*);           \
+  MAP_BENCHMARK_ONE_OP_SIZE(NAME, APPLY, int, llvm::StringRef); \
+  MAP_BENCHMARK_ONE_OP_SIZE(NAME, APPLY, llvm::StringRef, int)
+
+// Benchmark the minimal latency of checking if a key is contained within a map,
+// when it *is* definitely in that map. Because this is only really measuring
+// the *minimal* latency, it is more similar to a throughput benchmark.
+//
+// While this is structured to observe the latency of testing for presence of a
+// key, it is important to understand the reality of what this measures. Because
+// the boolean result testing for whether a key is in a map is fundamentally
+// provided not by accessing some data, but by branching on data to a control
+// flow path which sets the boolean to `true` or `false`, the result can be
+// speculatively provided based on predicting the conditional branch without
+// waiting for the results of the comparison to become available. And because
+// this is a small operation and we arrange for all the candidate keys to be
+// present, that branch *should* be predicted extremely well. The result is that
+// this measures the un-speculated latency of testing for presence which should
+// be small or zero. Which is why this is ultimately more similar to a
+// throughput benchmark.
+//
+// Because of these measurement oddities, the specific measurements here may not
+// be very interesting for predicting real-world performance in any way, but
+// they are useful for comparing how 'cheap' the operation is across changes to
+// the data structure or between similar data structures with similar
+// properties.
+template <typename MapT>
+static void BM_MapContainsHit(benchmark::State& state) {
+  using MapWrapperT = MapWrapper<MapT>;
+  using KT = typename MapWrapperT::KeyT;
+  using VT = typename MapWrapperT::ValueT;
+  MapWrapperT m;
+  auto [keys, lookup_keys] =
+      GetKeysAndHitKeys<KT>(state.range(0), state.range(1));
+  for (auto k : keys) {
+    m.BenchInsert(k, MakeValue<VT>());
+  }
+  ssize_t lookup_keys_size = lookup_keys.size();
+
+  while (state.KeepRunningBatch(lookup_keys_size)) {
+    for (ssize_t i = 0; i < lookup_keys_size;) {
+      // We block optimizing `i` as that has proven both more effective at
+      // blocking the loop from being optimized away and avoiding disruption of
+      // the generated code that we're benchmarking.
+      benchmark::DoNotOptimize(i);
+
+      bool result = m.BenchContains(lookup_keys[i]);
+      CARBON_DCHECK(result);
+      // We use the lookup success to step through keys, establishing a
+      // dependency between each lookup. This doesn't fully allow us to measure
+      // latency rather than throughput, as noted above.
+      i += static_cast<ssize_t>(result);
+    }
+  }
+}
+MAP_BENCHMARK_ONE_OP(BM_MapContainsHit, HitArgs);
+
+// Similar to `BM_MapContainsHit`, while this is structured as a latency
+// benchmark, the critical path is expected to be well predicted and so it
+// should turn into something closer to a throughput benchmark.
+template <typename MapT>
+static void BM_MapContainsMiss(benchmark::State& state) {
+  using MapWrapperT = MapWrapper<MapT>;
+  using KT = typename MapWrapperT::KeyT;
+  using VT = typename MapWrapperT::ValueT;
+  MapWrapperT m;
+  auto [keys, lookup_keys] = GetKeysAndMissKeys<KT>(state.range(0));
+  for (auto k : keys) {
+    m.BenchInsert(k, MakeValue<VT>());
+  }
+  ssize_t lookup_keys_size = lookup_keys.size();
+
+  while (state.KeepRunningBatch(lookup_keys_size)) {
+    for (ssize_t i = 0; i < lookup_keys_size;) {
+      benchmark::DoNotOptimize(i);
+
+      bool result = m.BenchContains(lookup_keys[i]);
+      CARBON_DCHECK(!result);
+      i += static_cast<ssize_t>(!result);
+    }
+  }
+}
+MAP_BENCHMARK_ONE_OP(BM_MapContainsMiss, SizeArgs);
+
+// This is a genuine latency benchmark. We lookup a key in the hashtable and use
+// the value associated with that key in the critical path of loading the next
+// iteration's key. We still ensure the keys are always present, and so we
+// generally expect the data structure branches to be well predicted. But we
+// vary the keys aggressively to avoid any prediction artifacts from repeatedly
+// examining the same key.
+//
+// This latency can be very helpful for understanding a range of data structure
+// behaviors:
+// - Many users of hashtables are directly dependent on the latency of this
+//   operation, and this micro-benchmark will reflect the expected latency for
+//   them.
+// - Showing how latency varies across different sizes of table and different
+//   fractions of the table being accessed (and thus needing space in the
+//   cache).
+//
+// However, it remains an ultimately synthetic and unrepresentative benchmark.
+// It should primarily be used to understand the relative cost of these
+// operations between versions of the data structure or between related data
+// structures.
+//
+// We vary both the number of entries in the table and the number of distinct
+// keys used when doing lookups. As the table becomes large, the latter dictates
+// the fraction of the table that will be accessed and thus the working set size
+// of the benchmark. Querying the same small number of keys in even a large
+// table doesn't actually encounter any cache pressure, so only a few of these
+// benchmarks will show any effects of the caching subsystem.
+template <typename MapT>
+static void BM_MapLookupHit(benchmark::State& state) {
+  using MapWrapperT = MapWrapper<MapT>;
+  using KT = typename MapWrapperT::KeyT;
+  using VT = typename MapWrapperT::ValueT;
+  MapWrapperT m;
+  auto [keys, lookup_keys] =
+      GetKeysAndHitKeys<KT>(state.range(0), state.range(1));
+  for (auto k : keys) {
+    m.BenchInsert(k, MakeValue<VT>());
+  }
+  ssize_t lookup_keys_size = lookup_keys.size();
+
+  while (state.KeepRunningBatch(lookup_keys_size)) {
+    for (ssize_t i = 0; i < lookup_keys_size;) {
+      benchmark::DoNotOptimize(i);
+
+      bool result = m.BenchLookup(lookup_keys[i]);
+      CARBON_DCHECK(result);
+      i += static_cast<ssize_t>(result);
+    }
+  }
+}
+MAP_BENCHMARK_ONE_OP(BM_MapLookupHit, HitArgs);
+
+// This is an update throughput benchmark in practice. While whether the key was
+// a hit is kept in the critical path, we only use keys that are hits and so
+// expect that to be fully predicted and speculated.
+//
+// However, we expect this fairly closely matches how user code interacts with
+// an update-style API. It will have some conditional testing (even if just an
+// assert) on whether the key was a hit and otherwise continue executing. As a
+// consequence the actual update is expected to not be in a meaningful critical
+// path.
+//
+// This still provides a basic way to measure the cost of this operation,
+// especially when comparing between implementations or across different hash
+// tables.
+template <typename MapT>
+static void BM_MapUpdateHit(benchmark::State& state) {
+  using MapWrapperT = MapWrapper<MapT>;
+  using KT = typename MapWrapperT::KeyT;
+  using VT = typename MapWrapperT::ValueT;
+  MapWrapperT m;
+  auto [keys, lookup_keys] =
+      GetKeysAndHitKeys<KT>(state.range(0), state.range(1));
+  for (auto k : keys) {
+    m.BenchInsert(k, MakeValue<VT>());
+  }
+  ssize_t lookup_keys_size = lookup_keys.size();
+
+  while (state.KeepRunningBatch(lookup_keys_size)) {
+    for (ssize_t i = 0; i < lookup_keys_size; ++i) {
+      benchmark::DoNotOptimize(i);
+
+      bool inserted = m.BenchUpdate(lookup_keys[i], MakeValue2<VT>());
+      CARBON_DCHECK(!inserted);
+    }
+  }
+}
+MAP_BENCHMARK_ONE_OP(BM_MapUpdateHit, HitArgs);
+
+// First erase and then insert the key. The code path will always be the same
+// here and so we expect this to largely be a throughput benchmark because of
+// branch prediction and speculative execution.
+//
+// We don't expect erase followed by insertion to be a common user code
+// sequence, but we don't have a good way of benchmarking either erase or insert
+// in isolation -- each would change the size of the table and thus the next
+// iteration's benchmark. And if we try to correct the table size outside of the
+// timed region, we end up trying to exclude too fine grained of a region from
+// timers to get good measurement data.
+//
+// Our solution is to benchmark both erase and insertion back to back. We can
+// then get a good profile of the code sequence of each, and at least measure
+// the sum cost of these reliably. Careful profiling can help attribute that
+// cost between erase and insert in order to understand which of the two
+// operations is contributing most to any performance artifacts observed.
+template <typename MapT>
+static void BM_MapEraseUpdateHit(benchmark::State& state) {
+  using MapWrapperT = MapWrapper<MapT>;
+  using KT = typename MapWrapperT::KeyT;
+  using VT = typename MapWrapperT::ValueT;
+  MapWrapperT m;
+  auto [keys, lookup_keys] =
+      GetKeysAndHitKeys<KT>(state.range(0), state.range(1));
+  for (auto k : keys) {
+    m.BenchInsert(k, MakeValue<VT>());
+  }
+  ssize_t lookup_keys_size = lookup_keys.size();
+
+  while (state.KeepRunningBatch(lookup_keys_size)) {
+    for (ssize_t i = 0; i < lookup_keys_size; ++i) {
+      benchmark::DoNotOptimize(i);
+
+      m.BenchErase(lookup_keys[i]);
+      benchmark::ClobberMemory();
+
+      bool inserted = m.BenchUpdate(lookup_keys[i], MakeValue2<VT>());
+      CARBON_DCHECK(inserted);
+    }
+  }
+}
+MAP_BENCHMARK_ONE_OP(BM_MapEraseUpdateHit, HitArgs);
+
+// NOLINTBEGIN(bugprone-macro-parentheses): Parentheses are incorrect here.
+#define MAP_BENCHMARK_OP_SEQ_SIZE(NAME, KT, VT)                  \
+  BENCHMARK(NAME<Map<KT, VT>>)->Apply(SizeArgs);                 \
+  BENCHMARK(NAME<absl::flat_hash_map<KT, VT>>)->Apply(SizeArgs); \
+  BENCHMARK(NAME<llvm::DenseMap<KT, VT>>)->Apply(APPLY);         \
+  BENCHMARK(NAME<llvm::DenseMap<KT, VT, CarbonHashDI<KT>>>)->Apply(SizeArgs)
+// NOLINTEND(bugprone-macro-parentheses)
+
+#define MAP_BENCHMARK_OP_SEQ(NAME)                       \
+  MAP_BENCHMARK_OP_SEQ_SIZE(NAME, int, int);             \
+  MAP_BENCHMARK_OP_SEQ_SIZE(NAME, int*, int*);           \
+  MAP_BENCHMARK_OP_SEQ_SIZE(NAME, int, llvm::StringRef); \
+  MAP_BENCHMARK_OP_SEQ_SIZE(NAME, llvm::StringRef, int)
+
+// This is an interesting, somewhat specialized benchmark that measures the cost
+// of inserting a sequence of key/value pairs into a table with no collisions up
+// to some size and then inserting a colliding key and throwing away the table.
+//
+// This can give an idea of the cost of building up a map of a particular size,
+// but without actually using it. Or of algorithms like cycle-detection which
+// for some reason need an associative container.
+//
+// It also covers both the insert-into-an-empty-slot code path that isn't
+// covered elsewhere, and the code path for growing a table to a larger size.
+//
+// Because this benchmark operates on whole maps, we also compute the number of
+// probed keys for Carbon's set as that is both a general reflection of the
+// efficacy of the underlying hash function, and a direct factor that drives the
+// cost of these operations.
+template <typename MapT>
+static void BM_MapInsertSeq(benchmark::State& state) {
+  using MapWrapperT = MapWrapper<MapT>;
+  using KT = typename MapWrapperT::KeyT;
+  using VT = typename MapWrapperT::ValueT;
+  constexpr ssize_t LookupKeysSize = 1 << 8;
+  auto [keys, lookup_keys] =
+      GetKeysAndHitKeys<KT>(state.range(0), LookupKeysSize);
+
+  // Note that we don't force batches that use all the lookup keys because
+  // there's no difference in cache usage by covering all the different lookup
+  // keys.
+  ssize_t i = 0;
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(i);
+
+    MapWrapperT m;
+    for (auto k : keys) {
+      bool inserted = m.BenchInsert(k, MakeValue<VT>());
+      CARBON_DCHECK(inserted) << "Must be a successful insert!";
+    }
+
+    // Now insert a final random repeated key.
+    bool inserted = m.BenchInsert(lookup_keys[i], MakeValue2<VT>());
+    CARBON_DCHECK(!inserted) << "Must already be in the map!";
+
+    // Rotate through the shuffled keys.
+    i = (i + static_cast<ssize_t>(!inserted)) & (LookupKeysSize - 1);
+  }
+
+  // It can be easier in some cases to think of this as a key-throughput rate of
+  // insertion rather than the latency of inserting N keys, so construct the
+  // rate counter as well.
+  state.counters["KeyRate"] = benchmark::Counter(
+      keys.size(), benchmark::Counter::kIsIterationInvariantRate);
+
+  // Report some extra statistics about the Carbon type.
+  if constexpr (IsCarbonMap<MapT>) {
+    // Re-build a map outside of the timing loop to look at the statistics
+    // rather than the timing.
+    MapT m;
+    for (auto k : keys) {
+      bool inserted = m.Insert(k, MakeValue<VT>()).is_inserted();
+      CARBON_DCHECK(inserted) << "Must be a successful insert!";
+    }
+
+    // While this count is "iteration invariant" (it should be exactly the same
+    // for every iteration as the set of keys is the same), we don't use that
+    // because it will scale this by the number of iterations. We want to
+    // display the probe count of this benchmark *parameter*, not the probe
+    // count that resulted from the number of iterations. That means we use the
+    // normal counter API without flags.
+    state.counters["Probed"] = m.CountProbedKeys();
+
+    // Uncomment this call to print out statistics about the index-collisions
+    // among these keys for debugging:
+    //
+    // RawHashtable::DumpHashStatistics(keys);
+  }
+}
+MAP_BENCHMARK_ONE_OP(BM_MapInsertSeq, SizeArgs);
+
+}  // namespace
+}  // namespace Carbon

+ 12 - 0
common/map_benchmark_test.sh

@@ -0,0 +1,12 @@
+#!/usr/bin/env bash
+#
+# Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+# Exceptions. See /LICENSE for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+BENCHMARK="$TEST_SRCDIR/$TEST_WORKSPACE/common/map_benchmark"
+
+exec "$BENCHMARK" \
+  --benchmark_counters_tabular=true \
+  --benchmark_min_time=1x \
+  --benchmark_filter='^[^/]*/[1-9][0-9]{0,3}(/[0-9]+)?$'

+ 627 - 0
common/map_test.cpp

@@ -0,0 +1,627 @@
+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+// Exceptions. See /LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "common/map.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include <initializer_list>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "common/raw_hashtable_test_helpers.h"
+
+namespace Carbon::Testing {
+namespace {
+
+using RawHashtable::FixedHashKeyContext;
+using RawHashtable::IndexKeyContext;
+using RawHashtable::TestData;
+using RawHashtable::TestKeyContext;
+using ::testing::Pair;
+using ::testing::UnorderedElementsAreArray;
+
+template <typename MapT, typename MatcherRangeT>
+void ExpectMapElementsAre(MapT&& m, MatcherRangeT element_matchers) {
+  // Now collect the elements into a container.
+  using KeyT = typename std::remove_reference<MapT>::type::KeyT;
+  using ValueT = typename std::remove_reference<MapT>::type::ValueT;
+  std::vector<std::pair<KeyT, ValueT>> map_entries;
+  m.ForEach([&map_entries](KeyT& k, ValueT& v) {
+    map_entries.push_back({k, v});
+  });
+
+  // Use the GoogleMock unordered container matcher to validate and show errors
+  // on wrong elements.
+  EXPECT_THAT(map_entries, UnorderedElementsAreArray(element_matchers));
+}
+
+// Allow directly using an initializer list.
+template <typename MapT, typename MatcherT>
+void ExpectMapElementsAre(MapT&& m,
+                          std::initializer_list<MatcherT> element_matchers) {
+  std::vector<MatcherT> element_matchers_storage = element_matchers;
+  ExpectMapElementsAre(m, element_matchers_storage);
+}
+
+template <typename ValueCB, typename RangeT, typename... RangeTs>
+auto MakeKeyValues(ValueCB value_cb, RangeT&& range, RangeTs&&... ranges) {
+  using KeyT = typename RangeT::value_type;
+  using ValueT = decltype(value_cb(std::declval<KeyT>()));
+  std::vector<std::pair<KeyT, ValueT>> elements;
+  auto add_range = [&](RangeT&& r) {
+    for (const auto&& e : r) {
+      elements.push_back({e, value_cb(e)});
+    }
+  };
+  add_range(std::forward<RangeT>(range));
+  (add_range(std::forward<RangeT>(ranges)), ...);
+
+  return elements;
+}
+
+template <typename MapT>
+class MapTest : public ::testing::Test {};
+
+using Types = ::testing::Types<
+    Map<int, int>, Map<int, int, 16>, Map<int, int, 64>,
+    Map<int, int, 0, TestKeyContext>, Map<int, int, 16, TestKeyContext>,
+    Map<int, int, 64, TestKeyContext>, Map<TestData, TestData>,
+    Map<TestData, TestData, 16>, Map<TestData, TestData, 0, TestKeyContext>,
+    Map<TestData, TestData, 16, TestKeyContext>>;
+TYPED_TEST_SUITE(MapTest, Types);
+
+TYPED_TEST(MapTest, Basic) {
+  TypeParam m;
+
+  EXPECT_FALSE(m.Contains(42));
+  EXPECT_EQ(nullptr, m[42]);
+  EXPECT_TRUE(m.Insert(1, 100).is_inserted());
+  ASSERT_TRUE(m.Contains(1));
+  auto result = m.Lookup(1);
+  EXPECT_TRUE(result);
+  EXPECT_EQ(1, result.key());
+  EXPECT_EQ(100, result.value());
+  EXPECT_EQ(100, *m[1]);
+  // Reinsertion doesn't change the value.
+  auto i_result = m.Insert(1, 101);
+  EXPECT_FALSE(i_result.is_inserted());
+  EXPECT_EQ(100, i_result.value());
+  EXPECT_EQ(100, *m[1]);
+  // Update does change the value.
+  i_result = m.Update(1, 101);
+  EXPECT_FALSE(i_result.is_inserted());
+  EXPECT_EQ(101, i_result.value());
+  EXPECT_EQ(101, *m[1]);
+
+  // Verify all the elements.
+  ExpectMapElementsAre(m, {Pair(1, 101)});
+
+  // Fill up a bunch to ensure we trigger growth a few times.
+  for (int i : llvm::seq(2, 512)) {
+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
+    EXPECT_TRUE(m.Insert(i, i * 100).is_inserted());
+
+    // Immediately do a basic check of all elements to pin down when an
+    // insertion corrupts the rest of the table.
+    ExpectMapElementsAre(
+        m,
+        MakeKeyValues([](int k) { return k * 100 + static_cast<int>(k == 1); },
+                      llvm::seq_inclusive(1, i)));
+  }
+  for (int i : llvm::seq(1, 512)) {
+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
+    EXPECT_FALSE(m.Insert(i, i * 100 + 1).is_inserted());
+    EXPECT_EQ(i * 100 + static_cast<int>(i == 1), *m[i]);
+    EXPECT_FALSE(m.Update(i, i * 100 + 1).is_inserted());
+    EXPECT_EQ(i * 100 + 1, *m[i]);
+  }
+  EXPECT_FALSE(m.Contains(513));
+
+  // Verify all the elements.
+  ExpectMapElementsAre(
+      m, MakeKeyValues([](int k) { return k * 100 + 1; }, llvm::seq(1, 512)));
+}
+
+TYPED_TEST(MapTest, FactoryAPI) {
+  TypeParam m;
+  EXPECT_TRUE(m.Insert(1, [] { return 100; }).is_inserted());
+  ASSERT_TRUE(m.Contains(1));
+  EXPECT_EQ(100, *m[1]);
+  // Reinsertion doesn't invoke the callback.
+  EXPECT_FALSE(m.Insert(1, []() -> int {
+                  llvm_unreachable("Should never be called!");
+                }).is_inserted());
+  // Update does invoke the callback.
+  auto i_result = m.Update(1, [] { return 101; });
+  EXPECT_FALSE(i_result.is_inserted());
+  EXPECT_EQ(101, i_result.value());
+  EXPECT_EQ(101, *m[1]);
+}
+
+TYPED_TEST(MapTest, Copy) {
+  using MapT = TypeParam;
+
+  MapT m;
+  // Make sure we exceed the small size for some of the map types, but not all
+  // of them, so we cover all the combinations of copying between small and
+  // large.
+  for (int i : llvm::seq(1, 24)) {
+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
+    ASSERT_TRUE(m.Insert(i, i * 100).is_inserted());
+  }
+
+  MapT other_m1 = m;
+  ExpectMapElementsAre(
+      other_m1, MakeKeyValues([](int k) { return k * 100; }, llvm::seq(1, 24)));
+
+  // Add some more elements to the original.
+  for (int i : llvm::seq(24, 32)) {
+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
+    ASSERT_TRUE(m.Insert(i, i * 100).is_inserted());
+  }
+
+  // The first copy doesn't change.
+  ExpectMapElementsAre(
+      other_m1, MakeKeyValues([](int k) { return k * 100; }, llvm::seq(1, 24)));
+
+  // A new copy does.
+  MapT other_m2 = m;
+  ExpectMapElementsAre(
+      other_m2, MakeKeyValues([](int k) { return k * 100; }, llvm::seq(1, 32)));
+}
+
+TYPED_TEST(MapTest, Move) {
+  using MapT = TypeParam;
+
+  MapT m;
+  // Make sure we exceed the small size for some of the map types, but not all
+  // of them, so we cover all the combinations of moving between small and
+  // large.
+  for (int i : llvm::seq(1, 24)) {
+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
+    ASSERT_TRUE(m.Insert(i, i * 100).is_inserted());
+  }
+
+  MapT other_m1 = std::move(m);
+  ExpectMapElementsAre(
+      other_m1, MakeKeyValues([](int k) { return k * 100; }, llvm::seq(1, 24)));
+
+  // Add some more elements.
+  for (int i : llvm::seq(24, 32)) {
+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
+    ASSERT_TRUE(other_m1.Insert(i, i * 100).is_inserted());
+  }
+  ExpectMapElementsAre(
+      other_m1, MakeKeyValues([](int k) { return k * 100; }, llvm::seq(1, 32)));
+}
+
+TYPED_TEST(MapTest, Conversions) {
+  using MapT = TypeParam;
+  using KeyT = MapT::KeyT;
+  using ValueT = MapT::ValueT;
+  using KeyContextT = MapT::KeyContextT;
+
+  MapT m;
+
+  ASSERT_TRUE(m.Insert(1, 101).is_inserted());
+  ASSERT_TRUE(m.Insert(2, 102).is_inserted());
+  ASSERT_TRUE(m.Insert(3, 103).is_inserted());
+  ASSERT_TRUE(m.Insert(4, 104).is_inserted());
+
+  MapView<KeyT, ValueT, KeyContextT> mv = m;
+  MapView<const KeyT, ValueT, KeyContextT> cmv = m;
+  MapView<KeyT, const ValueT, KeyContextT> cmv2 = m;
+  MapView<const KeyT, const ValueT, KeyContextT> cmv3 = m;
+  EXPECT_TRUE(mv.Contains(1));
+  EXPECT_EQ(101, *mv[1]);
+  EXPECT_TRUE(cmv.Contains(2));
+  EXPECT_EQ(102, *cmv[2]);
+  EXPECT_TRUE(cmv2.Contains(3));
+  EXPECT_EQ(103, *cmv2[3]);
+  EXPECT_TRUE(cmv3.Contains(4));
+  EXPECT_EQ(104, *cmv3[4]);
+}
+
+// This test is largely exercising the underlying `RawHashtable` implementation
+// with complex growth, erasure, and re-growth.
+TYPED_TEST(MapTest, ComplexOpSequence) {
+  // Use a small size as well to cover more growth scenarios.
+  TypeParam m;
+
+  EXPECT_FALSE(m.Contains(42));
+  EXPECT_EQ(nullptr, m[42]);
+  EXPECT_TRUE(m.Insert(1, 100).is_inserted());
+  ASSERT_TRUE(m.Contains(1));
+  auto result = m.Lookup(1);
+  EXPECT_TRUE(result);
+  EXPECT_EQ(1, result.key());
+  EXPECT_EQ(100, result.value());
+  EXPECT_EQ(100, *m[1]);
+  // Reinsertion doesn't change the value.
+  auto i_result = m.Insert(1, 101);
+  EXPECT_FALSE(i_result.is_inserted());
+  EXPECT_EQ(100, i_result.value());
+  EXPECT_EQ(100, *m[1]);
+  // Update does change the value.
+  i_result = m.Update(1, 101);
+  EXPECT_FALSE(i_result.is_inserted());
+  EXPECT_EQ(101, i_result.value());
+  EXPECT_EQ(101, *m[1]);
+
+  // Verify all the elements.
+  ExpectMapElementsAre(m, {Pair(1, 101)});
+
+  // Fill up the small buffer but don't overflow it.
+  for (int i : llvm::seq(2, 5)) {
+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
+    EXPECT_TRUE(m.Insert(i, i * 100).is_inserted());
+  }
+  for (int i : llvm::seq(1, 5)) {
+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
+    ASSERT_TRUE(m.Contains(i));
+    EXPECT_EQ(i * 100 + static_cast<int>(i == 1), *m[i]);
+    EXPECT_FALSE(m.Insert(i, i * 100 + 1).is_inserted());
+    EXPECT_EQ(i * 100 + static_cast<int>(i == 1), *m[i]);
+    EXPECT_FALSE(m.Update(i, i * 100 + 1).is_inserted());
+    EXPECT_EQ(i * 100 + 1, *m[i]);
+  }
+  EXPECT_FALSE(m.Contains(5));
+
+  // Verify all the elements.
+  ExpectMapElementsAre(
+      m, {Pair(1, 101), Pair(2, 201), Pair(3, 301), Pair(4, 401)});
+
+  // Erase some entries from the small buffer.
+  EXPECT_FALSE(m.Erase(42));
+  EXPECT_TRUE(m.Erase(2));
+  EXPECT_EQ(101, *m[1]);
+  EXPECT_EQ(nullptr, m[2]);
+  EXPECT_EQ(301, *m[3]);
+  EXPECT_EQ(401, *m[4]);
+  EXPECT_TRUE(m.Erase(1));
+  EXPECT_EQ(nullptr, m[1]);
+  EXPECT_EQ(nullptr, m[2]);
+  EXPECT_EQ(301, *m[3]);
+  EXPECT_EQ(401, *m[4]);
+  EXPECT_TRUE(m.Erase(4));
+  EXPECT_EQ(nullptr, m[1]);
+  EXPECT_EQ(nullptr, m[2]);
+  EXPECT_EQ(301, *m[3]);
+  EXPECT_EQ(nullptr, m[4]);
+  // Fill them back in, but with a different order and going back to the
+  // original value.
+  EXPECT_TRUE(m.Insert(1, 100).is_inserted());
+  EXPECT_TRUE(m.Insert(2, 200).is_inserted());
+  EXPECT_TRUE(m.Insert(4, 400).is_inserted());
+  EXPECT_EQ(100, *m[1]);
+  EXPECT_EQ(200, *m[2]);
+  EXPECT_EQ(301, *m[3]);
+  EXPECT_EQ(400, *m[4]);
+  // Then update their values to match.
+  EXPECT_FALSE(m.Update(1, 101).is_inserted());
+  EXPECT_FALSE(m.Update(2, 201).is_inserted());
+  EXPECT_FALSE(m.Update(4, 401).is_inserted());
+
+  // Now fill up the first metadata group.
+  for (int i : llvm::seq(5, 14)) {
+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
+    EXPECT_TRUE(m.Insert(i, i * 100).is_inserted());
+  }
+  for (int i : llvm::seq(1, 14)) {
+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
+    ASSERT_TRUE(m.Contains(i));
+    EXPECT_EQ(i * 100 + static_cast<int>(i < 5), *m[i]);
+    EXPECT_FALSE(m.Insert(i, i * 100 + 2).is_inserted());
+    EXPECT_EQ(i * 100 + static_cast<int>(i < 5), *m[i]);
+    EXPECT_FALSE(m.Update(i, i * 100 + 2).is_inserted());
+    EXPECT_EQ(i * 100 + 2, *m[i]);
+  }
+  EXPECT_FALSE(m.Contains(42));
+
+  // Verify all the elements by walking the entire map.
+  ExpectMapElementsAre(
+      m, {Pair(1, 102), Pair(2, 202), Pair(3, 302), Pair(4, 402), Pair(5, 502),
+          Pair(6, 602), Pair(7, 702), Pair(8, 802), Pair(9, 902),
+          Pair(10, 1002), Pair(11, 1102), Pair(12, 1202), Pair(13, 1302)});
+
+  // Now fill up several more groups.
+  for (int i : llvm::seq(14, 100)) {
+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
+    EXPECT_TRUE(m.Insert(i, i * 100).is_inserted());
+  }
+  for (int i : llvm::seq(1, 100)) {
+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
+    ASSERT_TRUE(m.Contains(i));
+    EXPECT_EQ(i * 100 + 2 * static_cast<int>(i < 14), *m[i]);
+    EXPECT_FALSE(m.Insert(i, i * 100 + 1).is_inserted());
+    EXPECT_EQ(i * 100 + 2 * static_cast<int>(i < 14), *m[i]);
+    EXPECT_FALSE(m.Update(i, i * 100 + 3).is_inserted());
+    EXPECT_EQ(i * 100 + 3, *m[i]);
+  }
+  EXPECT_FALSE(m.Contains(420));
+
+  // Check walking the entire container.
+  ExpectMapElementsAre(
+      m, MakeKeyValues([](int k) { return k * 100 + 3; }, llvm::seq(1, 100)));
+
+  // Clear back to empty.
+  m.Clear();
+  EXPECT_FALSE(m.Contains(42));
+  EXPECT_EQ(nullptr, m[42]);
+
+  // Refill but with both overlapping and different values.
+  for (int i : llvm::seq(50, 150)) {
+    EXPECT_TRUE(m.Insert(i, i * 100).is_inserted());
+  }
+  for (int i : llvm::seq(50, 150)) {
+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
+    ASSERT_TRUE(m.Contains(i));
+    EXPECT_EQ(i * 100, *m[i]);
+    EXPECT_FALSE(m.Insert(i, i * 100 + 1).is_inserted());
+    EXPECT_EQ(i * 100, *m[i]);
+    EXPECT_FALSE(m.Update(i, i * 100 + 1).is_inserted());
+    EXPECT_EQ(i * 100 + 1, *m[i]);
+  }
+  EXPECT_FALSE(m.Contains(42));
+  EXPECT_FALSE(m.Contains(420));
+
+  ExpectMapElementsAre(
+      m, MakeKeyValues([](int k) { return k * 100 + 1; }, llvm::seq(50, 150)));
+
+  EXPECT_FALSE(m.Erase(42));
+  EXPECT_TRUE(m.Contains(73));
+  EXPECT_TRUE(m.Erase(73));
+  EXPECT_FALSE(m.Contains(73));
+  for (int i : llvm::seq(102, 136)) {
+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
+    EXPECT_TRUE(m.Contains(i));
+    EXPECT_TRUE(m.Erase(i));
+    EXPECT_FALSE(m.Contains(i));
+  }
+  for (int i : llvm::seq(50, 150)) {
+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
+    if (i == 73 || (i >= 102 && i < 136)) {
+      continue;
+    }
+    ASSERT_TRUE(m.Contains(i));
+    EXPECT_EQ(i * 100 + 1, *m[i]);
+    EXPECT_FALSE(m.Insert(i, i * 100 + 2).is_inserted());
+    EXPECT_EQ(i * 100 + 1, *m[i]);
+    EXPECT_FALSE(m.Update(i, i * 100 + 2).is_inserted());
+    EXPECT_EQ(i * 100 + 2, *m[i]);
+  }
+  EXPECT_TRUE(m.Insert(73, 73 * 100 + 3).is_inserted());
+  EXPECT_EQ(73 * 100 + 3, *m[73]);
+
+  ExpectMapElementsAre(
+      m, MakeKeyValues([](int k) { return k * 100 + 2 + (k == 73); },
+                       llvm::seq(50, 102), llvm::seq(136, 150)));
+
+  // Reset back to empty and small.
+  m.Reset();
+  EXPECT_FALSE(m.Contains(42));
+  EXPECT_EQ(nullptr, m[42]);
+
+  // Refill but with both overlapping and different values, now triggering
+  // growth too. Also, use update instead of insert.
+  for (int i : llvm::seq(75, 175)) {
+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
+    EXPECT_TRUE(m.Update(i, i * 100).is_inserted());
+  }
+  for (int i : llvm::seq(75, 175)) {
+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
+    ASSERT_TRUE(m.Contains(i));
+    EXPECT_EQ(i * 100, *m[i]);
+    EXPECT_FALSE(m.Insert(i, i * 100 + 1).is_inserted());
+    EXPECT_EQ(i * 100, *m[i]);
+    EXPECT_FALSE(m.Update(i, i * 100 + 1).is_inserted());
+    EXPECT_EQ(i * 100 + 1, *m[i]);
+  }
+  EXPECT_FALSE(m.Contains(42));
+  EXPECT_FALSE(m.Contains(420));
+
+  ExpectMapElementsAre(
+      m, MakeKeyValues([](int k) { return k * 100 + 1; }, llvm::seq(75, 175)));
+
+  EXPECT_FALSE(m.Erase(42));
+  EXPECT_TRUE(m.Contains(93));
+  EXPECT_TRUE(m.Erase(93));
+  EXPECT_FALSE(m.Contains(93));
+  for (int i : llvm::seq(102, 136)) {
+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
+    EXPECT_TRUE(m.Contains(i));
+    EXPECT_TRUE(m.Erase(i));
+    EXPECT_FALSE(m.Contains(i));
+  }
+  for (int i : llvm::seq(75, 175)) {
+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
+    if (i == 93 || (i >= 102 && i < 136)) {
+      continue;
+    }
+    ASSERT_TRUE(m.Contains(i));
+    EXPECT_EQ(i * 100 + 1, *m[i]);
+    EXPECT_FALSE(m.Insert(i, i * 100 + 2).is_inserted());
+    EXPECT_EQ(i * 100 + 1, *m[i]);
+    EXPECT_FALSE(m.Update(i, i * 100 + 2).is_inserted());
+    EXPECT_EQ(i * 100 + 2, *m[i]);
+  }
+  EXPECT_TRUE(m.Insert(93, 93 * 100 + 3).is_inserted());
+  EXPECT_EQ(93 * 100 + 3, *m[93]);
+
+  ExpectMapElementsAre(
+      m, MakeKeyValues([](int k) { return k * 100 + 2 + (k == 93); },
+                       llvm::seq(75, 102), llvm::seq(136, 175)));
+}
+
+template <typename MapT>
+class MapCollisionTest : public ::testing::Test {};
+
+using CollisionTypes = ::testing::Types<
+    Map<int, int, 16,
+        FixedHashKeyContext<7, /*FixIndexBits*/ true, /*FixTagBits*/ false, 0>>,
+    Map<int, int, 16,
+        FixedHashKeyContext<7, /*FixIndexBits*/ false, /*FixTagBits*/ true, 0>>,
+    Map<int, int, 16,
+        FixedHashKeyContext<7, /*FixIndexBits*/ true, /*FixTagBits*/ true, 0>>,
+    Map<int, int, 16,
+        FixedHashKeyContext<7, /*FixIndexBits*/ true, /*FixTagBits*/ true,
+                            ~static_cast<uint64_t>(0)>>>;
+TYPED_TEST_SUITE(MapCollisionTest, CollisionTypes);
+
+TYPED_TEST(MapCollisionTest, Basic) {
+  TypeParam m;
+
+  // Fill the map through a couple of growth steps, verifying at each step. Note
+  // that because this is a collision test, we synthesize actively harmful
+  // hashes in terms of collisions and so this test is essentially quadratic. We
+  // need to keep it relatively small.
+  for (int i : llvm::seq(1, 256)) {
+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
+    EXPECT_TRUE(m.Insert(i, i * 100).is_inserted());
+
+    // Immediately do a basic check of all elements to pin down when an
+    // insertion corrupts the rest of the table.
+    ExpectMapElementsAre(m, MakeKeyValues([](int k) { return k * 100; },
+                                          llvm::seq_inclusive(1, i)));
+  }
+  EXPECT_FALSE(m.Contains(257));
+
+  // Erase and re-fill from the back.
+  for (int i : llvm::seq(192, 256)) {
+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
+    EXPECT_TRUE(m.Erase(i));
+  }
+  ExpectMapElementsAre(
+      m, MakeKeyValues([](int k) { return k * 100; }, llvm::seq(1, 192)));
+  for (int i : llvm::seq(192, 256)) {
+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
+    EXPECT_TRUE(m.Insert(i, i * 100 + 1).is_inserted());
+  }
+  ExpectMapElementsAre(m,
+                       MakeKeyValues([](int k) { return k * 100 + (k >= 192); },
+                                     llvm::seq(1, 256)));
+
+  // Erase and re-fill from the front.
+  for (int i : llvm::seq(1, 64)) {
+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
+    EXPECT_TRUE(m.Erase(i));
+  }
+  ExpectMapElementsAre(m,
+                       MakeKeyValues([](int k) { return k * 100 + (k >= 192); },
+                                     llvm::seq(64, 256)));
+  for (int i : llvm::seq(1, 64)) {
+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
+    EXPECT_TRUE(m.Insert(i, i * 100 + 1).is_inserted());
+  }
+  ExpectMapElementsAre(
+      m, MakeKeyValues([](int k) { return k * 100 + (k < 64) + (k >= 192); },
+                       llvm::seq(1, 256)));
+
+  // Erase and re-fill from the middle.
+  for (int i : llvm::seq(64, 192)) {
+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
+    EXPECT_TRUE(m.Erase(i));
+  }
+  ExpectMapElementsAre(m, MakeKeyValues([](int k) { return k * 100 + 1; },
+                                        llvm::seq(1, 64), llvm::seq(192, 256)));
+  for (int i : llvm::seq(64, 192)) {
+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
+    EXPECT_TRUE(m.Insert(i, i * 100 + 1).is_inserted());
+  }
+  ExpectMapElementsAre(
+      m, MakeKeyValues([](int k) { return k * 100 + 1; }, llvm::seq(1, 256)));
+
+  // Erase and re-fill from both the back and front.
+  for (auto s : {llvm::seq(192, 256), llvm::seq(1, 64)}) {
+    for (int i : s) {
+      SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
+      EXPECT_TRUE(m.Erase(i));
+    }
+  }
+  ExpectMapElementsAre(
+      m, MakeKeyValues([](int k) { return k * 100 + 1; }, llvm::seq(64, 192)));
+  for (auto s : {llvm::seq(192, 256), llvm::seq(1, 64)}) {
+    for (int i : s) {
+      SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
+      EXPECT_TRUE(m.Insert(i, i * 100 + 2).is_inserted());
+    }
+  }
+  ExpectMapElementsAre(
+      m,
+      MakeKeyValues([](int k) { return k * 100 + 1 + (k < 64) + (k >= 192); },
+                    llvm::seq(1, 256)));
+
+  // And update the middle elements in place.
+  for (int i : llvm::seq(64, 192)) {
+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
+    EXPECT_FALSE(m.Update(i, i * 100 + 2).is_inserted());
+  }
+  ExpectMapElementsAre(
+      m, MakeKeyValues([](int k) { return k * 100 + 2; }, llvm::seq(1, 256)));
+}
+
+TEST(MapContextTest, Basic) {
+  llvm::SmallVector<TestData> keys;
+  for (int i : llvm::seq(0, 513)) {
+    keys.push_back(i * 100000);
+  }
+  IndexKeyContext<TestData> key_context(keys);
+  Map<ssize_t, int, 0, IndexKeyContext<TestData>> m;
+
+  EXPECT_FALSE(m.Contains(42, key_context));
+  EXPECT_TRUE(m.Insert(1, 100, key_context).is_inserted());
+  ASSERT_TRUE(m.Contains(1, key_context));
+  auto result = m.Lookup(TestData(100000), key_context);
+  EXPECT_TRUE(result);
+  EXPECT_EQ(1, result.key());
+  EXPECT_EQ(100, result.value());
+  // Reinsertion doesn't change the value. Also, double check a temporary
+  // context.
+  auto i_result = m.Insert(1, 101, IndexKeyContext<TestData>(keys));
+  EXPECT_FALSE(i_result.is_inserted());
+  EXPECT_EQ(100, i_result.value());
+  // Update does change the value.
+  i_result = m.Update(1, 101, key_context);
+  EXPECT_FALSE(i_result.is_inserted());
+  EXPECT_EQ(101, i_result.value());
+
+  // Verify all the elements.
+  ExpectMapElementsAre(m, {Pair(1, 101)});
+
+  // Fill up a bunch to ensure we trigger growth a few times.
+  for (int i : llvm::seq(2, 512)) {
+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
+    EXPECT_TRUE(m.Insert(i, i * 100, key_context).is_inserted());
+
+    // Immediately do a basic check of all elements to pin down when an
+    // insertion corrupts the rest of the table.
+    for (int j : llvm::seq(1, i)) {
+      SCOPED_TRACE(llvm::formatv("Assert key: {0}", j).str());
+      ASSERT_EQ(j * 100 + static_cast<int>(j == 1),
+                m.Lookup(j, key_context).value());
+      ASSERT_EQ(j * 100 + static_cast<int>(j == 1),
+                m.Lookup(TestData(j * 100000), key_context).value());
+    }
+  }
+  for (int i : llvm::seq(1, 512)) {
+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
+    EXPECT_FALSE(m.Insert(i, i * 100 + 1, key_context).is_inserted());
+    EXPECT_EQ(i * 100 + static_cast<int>(i == 1),
+              m.Lookup(i, key_context).value());
+    EXPECT_FALSE(m.Update(i, i * 100 + 1, key_context).is_inserted());
+    EXPECT_EQ(i * 100 + 1, m.Lookup(i, key_context).value());
+  }
+  EXPECT_FALSE(m.Contains(0, key_context));
+  EXPECT_FALSE(m.Contains(512, key_context));
+
+  // Verify all the elements.
+  ExpectMapElementsAre(
+      m, MakeKeyValues([](int k) { return k * 100 + 1; }, llvm::seq(1, 512)));
+}
+
+}  // namespace
+}  // namespace Carbon::Testing

+ 1 - 0
common/ostream.h

@@ -54,6 +54,7 @@ class Printable {
 
 // Returns the result of printing the value.
 template <typename T>
+  requires std::derived_from<T, Printable<T>>
 inline auto PrintToString(const T& val) -> std::string {
   std::string str;
   llvm::raw_string_ostream stream(str);

+ 11 - 0
common/raw_hashtable.cpp

@@ -0,0 +1,11 @@
+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+// Exceptions. See /LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "common/raw_hashtable.h"
+
+namespace Carbon::RawHashtable {
+
+volatile std::byte global_addr_seed{1};
+
+}  // namespace Carbon::RawHashtable

+ 1336 - 0
common/raw_hashtable.h

@@ -0,0 +1,1336 @@
+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+// Exceptions. See /LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef CARBON_COMMON_RAW_HASHTABLE_H_
+#define CARBON_COMMON_RAW_HASHTABLE_H_
+
+#include <algorithm>
+#include <concepts>
+#include <cstddef>
+#include <cstring>
+#include <new>
+#include <type_traits>
+#include <utility>
+
+#include "common/check.h"
+#include "common/hashing.h"
+#include "common/hashtable_key_context.h"
+#include "common/raw_hashtable_metadata_group.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/MathExtras.h"
+
+// A namespace collecting a set of low-level utilities for building hashtable
+// data structures. These should only be used as implementation details of
+// higher-level data-structure APIs.
+//
+// The utilities here use the `hashtable_key_context.h` provided `KeyContext` to
+// support the necessary hashtable operations on keys: hashing and comparison.
+// This also serves as the customization point for hashtables built on this
+// infrastructure for those operations. See that header file for details.
+//
+// These utilities support hashtables following a *specific* API design pattern,
+// and using Small-Size Optimization, or "SSO", when desired. We expect there to
+// be three layers to any hashtable design:
+//
+// - A *view* type: a read-only view of the hashtable contents. This type should
+//   be a value type and is expected to be passed by-value in APIs. However, it
+//   will have `const`-reference semantics, much like a `std::string_view`. Note
+//   that the *entries* will continue to be mutable, it is only the *table* that
+//   is read-only.
+//
+// - A *base* type: a base class type of the actual hashtable, which allows
+//   almost all mutable operations but erases any specific SSO buffer size.
+//   Because this is a base of the actual hash table, it is designed to be
+//   passed as a non-`const` reference or pointer.
+//
+// - A *table* type: the actual hashtable which derives from the base type and
+//   adds any desired SSO storage buffer. Beyond the physical storage, it also
+//   allows resetting the table to its initial state & allocated size, as well
+//   as copying and moving the table.
+//
+// For complete examples of the API design, see `set.h` for a hashtable-based
+// set data structure, and `map.h` for a hashtable-based map data structure.
+//
+// The hashtable design implemented here has several key invariants and design
+// elements that are essential to all three of the types above and the
+// functionality they provide.
+//
+// - The underlying hashtable uses [open addressing], a power-of-two table size,
+//   and quadratic probing rather than closed addressing and chaining.
+//
+//   [open addressing]: https://en.wikipedia.org/wiki/Open_addressing
+//
+// - Each _slot_ in the table corresponds to a key, a value, and one byte of
+//   metadata. Each _entry_ is a key and value. The key and value for an entry
+//   are stored together.
+//
+// - The allocated storage is organized into an array of metadata bytes followed
+//   by an array of entry storage.
+//
+// - The metadata byte corresponding to each entry marks that entry is either
+//   empty, deleted, or present. When present, a 7-bit tag is also stored using
+//   another 7 bits from the hash of the entry key.
+//
+// - The storage for an entry is an internal type that should not be exposed to
+//   users, and instead only the underlying keys and values.
+//
+// - The hash addressing and probing occurs over *groups* of slots rather than
+//   individual entries. When inserting a new entry, it can be added to the
+//   group it hashes to as long it is not full, and can even replace a slot with
+//   a tombstone indicating a previously deleted entry. Only when the group is
+//   full will it look at the next group in the probe sequence. As a result,
+//   there may be entries in a group where a different group is the start of
+//   that entry's probe sequence. Also, when performing a lookup, every group in
+//   the probe sequence must be inspected for the lookup key until it is found
+//   or the group has an empty slot.
+//
+// - Groups are scanned rapidly using the one-byte metadata for each entry in
+//   the group and CPU instructions that allow comparing all of the metadata for
+//   a group in parallel. For more details on the metadata group encoding and
+//   scanning, see `raw_hashtable_metadata_group.h`.
+//
+// - `GroupSize` is a platform-specific relatively small power of two that fits
+//   in some hardware register. However, `MaxGroupSize` is provided as a
+//   portable max that is also a power of two. The table storage, whether
+//   provided by an SSO buffer or allocated, is required to be a multiple of
+//   `MaxGroupSize` to keep the requirement portable but sufficient for all
+//   platforms.
+//
+// - There is *always* an allocated table of some multiple of `MaxGroupSize`.
+//   This allows accesses to be branchless. When heap allocated, we pro-actively
+//   allocate at least a minimum heap size table. When there is a small-size
+//   optimization (SSO) buffer, that provides the initial allocation.
+//
+// - The table performs a minimal amount of bookkeeping that limits the APIs it
+//   can support:
+//    - `alloc_size` is the size of the table *allocated* (not *used*), and is
+//       always a power of 2 at least as big as `MinAllocatedSize`.
+//    - `storage` is a pointer to the storage for the `alloc_size` slots of the
+//       table, and never null.
+//    - `small_alloc_size` is the maximum `alloc_size` where the table is stored
+//       in the object itself instead of separately on the heap. In this case,
+//       `storage` points to `small_storage_`.
+//    - `growth_budget` is the number of entries that may be added before the
+//       table allocation is doubled. It is always
+//       `GrowthThresholdForAllocSize(alloc_size)` minus the number of
+//       non-empty (filled or deleted) slots. If it ever falls to 0, the table
+//       is grown to keep it greater than 0.
+//   There is also the "moved-from" state where the table may only be
+//   reinitialized or destroyed where the `alloc_size` is 0 and `storage` is
+//   null. Since it doesn't track the exact number of filled entries in a table,
+//   it doesn't support a container-style `size` API.
+//
+// - There is no direct iterator support because of the complexity of embedding
+//   the group-based metadata scanning into an iterator model. Instead, there is
+//   just a for-each method that is passed a lambda to observe all entries. The
+//   order of this observation is also not guaranteed.
+namespace Carbon::RawHashtable {
+
+// If allocating storage, allocate a minimum of one cacheline of group metadata
+// or a minimum of one group, whichever is larger.
+constexpr ssize_t MinAllocatedSize = std::max<ssize_t>(64, MaxGroupSize);
+
+// An entry in the hashtable storage of a `KeyT` and `ValueT` object.
+//
+// Allows manual construction, destruction, and access to these values so we can
+// create arrays af the entries prior to populating them with actual keys and
+// values.
+template <typename KeyT, typename ValueT>
+struct StorageEntry {
+  static constexpr bool IsTriviallyDestructible =
+      std::is_trivially_destructible_v<KeyT> &&
+      std::is_trivially_destructible_v<ValueT>;
+
+  static constexpr bool IsTriviallyRelocatable =
+      IsTriviallyDestructible && std::is_trivially_move_constructible_v<KeyT> &&
+      std::is_trivially_move_constructible_v<ValueT>;
+
+  auto key() const -> const KeyT& {
+    // Ensure we don't need more alignment than available. Inside a method body
+    // to apply to the complete type.
+    static_assert(
+        alignof(StorageEntry) <= MinAllocatedSize,
+        "The minimum allocated size turns into the alignment of our array of "
+        "storage entries as they follow the metadata byte array.");
+
+    return *std::launder(reinterpret_cast<const KeyT*>(&key_storage));
+  }
+  auto key() -> KeyT& {
+    return const_cast<KeyT&>(const_cast<const StorageEntry*>(this)->key());
+  }
+
+  auto value() const -> const ValueT& {
+    return *std::launder(reinterpret_cast<const ValueT*>(&value_storage));
+  }
+  auto value() -> ValueT& {
+    return const_cast<ValueT&>(const_cast<const StorageEntry*>(this)->value());
+  }
+
+  // We handle destruction and move manually as we only want to expose distinct
+  // `KeyT` and `ValueT` subobjects to user code that may need to do in-place
+  // construction. As a consequence, this struct only provides the storage and
+  // we have to manually manage the construction, move, and destruction of the
+  // objects.
+  auto Destroy() -> void {
+    static_assert(!IsTriviallyDestructible,
+                  "Should never instantiate when trivial!");
+    key().~KeyT();
+    value().~ValueT();
+  }
+
+  auto CopyFrom(const StorageEntry& entry) -> void {
+    if constexpr (IsTriviallyRelocatable) {
+      memcpy(this, &entry, sizeof(StorageEntry));
+    } else {
+      new (&key_storage) KeyT(entry.key());
+      new (&value_storage) ValueT(entry.value());
+    }
+  }
+
+  // Move from an expiring entry and destroy that entry's key and value.
+  // Optimizes to directly use `memcpy` when correct.
+  auto MoveFrom(StorageEntry&& entry) -> void {
+    if constexpr (IsTriviallyRelocatable) {
+      memcpy(this, &entry, sizeof(StorageEntry));
+    } else {
+      new (&key_storage) KeyT(std::move(entry.key()));
+      entry.key().~KeyT();
+      new (&value_storage) ValueT(std::move(entry.value()));
+      entry.value().~ValueT();
+    }
+  }
+
+  alignas(KeyT) std::byte key_storage[sizeof(KeyT)];
+  alignas(ValueT) std::byte value_storage[sizeof(ValueT)];
+};
+
+// A specialization of the storage entry for sets without a distinct value type.
+// Somewhat duplicative with the key-value version, but C++ specialization makes
+// doing better difficult.
+template <typename KeyT>
+struct StorageEntry<KeyT, void> {
+  static constexpr bool IsTriviallyDestructible =
+      std::is_trivially_destructible_v<KeyT>;
+
+  static constexpr bool IsTriviallyRelocatable =
+      IsTriviallyDestructible && std::is_trivially_move_constructible_v<KeyT>;
+
+  auto key() const -> const KeyT& {
+    // Ensure we don't need more alignment than available.
+    static_assert(
+        alignof(StorageEntry) <= MinAllocatedSize,
+        "The minimum allocated size turns into the alignment of our array of "
+        "storage entries as they follow the metadata byte array.");
+
+    return *std::launder(reinterpret_cast<const KeyT*>(&key_storage));
+  }
+  auto key() -> KeyT& {
+    return const_cast<KeyT&>(const_cast<const StorageEntry*>(this)->key());
+  }
+
+  auto Destroy() -> void {
+    static_assert(!IsTriviallyDestructible,
+                  "Should never instantiate when trivial!");
+    key().~KeyT();
+  }
+
+  auto CopyFrom(const StorageEntry& entry) -> void {
+    if constexpr (IsTriviallyRelocatable) {
+      memcpy(this, &entry, sizeof(StorageEntry));
+    } else {
+      new (&key_storage) KeyT(entry.key());
+    }
+  }
+
+  auto MoveFrom(StorageEntry&& entry) -> void {
+    if constexpr (IsTriviallyRelocatable) {
+      memcpy(this, &entry, sizeof(StorageEntry));
+    } else {
+      new (&key_storage) KeyT(std::move(entry.key()));
+      entry.key().~KeyT();
+    }
+  }
+
+  alignas(KeyT) std::byte key_storage[sizeof(KeyT)];
+};
+
+// A placeholder empty type used to model pointers to the allocated buffer of
+// storage.
+//
+// The allocated storage doesn't have a meaningful static layout -- it consists
+// of an array of metadata groups followed by an array of storage entries.
+// However, we want to be able to mark pointers to this and so use pointers to
+// this placeholder type as that signifier.
+//
+// This is a complete, empty type so that it can be used as a base class of a
+// specific concrete storage type for compile-time sized storage.
+struct Storage {};
+
+// Forward declaration to support friending, see the definition below.
+template <typename KeyT, typename ValueT = void,
+          typename InputKeyContextT = DefaultKeyContext>
+class BaseImpl;
+
+// Implementation helper for defining a read-only view type for a hashtable.
+//
+// A specific user-facing hashtable view type should derive privately from this
+// type, and forward the implementation of its interface to functions in this
+// type.
+//
+// The methods available to user-facing hashtable types are `protected`, and
+// where they are expected to directly map to a public API, named with an
+// `Impl`. The suffix naming ensures types don't `using` in these low-level APIs
+// but declare their own and implement them by forwarding to these APIs. We
+// don't want users to have to read these implementation details to understand
+// their container's API, so none of these methods should be `using`-ed into the
+// user facing types.
+//
+// Some of the types are just convenience aliases and aren't important to
+// surface as part of the user-facing type API for readers and so those are
+// reasonable to add via a `using`.
+//
+// Some methods are used by other parts of the raw hashtable implementation.
+// Those are kept `private` and where necessary the other components of the raw
+// hashtable implementation are friended to give access to them.
+template <typename InputKeyT, typename InputValueT = void,
+          typename InputKeyContextT = DefaultKeyContext>
+class ViewImpl {
+ protected:
+  using KeyT = InputKeyT;
+  using ValueT = InputValueT;
+  using KeyContextT = InputKeyContextT;
+  using EntryT = StorageEntry<KeyT, ValueT>;
+
+  friend class BaseImpl<KeyT, ValueT, KeyContextT>;
+
+  // Make more-`const` types friends to enable conversions that add `const`.
+  friend class ViewImpl<const KeyT, ValueT, KeyContextT>;
+  friend class ViewImpl<KeyT, const ValueT, KeyContextT>;
+  friend class ViewImpl<const KeyT, const ValueT, KeyContextT>;
+
+  ViewImpl() = default;
+
+  // Support adding `const` to either key or value type of some other view.
+  template <typename OtherKeyT, typename OtherValueT>
+  // NOLINTNEXTLINE(google-explicit-constructor)
+  ViewImpl(ViewImpl<OtherKeyT, OtherValueT, KeyContextT> other_view)
+    requires(std::same_as<KeyT, OtherKeyT> ||
+             std::same_as<KeyT, const OtherKeyT>) &&
+                (std::same_as<ValueT, OtherValueT> ||
+                 std::same_as<ValueT, const OtherValueT>)
+      : alloc_size_(other_view.alloc_size_), storage_(other_view.storage_) {}
+
+  // Looks up an entry in the hashtable and returns its address or null if not
+  // present.
+  template <typename LookupKeyT>
+  auto LookupEntry(LookupKeyT lookup_key, KeyContextT key_context) const
+      -> EntryT*;
+
+  // Calls `entry_callback` for each entry in the hashtable. All the entries
+  // within a specific group are visited first, and then `group_callback` is
+  // called on the group itself. The `group_callback` is typically only used by
+  // the internals of the hashtable.
+  template <typename EntryCallbackT, typename GroupCallbackT>
+  auto ForEachEntry(EntryCallbackT entry_callback,
+                    GroupCallbackT group_callback) const -> void;
+
+  // Counts the number of keys in the hashtable that required probing beyond the
+  // initial group.
+  auto CountProbedKeys(KeyContextT key_context) const -> ssize_t;
+
+ private:
+  ViewImpl(ssize_t alloc_size, Storage* storage)
+      : alloc_size_(alloc_size), storage_(storage) {}
+
+  // Computes the offset from the metadata array to the entries array for a
+  // given size. This is trivial, but we use this routine to enforce invariants
+  // on the sizes.
+  static constexpr auto EntriesOffset(ssize_t alloc_size) -> ssize_t {
+    CARBON_DCHECK(llvm::isPowerOf2_64(alloc_size))
+        << "Size must be a power of two for a hashed buffer!";
+    // The size is always a power of two. We prevent any too-small sizes so it
+    // being a power of two provides the needed alignment. As a result, the
+    // offset is exactly the size. We validate this here to catch alignment bugs
+    // early.
+    CARBON_DCHECK(static_cast<uint64_t>(alloc_size) ==
+                  llvm::alignTo<alignof(EntryT)>(alloc_size));
+    return alloc_size;
+  }
+
+  auto metadata() const -> uint8_t* {
+    return reinterpret_cast<uint8_t*>(storage_);
+  }
+  auto entries() const -> EntryT* {
+    return reinterpret_cast<EntryT*>(reinterpret_cast<std::byte*>(storage_) +
+                                     EntriesOffset(alloc_size_));
+  }
+
+  ssize_t alloc_size_;
+  Storage* storage_;
+};
+
+// Implementation helper for defining a read-write base type for a hashtable
+// that type-erases any SSO buffer.
+//
+// A specific user-facing hashtable base type should derive using *`protected`*
+// inheritance from this type, and forward the implementation of its interface
+// to functions in this type.
+//
+// Other than the use of `protected` inheritance, the patterns for this type,
+// and how to build user-facing hashtable base types from it, mirror those of
+// `ViewImpl`. See its documentation for more details.
+template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
+class BaseImpl {
+ protected:
+  using KeyT = InputKeyT;
+  using ValueT = InputValueT;
+  using KeyContextT = InputKeyContextT;
+  using ViewImplT = ViewImpl<KeyT, ValueT, KeyContextT>;
+  using EntryT = typename ViewImplT::EntryT;
+
+  BaseImpl(int small_alloc_size, Storage* small_storage)
+      : small_alloc_size_(small_alloc_size) {
+    CARBON_CHECK(small_alloc_size >= 0);
+    Construct(small_storage);
+  }
+  // Only used for copying and moving, and leaves storage uninitialized.
+  BaseImpl(ssize_t alloc_size, int growth_budget, int small_alloc_size)
+      : view_impl_(alloc_size, nullptr),
+        growth_budget_(growth_budget),
+        small_alloc_size_(small_alloc_size) {}
+  ~BaseImpl();
+
+  // NOLINTNEXTLINE(google-explicit-constructor): Designed to implicitly decay.
+  operator ViewImplT() const { return view_impl(); }
+
+  auto view_impl() const -> ViewImplT { return view_impl_; }
+
+  // Looks up the provided key in the hashtable. If found, returns a pointer to
+  // that entry and `false`.
+  //
+  // If not found, will locate an empty entry for inserting into, set the
+  // metadata for that entry, and return a pointer to the entry and `true`. When
+  // necessary, this will grow the hashtable to cause there to be sufficient
+  // empty entries.
+  template <typename LookupKeyT>
+  auto InsertImpl(LookupKeyT lookup_key, KeyContextT key_context)
+      -> std::pair<EntryT*, bool>;
+
+  // Looks up the entry in the hashtable, and if found destroys the entry and
+  // returns `true`. If not found, returns `false`.
+  //
+  // Does not release any memory, just leaves a tombstone behind so this entry
+  // cannot be found and the slot can in theory be re-used.
+  template <typename LookupKeyT>
+  auto EraseImpl(LookupKeyT lookup_key, KeyContextT key_context) -> bool;
+
+  // Erases all entries in the hashtable but leaves the allocated storage.
+  auto ClearImpl() -> void;
+
+ private:
+  template <typename InputBaseT, ssize_t SmallSize>
+  friend class TableImpl;
+
+  static constexpr ssize_t Alignment = std::max<ssize_t>(
+      {alignof(MetadataGroup), alignof(StorageEntry<KeyT, ValueT>)});
+
+  // Implementation of inline small storage for the provided key type, value
+  // type, and small size. Specialized for a zero small size to be an empty
+  // struct.
+  template <ssize_t SmallSize>
+  struct SmallStorage : Storage {
+    alignas(Alignment) uint8_t metadata[SmallSize];
+    mutable StorageEntry<KeyT, ValueT> entries[SmallSize];
+  };
+  // Specialized storage with no inline buffer to avoid any extra alignment.
+  template <>
+  struct SmallStorage<0> {};
+
+  static constexpr auto AllocByteSize(ssize_t alloc_size) -> ssize_t {
+    return ViewImplT::EntriesOffset(alloc_size) + sizeof(EntryT) * alloc_size;
+  }
+  static auto Allocate(ssize_t alloc_size) -> Storage*;
+  static auto Deallocate(Storage* storage, ssize_t alloc_size) -> void;
+
+  auto growth_budget() const -> ssize_t { return growth_budget_; }
+  auto alloc_size() const -> ssize_t { return view_impl_.alloc_size_; }
+  auto alloc_size() -> ssize_t& { return view_impl_.alloc_size_; }
+  auto storage() const -> Storage* { return view_impl_.storage_; }
+  auto storage() -> Storage*& { return view_impl_.storage_; }
+  auto metadata() const -> uint8_t* { return view_impl_.metadata(); }
+  auto entries() const -> EntryT* { return view_impl_.entries(); }
+  auto small_alloc_size() const -> ssize_t {
+    return static_cast<unsigned>(small_alloc_size_);
+  }
+  auto is_small() const -> bool { return alloc_size() <= small_alloc_size(); }
+
+  auto Construct(Storage* small_storage) -> void;
+  auto Destroy() -> void;
+
+  template <typename LookupKeyT>
+  auto InsertIntoEmpty(LookupKeyT lookup_key, KeyContextT key_context)
+      -> EntryT*;
+
+  static auto ComputeNextAllocSize(ssize_t old_alloc_size) -> ssize_t;
+  static auto GrowthThresholdForAllocSize(ssize_t alloc_size) -> ssize_t;
+
+  template <typename LookupKeyT>
+  auto GrowAndInsert(LookupKeyT lookup_key, KeyContextT key_context) -> EntryT*;
+
+  ViewImplT view_impl_;
+  int growth_budget_;
+  int small_alloc_size_;
+};
+
+// Implementation helper for defining a hashtable type with an SSO buffer.
+//
+// A specific user-facing hashtable should derive privately from this
+// type, and forward the implementation of its interface to functions in this
+// type. It should provide the corresponding user-facing hashtable base type as
+// the `InputBaseT` type parameter (rather than a key/value pair), and this type
+// will in turn derive from that provided base type. This allows derived-to-base
+// conversion from the user-facing hashtable type to the user-facing hashtable
+// base type. And it does so keeping the inheritance linear. The resulting
+// linear inheritance hierarchy for a `Map<K, T>` type will look like:
+//
+//   Map<K, T>
+//    ↓
+//   TableImpl<MapBase<K, T>>
+//    ↓
+//   MapBase<K, T>
+//    ↓
+//   BaseImpl<K, T>
+//
+// Other than this inheritance technique, the patterns for this type, and how to
+// build user-facing hashtable types from it, mirror those of `ViewImpl`. See
+// its documentation for more details.
+template <typename InputBaseT, ssize_t SmallSize>
+class TableImpl : public InputBaseT {
+ protected:
+  using BaseT = InputBaseT;
+
+  TableImpl() : BaseT(SmallSize, small_storage()) {}
+  TableImpl(const TableImpl& arg);
+  TableImpl(TableImpl&& arg) noexcept;
+
+  // Resets the hashtable to its initial state, clearing all entries and
+  // releasing all memory. If the hashtable had an SSO buffer, that is restored
+  // as the storage. Otherwise, a minimum sized table storage is allocated.
+  auto ResetImpl() -> void;
+
+ private:
+  using KeyT = BaseT::KeyT;
+  using ValueT = BaseT::ValueT;
+  using EntryT = BaseT::EntryT;
+  using SmallStorage = BaseT::template SmallStorage<SmallSize>;
+
+  auto small_storage() const -> Storage*;
+
+  [[no_unique_address]] mutable SmallStorage small_storage_;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Only implementation details below this point.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+// Computes a seed that provides a small amount of entropy from ASLR where
+// available with minimal cost. The priority is speed, and this computes the
+// entropy in a way that doesn't require loading from memory, merely accessing
+// entropy already available without accessing memory.
+inline auto ComputeSeed() -> uint64_t {
+  // A global variable whose address is used as a seed. This allows ASLR to
+  // introduce some variation in hashtable ordering when enabled via the code
+  // model for globals.
+  extern volatile std::byte global_addr_seed;
+
+  return reinterpret_cast<uint64_t>(&global_addr_seed);
+}
+
+inline auto ComputeProbeMaskFromSize(ssize_t size) -> size_t {
+  CARBON_DCHECK(llvm::isPowerOf2_64(size))
+      << "Size must be a power of two for a hashed buffer!";
+  // Since `size` is a power of two, we can make sure the probes are less
+  // than `size` by making the mask `size - 1`. We also mask off the low
+  // bits so the probes are a multiple of the size of the groups of entries.
+  return (size - 1) & ~GroupMask;
+}
+
+// This class handles building a sequence of probe indices from a given
+// starting point, including both the quadratic growth and masking the index
+// to stay within the bucket array size. The starting point doesn't need to be
+// clamped to the size ahead of time (or even be positive), we will do it
+// internally.
+//
+// For reference on quadratic probing:
+// https://en.wikipedia.org/wiki/Quadratic_probing
+//
+// We compute the quadratic probe index incrementally, but we can also compute
+// it mathematically and will check that the incremental result matches our
+// mathematical expectation. We use the quadratic probing formula of:
+//
+//   p(start, step) = (start + (step + step^2) / 2) (mod size / GroupSize)
+//
+// However, we compute it incrementally and scale all the variables by the group
+// size so it can be used as an index without an additional multiplication.
+class ProbeSequence {
+ public:
+  ProbeSequence(ssize_t start, ssize_t size) {
+    mask_ = ComputeProbeMaskFromSize(size);
+    p_ = start & mask_;
+#ifndef NDEBUG
+    start_ = start & mask_;
+    size_ = size;
+#endif
+  }
+
+  void Next() {
+    step_ += GroupSize;
+    p_ = (p_ + step_) & mask_;
+#ifndef NDEBUG
+    // Verify against the quadratic formula we expect to be following by scaling
+    // everything down by `GroupSize`.
+    CARBON_DCHECK(
+        (p_ / GroupSize) ==
+        ((start_ / GroupSize +
+          (step_ / GroupSize + (step_ / GroupSize) * (step_ / GroupSize)) / 2) %
+         (size_ / GroupSize)))
+        << "Index in probe sequence does not match the expected formula.";
+    CARBON_DCHECK(step_ < size_) << "We necessarily visit all groups, so we "
+                                    "can't have more probe steps than groups.";
+#endif
+  }
+
+  auto index() const -> ssize_t { return p_; }
+
+ private:
+  ssize_t step_ = 0;
+  size_t mask_;
+  ssize_t p_;
+#ifndef NDEBUG
+  ssize_t start_;
+  ssize_t size_;
+#endif
+};
+
+// TODO: Evaluate keeping this outlined to see if macro benchmarks observe the
+// same perf hit as micro benchmarks.
+template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
+template <typename LookupKeyT>
+auto ViewImpl<InputKeyT, InputValueT, InputKeyContextT>::LookupEntry(
+    LookupKeyT lookup_key, KeyContextT key_context) const -> EntryT* {
+  // Prefetch with a "low" temporal locality as we're primarily expecting a
+  // brief use of the storage and then to return to application code.
+  __builtin_prefetch(storage_, /*read*/ 0, /*low-locality*/ 1);
+
+  ssize_t local_size = alloc_size_;
+  CARBON_DCHECK(local_size > 0);
+
+  uint8_t* local_metadata = metadata();
+  HashCode hash = key_context.HashKey(lookup_key, ComputeSeed());
+  auto [hash_index, tag] = hash.ExtractIndexAndTag<7>();
+
+  EntryT* local_entries = entries();
+
+  // Walk through groups of entries using a quadratic probe starting from
+  // `hash_index`.
+  ProbeSequence s(hash_index, local_size);
+  do {
+    ssize_t group_index = s.index();
+
+    // For each group, match the tag against the metadata to extract the
+    // potentially matching entries within the group.
+    MetadataGroup g = MetadataGroup::Load(local_metadata, group_index);
+    auto metadata_matched_range = g.Match(tag);
+    if (LLVM_LIKELY(metadata_matched_range)) {
+      // If any entries in this group potentially match based on their metadata,
+      // walk each candidate and compare its key to see if we have definitively
+      // found a match.
+      EntryT* group_entries = &local_entries[group_index];
+      auto byte_it = metadata_matched_range.begin();
+      auto byte_end = metadata_matched_range.end();
+      do {
+        EntryT* entry = byte_it.index_ptr(group_entries);
+        if (LLVM_LIKELY(key_context.KeyEq(lookup_key, entry->key()))) {
+          __builtin_assume(entry != nullptr);
+          return entry;
+        }
+        ++byte_it;
+      } while (LLVM_UNLIKELY(byte_it != byte_end));
+    }
+
+    // We failed to find a matching entry in this bucket, so check if there are
+    // empty slots as that indicates we're done probing -- no later probed index
+    // could have a match.
+    auto empty_byte_matched_range = g.MatchEmpty();
+    if (LLVM_LIKELY(empty_byte_matched_range)) {
+      return nullptr;
+    }
+
+    s.Next();
+
+    // We use a weird construct of an "unlikely" condition of `true`. The goal
+    // is to get the compiler to not prioritize the back edge of the loop for
+    // code layout, and in at least some tests this seems to be an effective
+    // construct for achieving this.
+  } while (LLVM_UNLIKELY(true));
+}
+
+// Note that we force inlining here because we expect to be called with lambdas
+// that will in turn be inlined to form the loop body. We don't want function
+// boundaries within the loop for performance, and recognizing the degree of
+// simplification from inlining these callbacks may be difficult to
+// automatically recognize.
+template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
+template <typename EntryCallbackT, typename GroupCallbackT>
+[[clang::always_inline]] auto
+ViewImpl<InputKeyT, InputValueT, InputKeyContextT>::ForEachEntry(
+    EntryCallbackT entry_callback, GroupCallbackT group_callback) const
+    -> void {
+  uint8_t* local_metadata = metadata();
+  EntryT* local_entries = entries();
+
+  ssize_t local_size = alloc_size_;
+  for (ssize_t group_index = 0; group_index < local_size;
+       group_index += GroupSize) {
+    auto g = MetadataGroup::Load(local_metadata, group_index);
+    auto present_matched_range = g.MatchPresent();
+    if (!present_matched_range) {
+      continue;
+    }
+    for (ssize_t byte_index : present_matched_range) {
+      entry_callback(local_entries[group_index + byte_index]);
+    }
+
+    group_callback(&local_metadata[group_index]);
+  }
+}
+
+template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
+auto ViewImpl<InputKeyT, InputValueT, InputKeyContextT>::CountProbedKeys(
+    KeyContextT key_context) const -> ssize_t {
+  uint8_t* local_metadata = metadata();
+  EntryT* local_entries = entries();
+  ssize_t local_size = alloc_size_;
+  ssize_t count = 0;
+  for (ssize_t group_index = 0; group_index < local_size;
+       group_index += GroupSize) {
+    auto g = MetadataGroup::Load(local_metadata, group_index);
+    auto present_matched_range = g.MatchPresent();
+    for (ssize_t byte_index : present_matched_range) {
+      ssize_t index = group_index + byte_index;
+      HashCode hash =
+          key_context.HashKey(local_entries[index].key(), ComputeSeed());
+      ssize_t hash_index = hash.ExtractIndexAndTag<7>().first &
+                           ComputeProbeMaskFromSize(local_size);
+      count += static_cast<ssize_t>(hash_index != group_index);
+    }
+  }
+  return count;
+}
+
+template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
+BaseImpl<InputKeyT, InputValueT, InputKeyContextT>::~BaseImpl() {
+  Destroy();
+}
+
+// TODO: Evaluate whether it is worth forcing this out-of-line given the
+// reasonable ABI boundary it forms and large volume of code necessary to
+// implement it.
+template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
+template <typename LookupKeyT>
+auto BaseImpl<InputKeyT, InputValueT, InputKeyContextT>::InsertImpl(
+    LookupKeyT lookup_key, KeyContextT key_context)
+    -> std::pair<EntryT*, bool> {
+  CARBON_DCHECK(alloc_size() > 0);
+
+  uint8_t* local_metadata = metadata();
+
+  HashCode hash = key_context.HashKey(lookup_key, ComputeSeed());
+  auto [hash_index, tag] = hash.ExtractIndexAndTag<7>();
+
+  // We re-purpose the empty control byte to signal no insert is needed to the
+  // caller. This is guaranteed to not be a control byte we're inserting.
+  // constexpr uint8_t NoInsertNeeded = Group::Empty;
+
+  ssize_t group_with_deleted_index;
+  MetadataGroup::MatchIndex deleted_match = {};
+
+  EntryT* local_entries = entries();
+
+  auto return_insert_at_index = [&](ssize_t index) -> std::pair<EntryT*, bool> {
+    // We'll need to insert at this index so set the control group byte to the
+    // proper value.
+    local_metadata[index] = tag | MetadataGroup::PresentMask;
+    return {&local_entries[index], true};
+  };
+
+  for (ProbeSequence s(hash_index, alloc_size());; s.Next()) {
+    ssize_t group_index = s.index();
+    auto g = MetadataGroup::Load(local_metadata, group_index);
+
+    auto control_byte_matched_range = g.Match(tag);
+    if (control_byte_matched_range) {
+      EntryT* group_entries = &local_entries[group_index];
+      auto byte_it = control_byte_matched_range.begin();
+      auto byte_end = control_byte_matched_range.end();
+      do {
+        EntryT* entry = byte_it.index_ptr(group_entries);
+        if (LLVM_LIKELY(key_context.KeyEq(lookup_key, entry->key()))) {
+          return {entry, false};
+        }
+        ++byte_it;
+      } while (LLVM_UNLIKELY(byte_it != byte_end));
+    }
+
+    // Track the first group with a deleted entry that we could insert over.
+    if (!deleted_match) {
+      deleted_match = g.MatchDeleted();
+      group_with_deleted_index = group_index;
+    }
+
+    // We failed to find a matching entry in this bucket, so check if there are
+    // no empty slots. In that case, we'll continue probing.
+    auto empty_match = g.MatchEmpty();
+    if (!empty_match) {
+      continue;
+    }
+    // Ok, we've finished probing without finding anything and need to insert
+    // instead.
+
+    // If we found a deleted slot, we don't need the probe sequence to insert
+    // so just bail. We want to ensure building up a table is fast so we
+    // de-prioritize this a bit. In practice this doesn't have too much of an
+    // effect.
+    if (LLVM_UNLIKELY(deleted_match)) {
+      return return_insert_at_index(group_with_deleted_index +
+                                    deleted_match.index());
+    }
+
+    // We're going to need to grow by inserting into an empty slot. Check that
+    // we have the budget for that before we compute the exact index of the
+    // empty slot. Without the growth budget we'll have to completely rehash and
+    // so we can just bail here.
+    if (LLVM_UNLIKELY(growth_budget_ == 0)) {
+      return {GrowAndInsert(lookup_key, key_context), true};
+    }
+
+    --growth_budget_;
+    CARBON_DCHECK(growth_budget() >= 0)
+        << "Growth budget shouldn't have gone negative!";
+    return return_insert_at_index(group_index + empty_match.index());
+  }
+
+  CARBON_FATAL() << "We should never finish probing without finding the entry "
+                    "or an empty slot.";
+}
+
+template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
+template <typename LookupKeyT>
+auto BaseImpl<InputKeyT, InputValueT, InputKeyContextT>::EraseImpl(
+    LookupKeyT lookup_key, KeyContextT key_context) -> bool {
+  EntryT* entry = view_impl_.LookupEntry(lookup_key, key_context);
+  if (!entry) {
+    return false;
+  }
+
+  // If there are empty slots in this group then nothing will probe past this
+  // group looking for an entry so we can simply set this slot to empty as
+  // well. However, if every slot in this group is full, it might be part of
+  // a long probe chain that we can't disrupt. In that case we mark the slot's
+  // metadata as deleted to keep probes continuing past it.
+  //
+  // If we mark the slot as empty, we'll also need to increase the growth
+  // budget.
+  uint8_t* local_metadata = metadata();
+  EntryT* local_entries = entries();
+  ssize_t index = entry - local_entries;
+  ssize_t group_index = index & ~GroupMask;
+  auto g = MetadataGroup::Load(local_metadata, group_index);
+  auto empty_matched_range = g.MatchEmpty();
+  if (empty_matched_range) {
+    local_metadata[index] = MetadataGroup::Empty;
+    ++growth_budget_;
+  } else {
+    local_metadata[index] = MetadataGroup::Deleted;
+  }
+
+  if constexpr (!EntryT::IsTriviallyDestructible) {
+    entry->Destroy();
+  }
+
+  return true;
+}
+
+template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
+auto BaseImpl<InputKeyT, InputValueT, InputKeyContextT>::ClearImpl() -> void {
+  view_impl_.ForEachEntry(
+      [](EntryT& entry) {
+        if constexpr (!EntryT::IsTriviallyDestructible) {
+          entry.Destroy();
+        }
+      },
+      [](uint8_t* metadata_group) {
+        // Clear the group.
+        std::memset(metadata_group, 0, GroupSize);
+      });
+  growth_budget_ = GrowthThresholdForAllocSize(alloc_size());
+}
+
+// Allocates the appropriate memory layout for a table of the given
+// `alloc_size`, with space both for the metadata array and entries.
+//
+// The returned pointer *must* be deallocated by calling the below `Deallocate`
+// function with the same `alloc_size` as used here.
+template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
+auto BaseImpl<InputKeyT, InputValueT, InputKeyContextT>::Allocate(
+    ssize_t alloc_size) -> Storage* {
+  return reinterpret_cast<Storage*>(__builtin_operator_new(
+      AllocByteSize(alloc_size), static_cast<std::align_val_t>(Alignment),
+      std::nothrow_t()));
+}
+
+// Deallocates a table's storage that was allocated with the `Allocate`
+// function.
+template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
+auto BaseImpl<InputKeyT, InputValueT, InputKeyContextT>::Deallocate(
+    Storage* storage, ssize_t alloc_size) -> void {
+  ssize_t allocated_size = AllocByteSize(alloc_size);
+  // We don't need the size, but make sure it always compiles.
+  static_cast<void>(allocated_size);
+  __builtin_operator_delete(storage,
+#if __cpp_sized_deallocation
+                            allocated_size,
+#endif
+                            static_cast<std::align_val_t>(Alignment));
+}
+
+// Construct a table using the provided small storage if `small_alloc_size_` is
+// non-zero. If `small_alloc_size_` is zero, then `small_storage` won't be used
+// and can be null. Regardless, after this the storage pointer is non-null and
+// the size is non-zero so that we can directly begin inserting or querying the
+// table.
+template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
+auto BaseImpl<InputKeyT, InputValueT, InputKeyContextT>::Construct(
+    Storage* small_storage) -> void {
+  if (small_alloc_size_ > 0) {
+    alloc_size() = small_alloc_size_;
+    storage() = small_storage;
+  } else {
+    // Directly allocate the initial buffer so that the hashtable is never in
+    // an empty state.
+    alloc_size() = MinAllocatedSize;
+    storage() = Allocate(MinAllocatedSize);
+  }
+  std::memset(metadata(), 0, alloc_size());
+  growth_budget_ = GrowthThresholdForAllocSize(alloc_size());
+}
+
+// Destroy the current table, releasing any memory used.
+template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
+auto BaseImpl<InputKeyT, InputValueT, InputKeyContextT>::Destroy() -> void {
+  // Check for a moved-from state and don't do anything. Only a moved-from table
+  // has a zero size.
+  if (alloc_size() == 0) {
+    return;
+  }
+
+  // Destroy all the entries.
+  if constexpr (!EntryT::IsTriviallyDestructible) {
+    view_impl_.ForEachEntry([](EntryT& entry) { entry.Destroy(); },
+                            [](auto...) {});
+  }
+
+  // If small, nothing to deallocate.
+  if (is_small()) {
+    return;
+  }
+
+  // Just deallocate the storage without updating anything when destroying the
+  // object.
+  Deallocate(storage(), alloc_size());
+}
+
+// Optimized routine to insert a key into a table when that key *definitely*
+// isn't present in the table and the table *definitely* has a viable empty slot
+// (and growth space) to insert into before any deleted slots. When both of
+// these are true, typically just after growth, we can dramatically simplify the
+// insert position search.
+template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
+template <typename LookupKeyT>
+[[clang::noinline]] auto
+BaseImpl<InputKeyT, InputValueT, InputKeyContextT>::InsertIntoEmpty(
+    LookupKeyT lookup_key, KeyContextT key_context) -> EntryT* {
+  HashCode hash = key_context.HashKey(lookup_key, ComputeSeed());
+  auto [hash_index, tag] = hash.ExtractIndexAndTag<7>();
+  uint8_t* local_metadata = metadata();
+  EntryT* local_entries = entries();
+
+  for (ProbeSequence s(hash_index, alloc_size());; s.Next()) {
+    ssize_t group_index = s.index();
+    auto g = MetadataGroup::Load(local_metadata, group_index);
+
+    if (auto empty_match = g.MatchEmpty()) {
+      ssize_t index = group_index + empty_match.index();
+      local_metadata[index] = tag | MetadataGroup::PresentMask;
+      return &local_entries[index];
+    }
+
+    // Otherwise we continue probing.
+  }
+}
+
+// Apply our doubling growth strategy and (re-)check invariants around table
+// size.
+template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
+auto BaseImpl<InputKeyT, InputValueT, InputKeyContextT>::ComputeNextAllocSize(
+    ssize_t old_alloc_size) -> ssize_t {
+  CARBON_DCHECK(llvm::isPowerOf2_64(old_alloc_size))
+      << "Expected a power of two!";
+  ssize_t new_alloc_size;
+  bool overflow = __builtin_mul_overflow(old_alloc_size, 2, &new_alloc_size);
+  CARBON_CHECK(!overflow) << "Computing the new size overflowed `ssize_t`!";
+  return new_alloc_size;
+}
+
+// Compute the growth threshold for a given size.
+template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
+auto BaseImpl<InputKeyT, InputValueT,
+              InputKeyContextT>::GrowthThresholdForAllocSize(ssize_t alloc_size)
+    -> ssize_t {
+  // We use a 7/8ths load factor to trigger growth.
+  return alloc_size - alloc_size / 8;
+}
+
+// Grow the hashtable to create space and then insert into it. Returns the
+// selected insertion entry. Never returns null. In addition to growing and
+// selecting the insertion entry, this routine updates the metadata array so
+// that this function can be directly called and the result returned from
+// `InsertImpl`.
+template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
+template <typename LookupKeyT>
+[[clang::noinline]] auto
+BaseImpl<InputKeyT, InputValueT, InputKeyContextT>::GrowAndInsert(
+    LookupKeyT lookup_key, KeyContextT key_context) -> EntryT* {
+  // We collect the probed elements in a small vector for re-insertion. It is
+  // tempting to reuse the already allocated storage, but doing so appears to
+  // be a (very slight) performance regression. These are relatively rare and
+  // storing them into the existing storage creates stores to the same regions
+  // of memory we're reading. Moreover, it requires moving both the key and the
+  // value twice, and doing the `memcpy` widening for relocatable types before
+  // the group walk rather than after the group walk. In practice, between the
+  // statistical rareness and using a large small size buffer here on the stack,
+  // we can handle this most efficiently with temporary, additional storage.
+  llvm::SmallVector<ssize_t, 128> probed_indices;
+
+  // We grow into a new `MapBase` so that both the new and old maps are
+  // fully functional until all the entries are moved over. However, we directly
+  // manipulate the internals to short circuit many aspects of the growth.
+  ssize_t old_size = alloc_size();
+  CARBON_DCHECK(old_size > 0);
+  CARBON_DCHECK(growth_budget_ == 0);
+
+  bool old_small = is_small();
+  Storage* old_storage = storage();
+  uint8_t* old_metadata = metadata();
+  EntryT* old_entries = entries();
+
+#ifndef NDEBUG
+  // Count how many of the old table slots will end up being empty after we grow
+  // the table. This is both the currently empty slots, but also the deleted
+  // slots because we clear them to empty and re-insert everything that had any
+  // probing.
+  ssize_t debug_empty_count =
+      llvm::count(llvm::ArrayRef(old_metadata, old_size), MetadataGroup::Empty);
+  ssize_t debug_deleted_count = llvm::count(
+      llvm::ArrayRef(old_metadata, old_size), MetadataGroup::Deleted);
+  CARBON_DCHECK(debug_empty_count >=
+                (old_size - GrowthThresholdForAllocSize(old_size)))
+      << "debug_empty_count: " << debug_empty_count
+      << ", debug_deleted_count: " << debug_deleted_count
+      << ", size: " << old_size;
+#endif
+
+  // Compute the new size and grow the storage in place (if possible).
+  ssize_t new_size = ComputeNextAllocSize(old_size);
+  alloc_size() = new_size;
+  storage() = Allocate(new_size);
+  growth_budget_ = GrowthThresholdForAllocSize(new_size);
+
+  // Now extract the new components of the table.
+  uint8_t* new_metadata = metadata();
+  EntryT* new_entries = entries();
+
+  // We always double the size when we grow. This allows an important
+  // optimization -- we're adding exactly one more high bit to the hash-computed
+  // index for each entry. This in turn means we can classify every entry in the
+  // table into three cases:
+  //
+  // 1) The new high bit is zero, the entry is at the same index in the new
+  //    table as the old.
+  //
+  // 2) The new high bit is one, the entry is at the old index plus the old
+  //    size.
+  //
+  // 3) The entry's current index doesn't match the initial hash index because
+  //    it required some amount of probing to find an empty slot.
+  //
+  // The design of the hash table tries to minimize how many entries fall into
+  // case (3), so we expect the vast majority of entries to be in (1) or (2).
+  // This lets us model growth notionally as duplicating the hash table,
+  // clearing out the empty slots, and inserting any probed elements.
+
+  ssize_t count = 0;
+  for (ssize_t group_index = 0; group_index < old_size;
+       group_index += GroupSize) {
+    auto low_g = MetadataGroup::Load(old_metadata, group_index);
+    // Make sure to match present elements first to enable pipelining with
+    // clearing.
+    auto present_matched_range = low_g.MatchPresent();
+    low_g.ClearDeleted();
+    MetadataGroup high_g;
+    if constexpr (MetadataGroup::FastByteClear) {
+      // When we have a fast byte clear, we can update the metadata for the
+      // growth in-register and store at the end.
+      high_g = low_g;
+    } else {
+      // If we don't have a fast byte clear, we can store the metadata group
+      // eagerly here and overwrite bytes with a byte store below instead of
+      // clearing the byte in-register.
+      low_g.Store(new_metadata, group_index);
+      low_g.Store(new_metadata, group_index | old_size);
+    }
+    for (ssize_t byte_index : present_matched_range) {
+      ++count;
+      ssize_t old_index = group_index + byte_index;
+      if constexpr (!MetadataGroup::FastByteClear) {
+        CARBON_DCHECK(new_metadata[old_index] == old_metadata[old_index]);
+        CARBON_DCHECK(new_metadata[old_index | old_size] ==
+                      old_metadata[old_index]);
+      }
+      HashCode hash =
+          key_context.HashKey(old_entries[old_index].key(), ComputeSeed());
+      ssize_t old_hash_index = hash.ExtractIndexAndTag<7>().first &
+                               ComputeProbeMaskFromSize(old_size);
+      if (LLVM_UNLIKELY(old_hash_index != group_index)) {
+        probed_indices.push_back(old_index);
+        if constexpr (MetadataGroup::FastByteClear) {
+          low_g.ClearByte(byte_index);
+          high_g.ClearByte(byte_index);
+        } else {
+          new_metadata[old_index] = MetadataGroup::Empty;
+          new_metadata[old_index | old_size] = MetadataGroup::Empty;
+        }
+        continue;
+      }
+      ssize_t new_index = hash.ExtractIndexAndTag<7>().first &
+                          ComputeProbeMaskFromSize(new_size);
+      CARBON_DCHECK(new_index == old_hash_index ||
+                    new_index == (old_hash_index | old_size));
+      // Toggle the newly added bit of the index to get to the other possible
+      // target index.
+      if constexpr (MetadataGroup::FastByteClear) {
+        (new_index == old_hash_index ? high_g : low_g).ClearByte(byte_index);
+        new_index += byte_index;
+      } else {
+        new_index += byte_index;
+        new_metadata[new_index ^ old_size] = MetadataGroup::Empty;
+      }
+
+      // If we need to explicitly move (and destroy) the key or value, do so
+      // here where we already know its target.
+      if constexpr (!EntryT::IsTriviallyRelocatable) {
+        new_entries[new_index].MoveFrom(std::move(old_entries[old_index]));
+      }
+    }
+    if constexpr (MetadataGroup::FastByteClear) {
+      low_g.Store(new_metadata, group_index);
+      high_g.Store(new_metadata, (group_index | old_size));
+    }
+  }
+  CARBON_DCHECK((count - static_cast<ssize_t>(probed_indices.size())) ==
+                (new_size - llvm::count(llvm::ArrayRef(new_metadata, new_size),
+                                        MetadataGroup::Empty)));
+#ifndef NDEBUG
+  CARBON_DCHECK((debug_empty_count + debug_deleted_count) ==
+                (old_size - count));
+  CARBON_DCHECK(llvm::count(llvm::ArrayRef(new_metadata, new_size),
+                            MetadataGroup::Empty) ==
+                debug_empty_count + debug_deleted_count +
+                    static_cast<ssize_t>(probed_indices.size()) + old_size);
+#endif
+
+  // If the keys or values are trivially relocatable, we do a bulk memcpy of
+  // them into place. This will copy them into both possible locations, which is
+  // fine. One will be empty and clobbered if reused or ignored. The other will
+  // be the one used. This might seem like it needs it to be valid for us to
+  // create two copies, but it doesn't. This produces the exact same storage as
+  // copying the storage into the wrong location first, and then again into the
+  // correct location. Only one is live and only one is destroyed.
+  if constexpr (EntryT::IsTriviallyRelocatable) {
+    memcpy(new_entries, old_entries, old_size * sizeof(EntryT));
+    memcpy(new_entries + old_size, old_entries, old_size * sizeof(EntryT));
+  }
+
+  // We then need to do a normal insertion for anything that was probed before
+  // growth, but we know we'll find an empty slot, so leverage that.
+  for (ssize_t old_index : probed_indices) {
+    EntryT* new_entry =
+        InsertIntoEmpty(old_entries[old_index].key(), key_context);
+    new_entry->MoveFrom(std::move(old_entries[old_index]));
+  }
+  CARBON_DCHECK(count ==
+                (new_size - llvm::count(llvm::ArrayRef(new_metadata, new_size),
+                                        MetadataGroup::Empty)));
+  growth_budget_ -= count;
+  CARBON_DCHECK(growth_budget_ ==
+                (GrowthThresholdForAllocSize(new_size) -
+                 (new_size - llvm::count(llvm::ArrayRef(new_metadata, new_size),
+                                         MetadataGroup::Empty))));
+  CARBON_DCHECK(growth_budget_ > 0 &&
+                "Must still have a growth budget after rehash!");
+
+  if (!old_small) {
+    // Old isn't a small buffer, so we need to deallocate it.
+    Deallocate(old_storage, old_size);
+  }
+
+  // And lastly insert the lookup_key into an index in the newly grown map and
+  // return that index for use.
+  --growth_budget_;
+  return InsertIntoEmpty(lookup_key, key_context);
+}
+
+template <typename InputBaseT, ssize_t SmallSize>
+TableImpl<InputBaseT, SmallSize>::TableImpl(const TableImpl& arg)
+    : BaseT(arg.alloc_size(), arg.growth_budget_, SmallSize) {
+  CARBON_DCHECK(arg.small_alloc_size_ == SmallSize);
+
+  ssize_t local_size = arg.alloc_size();
+
+  if (SmallSize > 0 && arg.is_small()) {
+    CARBON_DCHECK(local_size == SmallSize);
+    this->storage() = small_storage();
+  } else {
+    this->storage() = BaseT::Allocate(local_size);
+  }
+
+  // Preserve which slot every entry is in, including tombstones in the
+  // metadata, in order to copy into the new table's storage without rehashing
+  // all of the keys. This is especially important as we don't have an easy way
+  // to access the key context needed for rehashing here.
+  uint8_t* local_metadata = this->metadata();
+  EntryT* local_entries = this->entries();
+  const uint8_t* local_arg_metadata = arg.metadata();
+  const EntryT* local_arg_entries = arg.entries();
+  memcpy(local_metadata, local_arg_metadata, local_size);
+
+  for (ssize_t group_index = 0; group_index < local_size;
+       group_index += GroupSize) {
+    auto g = MetadataGroup::Load(local_arg_metadata, group_index);
+    for (ssize_t byte_index : g.MatchPresent()) {
+      local_entries[group_index + byte_index].CopyFrom(
+          local_arg_entries[group_index + byte_index]);
+    }
+  }
+}
+
+// Puts the incoming table into a moved-from state that can be destroyed or
+// re-initialized but must not be used otherwise.
+template <typename InputBaseT, ssize_t SmallSize>
+TableImpl<InputBaseT, SmallSize>::TableImpl(TableImpl&& arg) noexcept
+    : BaseT(arg.alloc_size(), arg.growth_budget_, SmallSize) {
+  CARBON_DCHECK(arg.small_alloc_size_ == SmallSize);
+
+  ssize_t local_size = arg.alloc_size();
+
+  if (SmallSize > 0 && arg.is_small()) {
+    CARBON_DCHECK(local_size == SmallSize);
+    this->storage() = small_storage();
+
+    // For small tables, we have to move the entries as we can't move the tables
+    // themselves. We do this preserving their slots and even tombstones to
+    // avoid rehashing.
+    uint8_t* local_metadata = this->metadata();
+    EntryT* local_entries = this->entries();
+    uint8_t* local_arg_metadata = arg.metadata();
+    EntryT* local_arg_entries = arg.entries();
+    memcpy(local_metadata, local_arg_metadata, local_size);
+    if (EntryT::IsTriviallyRelocatable) {
+      memcpy(local_entries, local_arg_entries, SmallSize * sizeof(EntryT));
+    } else {
+      for (ssize_t group_index = 0; group_index < local_size;
+           group_index += GroupSize) {
+        auto g = MetadataGroup::Load(local_arg_metadata, group_index);
+        for (ssize_t byte_index : g.MatchPresent()) {
+          local_entries[group_index + byte_index].MoveFrom(
+              std::move(local_arg_entries[group_index + byte_index]));
+        }
+      }
+    }
+  } else {
+    // Just point to the allocated storage.
+    this->storage() = arg.storage();
+  }
+
+  // Finally, put the incoming table into a moved-from state.
+  arg.alloc_size() = 0;
+  // Replace the pointer with null to ease debugging.
+  arg.storage() = nullptr;
+}
+
+// Reset a table to its original state, including releasing any allocated
+// memory.
+template <typename InputBaseT, ssize_t SmallSize>
+auto TableImpl<InputBaseT, SmallSize>::ResetImpl() -> void {
+  this->Destroy();
+
+  // Re-initialize the whole thing.
+  CARBON_DCHECK(this->small_alloc_size() == SmallSize);
+  this->Construct(small_storage());
+}
+
+template <typename InputBaseT, ssize_t SmallSize>
+auto TableImpl<InputBaseT, SmallSize>::small_storage() const -> Storage* {
+  if constexpr (SmallSize > 0) {
+    // Do a bunch of validation of the small size to establish our invariants
+    // when we know we have a non-zero small size.
+    static_assert(llvm::isPowerOf2_64(SmallSize),
+                  "SmallSize must be a power of two for a hashed buffer!");
+    static_assert(
+        SmallSize >= MaxGroupSize,
+        "We require all small sizes to multiples of the largest group "
+        "size supported to ensure it can be used portably.  ");
+    static_assert(
+        (SmallSize % MaxGroupSize) == 0,
+        "Small size must be a multiple of the max group size supported "
+        "so that we can allocate a whole number of groups.");
+    // Implied by the max asserts above.
+    static_assert(SmallSize >= GroupSize);
+    static_assert((SmallSize % GroupSize) == 0);
+
+    static_assert(SmallSize >= alignof(StorageEntry<KeyT, ValueT>),
+                  "Requested a small size that would require padding between "
+                  "metadata bytes and correctly aligned key and value types. "
+                  "Either a larger small size or a zero small size and heap "
+                  "allocation are required for this key and value type.");
+
+    static_assert(offsetof(SmallStorage, entries) == SmallSize,
+                  "Offset to entries in small size storage doesn't match "
+                  "computed offset!");
+
+    return &small_storage_;
+  } else {
+    static_assert(
+        sizeof(TableImpl) == sizeof(BaseT),
+        "Empty small storage caused a size difference and wasted space!");
+
+    return nullptr;
+  }
+}
+
+}  // namespace Carbon::RawHashtable
+
+#endif  // CARBON_COMMON_RAW_HASHTABLE_H_

+ 383 - 0
common/raw_hashtable_benchmark_helpers.cpp

@@ -0,0 +1,383 @@
+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+// Exceptions. See /LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "common/raw_hashtable_benchmark_helpers.h"
+
+#include <cstddef>
+#include <forward_list>
+
+namespace Carbon::RawHashtable {
+
+// A local shuffle implementation built on Abseil to improve performance in
+// debug builds.
+template <typename T>
+static auto Shuffle(llvm::MutableArrayRef<T> data, absl::BitGen& gen) {
+  for (ssize_t i : llvm::seq<ssize_t>(0, data.size() - 1)) {
+    ssize_t j = absl::Uniform<ssize_t>(gen, 0, data.size() - i);
+    if (j != 0) {
+      std::swap(data[i], data[i + j]);
+    }
+  }
+}
+
+constexpr ssize_t NumChars = 64;
+static_assert(llvm::isPowerOf2_64(NumChars));
+
+// For benchmarking, we use short strings in a fixed distribution with common
+// characters. Real-world strings aren't uniform across ASCII or Unicode, etc.
+// And for *micro*-benchmarking we want to focus on the map overhead with short,
+// fast keys.
+static auto MakeChars() -> llvm::OwningArrayRef<char> {
+  llvm::OwningArrayRef<char> characters(NumChars);
+
+  // Start with `-` and `_`, and then add `a` - `z`, `A` - `Z`, and `0` - `9`.
+  characters[0] = '-';
+  characters[1] = '_';
+  ssize_t i = 2;
+  for (auto range :
+       {llvm::seq_inclusive('a', 'z'), llvm::seq_inclusive('A', 'Z'),
+        llvm::seq_inclusive('0', '9')}) {
+    for (char c : range) {
+      characters[i] = c;
+      ++i;
+    }
+  }
+  CARBON_CHECK(i == NumChars) << "Expected exactly " << NumChars
+                              << " characters, got " << i << " instead!";
+  return characters;
+}
+
+constexpr ssize_t NumFourCharStrs = NumChars * NumChars * NumChars * NumChars;
+static_assert(llvm::isPowerOf2_64(NumFourCharStrs));
+
+// Compute every 4-character string in a shuffled array. This is a little memory
+// intense -- 64 MiB -- but ends up being much cheaper by letting us reliably
+// select a unique 4-character sequence to avoid collisions.
+static auto MakeFourCharStrs(llvm::ArrayRef<char> characters, absl::BitGen& gen)
+    -> llvm::OwningArrayRef<std::array<char, 4>> {
+  constexpr ssize_t NumCharsMask = NumChars - 1;
+  constexpr ssize_t NumCharsShift = llvm::CTLog2<NumChars>();
+  llvm::OwningArrayRef<std::array<char, 4>> four_char_strs(NumFourCharStrs);
+  for (auto [i, str] : llvm::enumerate(four_char_strs)) {
+    str[0] = characters[i & NumCharsMask];
+    i >>= NumCharsShift;
+    str[1] = characters[i & NumCharsMask];
+    i >>= NumCharsShift;
+    str[2] = characters[i & NumCharsMask];
+    i >>= NumCharsShift;
+    CARBON_CHECK((i & ~NumCharsMask) == 0);
+    str[3] = characters[i];
+  }
+  Shuffle(four_char_strs, gen);
+  return four_char_strs;
+}
+
+constexpr ssize_t NumRandomChars = static_cast<ssize_t>(64) * 1024;
+
+// Create a pool of random characters to sample from rather than computing this
+// for every string which is very slow in debug builds. We also pad this pool
+// with the max length so we can pull the full length from the end to simplify
+// the logic when wrapping around the pool.
+static auto MakeRandomChars(llvm::ArrayRef<char> characters, int max_length,
+                            absl::BitGen& gen) -> llvm::OwningArrayRef<char> {
+  llvm::OwningArrayRef<char> random_chars(NumRandomChars + max_length);
+  for (char& c : random_chars) {
+    c = characters[absl::Uniform<ssize_t>(gen, 0, NumChars)];
+  }
+  return random_chars;
+}
+
+// Make a small vector of pointers into a single allocation of raw strings. The
+// allocated memory is expected to leak and must be transitively referenced by a
+// global. Each string has `length` size (which must be >= 4), and there are
+// `key_count` keys in the result. Each key is filled from the `random_chars`
+// until the last 4 characters. The last four characters of each string will be
+// taken sequentially from `four_char_strs` from some random start position to
+// ensure no duplicate keys are produced.
+static auto MakeRawStrKeys(ssize_t length, ssize_t key_count,
+                           llvm::ArrayRef<std::array<char, 4>> four_char_strs,
+                           llvm::ArrayRef<char> random_chars, absl::BitGen& gen)
+    -> llvm::SmallVector<const char*> {
+  llvm::SmallVector<const char*> raw_keys;
+  CARBON_CHECK(length >= 4);
+  ssize_t prefix_length = length - 4;
+
+  // Select a random start for indexing our four character strings.
+  ssize_t four_char_index = absl::Uniform<ssize_t>(gen, 0, NumFourCharStrs);
+
+  // Select a random start for the prefix random characters.
+  ssize_t random_chars_index = absl::Uniform<ssize_t>(gen, 0, NumRandomChars);
+
+  // Do a single memory allocation for all the keys of this length to
+  // avoid an excessive number of small and fragmented allocations. This
+  // memory is intentionally leaked as the keys are global and will
+  // themselves will point into it.
+  char* key_text = new char[key_count * length];
+
+  // Reserve all the key space since we know how many we'll need.
+  raw_keys.reserve(key_count);
+  for ([[gnu::unused]] ssize_t i : llvm::seq<ssize_t>(0, key_count)) {
+    memcpy(key_text, random_chars.data() + random_chars_index, prefix_length);
+    random_chars_index += prefix_length;
+    random_chars_index &= NumRandomChars - 1;
+    // Set the last four characters with this entry in the shuffled
+    // sequence.
+    memcpy(key_text + prefix_length, four_char_strs[four_char_index].data(), 4);
+    // Step through the shuffled sequence. We start at a random position,
+    // so we need to wrap around the end.
+    ++four_char_index;
+    four_char_index &= NumFourCharStrs - 1;
+
+    // And finally save the start pointer as one of our raw keys.
+    raw_keys.push_back(key_text);
+    key_text += length;
+  }
+  return raw_keys;
+}
+
+// Build up a large collection of random and unique string keys. This is
+// actually a relatively expensive operation due to needing to build all the
+// random string text. As a consequence, the initializer of this global is
+// somewhat performance tuned to ensure benchmarks don't take an excessive
+// amount of time to run or use an excessive amount of memory.
+static absl::NoDestructor<llvm::OwningArrayRef<llvm::StringRef>> raw_str_keys{
+    [] {
+      llvm::OwningArrayRef<llvm::StringRef> keys(MaxNumKeys);
+      absl::BitGen gen;
+
+      std::array length_buckets = {
+          4, 4, 4, 4, 5, 5, 5, 5, 7, 7, 10, 10, 15, 25, 40, 80,
+      };
+      static_assert((MaxNumKeys % length_buckets.size()) == 0);
+      CARBON_CHECK(llvm::is_sorted(length_buckets));
+
+      // For each distinct length bucket, we build a vector of raw keys.
+      std::forward_list<llvm::SmallVector<const char*>> raw_keys_storage;
+      // And a parallel array to the length buckets with the raw keys of that
+      // length.
+      std::array<llvm::SmallVector<const char*>*, length_buckets.size()>
+          raw_keys_buckets;
+
+      llvm::OwningArrayRef<char> characters = MakeChars();
+      llvm::OwningArrayRef<std::array<char, 4>> four_char_strs =
+          MakeFourCharStrs(characters, gen);
+      llvm::OwningArrayRef<char> random_chars = MakeRandomChars(
+          characters, /*max_length=*/length_buckets.back(), gen);
+
+      ssize_t prev_length = -1;
+      for (auto [length_index, length] : llvm::enumerate(length_buckets)) {
+        // We can detect repetitions in length as they are sorted.
+        if (length == prev_length) {
+          raw_keys_buckets[length_index] = raw_keys_buckets[length_index - 1];
+          continue;
+        }
+        prev_length = length;
+
+        // We want to compute all the keys of this length that we'll need.
+        ssize_t key_count = (MaxNumKeys / length_buckets.size()) *
+                            llvm::count(length_buckets, length);
+
+        raw_keys_buckets[length_index] =
+            &raw_keys_storage.emplace_front(MakeRawStrKeys(
+                length, key_count, four_char_strs, random_chars, gen));
+      }
+
+      // Now build the actual key array from our intermediate storage by
+      // round-robin extracting from the length buckets.
+      for (auto [index, key] : llvm::enumerate(keys)) {
+        ssize_t bucket = index % length_buckets.size();
+        ssize_t length = length_buckets[bucket];
+        // We pop a raw key from the list of them associated with this bucket.
+        const char* raw_key = raw_keys_buckets[bucket]->pop_back_val();
+        // And build our key from that.
+        key = llvm::StringRef(raw_key, length);
+      }
+      // Check that in fact we popped every raw key into our main keys.
+      for (const auto& raw_keys : raw_keys_storage) {
+        CARBON_CHECK(raw_keys.empty());
+      }
+      return keys;
+    }()};
+
+static absl::NoDestructor<llvm::OwningArrayRef<int*>> raw_ptr_keys{[] {
+  llvm::OwningArrayRef<int*> keys(MaxNumKeys);
+  for (auto [index, key] : llvm::enumerate(keys)) {
+    // We leak these pointers -- this is a static initializer executed once.
+    key = new int(static_cast<int>(index));
+  }
+  return keys;
+}()};
+
+static absl::NoDestructor<llvm::OwningArrayRef<int>> raw_int_keys{[] {
+  llvm::OwningArrayRef<int> keys(MaxNumKeys);
+  for (auto [index, key] : llvm::enumerate(keys)) {
+    key = index + 1;
+  }
+  return keys;
+}()};
+
+namespace {
+
+// Allow generically dispatching over the specific key types we build and
+// support.
+template <typename T>
+auto GetRawKeys() -> llvm::ArrayRef<T> {
+  if constexpr (std::is_same_v<T, llvm::StringRef>) {
+    return *raw_str_keys;
+  } else if constexpr (std::is_pointer_v<T>) {
+    return *raw_ptr_keys;
+  } else {
+    return *raw_int_keys;
+  }
+}
+
+template <typename T>
+static absl::NoDestructor<
+    std::map<std::pair<ssize_t, ssize_t>, llvm::OwningArrayRef<T>>>
+    lookup_keys_storage;
+
+// Given a particular table keys size and lookup keys size, provide an array ref
+// to a shuffled set of lookup keys.
+//
+// Because different table sizes pull from different sub-ranges of our raw keys,
+// we need to compute a distinct set of random keys in the table to use for
+// lookups depending on the table size. And we also want to have an even
+// distribution of key *sizes* throughout the lookup keys, and so we can't
+// compute a single lookup keys array of the maximum size. Instead we need to
+// compute a distinct special set of lookup keys for each pair of table and
+// lookup size, and then shuffle that specific set into a random sequence that
+// is returned. This function memoizes this sequence for each pair of sizes.
+template <typename T>
+auto GetShuffledLookupKeys(ssize_t table_keys_size, ssize_t lookup_keys_size)
+    -> llvm::ArrayRef<T> {
+  // The raw keys aren't shuffled and round-robin through the sizes. We want to
+  // keep the total size of lookup keys used exactly the same across runs. So
+  // for a given size we always take the leading sequence from the raw keys for
+  // that size, duplicating as needed to get the desired lookup sequence size,
+  // and then shuffle the keys in that sequence to end up with a random sequence
+  // of keys. We store each of these shuffled sequences in a map to avoid
+  // repeatedly computing them.
+  llvm::OwningArrayRef<T>& lookup_keys =
+      (*lookup_keys_storage<T>)[{table_keys_size, lookup_keys_size}];
+  if (lookup_keys.empty()) {
+    lookup_keys = llvm::OwningArrayRef<T>(lookup_keys_size);
+    auto raw_keys = GetRawKeys<T>();
+    for (auto [index, key] : llvm::enumerate(lookup_keys)) {
+      key = raw_keys[index % table_keys_size];
+    }
+    absl::BitGen gen;
+    Shuffle(lookup_keys, gen);
+  }
+  CARBON_CHECK(static_cast<ssize_t>(lookup_keys.size()) == lookup_keys_size);
+
+  return lookup_keys;
+}
+
+}  // namespace
+
+template <typename T>
+auto GetKeysAndMissKeys(ssize_t table_keys_size)
+    -> std::pair<llvm::ArrayRef<T>, llvm::ArrayRef<T>> {
+  CARBON_CHECK(table_keys_size <= MaxNumKeys);
+  // The raw keys aren't shuffled and round-robin through the sizes. Take the
+  // tail of this sequence and shuffle it to form a random set of miss keys with
+  // a consistent total size.
+  static absl::NoDestructor<llvm::OwningArrayRef<T>> miss_keys{[] {
+    llvm::OwningArrayRef<T> keys;
+    keys = GetRawKeys<T>().take_back(NumOtherKeys);
+    CARBON_CHECK(keys.size() == NumOtherKeys);
+    absl::BitGen gen;
+    Shuffle(keys, gen);
+    return keys;
+  }()};
+
+  return {GetRawKeys<T>().slice(0, table_keys_size), *miss_keys};
+}
+template auto GetKeysAndMissKeys<int>(ssize_t size)
+    -> std::pair<llvm::ArrayRef<int>, llvm::ArrayRef<int>>;
+template auto GetKeysAndMissKeys<int*>(ssize_t size)
+    -> std::pair<llvm::ArrayRef<int*>, llvm::ArrayRef<int*>>;
+template auto GetKeysAndMissKeys<llvm::StringRef>(ssize_t size)
+    -> std::pair<llvm::ArrayRef<llvm::StringRef>,
+                 llvm::ArrayRef<llvm::StringRef>>;
+
+template <typename T>
+auto GetKeysAndHitKeys(ssize_t table_keys_size, ssize_t lookup_keys_size)
+    -> std::pair<llvm::ArrayRef<T>, llvm::ArrayRef<T>> {
+  CARBON_CHECK(table_keys_size <= MaxNumKeys);
+  CARBON_CHECK(lookup_keys_size <= MaxNumKeys);
+  return {GetRawKeys<T>().slice(0, table_keys_size),
+          GetShuffledLookupKeys<T>(table_keys_size, lookup_keys_size)};
+}
+template auto GetKeysAndHitKeys<int>(ssize_t size, ssize_t lookup_keys_size)
+    -> std::pair<llvm::ArrayRef<int>, llvm::ArrayRef<int>>;
+template auto GetKeysAndHitKeys<int*>(ssize_t size, ssize_t lookup_keys_size)
+    -> std::pair<llvm::ArrayRef<int*>, llvm::ArrayRef<int*>>;
+template auto GetKeysAndHitKeys<llvm::StringRef>(ssize_t size,
+                                                 ssize_t lookup_keys_size)
+    -> std::pair<llvm::ArrayRef<llvm::StringRef>,
+                 llvm::ArrayRef<llvm::StringRef>>;
+
+template <typename T>
+auto DumpHashStatistics(llvm::ArrayRef<T> keys) -> void {
+  if (keys.size() < GroupSize) {
+    return;
+  }
+
+  // The hash table load factor is 7/8ths, so we want to add 1/7th of our
+  // current size, subtract one, and pick the next power of two to get the power
+  // of two where 7/8ths is greater than or equal to the incoming key size.
+  ssize_t expected_size =
+      llvm::NextPowerOf2(keys.size() + (keys.size() / 7) - 1);
+
+  constexpr int GroupShift = llvm::CTLog2<GroupSize>();
+
+  size_t mask = ComputeProbeMaskFromSize(expected_size);
+  uint64_t salt = ComputeSeed();
+  auto get_hash_index = [mask, salt](auto x) -> ssize_t {
+    auto [hash_index, _] = HashValue(x, salt).template ExtractIndexAndTag<7>();
+    return (hash_index & mask) >> GroupShift;
+  };
+
+  std::vector<std::vector<int>> grouped_key_indices(expected_size >>
+                                                    GroupShift);
+  for (auto [i, k] : llvm::enumerate(keys)) {
+    ssize_t hash_index = get_hash_index(k);
+    CARBON_CHECK(hash_index < (expected_size >> GroupShift)) << hash_index;
+    grouped_key_indices[hash_index].push_back(i);
+  }
+  ssize_t max_group_index =
+      std::max_element(grouped_key_indices.begin(), grouped_key_indices.end(),
+                       [](const auto& lhs, const auto& rhs) {
+                         return lhs.size() < rhs.size();
+                       }) -
+      grouped_key_indices.begin();
+
+  // If the max number of collisions on the index is less than or equal to the
+  // group size, there shouldn't be any necessary probing (outside of deletion)
+  // and so this isn't interesting, skip printing.
+  if (grouped_key_indices[max_group_index].size() <= GroupSize) {
+    return;
+  }
+
+  llvm::errs() << "keys: " << keys.size()
+               << "  groups: " << grouped_key_indices.size() << "\n"
+               << "max group index: " << llvm::formatv("{0x8}", max_group_index)
+               << "  collisions: "
+               << grouped_key_indices[max_group_index].size() << "\n";
+
+  for (auto i : llvm::ArrayRef(grouped_key_indices[max_group_index])
+                    .take_front(2 * GroupSize)) {
+    auto k = keys[i];
+    auto hash = static_cast<uint64_t>(HashValue(k, salt));
+    llvm::errs() << "  key: " << k
+                 << "  salt: " << llvm::formatv("{0:x16}", salt)
+                 << "  hash: " << llvm::formatv("{0:x16}", hash) << "\n";
+  }
+}
+template auto DumpHashStatistics(llvm::ArrayRef<int> keys) -> void;
+template auto DumpHashStatistics(llvm::ArrayRef<int*> keys) -> void;
+template auto DumpHashStatistics(llvm::ArrayRef<llvm::StringRef> keys) -> void;
+
+}  // namespace Carbon::RawHashtable

+ 208 - 0
common/raw_hashtable_benchmark_helpers.h

@@ -0,0 +1,208 @@
+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+// Exceptions. See /LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef CARBON_COMMON_RAW_HASHTABLE_BENCHMARK_HELPERS_H_
+#define CARBON_COMMON_RAW_HASHTABLE_BENCHMARK_HELPERS_H_
+
+#include <benchmark/benchmark.h>
+#include <sys/types.h>
+
+#include <limits>
+#include <map>
+#include <vector>
+
+#include "absl/base/no_destructor.h"
+#include "absl/random/random.h"
+#include "common/check.h"
+#include "common/hashing.h"
+#include "common/raw_hashtable.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+
+namespace Carbon::RawHashtable {
+
+// We want to support benchmarking with 16M keys plus up to 256 "other" keys
+// (for misses). The large number of keys helps check for performance hiccups
+// with especially large tables and when missing all levels of cache.
+inline constexpr ssize_t NumOtherKeys = 1 << 8;
+inline constexpr ssize_t MaxNumKeys = (1 << 24) + NumOtherKeys;
+
+// Get an array of main keys with the given `size`, which must be less than
+// 2^24. Also get a miss keys array of `NumOtherKeys` which has no collisions
+// with the main keys.
+//
+// For a given size, this will return the same arrays. This uses unsynchronized
+// global state, and so is thread hostile and must not be called before main.
+template <typename T>
+auto GetKeysAndMissKeys(ssize_t table_keys_size)
+    -> std::pair<llvm::ArrayRef<T>, llvm::ArrayRef<T>>;
+
+// Get an array of main keys with the given `size`, which must be less than
+// 2^24. Also get a hit keys array of `lookup_keys_size` all of which will occur
+// in the may keys array. If the lookup size is larger than the main size, the
+// lookup sequence will contain duplicates.
+//
+// For a given size, this will return the same arrays. This uses unsynchronized
+// global state, and so is thread hostile and must not be called before main.
+template <typename T>
+auto GetKeysAndHitKeys(ssize_t table_keys_size, ssize_t lookup_keys_size)
+    -> std::pair<llvm::ArrayRef<T>, llvm::ArrayRef<T>>;
+
+// Dump statistics about hashing the given keys.
+template <typename T>
+auto DumpHashStatistics(llvm::ArrayRef<T> keys) -> void;
+
+// Convert values used in hashtable benchmarking to a bool. This is used to form
+// dependencies between values stored in the hashtable between benchmark
+// iterations.
+template <typename T>
+auto ValueToBool(T value) -> bool {
+  if constexpr (std::is_same_v<T, llvm::StringRef>) {
+    return value.size() > 0;
+  } else if constexpr (std::is_pointer_v<T>) {
+    return value != nullptr;
+  } else {
+    // We want our keys to include `0` for integers, so use the largest value.
+    return value != std::numeric_limits<T>::max();
+  }
+}
+
+inline auto SizeArgs(benchmark::internal::Benchmark* b) -> void {
+  // Benchmarks for "miss" operations only have one parameter -- the size of the
+  // table. These benchmarks use a fixed `NumOtherKeys` set of extra keys for
+  // each miss operation.
+  b->DenseRange(1, 4, 1);
+  b->Arg(8);
+  b->Arg(16);
+  b->Arg(32);
+
+  // For sizes >= 64 we first use the power of two which will have a low load
+  // factor, and then target exactly at our max load factor.
+  auto large_sizes = {64, 1 << 8, 1 << 12, 1 << 16, 1 << 20, 1 << 24};
+  for (auto s : large_sizes) {
+    b->Arg(s);
+  }
+  for (auto s : large_sizes) {
+    b->Arg(s - (s / 8));
+  }
+}
+
+inline auto HitArgs(benchmark::internal::Benchmark* b) -> void {
+  // There are two parameters for benchmarks of "hit" operations. The first is
+  // the size of the hashtable itself. The second is the size of a buffer of
+  // random keys actually in the hashtable to use for the operations.
+  //
+  // For small sizes, we use a fixed `NumOtherKeys` lookup key count. This is
+  // enough to avoid patterns of queries training the branch predictor just from
+  // the keys themselves, while small enough to avoid significant L1 cache
+  // pressure.
+  b->ArgsProduct({benchmark::CreateDenseRange(1, 4, 1), {NumOtherKeys}});
+  b->Args({8, NumOtherKeys});
+  b->Args({16, NumOtherKeys});
+  b->Args({32, NumOtherKeys});
+
+  // For sizes >= 64 we first use the power of two which will have a low load
+  // factor, and then target exactly at our max load factor. Start the sizes
+  // list off with the powers of two, and the append a version of each power of
+  // two adjusted down to the load factor. We'll then build the benchmarks from
+  // these below.
+  std::vector<ssize_t> large_sizes = {64,      1 << 8,  1 << 12,
+                                      1 << 16, 1 << 20, 1 << 24};
+  for (auto i : llvm::seq<int>(0, large_sizes.size())) {
+    ssize_t s = large_sizes[i];
+    large_sizes.push_back(s - (s / 8));
+  }
+
+  for (auto s : large_sizes) {
+    b->Args({s, NumOtherKeys});
+
+    // Once the sizes are more than 4x the `NumOtherKeys` minimum lookup buffer
+    // size, also include 25% and 50% lookup buffer sizes which will
+    // increasingly exhaust the ability to keep matching entries in the cache.
+    if (s >= NumOtherKeys) {
+      b->Args({s, s / 4});
+      b->Args({s, s / 2});
+    }
+  }
+}
+
+// Provide some Dense{Map,Set}Info viable implementations for the key types
+// using Carbon's hashing framework. These let us benchmark the data structure
+// alone rather than the combination of data structure and hashing routine.
+//
+// We only provide these for benchmarking -- they are *not* necessarily suitable
+// for broader use. The Carbon hashing infrastructure has only been evaluated in
+// the context of its specific hashtable design.
+template <typename T>
+struct CarbonHashDI;
+
+template <>
+struct CarbonHashDI<int> {
+  static auto getEmptyKey() -> int { return -1; }
+  static auto getTombstoneKey() -> int { return -2; }
+  static auto getHashValue(const int val) -> unsigned {
+    return static_cast<uint64_t>(HashValue(val));
+  }
+  static auto isEqual(const int lhs, const int rhs) -> bool {
+    return lhs == rhs;
+  }
+};
+
+template <typename T>
+struct CarbonHashDI<T*> {
+  static constexpr uintptr_t Log2MaxAlign = 12;
+
+  static auto getEmptyKey() -> T* {
+    auto val = static_cast<uintptr_t>(-1);
+    val <<= Log2MaxAlign;
+    // NOLINTNEXTLINE(performance-no-int-to-ptr): This is required by the API.
+    return reinterpret_cast<int*>(val);
+  }
+
+  static auto getTombstoneKey() -> T* {
+    auto val = static_cast<uintptr_t>(-2);
+    val <<= Log2MaxAlign;
+    // NOLINTNEXTLINE(performance-no-int-to-ptr): This is required by the API.
+    return reinterpret_cast<int*>(val);
+  }
+
+  static auto getHashValue(const T* ptr_val) -> unsigned {
+    return static_cast<uint64_t>(HashValue(ptr_val));
+  }
+
+  static auto isEqual(const T* lhs, const T* rhs) -> bool { return lhs == rhs; }
+};
+
+template <>
+struct CarbonHashDI<llvm::StringRef> {
+  static auto getEmptyKey() -> llvm::StringRef {
+    return llvm::StringRef(
+        // NOLINTNEXTLINE(performance-no-int-to-ptr): Required by the API.
+        reinterpret_cast<const char*>(~static_cast<uintptr_t>(0)), 0);
+  }
+
+  static auto getTombstoneKey() -> llvm::StringRef {
+    return llvm::StringRef(
+        // NOLINTNEXTLINE(performance-no-int-to-ptr): Required by the API.
+        reinterpret_cast<const char*>(~static_cast<uintptr_t>(1)), 0);
+  }
+  static auto getHashValue(llvm::StringRef val) -> unsigned {
+    return static_cast<uint64_t>(HashValue(val));
+  }
+  static auto isEqual(llvm::StringRef lhs, llvm::StringRef rhs) -> bool {
+    if (rhs.data() == getEmptyKey().data()) {
+      return lhs.data() == getEmptyKey().data();
+    }
+    if (rhs.data() == getTombstoneKey().data()) {
+      return lhs.data() == getTombstoneKey().data();
+    }
+    return lhs == rhs;
+  }
+};
+
+}  // namespace Carbon::RawHashtable
+
+#endif  // CARBON_COMMON_RAW_HASHTABLE_BENCHMARK_HELPERS_H_

+ 20 - 0
common/raw_hashtable_metadata_group.cpp

@@ -0,0 +1,20 @@
+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+// Exceptions. See /LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "common/raw_hashtable_metadata_group.h"
+
+#include "llvm/ADT/StringExtras.h"
+
+namespace Carbon::RawHashtable {
+
+auto MetadataGroup::Print(llvm::raw_ostream& out) const -> void {
+  out << "[";
+  llvm::ListSeparator sep;
+  for (uint8_t byte : metadata_bytes) {
+    out << sep << llvm::formatv("{0:x2}", byte);
+  }
+  out << "]";
+}
+
+}  // namespace Carbon::RawHashtable

+ 1093 - 0
common/raw_hashtable_metadata_group.h

@@ -0,0 +1,1093 @@
+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+// Exceptions. See /LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef CARBON_COMMON_RAW_HASHTABLE_METADATA_GROUP_H_
+#define CARBON_COMMON_RAW_HASHTABLE_METADATA_GROUP_H_
+
+#include <cstddef>
+#include <cstring>
+#include <iterator>
+
+#include "common/check.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/bit.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/MathExtras.h"
+
+// Detect whether we can use SIMD accelerated implementations of the control
+// groups, and include the relevant platform specific APIs for the SIMD
+// implementations.
+//
+// Reference documentation for the SIMD APIs used here:
+// - https://arm-software.github.io/acle/neon_intrinsics/advsimd.html
+// - https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html
+#if defined(__SSSE3__)
+#include <x86intrin.h>
+#define CARBON_X86_SIMD_SUPPORT 1
+#elif defined(__ARM_NEON)
+#include <arm_neon.h>
+#define CARBON_NEON_SIMD_SUPPORT 1
+#endif
+
+// This namespace collects low-level utilities for implementing hashtable
+// data structures. This file only provides one of them:
+//
+// - Primitives to manage "groups" of hashtable entries that have densely packed
+//   control bytes we can scan rapidly as a group, often using SIMD facilities
+//   to process the entire group at once.
+namespace Carbon::RawHashtable {
+
+// We define a constant max group size. The particular group size used in
+// practice may vary, but we want to have some upper bound used to ensure
+// memory allocation is done consistently across different architectures.
+constexpr ssize_t MaxGroupSize = 16;
+
+// This takes a collection of bits representing the results of looking for a
+// particular tag in this metadata group and determines the first position with
+// a match. The position is represented by either the least significant set bit
+// or the least significant non-zero byte, depending on `ByteEncoding`. When
+// represented with a non-zero byte, that byte must have at least its most
+// significant bit set, but may have other bits set to any value. Bits more
+// significant than the match may have any value provided there is at least one
+// match. Zero matches must be represented by a zero input.
+//
+// Some bits of the underlying value may be known-zero, which can optimize
+// various operations. These can be represented as a `ZeroMask`.
+template <typename BitsInputT, bool ByteEncoding, BitsInputT ZeroMask = 0>
+class BitIndex
+    : public Printable<BitIndex<BitsInputT, ByteEncoding, ZeroMask>> {
+ public:
+  using BitsT = BitsInputT;
+
+  BitIndex() = default;
+  explicit BitIndex(BitsT bits) : bits_(bits) {}
+
+  friend auto operator==(BitIndex lhs, BitIndex rhs) -> bool {
+    if (lhs.empty() || rhs.empty()) {
+      return lhs.empty() == rhs.empty();
+    }
+    // For non-empty bit indices, compare the indices directly to ignore other
+    // (extraneous) parts of the incoming bits.
+    return lhs.index() == rhs.index();
+  }
+
+  auto Print(llvm::raw_ostream& out) const -> void {
+    out << llvm::formatv("{0:x}", bits_);
+  }
+
+  explicit operator bool() const { return !empty(); }
+
+  // Returns true when there are no matches for the tag.
+  auto empty() const -> bool {
+    CARBON_DCHECK((bits_ & ZeroMask) == 0) << "Unexpected non-zero bits!";
+    __builtin_assume((bits_ & ZeroMask) == 0);
+    return bits_ == 0;
+  }
+
+  // Returns the index of the first matched tag.
+  auto index() -> ssize_t {
+    CARBON_DCHECK(bits_ != 0) << "Cannot get an index from zero bits!";
+    __builtin_assume(bits_ != 0);
+    ssize_t index = unscaled_index();
+
+    if constexpr (ByteEncoding) {
+      // Shift to scale out of the byte encoding.
+      index >>= ByteEncodingShift;
+    }
+
+    return index;
+  }
+
+  // Optimized tool to index a pointer `p` by `index()`.
+  template <typename T>
+  auto index_ptr(T* pointer) -> T* {
+    CARBON_DCHECK(bits_ != 0) << "Cannot get an index from zero bits!";
+    __builtin_assume(bits_ != 0);
+    if constexpr (!ByteEncoding) {
+      return &pointer[unscaled_index()];
+    }
+
+    ssize_t index = unscaled_index();
+
+    // Scale the index as we counted zero *bits* and not zero *bytes*.
+    // However, we can fold that scale with the size of `T` when it is a power
+    // of two or divisible by 8.
+    CARBON_DCHECK(
+        (index & ((static_cast<size_t>(1) << ByteEncodingShift) - 1)) == 0);
+    if constexpr (sizeof(T) % 8 == 0) {
+      constexpr size_t FoldedScale = sizeof(T) / 8;
+      index *= FoldedScale;
+      return reinterpret_cast<T*>(
+          &reinterpret_cast<std::byte*>(pointer)[index]);
+    } else if constexpr (llvm::isPowerOf2_64(sizeof(T))) {
+      constexpr size_t ScaleShift = llvm::CTLog2<sizeof(T)>();
+      static_assert(ScaleShift <= ByteEncodingShift,
+                    "Scaling by >=8 should be handled above!");
+      constexpr size_t FoldedShift = ByteEncodingShift - ScaleShift;
+      index >>= FoldedShift;
+      return reinterpret_cast<T*>(
+          &reinterpret_cast<std::byte*>(pointer)[index]);
+    }
+
+    // Nothing we can fold here.
+    return &pointer[index >> ByteEncodingShift];
+  }
+
+ private:
+  // When using a byte encoding, we'll need to shift any index by this amount.
+  static constexpr size_t ByteEncodingShift = 3;
+
+  auto unscaled_index() -> ssize_t {
+    if constexpr (!ByteEncoding) {
+      // Note the cast to `size_t` to force zero extending the result.
+      return static_cast<size_t>(llvm::countr_zero(bits_));
+    } else {
+      // The index is encoded in the high bit of each byte. We compute the index
+      // by counting the number of low zero bytes there are before the first
+      // byte with its high bit set. Rather that shifting the high bit to be the
+      // low bit and counting the trailing (least significant) zero bits
+      // directly, we instead byte-reverse the bits and count the *leading*
+      // (most significant) zero bits. While this may be a wash on CPUs with
+      // direct support for counting the trailing zero bits, AArch64 only
+      // supports counting the leading zero bits and requires a bit-reverse to
+      // count the trailing zero bits. Doing the byte-reverse approach
+      // essentially combines moving the high bit into the low bit and the
+      // reverse necessary for counting the zero bits. While this only removes
+      // one instruction, it is an instruction in the critical path of the
+      // hottest part of table lookup, and that critical path dependency height
+      // is few enough instructions that removing even one significantly impacts
+      // latency.
+      //
+      // We also cast to `size_t` to clearly zero-extend the result.
+      return static_cast<size_t>(llvm::countl_zero(llvm::byteswap(bits_)));
+    }
+  }
+
+  BitsT bits_ = 0;
+};
+
+// This is like `BitIndex`, but allows iterating through all of the matches.
+//
+// A key requirement for efficient iteration is that all of the matches are
+// represented with a single bit and there are no other bits set. For example,
+// with byte-encoded bit indices, exactly the high bit and no other bit of each
+// matching byte must be set. This is a stricter constraint than what `BitIndex`
+// alone would impose on any one of the matches.
+template <typename BitIndexT>
+class BitIndexRange : public Printable<BitIndexRange<BitIndexT>> {
+ public:
+  using BitsT = BitIndexT::BitsT;
+
+  class Iterator
+      : public llvm::iterator_facade_base<Iterator, std::forward_iterator_tag,
+                                          ssize_t, ssize_t> {
+   public:
+    Iterator() = default;
+    explicit Iterator(BitsT bits) : bits_(bits) {}
+
+    auto operator==(const Iterator& rhs) const -> bool {
+      return bits_ == rhs.bits_;
+    }
+
+    auto operator*() -> ssize_t& {
+      CARBON_DCHECK(bits_ != 0) << "Cannot get an index from zero bits!";
+      __builtin_assume(bits_ != 0);
+      index_ = BitIndexT(bits_).index();
+      // Note that we store the index in a member so we can return a reference
+      // to it here as required to be a forward iterator.
+      return index_;
+    }
+
+    template <typename T>
+    auto index_ptr(T* pointer) -> T* {
+      return BitIndexT(bits_).index_ptr(pointer);
+    }
+
+    auto operator++() -> Iterator& {
+      CARBON_DCHECK(bits_ != 0) << "Must not increment past the end!";
+      __builtin_assume(bits_ != 0);
+      // Clears the least significant set bit, effectively stepping to the next
+      // match.
+      bits_ &= (bits_ - 1);
+      return *this;
+    }
+
+   private:
+    ssize_t index_;
+    BitsT bits_ = 0;
+  };
+
+  BitIndexRange() = default;
+  explicit BitIndexRange(BitsT bits) : bits_(bits) {}
+
+  explicit operator bool() const { return !empty(); }
+  auto empty() const -> bool { return BitIndexT(bits_).empty(); }
+
+  auto begin() const -> Iterator { return Iterator(bits_); }
+  auto end() const -> Iterator { return Iterator(); }
+
+  friend auto operator==(BitIndexRange lhs, BitIndexRange rhs) -> bool {
+    return lhs.bits_ == rhs.bits_;
+  }
+
+  auto Print(llvm::raw_ostream& out) const -> void {
+    out << llvm::formatv("{0:x}", bits_);
+  }
+
+  explicit operator BitsT() const { return bits_; }
+  explicit operator BitIndexT() const { return BitIndexT(bits_); }
+
+ private:
+  BitsT bits_ = 0;
+};
+
+// A group of metadata bytes that can be manipulated together.
+//
+// The metadata bytes used Carbon's hashtable implementation are designed to
+// support being manipulating as groups, either using architecture specific SIMD
+// code sequences or using portable SIMD-in-an-integer-register code sequences.
+// These operations are unusually performance sensitive and in sometimes
+// surprising ways. The implementations here are crafted specifically to
+// optimize the particular usages in Carbon's hashtable and should not be
+// expected to be reusable in any other context.
+//
+// Throughout the functions operating on this type we try to use patterns with a
+// fallback portable implementation which can be directly used in the absence of
+// a SIMD implementation, but is also used (with the same code) to check that
+// any SIMD implementation produces the same result as the portable one. These
+// patterns help minimize un-compiled or un-tested paths through either portable
+// or SIMD code, regardless of which path is actually *used* on a particular
+// platform. To illustrate a common version of this pattern, we might have code
+// like:
+//
+// ```cpp
+// auto MetadataGroup::Operation(...) -> ... {
+//   ... portable_result;
+//   ... simd_result;
+//   if constexpr (!UseSIMD || DebugSIMD) {
+//     portable_result = PortableOperation(...);
+//   }
+//   if (UseSIMD || DebugSIMD) {
+//     simd_result = SIMDOperation(...)
+//     CARBON_DCHECK(result == portable_result) << ...;
+//   }
+//   return UseSIMD ? simd_result : portable_result;
+// }
+// ```
+class MetadataGroup : public Printable<MetadataGroup> {
+ public:
+  static constexpr ssize_t Size =
+#if CARBON_X86_SIMD_SUPPORT
+      16;
+#else
+      8;
+#endif
+  static_assert(Size >= 8);
+  static_assert(Size % 8 == 0);
+  static_assert(Size <= MaxGroupSize);
+  static_assert(MaxGroupSize % Size == 0);
+  static_assert(llvm::isPowerOf2_64(Size),
+                "The group size must be a constant power of two so dividing by "
+                "it is a simple shift.");
+  static constexpr ssize_t Mask = Size - 1;
+
+  // Each control byte can have special values. All special values have the
+  // most significant bit cleared to distinguish them from the seven hash bits
+  // stored when the control byte represents a full bucket.
+  //
+  // Otherwise, their values are chose primarily to provide efficient SIMD
+  // implementations of the common operations on an entire control group.
+  static constexpr uint8_t Empty = 0;
+  static constexpr uint8_t Deleted = 1;
+
+  static constexpr uint8_t PresentMask = 0b1000'0000;
+
+  // Some architectures make it much more efficient to build the match indices
+  // in a byte-encoded form rather than a bit-encoded form. This encoding
+  // changes verification and other aspects of our algorithms.
+  static constexpr bool ByteEncoding =
+#if CARBON_X86_SIMD_SUPPORT
+      false;
+#else
+      true;
+#endif
+  static_assert(!ByteEncoding || Size == 8,
+                "We can only support byte encoding with a group size of 8.");
+
+  // We need to indicate to users of the metadata group when they can hold a
+  // group value in a "register" (local variable) across clearing of individual
+  // bytes in the group efficiently. If the entire group can fit in an integer
+  // register, this works well and clients of the group should work to use the
+  // already-loaded value when clearing bytes. But when we have a larger group
+  // size, clearing the byte will typically require storing a byte to memory and
+  // re-loading the group. The usage patterns that need to clear bytes can in
+  // those cases avoid clearing a loaded group, and clear the byte directly in
+  // the larger metadata array.
+  static constexpr bool FastByteClear = Size == 8;
+
+  using MatchIndex =
+      BitIndex<std::conditional_t<ByteEncoding, uint64_t, uint32_t>,
+               ByteEncoding,
+               /*ZeroMask=*/ByteEncoding ? 0 : (~0U << Size)>;
+  using MatchRange = BitIndexRange<MatchIndex>;
+
+  union {
+    uint8_t metadata_bytes[Size];
+    uint64_t metadata_ints[Size / 8];
+#if CARBON_NEON_SIMD_SUPPORT
+    uint8x8_t metadata_vec = {};
+    static_assert(sizeof(metadata_vec) == Size);
+#elif CARBON_X86_SIMD_SUPPORT
+    __m128i metadata_vec = {};
+    static_assert(sizeof(metadata_vec) == Size);
+#endif
+  };
+
+  auto Print(llvm::raw_ostream& out) const -> void;
+
+  friend auto operator==(MetadataGroup lhs, MetadataGroup rhs) -> bool {
+    return CompareEqual(lhs, rhs);
+  }
+
+  // The main API for this class. This API will switch between a portable and
+  // SIMD implementation based on what is most efficient, but in debug builds
+  // will cross check that the implementations do not diverge.
+
+  // Load and return a group of metadata bytes out of the main metadata array at
+  // a particular `index`. The index must be a multiple of `GroupSize`. This
+  // will arrange for the load to place the group into the correct structure for
+  // efficient register-based processing.
+  static auto Load(const uint8_t* metadata, ssize_t index) -> MetadataGroup;
+
+  // Store this metadata group into the main metadata array at the provided
+  // `index`. The index must be a multiple of `GroupSize`.
+  auto Store(uint8_t* metadata, ssize_t index) const -> void;
+
+  // Clear a byte of this group's metadata at the provided `byte_index` to the
+  // empty value. Note that this must only be called when `FastByteClear` is
+  // true -- in all other cases users of this class should arrange to clear
+  // individual bytes in the underlying array rather than using the group API.
+  auto ClearByte(ssize_t byte_index) -> void;
+
+  // Clear all of this group's metadata bytes that indicate a deleted slot to
+  // the empty value.
+  auto ClearDeleted() -> void;
+
+  // Find all of the bytes of metadata in this group that are present and whose
+  // low 7 bits match the provided `tag`. The `tag` byte must have a clear high
+  // bit, only 7 bits of tag are used. Note that this means the provided tag is
+  // *not* the actual present metadata byte -- this function is responsible for
+  // mapping the tag into that form as it can do so more efficiently in some
+  // cases. A range over all of the byte indices which matched is returned.
+  auto Match(uint8_t tag) const -> MatchRange;
+
+  // Find all of the present bytes of metadata in this group. A range over all
+  // of the byte indices which are present is returned.
+  auto MatchPresent() const -> MatchRange;
+
+  // Find the first byte of the metadata group that is empty and return that
+  // index. There is no order or position required for which of the bytes of
+  // metadata is considered "first", any model will do that makes it efficient
+  // to produce the matching index. Must return an empty match index if no bytes
+  // match the empty metadata.
+  auto MatchEmpty() const -> MatchIndex;
+
+  // Find the first byte of the metadata group that is deleted and return that
+  // index. There is no order or position required for which of the bytes of
+  // metadata is considered "first", any model will do that makes it efficient
+  // to produce the matching index. Must return an empty match index if no bytes
+  // match the deleted metadata.
+  auto MatchDeleted() const -> MatchIndex;
+
+ private:
+  // Two classes only defined in the benchmark code are allowed to directly call
+  // the portable and SIMD implementations for benchmarking purposes.
+  friend class BenchmarkPortableMetadataGroup;
+  friend class BenchmarkSIMDMetadataGroup;
+
+  // Whether to use a SIMD implementation. Even when we *support* a SIMD
+  // implementation, we do not always have to use it in the event that it is
+  // less efficient than the portable version.
+  static constexpr bool UseSIMD =
+#if CARBON_X86_SIMD_SUPPORT
+      true;
+#else
+      false;
+#endif
+
+  // All SIMD variants that we have an implementation for should be enabled for
+  // debugging. This lets us maintain a SIMD implementation even if it is not
+  // used due to performance reasons, and easily re-enable it if the performance
+  // changes.
+  static constexpr bool DebugSIMD =
+#if !defined(NDEBUG) && (CARBON_NEON_SIMD_SUPPORT || CARBON_X86_SIMD_SUPPORT)
+      true;
+#else
+      false;
+#endif
+
+  // Most and least significant bits set.
+  static constexpr uint64_t MSBs = 0x8080'8080'8080'8080ULL;
+  static constexpr uint64_t LSBs = 0x0101'0101'0101'0101ULL;
+
+  using MatchBitsT = MatchIndex::BitsT;
+
+  static auto CompareEqual(MetadataGroup lhs, MetadataGroup rhs) -> bool;
+
+  // Functions for validating the returned matches agree with what is predicted
+  // by the `byte_match` function. These either `CHECK`-fail or return true. To
+  // pass validation, the `*_bits` argument must have `0x80` for those bytes
+  // where `byte_match` returns true, and `0` for the rest.
+
+  // `VerifyIndexBits` is for functions that return `MatchIndex`, as they only
+  // promise to return accurate information up to the first match.
+  auto VerifyIndexBits(
+      MatchBitsT index_bits,
+      llvm::function_ref<auto(uint8_t byte)->bool> byte_match) const -> bool;
+  // `VerifyRangeBits` is for functions that return `MatchRange`, and so it
+  // validates all the bytes of `range_bits`.
+  auto VerifyRangeBits(
+      MatchBitsT range_bits,
+      llvm::function_ref<auto(uint8_t byte)->bool> byte_match) const -> bool;
+
+  // Portable implementations of each operation. These are used on platforms
+  // without SIMD support or where the portable implementation is faster than
+  // SIMD. They are heavily optimized even though they are not SIMD because we
+  // expect there to be platforms where the portable implementation can
+  // outperform SIMD. Their behavior and semantics exactly match the
+  // documentation for the un-prefixed functions.
+  //
+  // In debug builds, these also directly verify their results to help establish
+  // baseline functionality.
+  static auto PortableLoad(const uint8_t* metadata, ssize_t index)
+      -> MetadataGroup;
+  auto PortableStore(uint8_t* metadata, ssize_t index) const -> void;
+
+  auto PortableClearDeleted() -> void;
+
+  auto PortableMatch(uint8_t tag) const -> MatchRange;
+  auto PortableMatchPresent() const -> MatchRange;
+
+  auto PortableMatchEmpty() const -> MatchIndex;
+  auto PortableMatchDeleted() const -> MatchIndex;
+
+  static auto PortableCompareEqual(MetadataGroup lhs, MetadataGroup rhs)
+      -> bool;
+
+  // SIMD implementations of each operation. We minimize platform-specific APIs
+  // to reduce the scope of errors that can only be discoverd building on one
+  // platform, so the bodies of these contain the platform specific code. Their
+  // behavior and semantics exactly match the documentation for the un-prefixed
+  // functions.
+  //
+  // These routines don't directly verify their results as we can build simpler
+  // debug checks by comparing them against the verified portable results.
+  static auto SIMDLoad(const uint8_t* metadata, ssize_t index) -> MetadataGroup;
+  auto SIMDStore(uint8_t* metadata, ssize_t index) const -> void;
+
+  auto SIMDClearDeleted() -> void;
+
+  auto SIMDMatch(uint8_t tag) const -> MatchRange;
+  auto SIMDMatchPresent() const -> MatchRange;
+
+  auto SIMDMatchEmpty() const -> MatchIndex;
+  auto SIMDMatchDeleted() const -> MatchIndex;
+
+  static auto SIMDCompareEqual(MetadataGroup lhs, MetadataGroup rhs) -> bool;
+
+#if CARBON_X86_SIMD_SUPPORT
+  // A common routine for x86 SIMD matching that can be used for matching
+  // present, empty, and deleted bytes with equal efficiency.
+  auto X86SIMDMatch(uint8_t match_byte) const -> MatchRange;
+#endif
+};
+
+// Promote the size and mask to top-level constants as we'll need to operate on
+// the grouped structure outside of the metadata bytes.
+inline constexpr ssize_t GroupSize = MetadataGroup::Size;
+inline constexpr ssize_t GroupMask = MetadataGroup::Mask;
+
+inline auto MetadataGroup::Load(const uint8_t* metadata, ssize_t index)
+    -> MetadataGroup {
+  MetadataGroup portable_g;
+  if constexpr (!UseSIMD || DebugSIMD) {
+    portable_g = PortableLoad(metadata, index);
+    if constexpr (!UseSIMD) {
+      return portable_g;
+    }
+  }
+  MetadataGroup g = SIMDLoad(metadata, index);
+  CARBON_DCHECK(g == portable_g);
+  return g;
+}
+
+inline auto MetadataGroup::Store(uint8_t* metadata, ssize_t index) const
+    -> void {
+  if constexpr (!UseSIMD) {
+    std::memcpy(metadata + index, &metadata_bytes, Size);
+  } else {
+    SIMDStore(metadata, index);
+  }
+  CARBON_DCHECK(0 == std::memcmp(metadata + index, &metadata_bytes, Size));
+}
+
+inline auto MetadataGroup::ClearByte(ssize_t byte_index) -> void {
+  CARBON_DCHECK(FastByteClear) << "Only use byte clearing when fast!";
+  CARBON_DCHECK(Size == 8)
+      << "The clear implementation assumes an 8-byte group.";
+
+  metadata_ints[0] &= ~(static_cast<uint64_t>(0xff) << (byte_index * 8));
+}
+
+inline auto MetadataGroup::ClearDeleted() -> void {
+  MetadataGroup portable_g = *this;
+  MetadataGroup simd_g = *this;
+  if constexpr (!UseSIMD || DebugSIMD) {
+    portable_g.PortableClearDeleted();
+  }
+  if constexpr (UseSIMD || DebugSIMD) {
+    simd_g.SIMDClearDeleted();
+    CARBON_DCHECK(simd_g == portable_g)
+        << "SIMD cleared group '" << simd_g
+        << "' doesn't match portable cleared group '" << portable_g << "'";
+  }
+  *this = UseSIMD ? simd_g : portable_g;
+}
+
+inline auto MetadataGroup::Match(uint8_t tag) const -> MatchRange {
+  // The caller should provide us with the present byte hash, and not set any
+  // present bit tag on it so that this layer can manage tagging the high bit of
+  // a present byte.
+  CARBON_DCHECK((tag & PresentMask) == 0) << llvm::formatv("{0:x}", tag);
+
+  MatchRange portable_result;
+  MatchRange simd_result;
+  if constexpr (!UseSIMD || DebugSIMD) {
+    portable_result = PortableMatch(tag);
+  }
+  if constexpr (UseSIMD || DebugSIMD) {
+    simd_result = SIMDMatch(tag);
+    CARBON_DCHECK(simd_result == portable_result)
+        << "SIMD result '" << simd_result << "' doesn't match portable result '"
+        << portable_result << "'";
+  }
+  return UseSIMD ? simd_result : portable_result;
+}
+
+inline auto MetadataGroup::MatchPresent() const -> MatchRange {
+  MatchRange portable_result;
+  MatchRange simd_result;
+  if constexpr (!UseSIMD || DebugSIMD) {
+    portable_result = PortableMatchPresent();
+  }
+  if constexpr (UseSIMD || DebugSIMD) {
+    simd_result = SIMDMatchPresent();
+    CARBON_DCHECK(simd_result == portable_result)
+        << "SIMD result '" << simd_result << "' doesn't match portable result '"
+        << portable_result << "'";
+  }
+  return UseSIMD ? simd_result : portable_result;
+}
+
+inline auto MetadataGroup::MatchEmpty() const -> MatchIndex {
+  MatchIndex portable_result;
+  MatchIndex simd_result;
+  if constexpr (!UseSIMD || DebugSIMD) {
+    portable_result = PortableMatchEmpty();
+  }
+  if constexpr (UseSIMD || DebugSIMD) {
+    simd_result = SIMDMatchEmpty();
+    CARBON_DCHECK(simd_result == portable_result)
+        << "SIMD result '" << simd_result << "' doesn't match portable result '"
+        << portable_result << "'";
+  }
+  return UseSIMD ? simd_result : portable_result;
+}
+
+inline auto MetadataGroup::MatchDeleted() const -> MatchIndex {
+  MatchIndex portable_result;
+  MatchIndex simd_result;
+  if constexpr (!UseSIMD || DebugSIMD) {
+    portable_result = PortableMatchDeleted();
+  }
+  if constexpr (UseSIMD || DebugSIMD) {
+    simd_result = SIMDMatchDeleted();
+    CARBON_DCHECK(simd_result == portable_result)
+        << "SIMD result '" << simd_result << "' doesn't match portable result '"
+        << portable_result << "'";
+  }
+  return UseSIMD ? simd_result : portable_result;
+}
+
+inline auto MetadataGroup::CompareEqual(MetadataGroup lhs, MetadataGroup rhs)
+    -> bool {
+  bool portable_result;
+  bool simd_result;
+  if constexpr (!UseSIMD || DebugSIMD) {
+    portable_result = PortableCompareEqual(lhs, rhs);
+  }
+  if constexpr (UseSIMD || DebugSIMD) {
+    simd_result = SIMDCompareEqual(lhs, rhs);
+    CARBON_DCHECK(simd_result == portable_result);
+  }
+  return UseSIMD ? simd_result : portable_result;
+}
+
+inline auto MetadataGroup::VerifyIndexBits(
+    MatchBitsT index_bits,
+    llvm::function_ref<auto(uint8_t byte)->bool> byte_match) const -> bool {
+  for (ssize_t byte_index : llvm::seq<ssize_t>(0, Size)) {
+    if constexpr (!ByteEncoding) {
+      if (byte_match(metadata_bytes[byte_index])) {
+        CARBON_CHECK(((index_bits >> byte_index) & 1) == 1)
+            << "Bit not set at matching byte index: " << byte_index;
+        // Only the first match is needed, so stop scanning once found.
+        break;
+      }
+
+      CARBON_CHECK(((index_bits >> byte_index) & 1) == 0)
+          << "Bit set at non-matching byte index: " << byte_index;
+    } else {
+      // `index_bits` is byte-encoded rather than bit encoded, so extract a
+      // byte.
+      uint8_t index_byte = (index_bits >> (byte_index * 8)) & 0xFF;
+      if (byte_match(metadata_bytes[byte_index])) {
+        CARBON_CHECK((index_byte & 0x80) == 0x80)
+            << "Should have the high bit set for a matching byte, found: "
+            << llvm::formatv("{0:x}", index_byte);
+        // Only the first match is needed so stop scanning once found.
+        break;
+      }
+
+      CARBON_CHECK(index_byte == 0)
+          << "Should have no bits set for an unmatched byte, found: "
+          << llvm::formatv("{0:x}", index_byte);
+    }
+  }
+  return true;
+}
+
+inline auto MetadataGroup::VerifyRangeBits(
+    MatchBitsT range_bits,
+    llvm::function_ref<auto(uint8_t byte)->bool> byte_match) const -> bool {
+  for (ssize_t byte_index : llvm::seq<ssize_t>(0, Size)) {
+    if constexpr (!ByteEncoding) {
+      if (byte_match(metadata_bytes[byte_index])) {
+        CARBON_CHECK(((range_bits >> byte_index) & 1) == 1)
+            << "Bit not set at matching byte index: " << byte_index;
+      } else {
+        CARBON_CHECK(((range_bits >> byte_index) & 1) == 0)
+            << "Bit set at non-matching byte index: " << byte_index;
+      }
+    } else {
+      // `range_bits` is byte-encoded rather than bit encoded, so extract a
+      // byte.
+      uint8_t range_byte = (range_bits >> (byte_index * 8)) & 0xFF;
+      if (byte_match(metadata_bytes[byte_index])) {
+        CARBON_CHECK(range_byte == 0x80)
+            << "Should just have the high bit set for a matching byte, found: "
+            << llvm::formatv("{0:x}", range_byte);
+      } else {
+        CARBON_CHECK(range_byte == 0)
+            << "Should have no bits set for an unmatched byte, found: "
+            << llvm::formatv("{0:x}", range_byte);
+      }
+    }
+  }
+  return true;
+}
+
+inline auto MetadataGroup::PortableLoad(const uint8_t* metadata, ssize_t index)
+    -> MetadataGroup {
+  MetadataGroup g;
+  static_assert(sizeof(g) == Size);
+  std::memcpy(&g.metadata_bytes, metadata + index, Size);
+  return g;
+}
+
+inline auto MetadataGroup::PortableStore(uint8_t* metadata, ssize_t index) const
+    -> void {
+  std::memcpy(metadata + index, &metadata_bytes, Size);
+}
+
+inline auto MetadataGroup::PortableClearDeleted() -> void {
+  for (uint64_t& metadata_int : metadata_ints) {
+    // Deleted bytes have only the least significant bits set, so to clear them
+    // we only need to clear the least significant bit. And empty bytes already
+    // have a clear least significant bit, so the only least significant bits we
+    // need to preserve are those of present bytes. The most significant bit of
+    // every present byte is set, so we take the most significant bit of each
+    // byte, shift it into the least significant bit position, and bit-or it
+    // with the compliment of `LSBs`. This will have ones for every bit but the
+    // least significant bits, and ones for the least significant bits of every
+    // present byte.
+    metadata_int &= (~LSBs | metadata_int >> 7);
+  }
+}
+
+inline auto MetadataGroup::PortableMatch(uint8_t tag) const -> MatchRange {
+  // The caller should provide us with the present byte hash, and not set any
+  // present bit tag on it so that this layer can manage tagging the high bit of
+  // a present byte.
+  CARBON_DCHECK((tag & PresentMask) == 0) << llvm::formatv("{0:x}", tag);
+
+  // Use a simple fallback approach for sizes beyond 8.
+  // TODO: Instead of a simple fallback, we should generalize the below
+  // algorithm for sizes above 8, even if to just exercise the same code on
+  // more platforms.
+  if constexpr (Size > 8) {
+    static_assert(Size <= 32, "Sizes larger than 32 not yet supported!");
+    uint32_t match_bits = 0;
+    uint32_t bit = 1;
+    uint8_t present_byte = tag | PresentMask;
+    for (ssize_t i : llvm::seq<ssize_t>(0, Size)) {
+      if (metadata_bytes[i] == present_byte) {
+        match_bits |= bit;
+      }
+      bit <<= 1;
+    }
+    return MatchRange(match_bits);
+  }
+
+  // This algorithm only works for matching *present* bytes. We leverage the
+  // set high bit in the present case as part of the algorithm. The whole
+  // algorithm has a critical path height of 4 operations, and does 6
+  // operations total on AArch64. The operation dependency graph is:
+  //
+  //          group | MSBs        LSBs * match_byte + MSBs
+  //                 \                /
+  //                 match_bits ^ broadcast
+  //                            |
+  //   group & MSBs        MSBs - match_bits
+  //          \                /
+  //        group_MSBs & match_bits
+  //
+  // This diagram and the operation count are specific to AArch64 where we have
+  // a fused *integer* multiply-add operation.
+  //
+  // While it is superficially similar to the "find zero bytes in a word" bit
+  // math trick, it is different because this is designed to have no false
+  // positives and perfectly produce 0x80 for matching bytes and 0x00 for
+  // non-matching bytes. This is do-able because we constrain to only handle
+  // present matches which only require testing 7 bits and have a particular
+  // layout.
+
+  // Set the high bit of every byte to `1`. Any matching byte is a present byte
+  // and so always has this bit set as well, which means the xor below, in
+  // addition to zeroing the low 7 bits of any byte that matches the tag, also
+  // clears the high bit of every byte.
+  uint64_t match_bits = metadata_ints[0] | MSBs;
+  // Broadcast the match byte to all bytes, and mask in the present bits in the
+  // MSBs of each byte. We structure this as a multiply and an add because we
+  // know that the add cannot carry, and this way it can be lowered using
+  // combined multiply-add instructions if available.
+  uint64_t broadcast = LSBs * tag + MSBs;
+  CARBON_DCHECK(broadcast == (LSBs * tag | MSBs))
+      << "Unexpected carry from addition!";
+
+  // Xor the broadcast byte pattern. This makes bytes with matches become 0, and
+  // clears the high-bits of non-matches. Note that if we are looking for a tag
+  // with the same value as `Empty` or `Deleted`, those bytes will be zero as
+  // well.
+  match_bits = match_bits ^ broadcast;
+  // Subtract each byte of `match_bits` from `0x80` bytes. After this, the high
+  // bit will be set only for those bytes that were zero.
+  match_bits = MSBs - match_bits;
+  // Zero everything but the high bits, and also zero the high bits of any bytes
+  // for "not present" slots in the original group. This avoids false positives
+  // for `Empty` and `Deleted` bytes in the metadata.
+  match_bits &= (metadata_ints[0] & MSBs);
+
+  // At this point, `match_bits` has the high bit set for bytes where the
+  // original group byte equals `tag` plus the high bit.
+  CARBON_DCHECK(VerifyRangeBits(
+      match_bits, [&](uint8_t byte) { return byte == (tag | PresentMask); }));
+  return MatchRange(match_bits);
+}
+
+inline auto MetadataGroup::PortableMatchPresent() const -> MatchRange {
+  // Use a simple fallback approach for sizes beyond 8.
+  // TODO: Instead of a simple fallback, we should generalize the below
+  // algorithm for sizes above 8, even if to just exercise the same code on
+  // more platforms.
+  if constexpr (Size > 8) {
+    static_assert(Size <= 32, "Sizes larger than 32 not yet supported!");
+    uint32_t match_bits = 0;
+    uint32_t bit = 1;
+    for (ssize_t i : llvm::seq<ssize_t>(0, Size)) {
+      if (metadata_bytes[i] & PresentMask) {
+        match_bits |= bit;
+      }
+      bit <<= 1;
+    }
+    return MatchRange(match_bits);
+  }
+
+  // Want to keep the high bit of each byte, which indicates whether that byte
+  // represents a present slot.
+  uint64_t match_bits = metadata_ints[0] & MSBs;
+
+  CARBON_DCHECK(VerifyRangeBits(
+      match_bits, [&](uint8_t byte) { return (byte & PresentMask) != 0; }));
+  return MatchRange(match_bits);
+}
+
+inline auto MetadataGroup::PortableMatchEmpty() const -> MatchIndex {
+  // Use a simple fallback approach for sizes beyond 8.
+  // TODO: Instead of a simple fallback, we should generalize the below
+  // algorithm for sizes above 8, even if to just exercise the same code on
+  // more platforms.
+  if constexpr (Size > 8) {
+    static_assert(Size <= 32, "Sizes larger than 32 not yet supported!");
+    uint32_t bit = 1;
+    for (ssize_t i : llvm::seq<ssize_t>(0, Size)) {
+      if (metadata_bytes[i] == Empty) {
+        return MatchIndex(bit);
+      }
+      bit <<= 1;
+    }
+    return MatchIndex(0);
+  }
+
+  // This sets the high bit of every byte in `match_bits` unless the
+  // corresponding metadata byte is 0. We take advantage of the fact that
+  // the metadata bytes in are non-zero only if they are either:
+  // - present: in which case the high bit of the byte will already be set; or
+  // - deleted: in which case the byte will be 1, and shifting it left by 7 will
+  //   cause the high bit to be set.
+  uint64_t match_bits = metadata_ints[0] | (metadata_ints[0] << 7);
+  // This inverts the high bits of the bytes, and clears the remaining bits.
+  match_bits = ~match_bits & MSBs;
+
+  // The high bits of the bytes of `match_bits` are set if the corresponding
+  // metadata byte is `Empty`.
+  CARBON_DCHECK(
+      VerifyIndexBits(match_bits, [](uint8_t byte) { return byte == Empty; }));
+  return MatchIndex(match_bits);
+}
+
+inline auto MetadataGroup::PortableMatchDeleted() const -> MatchIndex {
+  // Use a simple fallback approach for sizes beyond 8.
+  // TODO: Instead of a simple fallback, we should generalize the below
+  // algorithm for sizes above 8, even if to just exercise the same code on
+  // more platforms.
+  if constexpr (Size > 8) {
+    static_assert(Size <= 32, "Sizes larger than 32 not yet supported!");
+    uint32_t bit = 1;
+    for (ssize_t i : llvm::seq<ssize_t>(0, Size)) {
+      if (metadata_bytes[i] == Deleted) {
+        return MatchIndex(bit);
+      }
+      bit <<= 1;
+    }
+    return MatchIndex(0);
+  }
+
+  // This sets the high bit of every byte in `match_bits` unless the
+  // corresponding metadata byte is 1. We take advantage of the fact that the
+  // metadata bytes are not 1 only if they are either:
+  // - present: in which case the high bit of the byte will already be set; or
+  // - empty: in which case the byte will be 0, and in that case inverting and
+  //   shifting left by 7 will have the high bit set.
+  uint64_t match_bits = metadata_ints[0] | (~metadata_ints[0] << 7);
+  // This inverts the high bits of the bytes, and clears the remaining bits.
+  match_bits = ~match_bits & MSBs;
+
+  // The high bits of the bytes of `match_bits` are set if the corresponding
+  // metadata byte is `Deleted`.
+  CARBON_DCHECK(VerifyIndexBits(match_bits,
+                                [](uint8_t byte) { return byte == Deleted; }));
+  return MatchIndex(match_bits);
+}
+
+inline auto MetadataGroup::PortableCompareEqual(MetadataGroup lhs,
+                                                MetadataGroup rhs) -> bool {
+  return llvm::equal(lhs.metadata_bytes, rhs.metadata_bytes);
+}
+
+inline auto MetadataGroup::SIMDLoad(const uint8_t* metadata, ssize_t index)
+    -> MetadataGroup {
+  MetadataGroup g;
+#if CARBON_NEON_SIMD_SUPPORT
+  g.metadata_vec = vld1_u8(metadata + index);
+#elif CARBON_X86_SIMD_SUPPORT
+  g.metadata_vec =
+      _mm_load_si128(reinterpret_cast<const __m128i*>(metadata + index));
+#else
+  static_assert(!UseSIMD, "Unimplemented SIMD operation");
+  static_cast<void>(metadata);
+  static_cast<void>(index);
+#endif
+  return g;
+}
+
+inline auto MetadataGroup::SIMDStore(uint8_t* metadata, ssize_t index) const
+    -> void {
+#if CARBON_NEON_SIMD_SUPPORT
+  vst1_u8(metadata + index, metadata_vec);
+#elif CARBON_X86_SIMD_SUPPORT
+  _mm_store_si128(reinterpret_cast<__m128i*>(metadata + index), metadata_vec);
+#else
+  static_assert(!UseSIMD, "Unimplemented SIMD operation");
+  static_cast<void>(metadata);
+  static_cast<void>(index);
+#endif
+}
+
+inline auto MetadataGroup::SIMDClearDeleted() -> void {
+#if CARBON_NEON_SIMD_SUPPORT
+  // There is no good Neon operation to implement this, so do it using integer
+  // code. This is reasonably fast, but unfortunate because it forces the group
+  // out of a SIMD register and into a general purpose register, which can have
+  // high latency.
+  metadata_ints[0] &= (~LSBs | metadata_ints[0] >> 7);
+#elif CARBON_X86_SIMD_SUPPORT
+  // For each byte, use `metadata_vec` if the byte's high bit is set (indicating
+  // it is present), otherwise (it is empty or deleted) replace it with zero
+  // (representing empty).
+  metadata_vec =
+      _mm_blendv_epi8(_mm_setzero_si128(), metadata_vec, metadata_vec);
+#else
+  static_assert(!UseSIMD && !DebugSIMD, "Unimplemented SIMD operation");
+#endif
+}
+
+inline auto MetadataGroup::SIMDMatch(uint8_t tag) const -> MatchRange {
+  MatchRange result;
+#if CARBON_NEON_SIMD_SUPPORT
+  // Broadcast byte we want to match to every byte in the vector.
+  auto match_byte_vec = vdup_n_u8(tag | PresentMask);
+  // Result bytes have all bits set for the bytes that match, so we have to
+  // clear everything but MSBs next.
+  auto match_byte_cmp_vec = vceq_u8(metadata_vec, match_byte_vec);
+  uint64_t match_bits = vreinterpret_u64_u8(match_byte_cmp_vec)[0];
+  // The matched range is likely to be tested for zero by the caller, and that
+  // test can often be folded into masking the bits with `MSBs` when we do that
+  // mask in the scalar domain rather than the SIMD domain. So we do the mask
+  // here rather than above prior to extracting the match bits.
+  result = MatchRange(match_bits & MSBs);
+#elif CARBON_X86_SIMD_SUPPORT
+  result = X86SIMDMatch(tag | PresentMask);
+#else
+  static_assert(!UseSIMD && !DebugSIMD, "Unimplemented SIMD operation");
+  static_cast<void>(tag);
+#endif
+  return result;
+}
+
+inline auto MetadataGroup::SIMDMatchPresent() const -> MatchRange {
+  MatchRange result;
+#if CARBON_NEON_SIMD_SUPPORT
+  // Just extract the metadata directly.
+  uint64_t match_bits = vreinterpret_u64_u8(metadata_vec)[0];
+  // The matched range is likely to be tested for zero by the caller, and that
+  // test can often be folded into masking the bits with `MSBs` when we do that
+  // mask in the scalar domain rather than the SIMD domain. So we do the mask
+  // here rather than above prior to extracting the match bits.
+  result = MatchRange(match_bits & MSBs);
+#elif CARBON_X86_SIMD_SUPPORT
+  // We arranged the byte vector so that present bytes have the high bit set,
+  // which this instruction extracts.
+  result = MatchRange(_mm_movemask_epi8(metadata_vec));
+#else
+  static_assert(!UseSIMD && !DebugSIMD, "Unimplemented SIMD operation");
+#endif
+  return result;
+}
+
+inline auto MetadataGroup::SIMDMatchEmpty() const -> MatchIndex {
+  MatchIndex result;
+#if CARBON_NEON_SIMD_SUPPORT
+  // Compare all bytes with zero, as that is the empty byte value. Result will
+  // have all bits set for any input zero byte, so we zero all but the high bits
+  // below.
+  auto cmp_vec = vceqz_u8(metadata_vec);
+  uint64_t metadata_bits = vreinterpret_u64_u8(cmp_vec)[0];
+  // The matched range is likely to be tested for zero by the caller, and that
+  // test can often be folded into masking the bits with `MSBs` when we do that
+  // mask in the scalar domain rather than the SIMD domain. So we do the mask
+  // here rather than above prior to extracting the match bits.
+  result = MatchIndex(metadata_bits & MSBs);
+#elif CARBON_X86_SIMD_SUPPORT
+  // Even though we only need the first match rather than all matches, we don't
+  // have a more efficient way to compute this on x86 and so we reuse the
+  // general match infrastructure that computes all matches in a bit-encoding.
+  // We then convert it into a `MatchIndex` that just finds the first one.
+  result = static_cast<MatchIndex>(X86SIMDMatch(Empty));
+#else
+  static_assert(!UseSIMD && !DebugSIMD, "Unimplemented SIMD operation");
+#endif
+  return result;
+}
+
+inline auto MetadataGroup::SIMDMatchDeleted() const -> MatchIndex {
+  MatchIndex result;
+#if CARBON_NEON_SIMD_SUPPORT
+  // Broadcast the `Deleted` byte across the vector and compare the bytes of
+  // that with the metadata vector. The result will have all bits set for any
+  // input zero byte, so we zero all but the high bits below.
+  auto cmp_vec = vceq_u8(metadata_vec, vdup_n_u8(Deleted));
+  uint64_t match_bits = vreinterpret_u64_u8(cmp_vec)[0];
+  // The matched range is likely to be tested for zero by the caller, and that
+  // test can often be folded into masking the bits with `MSBs` when we do that
+  // mask in the scalar domain rather than the SIMD domain. So we do the mask
+  // here rather than above prior to extracting the match bits.
+  result = MatchIndex(match_bits & MSBs);
+#elif CARBON_X86_SIMD_SUPPORT
+  // Even though we only need the first match rather than all matches, we don't
+  // have a more efficient way to compute this on x86 and so we reuse the
+  // general match infrastructure that computes all matches in a bit-encoding.
+  // We then convert it into a `MatchIndex` that just finds the first one.
+  result = static_cast<MatchIndex>(X86SIMDMatch(Deleted));
+#else
+  static_assert(!UseSIMD && !DebugSIMD, "Unimplemented SIMD operation");
+#endif
+  return result;
+}
+
+inline auto MetadataGroup::SIMDCompareEqual(MetadataGroup lhs,
+                                            MetadataGroup rhs) -> bool {
+#if CARBON_NEON_SIMD_SUPPORT
+  return vreinterpret_u64_u8(vceq_u8(lhs.metadata_vec, rhs.metadata_vec))[0] ==
+         static_cast<uint64_t>(-1LL);
+#elif CARBON_X86_SIMD_SUPPORT
+  // Different x86 SIMD extensions provide different comparison functionality
+  // available.
+#if __SSE4_2__
+  // With SSE 4.2, we can directly test and branch in the SIMD domain on whether
+  // the two metadata vectors are equal.
+  return _mm_testc_si128(_mm_cmpeq_epi8(lhs.metadata_vec, rhs.metadata_vec),
+                         _mm_set1_epi8(0xff)) == 1;
+#else
+  // With older versions of SSE we have to extract the result of the comparison,
+  // much like we do when matching. That will have the usual bitmask
+  // representing equal bytes, and test for that exact bitmask in scalar code.
+  return _mm_movemask_epi8(_mm_cmpeq_epi8(lhs.metadata_vec,
+                                          rhs.metadata_vec)) == 0x0000'ffffU;
+#endif
+#else
+  static_assert(!UseSIMD && !DebugSIMD, "Unimplemented SIMD operation");
+  static_cast<void>(lhs);
+  static_cast<void>(rhs);
+  return false;
+#endif
+}
+
+#if CARBON_X86_SIMD_SUPPORT
+inline auto MetadataGroup::X86SIMDMatch(uint8_t match_byte) const
+    -> MatchRange {
+  // Broadcast the byte we're matching against to all bytes in a vector, and
+  // compare those bytes with the metadata vector bytes.
+  auto match_byte_vec = _mm_set1_epi8(match_byte);
+  auto match_byte_cmp_vec = _mm_cmpeq_epi8(metadata_vec, match_byte_vec);
+  // Extract the result of each byte-wise comparison into the low bits of an
+  // integer.
+  uint32_t match_bits = _mm_movemask_epi8(match_byte_cmp_vec);
+  return MatchRange(match_bits);
+}
+#endif
+
+}  // namespace Carbon::RawHashtable
+
+#endif  // CARBON_COMMON_RAW_HASHTABLE_METADATA_GROUP_H_

+ 328 - 0
common/raw_hashtable_metadata_group_benchmark.cpp

@@ -0,0 +1,328 @@
+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+// Exceptions. See /LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <benchmark/benchmark.h>
+
+#include <algorithm>
+
+#include "absl/random/random.h"
+#include "common/raw_hashtable_metadata_group.h"
+
+namespace Carbon::RawHashtable {
+
+// If we have any SIMD support, create dedicated benchmark utilities for the
+// portable and SIMD implementation so we can directly benchmark both.
+#if CARBON_NEON_SIMD_SUPPORT || CARBON_X86_SIMD_SUPPORT
+// Override the core API with explicit use of the portable API.
+class BenchmarkPortableMetadataGroup : public MetadataGroup {
+ public:
+  explicit BenchmarkPortableMetadataGroup(MetadataGroup g) : MetadataGroup(g) {}
+
+  static auto Load(uint8_t* metadata, ssize_t index)
+      -> BenchmarkPortableMetadataGroup {
+    return BenchmarkPortableMetadataGroup(PortableLoad(metadata, index));
+  }
+  auto Store(uint8_t* metadata, ssize_t index) const -> void {
+    PortableStore(metadata, index);
+  }
+
+  auto ClearDeleted() -> void { PortableClearDeleted(); }
+
+  auto Match(uint8_t present_byte) const -> MatchRange {
+    return PortableMatch(present_byte);
+  }
+  auto MatchPresent() const -> MatchRange { return PortableMatchPresent(); }
+
+  auto MatchEmpty() const -> MatchIndex { return PortableMatchEmpty(); }
+  auto MatchDeleted() const -> MatchIndex { return PortableMatchDeleted(); }
+};
+
+// Override the core API with explicit use of the SIMD API.
+class BenchmarkSIMDMetadataGroup : public MetadataGroup {
+ public:
+  explicit BenchmarkSIMDMetadataGroup(MetadataGroup g) : MetadataGroup(g) {}
+
+  static auto Load(uint8_t* metadata, ssize_t index)
+      -> BenchmarkSIMDMetadataGroup {
+    return BenchmarkSIMDMetadataGroup(SIMDLoad(metadata, index));
+  }
+  auto Store(uint8_t* metadata, ssize_t index) const -> void {
+    SIMDStore(metadata, index);
+  }
+
+  auto ClearDeleted() -> void { SIMDClearDeleted(); }
+
+  auto Match(uint8_t present_byte) const -> MatchRange {
+    return SIMDMatch(present_byte);
+  }
+  auto MatchPresent() const -> MatchRange { return SIMDMatchPresent(); }
+
+  auto MatchEmpty() const -> MatchIndex { return SIMDMatchEmpty(); }
+  auto MatchDeleted() const -> MatchIndex { return SIMDMatchDeleted(); }
+};
+#endif
+
+namespace {
+
+// The number of metadata groups we use when benchmarking a particular scenario
+// of matching within a group.
+constexpr ssize_t BenchSize = 256;
+
+#if CARBON_NEON_SIMD_SUPPORT || CARBON_X86_SIMD_SUPPORT
+using PortableGroup = BenchmarkPortableMetadataGroup;
+using SIMDGroup = BenchmarkSIMDMetadataGroup;
+#endif
+
+struct BenchMetadata {
+  // The metadata for benchmarking, arranged in `BenchSize` groups, each one
+  // `GroupSize` in length. As a consequence, the size of this array will always
+  // be `BenchSize * GroupSize`.
+  llvm::MutableArrayRef<uint8_t> metadata;
+
+  // For benchmarking random matches in the metadata, each byte here is the tag
+  // that should be matched against the corresponding group of the metadata.
+  // Because this array parallels the *groups* of the metadata array, its size
+  // will be `BenchSize`. For other kinds, this is empty.
+  llvm::ArrayRef<uint8_t> bytes;
+};
+
+enum class BenchKind : uint8_t {
+  Random,
+  Empty,
+  Deleted,
+};
+
+// This routine should only be called once per `BenchKind` as the initializer of
+// a global variable below. It returns an `ArrayRef` pointing into
+// function-local static storage that provides our benchmark metadata.
+//
+// The returned array will have exactly `GroupSize` elements, each of
+// `BenchMetadata`. For the `BenchMetadata` at index `i`, there will be `i+1`
+// matches of that kind within each group of the metadata. This lets us
+// benchmark each of the possible match-counts for a group.
+template <BenchKind Kind = BenchKind::Random>
+static auto BuildBenchMetadata() -> llvm::ArrayRef<BenchMetadata> {
+  // We build `GroupSize` elements of `BenchMetadata` below, and so we need
+  // `GroupSize` copies of each of these arrays to serve as inputs to it.
+  //
+  // The first storage is of `BenchSize` groups of metadata.
+  static uint8_t metadata_storage[GroupSize][BenchSize * GroupSize];
+  // When `Kind` is `Random`, each group above will have a *different* byte that
+  // matches in that group. This array stores those bytes for the benchmark to
+  // match against the group.
+  static uint8_t bytes_storage[GroupSize][BenchSize];
+
+  // The backing storage for the returned `ArrayRef`.
+  static BenchMetadata bm_storage[GroupSize];
+
+  absl::BitGen gen;
+  for (auto [bm_index, bm] : llvm::enumerate(bm_storage)) {
+    int match_count = bm_index + 1;
+
+    for (ssize_t g_index : llvm::seq<ssize_t>(0, BenchSize)) {
+      // Start by filling the group with random bytes.
+      auto group_bytes = llvm::MutableArrayRef(
+          &metadata_storage[bm_index][g_index * GroupSize], GroupSize);
+      for (uint8_t& b : group_bytes) {
+        b = absl::Uniform<uint8_t>(gen) | MetadataGroup::PresentMask;
+      }
+
+      // Now we need up to `match_count` random indices into the group where
+      // we'll put a matching byte.
+      std::array<ssize_t, GroupSize> group_indices;
+      std::iota(group_indices.begin(), group_indices.end(), 0);
+      std::shuffle(group_indices.begin(), group_indices.end(), gen);
+
+      // Now cause the first match index to have the desired value.
+      ssize_t match_index = *group_indices.begin();
+      uint8_t& match_b = group_bytes[match_index];
+      switch (Kind) {
+        case BenchKind::Random: {
+          // Already a random value, but we need to  ensure it isn't one that
+          // repeats elsewhere in the group.
+          while (llvm::count(group_bytes, match_b) > 1) {
+            match_b = absl::Uniform<uint8_t>(gen) | MetadataGroup::PresentMask;
+          }
+          // Store this as the byte to search for in this group, but without the
+          // present bit to simulate where we start when using a 7-bit tag
+          // from a hash.
+          bytes_storage[bm_index][g_index] =
+              match_b & ~MetadataGroup::PresentMask;
+          break;
+        }
+        case BenchKind::Empty: {
+          match_b = MetadataGroup::Empty;
+          break;
+        }
+        case BenchKind::Deleted: {
+          match_b = MetadataGroup::Deleted;
+          break;
+        }
+      }
+
+      // Replicate the match byte in each of the other matching indices.
+      for (ssize_t m_index : llvm::ArrayRef(group_indices)
+                                 .drop_front()
+                                 .take_front(match_count - 1)) {
+        group_bytes[m_index] = match_b;
+      }
+    }
+
+    // Now that the storage is set up, record these in our struct.
+    bm.metadata = metadata_storage[bm_index];
+    if constexpr (Kind == BenchKind::Random) {
+      bm.bytes = bytes_storage[bm_index];
+    }
+  }
+  return bm_storage;
+}
+
+template <BenchKind Kind>
+// NOLINTNEXTLINE(google-readability-casting): False positive clang-tidy bug.
+const auto bench_metadata = BuildBenchMetadata<Kind>();
+
+// Benchmark that simulates the dynamic execution pattern when we match exactly
+// one entry in the group, typically then using the index of the matching byte
+// to index into an element of a group of entries. But notably, the *first*
+// match is sufficient, and we never have to find the *next* match within the
+// group.
+template <BenchKind Kind, typename GroupT = MetadataGroup>
+static void BM_LoadMatch(benchmark::State& s) {
+  BenchMetadata bm = bench_metadata<Kind>[0];
+
+  // We want to make the index used by the next iteration of the benchmark have
+  // a data dependency on the result of matching. A match produces an index into
+  // the group of metadata. To consume this match in a way that is
+  // representative of how it will be used in a hashtable (indexing into an
+  // array of entries), while establishing that dependence, we keep a
+  // group-sized array of the value `1` in memory that we can index into to
+  // increment to the next step of the loop. We do have to hide the contents of
+  // the loop from the optimizer by clobbering the memory.
+  ssize_t all_ones[GroupSize];
+  for (ssize_t& n : all_ones) {
+    n = 1;
+  }
+  benchmark::ClobberMemory();
+
+  // We don't want the optimizer to peel iterations off of this loop, so hide
+  // the starting index.
+  ssize_t i = 0;
+  benchmark::DoNotOptimize(i);
+
+  // This loop looks *really* attractive to unroll to the compiler. However,
+  // that can easily overlap some of the memory operations and generally makes
+  // it harder to analyze the exact operation sequence we care about.
+#pragma clang loop unroll(disable)
+  for (auto _ : s) {
+    auto g = GroupT::Load(bm.metadata.data(), i * GroupSize);
+    typename GroupT::MatchIndex matches;
+    if constexpr (Kind == BenchKind::Empty) {
+      matches = g.MatchEmpty();
+    } else if constexpr (Kind == BenchKind::Deleted) {
+      matches = g.MatchDeleted();
+    } else {
+      static_assert(Kind == BenchKind::Random);
+      matches = static_cast<MetadataGroup::MatchIndex>(g.Match(bm.bytes[i]));
+    }
+    // Despite not being a DCHECK, this is fine for benchmarking. In an actual
+    // hashtable, we expect to have a test for empty of the match prior to using
+    // it to index an array, and that test is expected to be strongly predicted.
+    // That exactly matches how the `CARBON_CHECK` macro works, and so this
+    // serves as both a good correctness test and replication of hashtable usage
+    // of a match.
+    CARBON_CHECK(matches);
+
+    // Now do the data-dependent increment by indexing our "all ones" array. The
+    // index into `all_ones` is analogous to the index into a group of hashtable
+    // entries.
+    i = (i + all_ones[matches.index()]) & (BenchSize - 1);
+  }
+}
+BENCHMARK(BM_LoadMatch<BenchKind::Random>);
+BENCHMARK(BM_LoadMatch<BenchKind::Empty>);
+BENCHMARK(BM_LoadMatch<BenchKind::Deleted>);
+#if CARBON_NEON_SIMD_SUPPORT || CARBON_X86_SIMD_SUPPORT
+BENCHMARK(BM_LoadMatch<BenchKind::Random, PortableGroup>);
+BENCHMARK(BM_LoadMatch<BenchKind::Empty, PortableGroup>);
+BENCHMARK(BM_LoadMatch<BenchKind::Deleted, PortableGroup>);
+BENCHMARK(BM_LoadMatch<BenchKind::Random, SIMDGroup>);
+BENCHMARK(BM_LoadMatch<BenchKind::Empty, SIMDGroup>);
+BENCHMARK(BM_LoadMatch<BenchKind::Deleted, SIMDGroup>);
+#endif
+
+// Benchmark that measures the speed of a match that is only found after at
+// least one miss. Because the first match doesn't work, this covers
+// incrementing to the next match, with a number of increments taken from the
+// `Step` template parameter.
+template <BenchKind Kind, ssize_t Steps>
+static void BM_LoadMatchMissSteps(benchmark::State& s) {
+  static_assert(Steps > 0);
+  static_assert(Steps <= GroupSize);
+
+  // We pick the benchmark metadata at index `Steps - 1`, which will have
+  // `Steps` matches within each group.
+  BenchMetadata bm = bench_metadata<Kind>[Steps - 1];
+
+  // We want to make the index used by the next iteration of the benchmark have
+  // a data dependency on the result of matching. A match produces an index into
+  // the group of metadata. To consume this match in a way that is
+  // representative of how it will be used in a hashtable (indexing into an
+  // array of entries), while establishing that dependence, we keep a
+  // group-sized array of the value `1` in memory that we can index into to
+  // increment to the next step of the loop. We do have to hide the contents of
+  // the loop from the optimizer by clobbering the memory.
+  ssize_t all_ones[GroupSize];
+  for (ssize_t& n : all_ones) {
+    n = 1;
+  }
+  benchmark::ClobberMemory();
+
+  // We don't want the optimizer to peel iterations off of this loop, so hide
+  // the starting index.
+  ssize_t i = 0;
+  benchmark::DoNotOptimize(i);
+
+  // This loop looks *really* attractive to unroll to the compiler. However,
+  // that can easily overlap some of the memory operations and generally makes
+  // it harder to analyze the exact operation sequence we care about.
+#pragma clang loop unroll(disable)
+  for (auto _ : s) {
+    auto g = MetadataGroup::Load(bm.metadata.data(), i * GroupSize);
+    auto matched_range = g.Match(bm.bytes[i]);
+
+    // We don't use a `CARBON_CHECK` here as the loop below will test the range
+    // to see if the loop should be skipped, replicating the test that we also
+    // expect in hashtable usage.
+
+    // We want to simulate the code sequence a hashtable would produce when
+    // matching indices are "misses" in the hashtable, but only the aspects of
+    // those that reflect on the specific *match* implementation's generated
+    // code and performance. For each index in the match, we locate it in the
+    // `matched_range`, extract it as an index, and use that to index a
+    // group-sized array. We read memory from that array to increment `indices`,
+    // establishing data dependencies on each match index. This loop will run
+    // exactly `Steps` times.
+    ssize_t indices = 0;
+    for (ssize_t index : matched_range) {
+      indices += all_ones[index];
+    }
+
+    // We want to propagate the data dependencies accumulated into `indices`
+    // into the next value of `i`, and we know exactly how many increments were
+    // done in the loop, so subtract that constant and add one to arrive back at
+    // an increment of 1.
+    i = (i + (indices - Steps + 1)) & (BenchSize - 1);
+  }
+}
+BENCHMARK(BM_LoadMatchMissSteps<BenchKind::Random, 1>);
+BENCHMARK(BM_LoadMatchMissSteps<BenchKind::Random, 2>);
+BENCHMARK(BM_LoadMatchMissSteps<BenchKind::Random, 4>);
+BENCHMARK(BM_LoadMatchMissSteps<BenchKind::Random, 8>);
+#if CARBON_USE_X86_SIMD_CONTROL_GROUP
+BENCHMARK(BM_LoadMatchMissSteps<BenchKind::Random, 12>);
+BENCHMARK(BM_LoadMatchMissSteps<BenchKind::Random, 16>);
+#endif
+
+}  // namespace
+}  // namespace Carbon::RawHashtable

+ 11 - 0
common/raw_hashtable_metadata_group_benchmark_test.sh

@@ -0,0 +1,11 @@
+#!/usr/bin/env bash
+#
+# Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+# Exceptions. See /LICENSE for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+BENCHMARK="$TEST_SRCDIR/$TEST_WORKSPACE/common/raw_hashtable_metadata_group_benchmark"
+
+exec "$BENCHMARK" \
+  --benchmark_counters_tabular=true \
+  --benchmark_min_time=1x

+ 115 - 0
common/raw_hashtable_test_helpers.h

@@ -0,0 +1,115 @@
+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+// Exceptions. See /LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef CARBON_COMMON_RAW_HASHTABLE_TEST_HELPERS_H_
+#define CARBON_COMMON_RAW_HASHTABLE_TEST_HELPERS_H_
+
+#include <compare>
+
+#include "common/check.h"
+#include "common/hashing.h"
+#include "common/hashtable_key_context.h"
+#include "common/ostream.h"
+
+namespace Carbon::RawHashtable {
+
+// Non-trivial type for testing.
+struct TestData : Printable<TestData> {
+  int value;
+
+  // NOLINTNEXTLINE: google-explicit-constructor
+  TestData(int v) : value(v) { CARBON_CHECK(value >= 0); }
+  ~TestData() {
+    CARBON_CHECK(value >= 0);
+    value = -1;
+  }
+  TestData(const TestData& other) : TestData(other.value) {}
+  TestData(TestData&& other) noexcept : TestData(other.value) {
+    other.value = 0;
+  }
+  auto Print(llvm::raw_ostream& out) const -> void { out << value; }
+
+  friend auto operator==(TestData lhs, TestData rhs) -> bool {
+    return lhs.value == rhs.value;
+  }
+
+  friend auto operator<=>(TestData lhs, TestData rhs) -> std::strong_ordering {
+    return lhs.value <=> rhs.value;
+  }
+
+  friend auto CarbonHashValue(TestData data, uint64_t seed) -> HashCode {
+    return Carbon::HashValue(data.value, seed);
+  }
+};
+
+// Test stateless key context that produces different hashes from normal.
+// Changing the hash values should result in test failures if the context ever
+// fails to be used.
+struct TestKeyContext : DefaultKeyContext {
+  template <typename KeyT>
+  auto HashKey(const KeyT& key, uint64_t seed) const -> HashCode {
+    Hasher hash(seed);
+    // Inject some other data to the hash.
+    hash.Hash(42);
+    hash.Hash(HashValue(key));
+    return static_cast<HashCode>(hash);
+  }
+};
+
+// Hostile fixed hashing key context used for stress testing. Allows control
+// over which parts of the hash will be forced to collide, and the values they
+// are coerced to. Note that this relies on implementation details and internals
+// of `HashCode`.
+template <int TagBits, bool FixIndexBits, bool FixTagBits, uint64_t FixedVal>
+struct FixedHashKeyContext : DefaultKeyContext {
+  template <typename KeyT>
+  auto HashKey(const KeyT& key, uint64_t seed) const -> HashCode {
+    HashCode original_hash = HashValue(key, seed);
+    auto raw_hash = static_cast<uint64_t>(original_hash);
+
+    constexpr uint64_t TagMask = (1U << TagBits) - 1;
+    if (FixIndexBits) {
+      raw_hash &= TagMask;
+      raw_hash |= FixedVal << TagBits;
+      CARBON_DCHECK(HashCode(raw_hash).ExtractIndexAndTag<TagBits>().first ==
+                    (FixedVal & (~static_cast<uint64_t>(0) >> TagBits)));
+    }
+    if (FixTagBits) {
+      raw_hash &= ~TagMask;
+      raw_hash |= FixedVal & TagMask;
+      CARBON_DCHECK(HashCode(raw_hash).ExtractIndexAndTag<TagBits>().second ==
+                    (FixedVal & TagMask));
+    }
+    return HashCode(raw_hash);
+  }
+};
+
+template <typename T>
+class IndexKeyContext {
+ public:
+  explicit IndexKeyContext(llvm::ArrayRef<T> array) : array_(array) {}
+
+  auto HashKey(const T& value, uint64_t seed) const -> HashCode {
+    return HashValue(value, seed);
+  }
+  auto HashKey(ssize_t index, uint64_t seed) const -> HashCode {
+    return HashKey(array_[index], seed);
+  }
+
+  auto KeyEq(const T& lhs, ssize_t rhs_index) const -> bool {
+    return lhs == array_[rhs_index];
+  }
+  auto KeyEq(ssize_t lhs_index, ssize_t rhs_index) const -> bool {
+    // No need to compare the elements, if the indices are equal, the values
+    // must be.
+    return lhs_index == rhs_index;
+  }
+
+ private:
+  llvm::ArrayRef<T> array_;
+};
+
+}  // namespace Carbon::RawHashtable
+
+#endif  // CARBON_COMMON_RAW_HASHTABLE_TEST_HELPERS_H_

+ 358 - 0
common/set.h

@@ -0,0 +1,358 @@
+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+// Exceptions. See /LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef CARBON_COMMON_SET_H_
+#define CARBON_COMMON_SET_H_
+
+#include <concepts>
+
+#include "common/check.h"
+#include "common/hashtable_key_context.h"
+#include "common/raw_hashtable.h"
+#include "llvm/Support/Compiler.h"
+
+namespace Carbon {
+
+// Forward declarations to resolve cyclic references.
+template <typename KeyT, typename KeyContextT>
+class SetView;
+template <typename KeyT, typename KeyContextT>
+class SetBase;
+template <typename KeyT, ssize_t SmallSize, typename KeyContextT>
+class Set;
+
+// A read-only view type for a set of keys.
+//
+// This view is a cheap-to-copy type that should be passed by value, but
+// provides view or read-only reference semantics to the underlying set data
+// structure.
+//
+// This should always be preferred to a `const`-ref parameter for the `SetBase`
+// or `Set` type as it provides more flexibility and a cleaner API.
+//
+// Note that while this type is a read-only view, that applies to the underlying
+// *set* data structure, not the individual entries stored within it. Those can
+// be mutated freely as long as both the hashes and equality of the keys are
+// preserved. If we applied a deep-`const` design here, it would prevent using
+// this type in situations where the keys carry state (unhashed and not part of
+// equality) that is mutated while the associative container is not. A view of
+// immutable data can always be obtained by using `SetView<const T>`, and we
+// enable conversions to more-const views. This mirrors the semantics of views
+// like `std::span`.
+//
+// A specific `KeyContextT` type can optionally be provided to configure how
+// keys will be hashed and compared. The default is `DefaultKeyContext` which is
+// stateless and will hash using `Carbon::HashValue` and compare using
+// `operator==`. Every method accepting a lookup key or operating on the keys in
+// the table will also accept an instance of this type. For stateless context
+// types, including the default, an instance will be default constructed if not
+// provided to these methods. However, stateful contexts should be constructed
+// and passed in explicitly. The context type should be small and reasonable to
+// pass by value, often a wrapper or pointer to the relevant context needed for
+// hashing and comparing keys. For more details about the key context, see
+// `hashtable_key_context.h`.
+template <typename InputKeyT, typename InputKeyContextT = DefaultKeyContext>
+class SetView : RawHashtable::ViewImpl<InputKeyT, void, InputKeyContextT> {
+  using ImplT = RawHashtable::ViewImpl<InputKeyT, void, InputKeyContextT>;
+
+ public:
+  using KeyT = typename ImplT::KeyT;
+  using KeyContextT = typename ImplT::KeyContextT;
+
+  // This type represents the result of lookup operations. It encodes whether
+  // the lookup was a success as well as accessors for the key.
+  class LookupResult {
+   public:
+    LookupResult() = default;
+    explicit LookupResult(KeyT& key) : key_(&key) {}
+
+    explicit operator bool() const { return key_ != nullptr; }
+
+    auto key() const -> KeyT& { return *key_; }
+
+   private:
+    KeyT* key_ = nullptr;
+  };
+
+  // Enable implicit conversions that add `const`-ness to the key type.
+  // NOLINTNEXTLINE(google-explicit-constructor)
+  SetView(SetView<std::remove_const_t<KeyT>, KeyContextT> other_view)
+    requires(!std::same_as<KeyT, std::remove_const_t<KeyT>>)
+      : ImplT(other_view) {}
+
+  // Tests whether a key is present in the set.
+  template <typename LookupKeyT>
+  auto Contains(LookupKeyT lookup_key,
+                KeyContextT key_context = KeyContextT()) const -> bool;
+
+  // Lookup a key in the set.
+  template <typename LookupKeyT>
+  auto Lookup(LookupKeyT lookup_key,
+              KeyContextT key_context = KeyContextT()) const -> LookupResult;
+
+  // Run the provided callback for every key in the set.
+  template <typename CallbackT>
+  void ForEach(CallbackT callback)
+    requires(std::invocable<CallbackT, KeyT&>);
+
+  // This routine is relatively inefficient and only intended for use in
+  // benchmarking or logging of performance anomalies. The specific count
+  // returned has no specific guarantees beyond being informative in benchmarks.
+  // It counts how many of the keys in the hashtable have required probing
+  // beyond their initial group of slots.
+  //
+  // TODO: Replace with a more general metrics routine that covers other
+  // important aspects such as load factor, and average probe *distance*.
+  auto CountProbedKeys(KeyContextT key_context = KeyContextT()) -> ssize_t {
+    return ImplT::CountProbedKeys(key_context);
+  }
+
+ private:
+  template <typename SetKeyT, ssize_t SmallSize, typename KeyContextT>
+  friend class Set;
+  friend class SetBase<KeyT, KeyContextT>;
+  friend class SetView<const KeyT, KeyContextT>;
+
+  using EntryT = typename ImplT::EntryT;
+
+  SetView() = default;
+  // NOLINTNEXTLINE(google-explicit-constructor): Implicit by design.
+  SetView(ImplT base) : ImplT(base) {}
+  SetView(ssize_t size, RawHashtable::Storage* storage)
+      : ImplT(size, storage) {}
+};
+
+// A base class for a `Set` type that remains mutable while type-erasing the
+// `SmallSize` (SSO) template parameter.
+//
+// A pointer or reference to this type is the preferred way to pass a mutable
+// handle to a `Set` type across API boundaries as it avoids encoding specific
+// SSO sizing information while providing a near-complete mutable API.
+template <typename InputKeyT, typename InputKeyContextT>
+class SetBase
+    : protected RawHashtable::BaseImpl<InputKeyT, void, InputKeyContextT> {
+ protected:
+  using ImplT = RawHashtable::BaseImpl<InputKeyT, void, InputKeyContextT>;
+
+ public:
+  using KeyT = typename ImplT::KeyT;
+  using KeyContextT = typename ImplT::KeyContextT;
+  using ViewT = SetView<KeyT, KeyContextT>;
+  using LookupResult = typename ViewT::LookupResult;
+
+  // The result type for insertion operations both indicates whether an insert
+  // was needed (as opposed to the key already being in the set), and provides
+  // access to the key.
+  class InsertResult {
+   public:
+    InsertResult() = default;
+    explicit InsertResult(bool inserted, KeyT& key)
+        : key_(&key), inserted_(inserted) {}
+
+    auto is_inserted() const -> bool { return inserted_; }
+
+    auto key() const -> KeyT& { return *key_; }
+
+   private:
+    KeyT* key_;
+    bool inserted_;
+  };
+
+  // Implicitly convertible to the relevant view type.
+  //
+  // NOLINTNEXTLINE(google-explicit-constructor): Designed to implicitly decay.
+  operator ViewT() const { return this->view_impl(); }
+
+  // We can't chain the above conversion with the conversions on `ViewT` to add
+  // const, so explicitly support adding const to produce a view here.
+  //
+  // NOLINTNEXTLINE(google-explicit-constructor): Designed to implicitly decay.
+  operator SetView<const KeyT, KeyContextT>() const { return ViewT(*this); }
+
+  // Convenience forwarder to the view type.
+  template <typename LookupKeyT>
+  auto Contains(LookupKeyT lookup_key,
+                KeyContextT key_context = KeyContextT()) const -> bool {
+    return ViewT(*this).Contains(lookup_key, key_context);
+  }
+
+  // Convenience forwarder to the view type.
+  template <typename LookupKeyT>
+  auto Lookup(LookupKeyT lookup_key,
+              KeyContextT key_context = KeyContextT()) const -> LookupResult {
+    return ViewT(*this).Lookup(lookup_key, key_context);
+  }
+
+  // Convenience forwarder to the view type.
+  template <typename CallbackT>
+  void ForEach(CallbackT callback)
+    requires(std::invocable<CallbackT, KeyT&>)
+  {
+    return ViewT(*this).ForEach(callback);
+  }
+
+  // Convenience forwarder to the view type.
+  auto CountProbedKeys(KeyContextT key_context = KeyContextT()) const
+      -> ssize_t {
+    return ViewT(*this).CountProbedKeys(key_context);
+  }
+
+  // Insert a key into the set. If the key is already present, no insertion is
+  // performed and that present key is available in the result. Otherwise a new
+  // key is inserted and constructed from the argument and available in the
+  // result.
+  template <typename LookupKeyT>
+  auto Insert(LookupKeyT lookup_key, KeyContextT key_context = KeyContextT())
+      -> InsertResult;
+
+  // Insert a key into the set and call the provided callback to allow in-place
+  // construction of the key if not already present. The lookup key is passed
+  // through to the callback so it needn't be captured and can be kept in a
+  // register argument throughout.
+  //
+  // Example:
+  // ```cpp
+  //   m.Insert("widget", [](MyStringViewType lookup_key, void* key_storage) {
+  //     new (key_storage) MyStringType(lookup_key);
+  //   });
+  // ```
+  template <typename LookupKeyT, typename InsertCallbackT>
+  auto Insert(LookupKeyT lookup_key, InsertCallbackT insert_cb,
+              KeyContextT key_context = KeyContextT()) -> InsertResult
+    requires std::invocable<InsertCallbackT, LookupKeyT, void*>;
+
+  // Erase a key from the set.
+  template <typename LookupKeyT>
+  auto Erase(LookupKeyT lookup_key, KeyContextT key_context = KeyContextT())
+      -> bool;
+
+  // Clear all key/value pairs from the set but leave the underlying hashtable
+  // allocated and in place.
+  void Clear();
+
+ protected:
+  using ImplT::ImplT;
+};
+
+// A data structure for a set of keys.
+//
+// This set supports small size optimization (or "SSO"). The provided
+// `SmallSize` type parameter indicates the size of an embedded buffer for
+// storing sets small enough to fit. The default is zero, which always allocates
+// a heap buffer on construction. When non-zero, must be a multiple of the
+// `MaxGroupSize` which is currently 16. The library will check that the size is
+// valid and provide an error at compile time if not. We don't automatically
+// select the next multiple or otherwise fit the size to the constraints to make
+// it clear in the code how much memory is used by the SSO buffer.
+//
+// This data structure optimizes heavily for small key types that are cheap to
+// move and even copy. Using types with large keys or expensive to copy keys may
+// create surprising performance bottlenecks. A `std::string` key should be fine
+// with generally small strings, but if some or many strings are large heap
+// allocations the performance of hashtable routines may be unacceptably bad and
+// another data structure or key design is likely preferable.
+//
+// Note that this type should typically not appear on API boundaries; either
+// `SetBase` or `SetView` should be used instead.
+template <typename InputKeyT, ssize_t SmallSize = 0,
+          typename InputKeyContextT = DefaultKeyContext>
+class Set : public RawHashtable::TableImpl<SetBase<InputKeyT, InputKeyContextT>,
+                                           SmallSize> {
+  using BaseT = SetBase<InputKeyT, InputKeyContextT>;
+  using ImplT = RawHashtable::TableImpl<BaseT, SmallSize>;
+
+ public:
+  using KeyT = typename BaseT::KeyT;
+
+  Set() = default;
+  Set(const Set& arg) = default;
+  Set(Set&& arg) noexcept = default;
+
+  // Reset the entire state of the hashtable to as it was when constructed,
+  // throwing away any intervening allocations.
+  void Reset();
+};
+
+template <typename InputKeyT, typename InputKeyContextT>
+template <typename LookupKeyT>
+auto SetView<InputKeyT, InputKeyContextT>::Contains(
+    LookupKeyT lookup_key, KeyContextT key_context) const -> bool {
+  return this->LookupEntry(lookup_key, key_context) != nullptr;
+}
+
+template <typename InputKeyT, typename InputKeyContextT>
+template <typename LookupKeyT>
+auto SetView<InputKeyT, InputKeyContextT>::Lookup(LookupKeyT lookup_key,
+                                                  KeyContextT key_context) const
+    -> LookupResult {
+  EntryT* entry = this->LookupEntry(lookup_key, key_context);
+  if (!entry) {
+    return LookupResult();
+  }
+
+  return LookupResult(entry->key());
+}
+
+template <typename InputKeyT, typename InputKeyContextT>
+template <typename CallbackT>
+void SetView<InputKeyT, InputKeyContextT>::ForEach(CallbackT callback)
+  requires(std::invocable<CallbackT, KeyT&>)
+{
+  this->ForEachEntry([callback](EntryT& entry) { callback(entry.key()); },
+                     [](auto...) {});
+}
+
+template <typename InputKeyT, typename InputKeyContextT>
+template <typename LookupKeyT>
+auto SetBase<InputKeyT, InputKeyContextT>::Insert(LookupKeyT lookup_key,
+                                                  KeyContextT key_context)
+    -> InsertResult {
+  return Insert(
+      lookup_key,
+      [](LookupKeyT lookup_key, void* key_storage) {
+        new (key_storage) KeyT(std::move(lookup_key));
+      },
+      key_context);
+}
+
+template <typename InputKeyT, typename InputKeyContextT>
+template <typename LookupKeyT, typename InsertCallbackT>
+auto SetBase<InputKeyT, InputKeyContextT>::Insert(LookupKeyT lookup_key,
+                                                  InsertCallbackT insert_cb,
+                                                  KeyContextT key_context)
+    -> InsertResult
+  requires std::invocable<InsertCallbackT, LookupKeyT, void*>
+{
+  auto [entry, inserted] = this->InsertImpl(lookup_key, key_context);
+  CARBON_DCHECK(entry) << "Should always result in a valid index.";
+
+  if (LLVM_LIKELY(!inserted)) {
+    return InsertResult(false, entry->key());
+  }
+
+  insert_cb(lookup_key, static_cast<void*>(&entry->key_storage));
+  return InsertResult(true, entry->key());
+}
+
+template <typename InputKeyT, typename InputKeyContextT>
+template <typename LookupKeyT>
+auto SetBase<InputKeyT, InputKeyContextT>::Erase(LookupKeyT lookup_key,
+                                                 KeyContextT key_context)
+    -> bool {
+  return this->EraseImpl(lookup_key, key_context);
+}
+
+template <typename InputKeyT, typename InputKeyContextT>
+void SetBase<InputKeyT, InputKeyContextT>::Clear() {
+  this->ClearImpl();
+}
+
+template <typename InputKeyT, ssize_t SmallSize, typename InputKeyContextT>
+void Set<InputKeyT, SmallSize, InputKeyContextT>::Reset() {
+  this->ResetImpl();
+}
+
+}  // namespace Carbon
+
+#endif  // CARBON_COMMON_SET_H_

+ 382 - 0
common/set_benchmark.cpp

@@ -0,0 +1,382 @@
+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+// Exceptions. See /LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <benchmark/benchmark.h>
+
+#include "absl/container/flat_hash_set.h"
+#include "common/raw_hashtable_benchmark_helpers.h"
+#include "common/set.h"
+#include "llvm/ADT/DenseSet.h"
+
+namespace Carbon {
+namespace {
+
+using RawHashtable::CarbonHashDI;
+using RawHashtable::GetKeysAndHitKeys;
+using RawHashtable::GetKeysAndMissKeys;
+using RawHashtable::HitArgs;
+using RawHashtable::SizeArgs;
+using RawHashtable::ValueToBool;
+
+template <typename SetT>
+struct IsCarbonSetImpl : std::false_type {};
+template <typename KT, int MinSmallSize>
+struct IsCarbonSetImpl<Set<KT, MinSmallSize>> : std::true_type {};
+
+template <typename SetT>
+static constexpr bool IsCarbonSet = IsCarbonSetImpl<SetT>::value;
+
+// A wrapper around various set types that we specialize to implement a common
+// API used in the benchmarks for various different map data structures that
+// support different APIs. The primary template assumes a roughly
+// `std::unordered_set` API design, and types with a different API design are
+// supported through specializations.
+template <typename SetT>
+struct SetWrapperImpl {
+  using KeyT = typename SetT::key_type;
+
+  SetT s;
+
+  auto BenchContains(KeyT k) -> bool { return s.find(k) != s.end(); }
+
+  auto BenchLookup(KeyT k) -> bool {
+    auto it = s.find(k);
+    if (it == s.end()) {
+      return false;
+    }
+    // We expect keys to always convert to `true` so directly return that here.
+    return ValueToBool(*it);
+  }
+
+  auto BenchInsert(KeyT k) -> bool {
+    auto result = s.insert(k);
+    return result.second;
+  }
+
+  auto BenchErase(KeyT k) -> bool { return s.erase(k) != 0; }
+};
+
+// Explicit (partial) specialization for the Carbon map type that uses its
+// different API design.
+template <typename KT, int MinSmallSize>
+struct SetWrapperImpl<Set<KT, MinSmallSize>> {
+  using SetT = Set<KT, MinSmallSize>;
+  using KeyT = KT;
+
+  SetT s;
+
+  auto BenchContains(KeyT k) -> bool { return s.Contains(k); }
+
+  auto BenchLookup(KeyT k) -> bool {
+    auto result = s.Lookup(k);
+    if (!result) {
+      return false;
+    }
+    return ValueToBool(result.key());
+  }
+
+  auto BenchInsert(KeyT k) -> bool {
+    auto result = s.Insert(k);
+    return result.is_inserted();
+  }
+
+  auto BenchErase(KeyT k) -> bool { return s.Erase(k); }
+};
+
+// Provide a way to override the Carbon Set specific benchmark runs with another
+// hashtable implementation. When building, you can use one of these enum names
+// in a macro define such as `-DCARBON_SET_BENCH_OVERRIDE=Name` in order to
+// trigger a specific override for the `Set` type benchmarks. This is used to
+// get before/after runs that compare the performance of Carbon's Set versus
+// other implementations.
+enum class SetOverride : uint8_t {
+  Abseil,
+  LLVM,
+  LLVMAndCarbonHash,
+};
+template <typename SetT, SetOverride Override>
+struct SetWrapperOverride : SetWrapperImpl<SetT> {};
+
+template <typename KeyT, int MinSmallSize>
+struct SetWrapperOverride<Set<KeyT, MinSmallSize>, SetOverride::Abseil>
+    : SetWrapperImpl<absl::flat_hash_set<KeyT>> {};
+
+template <typename KeyT, int MinSmallSize>
+struct SetWrapperOverride<Set<KeyT, MinSmallSize>, SetOverride::LLVM>
+    : SetWrapperImpl<llvm::DenseSet<KeyT>> {};
+
+template <typename KeyT, int MinSmallSize>
+struct SetWrapperOverride<Set<KeyT, MinSmallSize>,
+                          SetOverride::LLVMAndCarbonHash>
+    : SetWrapperImpl<llvm::DenseSet<KeyT, CarbonHashDI<KeyT>>> {};
+
+#ifndef CARBON_SET_BENCH_OVERRIDE
+template <typename SetT>
+using SetWrapper = SetWrapperImpl<SetT>;
+#else
+template <typename SetT>
+using SetWrapper =
+    SetWrapperOverride<SetT, SetOverride::CARBON_SET_BENCH_OVERRIDE>;
+#endif
+
+// NOLINTBEGIN(bugprone-macro-parentheses): Parentheses are incorrect here.
+#define MAP_BENCHMARK_ONE_OP_SIZE(NAME, APPLY, KT)        \
+  BENCHMARK(NAME<Set<KT>>)->Apply(APPLY);                 \
+  BENCHMARK(NAME<absl::flat_hash_set<KT>>)->Apply(APPLY); \
+  BENCHMARK(NAME<llvm::DenseSet<KT>>)->Apply(APPLY);      \
+  BENCHMARK(NAME<llvm::DenseSet<KT, CarbonHashDI<KT>>>)->Apply(APPLY)
+// NOLINTEND(bugprone-macro-parentheses)
+
+#define MAP_BENCHMARK_ONE_OP(NAME, APPLY)       \
+  MAP_BENCHMARK_ONE_OP_SIZE(NAME, APPLY, int);  \
+  MAP_BENCHMARK_ONE_OP_SIZE(NAME, APPLY, int*); \
+  MAP_BENCHMARK_ONE_OP_SIZE(NAME, APPLY, llvm::StringRef)
+
+// Benchmark the "latency" of testing for a key in a set. This always tests with
+// a key that is found.
+//
+// However, because the key is always found and because the test ultimately
+// involves conditional control flow that can be predicted, we expect modern
+// CPUs to perfectly predict the control flow here and turn the measurement from
+// one iteration to the next into a throughput measurement rather than a real
+// latency measurement.
+//
+// However, this does represent a particularly common way in which a set data
+// structure is accessed. The numbers should just be carefully interpreted in
+// the context of being more a reflection of reciprocal throughput than actual
+// latency. See the `Lookup` benchmarks for a genuine latency measure with its
+// own caveats.
+//
+// However, this does still show some interesting caching effects when querying
+// large fractions of large tables, and can give a sense of the inescapable
+// magnitude of these effects even when there is a great deal of prediction and
+// speculative execution to hide memory access latency.
+template <typename SetT>
+static void BM_SetContainsHitPtr(benchmark::State& state) {
+  using SetWrapperT = SetWrapper<SetT>;
+  using KT = typename SetWrapperT::KeyT;
+  SetWrapperT s;
+  auto [keys, lookup_keys] =
+      GetKeysAndHitKeys<KT>(state.range(0), state.range(1));
+  for (auto k : keys) {
+    s.BenchInsert(k);
+  }
+  ssize_t lookup_keys_size = lookup_keys.size();
+
+  while (state.KeepRunningBatch(lookup_keys_size)) {
+    for (ssize_t i = 0; i < lookup_keys_size;) {
+      // We block optimizing `i` as that has proven both more effective at
+      // blocking the loop from being optimized away and avoiding disruption of
+      // the generated code that we're benchmarking.
+      benchmark::DoNotOptimize(i);
+
+      bool result = s.BenchContains(lookup_keys[i]);
+      CARBON_DCHECK(result);
+      // We use the lookup success to step through keys, establishing a
+      // dependency between each lookup. This doesn't fully allow us to measure
+      // latency rather than throughput, as noted above.
+      i += static_cast<ssize_t>(result);
+    }
+  }
+}
+MAP_BENCHMARK_ONE_OP(BM_SetContainsHitPtr, HitArgs);
+
+// Benchmark the "latency" (but more likely the reciprocal throughput, see
+// comment above) of testing for a key in the set that is *not* present.
+template <typename SetT>
+static void BM_SetContainsMissPtr(benchmark::State& state) {
+  using SetWrapperT = SetWrapper<SetT>;
+  using KT = typename SetWrapperT::KeyT;
+  SetWrapperT s;
+  auto [keys, lookup_keys] = GetKeysAndMissKeys<KT>(state.range(0));
+  for (auto k : keys) {
+    s.BenchInsert(k);
+  }
+  ssize_t lookup_keys_size = lookup_keys.size();
+
+  while (state.KeepRunningBatch(lookup_keys_size)) {
+    for (ssize_t i = 0; i < lookup_keys_size;) {
+      benchmark::DoNotOptimize(i);
+
+      bool result = s.BenchContains(lookup_keys[i]);
+      CARBON_DCHECK(!result);
+      i += static_cast<ssize_t>(!result);
+    }
+  }
+}
+MAP_BENCHMARK_ONE_OP(BM_SetContainsMissPtr, SizeArgs);
+
+// A somewhat contrived latency test for the lookup code path.
+//
+// While lookups into a set are often (but not always) simply used to influence
+// control flow, that style of access produces difficult to evaluate benchmark
+// results (see the comments on the `Contains` benchmarks above).
+//
+// So here we actually access the key in the set and convert that key's value to
+// a boolean on the critical path of each iteration. This lets us have a genuine
+// latency benchmark of looking up a key in the set, at the expense of being
+// somewhat contrived. That said, for usage where the key object is queried or
+// operated on in some way once looked up in the set, this will be fairly
+// representative of the latency cost from the data structure.
+template <typename SetT>
+static void BM_SetLookupHitPtr(benchmark::State& state) {
+  using SetWrapperT = SetWrapper<SetT>;
+  using KT = typename SetWrapperT::KeyT;
+  SetWrapperT s;
+  auto [keys, lookup_keys] =
+      GetKeysAndHitKeys<KT>(state.range(0), state.range(1));
+  for (auto k : keys) {
+    s.BenchInsert(k);
+  }
+  ssize_t lookup_keys_size = lookup_keys.size();
+
+  while (state.KeepRunningBatch(lookup_keys_size)) {
+    for (ssize_t i = 0; i < lookup_keys_size;) {
+      benchmark::DoNotOptimize(i);
+
+      bool result = s.BenchLookup(lookup_keys[i]);
+      CARBON_DCHECK(result);
+      i += static_cast<ssize_t>(result);
+    }
+  }
+}
+MAP_BENCHMARK_ONE_OP(BM_SetLookupHitPtr, HitArgs);
+
+// First erase and then insert the key. The code path will always be the same
+// here and so we expect this to largely be a throughput benchmark because of
+// branch prediction and speculative execution.
+//
+// We don't expect erase followed by insertion to be a common user code
+// sequence, but we don't have a good way of benchmarking either erase or insert
+// in isolation -- each would change the size of the table and thus the next
+// iteration's benchmark. And if we try to correct the table size outside of the
+// timed region, we end up trying to exclude too fine grained of a region from
+// timers to get good measurement data.
+//
+// Our solution is to benchmark both erase and insertion back to back. We can
+// then get a good profile of the code sequence of each, and at least measure
+// the sum cost of these reliably. Careful profiling can help attribute that
+// cost between erase and insert in order to understand which of the two
+// operations is contributing most to any performance artifacts observed.
+template <typename SetT>
+static void BM_SetEraseInsertHitPtr(benchmark::State& state) {
+  using SetWrapperT = SetWrapper<SetT>;
+  using KT = typename SetWrapperT::KeyT;
+  SetWrapperT s;
+  auto [keys, lookup_keys] =
+      GetKeysAndHitKeys<KT>(state.range(0), state.range(1));
+  for (auto k : keys) {
+    s.BenchInsert(k);
+  }
+  ssize_t lookup_keys_size = lookup_keys.size();
+
+  while (state.KeepRunningBatch(lookup_keys_size)) {
+    for (ssize_t i = 0; i < lookup_keys_size;) {
+      benchmark::DoNotOptimize(i);
+
+      s.BenchErase(lookup_keys[i]);
+      benchmark::ClobberMemory();
+
+      bool inserted = s.BenchInsert(lookup_keys[i]);
+      CARBON_DCHECK(inserted);
+      i += static_cast<ssize_t>(inserted);
+    }
+  }
+}
+MAP_BENCHMARK_ONE_OP(BM_SetEraseInsertHitPtr, HitArgs);
+
+// NOLINTBEGIN(bugprone-macro-parentheses): Parentheses are incorrect here.
+#define MAP_BENCHMARK_OP_SEQ_SIZE(NAME, KT)                  \
+  BENCHMARK(NAME<Set<KT>>)->Apply(SizeArgs);                 \
+  BENCHMARK(NAME<absl::flat_hash_set<KT>>)->Apply(SizeArgs); \
+  BENCHMARK(NAME<llvm::DenseSet<KT>>)->Apply(SizeArgs);      \
+  BENCHMARK(NAME<llvm::DenseSet<KT, CarbonHashDI<KT>>>)->Apply(SizeArgs)
+// NOLINTEND(bugprone-macro-parentheses)
+
+#define MAP_BENCHMARK_OP_SEQ(NAME)       \
+  MAP_BENCHMARK_OP_SEQ_SIZE(NAME, int);  \
+  MAP_BENCHMARK_OP_SEQ_SIZE(NAME, int*); \
+  MAP_BENCHMARK_OP_SEQ_SIZE(NAME, llvm::StringRef)
+
+// This is an interesting, somewhat specialized benchmark that measures the cost
+// of inserting a sequence of keys into a set up to some size and then inserting
+// a colliding key and throwing away the set.
+//
+// This is an especially important usage pattern for sets as a large number of
+// algorithms essentially look like this, such as collision detection, cycle
+// detection, de-duplication, etc.
+//
+// It also covers both the insert-into-an-empty-slot code path that isn't
+// covered elsewhere, and the code path for growing a table to a larger size.
+//
+// This is the second most important aspect of expected set usage after testing
+// for presence. It also nicely lends itself to a single benchmark that covers
+// the total cost of this usage pattern.
+//
+// Because this benchmark operates on whole sets, we also compute the number of
+// probed keys for Carbon's set as that is both a general reflection of the
+// efficacy of the underlying hash function, and a direct factor that drives the
+// cost of these operations.
+template <typename SetT>
+static void BM_SetInsertSeq(benchmark::State& state) {
+  using SetWrapperT = SetWrapper<SetT>;
+  using KT = typename SetWrapperT::KeyT;
+  constexpr ssize_t LookupKeysSize = 1 << 8;
+  auto [keys, lookup_keys] =
+      GetKeysAndHitKeys<KT>(state.range(0), LookupKeysSize);
+
+  // Now build a large shuffled set of keys (with duplicates) we'll use at the
+  // end.
+  ssize_t i = 0;
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(i);
+
+    SetWrapperT s;
+    for (auto k : keys) {
+      bool inserted = s.BenchInsert(k);
+      CARBON_DCHECK(inserted) << "Must be a successful insert!";
+    }
+
+    // Now insert a final random repeated key.
+    bool inserted = s.BenchInsert(lookup_keys[i]);
+    CARBON_DCHECK(!inserted) << "Must already be in the map!";
+
+    // Rotate through the shuffled keys.
+    i = (i + static_cast<ssize_t>(!inserted)) & (LookupKeysSize - 1);
+  }
+
+  // It can be easier in some cases to think of this as a key-throughput rate of
+  // insertion rather than the latency of inserting N keys, so construct the
+  // rate counter as well.
+  state.counters["KeyRate"] = benchmark::Counter(
+      keys.size(), benchmark::Counter::kIsIterationInvariantRate);
+
+  // Report some extra statistics about the Carbon type.
+  if constexpr (IsCarbonSet<SetT>) {
+    // Re-build a set outside of the timing loop to look at the statistics
+    // rather than the timing.
+    SetT s;
+    for (auto k : keys) {
+      bool inserted = s.Insert(k).is_inserted();
+      CARBON_DCHECK(inserted) << "Must be a successful insert!";
+    }
+
+    // While this count is "iteration invariant" (it should be exactly the same
+    // for every iteration as the set of keys is the same), we don't use that
+    // because it will scale this by the number of iterations. We want to
+    // display the probe count of this benchmark *parameter*, not the probe
+    // count that resulted from the number of iterations. That means we use the
+    // normal counter API without flags.
+    state.counters["Probed"] = s.CountProbedKeys();
+
+    // Uncomment this call to print out statistics about the index-collisions
+    // among these keys for debugging:
+    //
+    // RawHashtable::DumpHashStatistics(raw_keys);
+  }
+}
+MAP_BENCHMARK_OP_SEQ(BM_SetInsertSeq);
+
+}  // namespace
+}  // namespace Carbon

+ 12 - 0
common/set_benchmark_test.sh

@@ -0,0 +1,12 @@
+#!/usr/bin/env bash
+#
+# Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+# Exceptions. See /LICENSE for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+BENCHMARK="$TEST_SRCDIR/$TEST_WORKSPACE/common/set_benchmark"
+
+exec "$BENCHMARK" \
+  --benchmark_counters_tabular=true \
+  --benchmark_min_time=1x \
+  --benchmark_filter='^[^/]*/[1-9][0-9]{0,3}(/[0-9]+)?$'

+ 218 - 0
common/set_test.cpp

@@ -0,0 +1,218 @@
+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+// Exceptions. See /LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "common/set.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include <initializer_list>
+#include <type_traits>
+#include <vector>
+
+#include "common/raw_hashtable_test_helpers.h"
+
+namespace Carbon {
+namespace {
+
+using RawHashtable::IndexKeyContext;
+using RawHashtable::TestData;
+using ::testing::UnorderedElementsAreArray;
+
+template <typename SetT, typename MatcherRangeT>
+void ExpectSetElementsAre(SetT&& s, MatcherRangeT element_matchers) {
+  // Collect the elements into a container.
+  using KeyT = typename std::remove_reference<SetT>::type::KeyT;
+  std::vector<KeyT> entries;
+  s.ForEach([&entries](KeyT& k) { entries.push_back(k); });
+
+  // Use the GoogleMock unordered container matcher to validate and show errors
+  // on wrong elements.
+  EXPECT_THAT(entries, UnorderedElementsAreArray(element_matchers));
+}
+
+// Allow directly using an initializer list.
+template <typename SetT, typename MatcherT>
+void ExpectSetElementsAre(SetT&& s,
+                          std::initializer_list<MatcherT> element_matchers) {
+  std::vector<MatcherT> element_matchers_storage = element_matchers;
+  ExpectSetElementsAre(s, element_matchers_storage);
+}
+
+template <typename RangeT, typename... RangeTs>
+auto MakeElements(RangeT&& range, RangeTs&&... ranges) {
+  std::vector<typename RangeT::value_type> elements;
+  auto add_range = [&elements](RangeT&& r) {
+    for (const auto&& e : r) {
+      elements.push_back(e);
+    }
+  };
+  add_range(std::forward<RangeT>(range));
+  (add_range(std::forward<RangeT>(ranges)), ...);
+
+  return elements;
+}
+
+template <typename SetT>
+class SetTest : public ::testing::Test {};
+
+using Types = ::testing::Types<Set<int>, Set<int, 16>, Set<int, 128>,
+                               Set<TestData>, Set<TestData, 16>>;
+TYPED_TEST_SUITE(SetTest, Types);
+
+TYPED_TEST(SetTest, Basic) {
+  using SetT = TypeParam;
+  SetT s;
+
+  EXPECT_FALSE(s.Contains(42));
+  EXPECT_TRUE(s.Insert(1).is_inserted());
+  EXPECT_TRUE(s.Contains(1));
+  auto result = s.Lookup(1);
+  EXPECT_TRUE(result);
+  EXPECT_EQ(1, result.key());
+  auto i_result = s.Insert(1);
+  EXPECT_FALSE(i_result.is_inserted());
+  EXPECT_TRUE(s.Contains(1));
+
+  // Verify all the elements.
+  ExpectSetElementsAre(s, {1});
+
+  // Fill up a bunch to ensure we trigger growth a few times.
+  for (int i : llvm::seq(2, 512)) {
+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
+    EXPECT_TRUE(s.Insert(i).is_inserted());
+  }
+  for (int i : llvm::seq(1, 512)) {
+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
+    EXPECT_TRUE(s.Contains(i));
+    EXPECT_FALSE(s.Insert(i).is_inserted());
+  }
+  EXPECT_FALSE(s.Contains(513));
+
+  // Verify all the elements.
+  ExpectSetElementsAre(s, MakeElements(llvm::seq(1, 512)));
+}
+
+TYPED_TEST(SetTest, FactoryAPI) {
+  using SetT = TypeParam;
+  SetT s;
+  EXPECT_TRUE(s.Insert(1, [](int k, void* key_storage) {
+                 return new (key_storage) int(k);
+               }).is_inserted());
+  ASSERT_TRUE(s.Contains(1));
+  // Reinsertion doesn't invoke the callback.
+  EXPECT_FALSE(s.Insert(1, [](int, void*) -> int* {
+                  llvm_unreachable("Should never be called!");
+                }).is_inserted());
+}
+
+TYPED_TEST(SetTest, Copy) {
+  using SetT = TypeParam;
+
+  SetT s;
+  // Make sure we exceed the small size for some of the set types, but not all
+  // of them, so we cover all the combinations of copying between small and
+  // large.
+  for (int i : llvm::seq(1, 24)) {
+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
+    ASSERT_TRUE(s.Insert(i).is_inserted());
+  }
+
+  SetT other_s1 = s;
+  ExpectSetElementsAre(other_s1, MakeElements(llvm::seq(1, 24)));
+
+  // Add some more elements to the original.
+  for (int i : llvm::seq(24, 32)) {
+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
+    ASSERT_TRUE(s.Insert(i).is_inserted());
+  }
+
+  // The first copy doesn't change.
+  ExpectSetElementsAre(other_s1, MakeElements(llvm::seq(1, 24)));
+
+  // A new copy does.
+  SetT other_s2 = s;
+  ExpectSetElementsAre(other_s2, MakeElements(llvm::seq(1, 32)));
+}
+
+TYPED_TEST(SetTest, Move) {
+  using SetT = TypeParam;
+
+  SetT s;
+  // Make sure we exceed the small size for some of the set types, but not all
+  // of them, so we cover all the combinations of copying between small and
+  // large.
+  for (int i : llvm::seq(1, 24)) {
+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
+    ASSERT_TRUE(s.Insert(i).is_inserted());
+  }
+
+  SetT other_s1 = std::move(s);
+  ExpectSetElementsAre(other_s1, MakeElements(llvm::seq(1, 24)));
+
+  // Add some more elements.
+  for (int i : llvm::seq(24, 32)) {
+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
+    ASSERT_TRUE(other_s1.Insert(i).is_inserted());
+  }
+  ExpectSetElementsAre(other_s1, MakeElements(llvm::seq(1, 32)));
+}
+
+TYPED_TEST(SetTest, Conversions) {
+  using SetT = TypeParam;
+  using KeyT = SetT::KeyT;
+  SetT s;
+  ASSERT_TRUE(s.Insert(1).is_inserted());
+  ASSERT_TRUE(s.Insert(2).is_inserted());
+  ASSERT_TRUE(s.Insert(3).is_inserted());
+  ASSERT_TRUE(s.Insert(4).is_inserted());
+
+  SetView<KeyT> sv = s;
+  SetView<const KeyT> csv = sv;
+  SetView<const KeyT> csv2 = s;
+  EXPECT_TRUE(sv.Contains(1));
+  EXPECT_TRUE(csv.Contains(2));
+  EXPECT_TRUE(csv2.Contains(3));
+}
+
+TEST(SetContextTest, Basic) {
+  llvm::SmallVector<TestData> keys;
+  for (int i : llvm::seq(0, 513)) {
+    keys.push_back(i * 100);
+  }
+  IndexKeyContext<TestData> key_context(keys);
+  Set<ssize_t, 0, IndexKeyContext<TestData>> s;
+
+  EXPECT_FALSE(s.Contains(42, key_context));
+  EXPECT_TRUE(s.Insert(1, key_context).is_inserted());
+  EXPECT_TRUE(s.Contains(1, key_context));
+  auto result = s.Lookup(TestData(100), key_context);
+  EXPECT_TRUE(result);
+  EXPECT_EQ(1, result.key());
+  auto i_result = s.Insert(1, IndexKeyContext<TestData>(keys));
+  EXPECT_FALSE(i_result.is_inserted());
+  EXPECT_TRUE(s.Contains(1, key_context));
+
+  // Verify all the elements.
+  ExpectSetElementsAre(s, {1});
+
+  // Fill up a bunch to ensure we trigger growth a few times.
+  for (int i : llvm::seq(2, 512)) {
+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
+    EXPECT_TRUE(s.Insert(i, key_context).is_inserted());
+  }
+  for (int i : llvm::seq(1, 512)) {
+    SCOPED_TRACE(llvm::formatv("Key: {0}", i).str());
+    EXPECT_TRUE(s.Contains(i, key_context));
+    EXPECT_FALSE(s.Insert(i, key_context).is_inserted());
+  }
+  EXPECT_FALSE(s.Contains(0, key_context));
+  EXPECT_FALSE(s.Contains(512, key_context));
+
+  // Verify all the elements.
+  ExpectSetElementsAre(s, MakeElements(llvm::seq(1, 512)));
+}
+
+}  // namespace
+}  // namespace Carbon