tomteb
/
carbon-lang


			
				
					
						
						
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336
							// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
// Exceptions. See /LICENSE for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#ifndef CARBON_COMMON_RAW_HASHTABLE_H_
#define CARBON_COMMON_RAW_HASHTABLE_H_

#include <algorithm>
#include <concepts>
#include <cstddef>
#include <cstring>
#include <new>
#include <type_traits>
#include <utility>

#include "common/check.h"
#include "common/hashing.h"
#include "common/hashtable_key_context.h"
#include "common/raw_hashtable_metadata_group.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/MathExtras.h"

// A namespace collecting a set of low-level utilities for building hashtable
// data structures. These should only be used as implementation details of
// higher-level data-structure APIs.
//
// The utilities here use the `hashtable_key_context.h` provided `KeyContext` to
// support the necessary hashtable operations on keys: hashing and comparison.
// This also serves as the customization point for hashtables built on this
// infrastructure for those operations. See that header file for details.
//
// These utilities support hashtables following a *specific* API design pattern,
// and using Small-Size Optimization, or "SSO", when desired. We expect there to
// be three layers to any hashtable design:
//
// - A *view* type: a read-only view of the hashtable contents. This type should
//   be a value type and is expected to be passed by-value in APIs. However, it
//   will have `const`-reference semantics, much like a `std::string_view`. Note
//   that the *entries* will continue to be mutable, it is only the *table* that
//   is read-only.
//
// - A *base* type: a base class type of the actual hashtable, which allows
//   almost all mutable operations but erases any specific SSO buffer size.
//   Because this is a base of the actual hash table, it is designed to be
//   passed as a non-`const` reference or pointer.
//
// - A *table* type: the actual hashtable which derives from the base type and
//   adds any desired SSO storage buffer. Beyond the physical storage, it also
//   allows resetting the table to its initial state & allocated size, as well
//   as copying and moving the table.
//
// For complete examples of the API design, see `set.h` for a hashtable-based
// set data structure, and `map.h` for a hashtable-based map data structure.
//
// The hashtable design implemented here has several key invariants and design
// elements that are essential to all three of the types above and the
// functionality they provide.
//
// - The underlying hashtable uses [open addressing], a power-of-two table size,
//   and quadratic probing rather than closed addressing and chaining.
//
//   [open addressing]: https://en.wikipedia.org/wiki/Open_addressing
//
// - Each _slot_ in the table corresponds to a key, a value, and one byte of
//   metadata. Each _entry_ is a key and value. The key and value for an entry
//   are stored together.
//
// - The allocated storage is organized into an array of metadata bytes followed
//   by an array of entry storage.
//
// - The metadata byte corresponding to each entry marks that entry is either
//   empty, deleted, or present. When present, a 7-bit tag is also stored using
//   another 7 bits from the hash of the entry key.
//
// - The storage for an entry is an internal type that should not be exposed to
//   users, and instead only the underlying keys and values.
//
// - The hash addressing and probing occurs over *groups* of slots rather than
//   individual entries. When inserting a new entry, it can be added to the
//   group it hashes to as long it is not full, and can even replace a slot with
//   a tombstone indicating a previously deleted entry. Only when the group is
//   full will it look at the next group in the probe sequence. As a result,
//   there may be entries in a group where a different group is the start of
//   that entry's probe sequence. Also, when performing a lookup, every group in
//   the probe sequence must be inspected for the lookup key until it is found
//   or the group has an empty slot.
//
// - Groups are scanned rapidly using the one-byte metadata for each entry in
//   the group and CPU instructions that allow comparing all of the metadata for
//   a group in parallel. For more details on the metadata group encoding and
//   scanning, see `raw_hashtable_metadata_group.h`.
//
// - `GroupSize` is a platform-specific relatively small power of two that fits
//   in some hardware register. However, `MaxGroupSize` is provided as a
//   portable max that is also a power of two. The table storage, whether
//   provided by an SSO buffer or allocated, is required to be a multiple of
//   `MaxGroupSize` to keep the requirement portable but sufficient for all
//   platforms.
//
// - There is *always* an allocated table of some multiple of `MaxGroupSize`.
//   This allows accesses to be branchless. When heap allocated, we pro-actively
//   allocate at least a minimum heap size table. When there is a small-size
//   optimization (SSO) buffer, that provides the initial allocation.
//
// - The table performs a minimal amount of bookkeeping that limits the APIs it
//   can support:
//    - `alloc_size` is the size of the table *allocated* (not *used*), and is
//       always a power of 2 at least as big as `MinAllocatedSize`.
//    - `storage` is a pointer to the storage for the `alloc_size` slots of the
//       table, and never null.
//    - `small_alloc_size` is the maximum `alloc_size` where the table is stored
//       in the object itself instead of separately on the heap. In this case,
//       `storage` points to `small_storage_`.
//    - `growth_budget` is the number of entries that may be added before the
//       table allocation is doubled. It is always
//       `GrowthThresholdForAllocSize(alloc_size)` minus the number of
//       non-empty (filled or deleted) slots. If it ever falls to 0, the table
//       is grown to keep it greater than 0.
//   There is also the "moved-from" state where the table may only be
//   reinitialized or destroyed where the `alloc_size` is 0 and `storage` is
//   null. Since it doesn't track the exact number of filled entries in a table,
//   it doesn't support a container-style `size` API.
//
// - There is no direct iterator support because of the complexity of embedding
//   the group-based metadata scanning into an iterator model. Instead, there is
//   just a for-each method that is passed a lambda to observe all entries. The
//   order of this observation is also not guaranteed.
namespace Carbon::RawHashtable {

// If allocating storage, allocate a minimum of one cacheline of group metadata
// or a minimum of one group, whichever is larger.
constexpr ssize_t MinAllocatedSize = std::max<ssize_t>(64, MaxGroupSize);

// An entry in the hashtable storage of a `KeyT` and `ValueT` object.
//
// Allows manual construction, destruction, and access to these values so we can
// create arrays af the entries prior to populating them with actual keys and
// values.
template <typename KeyT, typename ValueT>
struct StorageEntry {
  static constexpr bool IsTriviallyDestructible =
      std::is_trivially_destructible_v<KeyT> &&
      std::is_trivially_destructible_v<ValueT>;

  static constexpr bool IsTriviallyRelocatable =
      IsTriviallyDestructible && std::is_trivially_move_constructible_v<KeyT> &&
      std::is_trivially_move_constructible_v<ValueT>;

  auto key() const -> const KeyT& {
    // Ensure we don't need more alignment than available. Inside a method body
    // to apply to the complete type.
    static_assert(
        alignof(StorageEntry) <= MinAllocatedSize,
        "The minimum allocated size turns into the alignment of our array of "
        "storage entries as they follow the metadata byte array.");

    return *std::launder(reinterpret_cast<const KeyT*>(&key_storage));
  }
  auto key() -> KeyT& {
    return const_cast<KeyT&>(const_cast<const StorageEntry*>(this)->key());
  }

  auto value() const -> const ValueT& {
    return *std::launder(reinterpret_cast<const ValueT*>(&value_storage));
  }
  auto value() -> ValueT& {
    return const_cast<ValueT&>(const_cast<const StorageEntry*>(this)->value());
  }

  // We handle destruction and move manually as we only want to expose distinct
  // `KeyT` and `ValueT` subobjects to user code that may need to do in-place
  // construction. As a consequence, this struct only provides the storage and
  // we have to manually manage the construction, move, and destruction of the
  // objects.
  auto Destroy() -> void {
    static_assert(!IsTriviallyDestructible,
                  "Should never instantiate when trivial!");
    key().~KeyT();
    value().~ValueT();
  }

  auto CopyFrom(const StorageEntry& entry) -> void {
    if constexpr (IsTriviallyRelocatable) {
      memcpy(this, &entry, sizeof(StorageEntry));
    } else {
      new (&key_storage) KeyT(entry.key());
      new (&value_storage) ValueT(entry.value());
    }
  }

  // Move from an expiring entry and destroy that entry's key and value.
  // Optimizes to directly use `memcpy` when correct.
  auto MoveFrom(StorageEntry&& entry) -> void {
    if constexpr (IsTriviallyRelocatable) {
      memcpy(this, &entry, sizeof(StorageEntry));
    } else {
      new (&key_storage) KeyT(std::move(entry.key()));
      entry.key().~KeyT();
      new (&value_storage) ValueT(std::move(entry.value()));
      entry.value().~ValueT();
    }
  }

  alignas(KeyT) std::byte key_storage[sizeof(KeyT)];
  alignas(ValueT) std::byte value_storage[sizeof(ValueT)];
};

// A specialization of the storage entry for sets without a distinct value type.
// Somewhat duplicative with the key-value version, but C++ specialization makes
// doing better difficult.
template <typename KeyT>
struct StorageEntry<KeyT, void> {
  static constexpr bool IsTriviallyDestructible =
      std::is_trivially_destructible_v<KeyT>;

  static constexpr bool IsTriviallyRelocatable =
      IsTriviallyDestructible && std::is_trivially_move_constructible_v<KeyT>;

  auto key() const -> const KeyT& {
    // Ensure we don't need more alignment than available.
    static_assert(
        alignof(StorageEntry) <= MinAllocatedSize,
        "The minimum allocated size turns into the alignment of our array of "
        "storage entries as they follow the metadata byte array.");

    return *std::launder(reinterpret_cast<const KeyT*>(&key_storage));
  }
  auto key() -> KeyT& {
    return const_cast<KeyT&>(const_cast<const StorageEntry*>(this)->key());
  }

  auto Destroy() -> void {
    static_assert(!IsTriviallyDestructible,
                  "Should never instantiate when trivial!");
    key().~KeyT();
  }

  auto CopyFrom(const StorageEntry& entry) -> void {
    if constexpr (IsTriviallyRelocatable) {
      memcpy(this, &entry, sizeof(StorageEntry));
    } else {
      new (&key_storage) KeyT(entry.key());
    }
  }

  auto MoveFrom(StorageEntry&& entry) -> void {
    if constexpr (IsTriviallyRelocatable) {
      memcpy(this, &entry, sizeof(StorageEntry));
    } else {
      new (&key_storage) KeyT(std::move(entry.key()));
      entry.key().~KeyT();
    }
  }

  alignas(KeyT) std::byte key_storage[sizeof(KeyT)];
};

// A placeholder empty type used to model pointers to the allocated buffer of
// storage.
//
// The allocated storage doesn't have a meaningful static layout -- it consists
// of an array of metadata groups followed by an array of storage entries.
// However, we want to be able to mark pointers to this and so use pointers to
// this placeholder type as that signifier.
//
// This is a complete, empty type so that it can be used as a base class of a
// specific concrete storage type for compile-time sized storage.
struct Storage {};

// Forward declaration to support friending, see the definition below.
template <typename KeyT, typename ValueT = void,
          typename InputKeyContextT = DefaultKeyContext>
class BaseImpl;

// Implementation helper for defining a read-only view type for a hashtable.
//
// A specific user-facing hashtable view type should derive privately from this
// type, and forward the implementation of its interface to functions in this
// type.
//
// The methods available to user-facing hashtable types are `protected`, and
// where they are expected to directly map to a public API, named with an
// `Impl`. The suffix naming ensures types don't `using` in these low-level APIs
// but declare their own and implement them by forwarding to these APIs. We
// don't want users to have to read these implementation details to understand
// their container's API, so none of these methods should be `using`-ed into the
// user facing types.
//
// Some of the types are just convenience aliases and aren't important to
// surface as part of the user-facing type API for readers and so those are
// reasonable to add via a `using`.
//
// Some methods are used by other parts of the raw hashtable implementation.
// Those are kept `private` and where necessary the other components of the raw
// hashtable implementation are friended to give access to them.
template <typename InputKeyT, typename InputValueT = void,
          typename InputKeyContextT = DefaultKeyContext>
class ViewImpl {
 protected:
  using KeyT = InputKeyT;
  using ValueT = InputValueT;
  using KeyContextT = InputKeyContextT;
  using EntryT = StorageEntry<KeyT, ValueT>;

  friend class BaseImpl<KeyT, ValueT, KeyContextT>;

  // Make more-`const` types friends to enable conversions that add `const`.
  friend class ViewImpl<const KeyT, ValueT, KeyContextT>;
  friend class ViewImpl<KeyT, const ValueT, KeyContextT>;
  friend class ViewImpl<const KeyT, const ValueT, KeyContextT>;

  ViewImpl() = default;

  // Support adding `const` to either key or value type of some other view.
  template <typename OtherKeyT, typename OtherValueT>
  // NOLINTNEXTLINE(google-explicit-constructor)
  ViewImpl(ViewImpl<OtherKeyT, OtherValueT, KeyContextT> other_view)
    requires(std::same_as<KeyT, OtherKeyT> ||
             std::same_as<KeyT, const OtherKeyT>) &&
                (std::same_as<ValueT, OtherValueT> ||
                 std::same_as<ValueT, const OtherValueT>)
      : alloc_size_(other_view.alloc_size_), storage_(other_view.storage_) {}

  // Looks up an entry in the hashtable and returns its address or null if not
  // present.
  template <typename LookupKeyT>
  auto LookupEntry(LookupKeyT lookup_key, KeyContextT key_context) const
      -> EntryT*;

  // Calls `entry_callback` for each entry in the hashtable. All the entries
  // within a specific group are visited first, and then `group_callback` is
  // called on the group itself. The `group_callback` is typically only used by
  // the internals of the hashtable.
  template <typename EntryCallbackT, typename GroupCallbackT>
  auto ForEachEntry(EntryCallbackT entry_callback,
                    GroupCallbackT group_callback) const -> void;

  // Counts the number of keys in the hashtable that required probing beyond the
  // initial group.
  auto CountProbedKeys(KeyContextT key_context) const -> ssize_t;

 private:
  ViewImpl(ssize_t alloc_size, Storage* storage)
      : alloc_size_(alloc_size), storage_(storage) {}

  // Computes the offset from the metadata array to the entries array for a
  // given size. This is trivial, but we use this routine to enforce invariants
  // on the sizes.
  static constexpr auto EntriesOffset(ssize_t alloc_size) -> ssize_t {
    CARBON_DCHECK(llvm::isPowerOf2_64(alloc_size))
        << "Size must be a power of two for a hashed buffer!";
    // The size is always a power of two. We prevent any too-small sizes so it
    // being a power of two provides the needed alignment. As a result, the
    // offset is exactly the size. We validate this here to catch alignment bugs
    // early.
    CARBON_DCHECK(static_cast<uint64_t>(alloc_size) ==
                  llvm::alignTo<alignof(EntryT)>(alloc_size));
    return alloc_size;
  }

  auto metadata() const -> uint8_t* {
    return reinterpret_cast<uint8_t*>(storage_);
  }
  auto entries() const -> EntryT* {
    return reinterpret_cast<EntryT*>(reinterpret_cast<std::byte*>(storage_) +
                                     EntriesOffset(alloc_size_));
  }

  ssize_t alloc_size_;
  Storage* storage_;
};

// Implementation helper for defining a read-write base type for a hashtable
// that type-erases any SSO buffer.
//
// A specific user-facing hashtable base type should derive using *`protected`*
// inheritance from this type, and forward the implementation of its interface
// to functions in this type.
//
// Other than the use of `protected` inheritance, the patterns for this type,
// and how to build user-facing hashtable base types from it, mirror those of
// `ViewImpl`. See its documentation for more details.
template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
class BaseImpl {
 protected:
  using KeyT = InputKeyT;
  using ValueT = InputValueT;
  using KeyContextT = InputKeyContextT;
  using ViewImplT = ViewImpl<KeyT, ValueT, KeyContextT>;
  using EntryT = typename ViewImplT::EntryT;

  BaseImpl(int small_alloc_size, Storage* small_storage)
      : small_alloc_size_(small_alloc_size) {
    CARBON_CHECK(small_alloc_size >= 0);
    Construct(small_storage);
  }
  // Only used for copying and moving, and leaves storage uninitialized.
  BaseImpl(ssize_t alloc_size, int growth_budget, int small_alloc_size)
      : view_impl_(alloc_size, nullptr),
        growth_budget_(growth_budget),
        small_alloc_size_(small_alloc_size) {}
  ~BaseImpl();

  // NOLINTNEXTLINE(google-explicit-constructor): Designed to implicitly decay.
  operator ViewImplT() const { return view_impl(); }

  auto view_impl() const -> ViewImplT { return view_impl_; }

  // Looks up the provided key in the hashtable. If found, returns a pointer to
  // that entry and `false`.
  //
  // If not found, will locate an empty entry for inserting into, set the
  // metadata for that entry, and return a pointer to the entry and `true`. When
  // necessary, this will grow the hashtable to cause there to be sufficient
  // empty entries.
  template <typename LookupKeyT>
  auto InsertImpl(LookupKeyT lookup_key, KeyContextT key_context)
      -> std::pair<EntryT*, bool>;

  // Looks up the entry in the hashtable, and if found destroys the entry and
  // returns `true`. If not found, returns `false`.
  //
  // Does not release any memory, just leaves a tombstone behind so this entry
  // cannot be found and the slot can in theory be re-used.
  template <typename LookupKeyT>
  auto EraseImpl(LookupKeyT lookup_key, KeyContextT key_context) -> bool;

  // Erases all entries in the hashtable but leaves the allocated storage.
  auto ClearImpl() -> void;

 private:
  template <typename InputBaseT, ssize_t SmallSize>
  friend class TableImpl;

  static constexpr ssize_t Alignment = std::max<ssize_t>(
      {alignof(MetadataGroup), alignof(StorageEntry<KeyT, ValueT>)});

  // Implementation of inline small storage for the provided key type, value
  // type, and small size. Specialized for a zero small size to be an empty
  // struct.
  template <ssize_t SmallSize>
  struct SmallStorage : Storage {
    alignas(Alignment) uint8_t metadata[SmallSize];
    mutable StorageEntry<KeyT, ValueT> entries[SmallSize];
  };
  // Specialized storage with no inline buffer to avoid any extra alignment.
  template <>
  struct SmallStorage<0> {};

  static constexpr auto AllocByteSize(ssize_t alloc_size) -> ssize_t {
    return ViewImplT::EntriesOffset(alloc_size) + sizeof(EntryT) * alloc_size;
  }
  static auto Allocate(ssize_t alloc_size) -> Storage*;
  static auto Deallocate(Storage* storage, ssize_t alloc_size) -> void;

  auto growth_budget() const -> ssize_t { return growth_budget_; }
  auto alloc_size() const -> ssize_t { return view_impl_.alloc_size_; }
  auto alloc_size() -> ssize_t& { return view_impl_.alloc_size_; }
  auto storage() const -> Storage* { return view_impl_.storage_; }
  auto storage() -> Storage*& { return view_impl_.storage_; }
  auto metadata() const -> uint8_t* { return view_impl_.metadata(); }
  auto entries() const -> EntryT* { return view_impl_.entries(); }
  auto small_alloc_size() const -> ssize_t {
    return static_cast<unsigned>(small_alloc_size_);
  }
  auto is_small() const -> bool { return alloc_size() <= small_alloc_size(); }

  auto Construct(Storage* small_storage) -> void;
  auto Destroy() -> void;

  template <typename LookupKeyT>
  auto InsertIntoEmpty(LookupKeyT lookup_key, KeyContextT key_context)
      -> EntryT*;

  static auto ComputeNextAllocSize(ssize_t old_alloc_size) -> ssize_t;
  static auto GrowthThresholdForAllocSize(ssize_t alloc_size) -> ssize_t;

  template <typename LookupKeyT>
  auto GrowAndInsert(LookupKeyT lookup_key, KeyContextT key_context) -> EntryT*;

  ViewImplT view_impl_;
  int growth_budget_;
  int small_alloc_size_;
};

// Implementation helper for defining a hashtable type with an SSO buffer.
//
// A specific user-facing hashtable should derive privately from this
// type, and forward the implementation of its interface to functions in this
// type. It should provide the corresponding user-facing hashtable base type as
// the `InputBaseT` type parameter (rather than a key/value pair), and this type
// will in turn derive from that provided base type. This allows derived-to-base
// conversion from the user-facing hashtable type to the user-facing hashtable
// base type. And it does so keeping the inheritance linear. The resulting
// linear inheritance hierarchy for a `Map<K, T>` type will look like:
//
//   Map<K, T>
//    ↓
//   TableImpl<MapBase<K, T>>
//    ↓
//   MapBase<K, T>
//    ↓
//   BaseImpl<K, T>
//
// Other than this inheritance technique, the patterns for this type, and how to
// build user-facing hashtable types from it, mirror those of `ViewImpl`. See
// its documentation for more details.
template <typename InputBaseT, ssize_t SmallSize>
class TableImpl : public InputBaseT {
 protected:
  using BaseT = InputBaseT;

  TableImpl() : BaseT(SmallSize, small_storage()) {}
  TableImpl(const TableImpl& arg);
  TableImpl(TableImpl&& arg) noexcept;

  // Resets the hashtable to its initial state, clearing all entries and
  // releasing all memory. If the hashtable had an SSO buffer, that is restored
  // as the storage. Otherwise, a minimum sized table storage is allocated.
  auto ResetImpl() -> void;

 private:
  using KeyT = BaseT::KeyT;
  using ValueT = BaseT::ValueT;
  using EntryT = BaseT::EntryT;
  using SmallStorage = BaseT::template SmallStorage<SmallSize>;

  auto small_storage() const -> Storage*;

  [[no_unique_address]] mutable SmallStorage small_storage_;
};

////////////////////////////////////////////////////////////////////////////////
//
// Only implementation details below this point.
//
////////////////////////////////////////////////////////////////////////////////

// Computes a seed that provides a small amount of entropy from ASLR where
// available with minimal cost. The priority is speed, and this computes the
// entropy in a way that doesn't require loading from memory, merely accessing
// entropy already available without accessing memory.
inline auto ComputeSeed() -> uint64_t {
  // A global variable whose address is used as a seed. This allows ASLR to
  // introduce some variation in hashtable ordering when enabled via the code
  // model for globals.
  extern volatile std::byte global_addr_seed;

  return reinterpret_cast<uint64_t>(&global_addr_seed);
}

inline auto ComputeProbeMaskFromSize(ssize_t size) -> size_t {
  CARBON_DCHECK(llvm::isPowerOf2_64(size))
      << "Size must be a power of two for a hashed buffer!";
  // Since `size` is a power of two, we can make sure the probes are less
  // than `size` by making the mask `size - 1`. We also mask off the low
  // bits so the probes are a multiple of the size of the groups of entries.
  return (size - 1) & ~GroupMask;
}

// This class handles building a sequence of probe indices from a given
// starting point, including both the quadratic growth and masking the index
// to stay within the bucket array size. The starting point doesn't need to be
// clamped to the size ahead of time (or even be positive), we will do it
// internally.
//
// For reference on quadratic probing:
// https://en.wikipedia.org/wiki/Quadratic_probing
//
// We compute the quadratic probe index incrementally, but we can also compute
// it mathematically and will check that the incremental result matches our
// mathematical expectation. We use the quadratic probing formula of:
//
//   p(start, step) = (start + (step + step^2) / 2) (mod size / GroupSize)
//
// However, we compute it incrementally and scale all the variables by the group
// size so it can be used as an index without an additional multiplication.
class ProbeSequence {
 public:
  ProbeSequence(ssize_t start, ssize_t size) {
    mask_ = ComputeProbeMaskFromSize(size);
    p_ = start & mask_;
#ifndef NDEBUG
    start_ = start & mask_;
    size_ = size;
#endif
  }

  void Next() {
    step_ += GroupSize;
    p_ = (p_ + step_) & mask_;
#ifndef NDEBUG
    // Verify against the quadratic formula we expect to be following by scaling
    // everything down by `GroupSize`.
    CARBON_DCHECK(
        (p_ / GroupSize) ==
        ((start_ / GroupSize +
          (step_ / GroupSize + (step_ / GroupSize) * (step_ / GroupSize)) / 2) %
         (size_ / GroupSize)))
        << "Index in probe sequence does not match the expected formula.";
    CARBON_DCHECK(step_ < size_) << "We necessarily visit all groups, so we "
                                    "can't have more probe steps than groups.";
#endif
  }

  auto index() const -> ssize_t { return p_; }

 private:
  ssize_t step_ = 0;
  size_t mask_;
  ssize_t p_;
#ifndef NDEBUG
  ssize_t start_;
  ssize_t size_;
#endif
};

// TODO: Evaluate keeping this outlined to see if macro benchmarks observe the
// same perf hit as micro benchmarks.
template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
template <typename LookupKeyT>
auto ViewImpl<InputKeyT, InputValueT, InputKeyContextT>::LookupEntry(
    LookupKeyT lookup_key, KeyContextT key_context) const -> EntryT* {
  // Prefetch with a "low" temporal locality as we're primarily expecting a
  // brief use of the storage and then to return to application code.
  __builtin_prefetch(storage_, /*read*/ 0, /*low-locality*/ 1);

  ssize_t local_size = alloc_size_;
  CARBON_DCHECK(local_size > 0);

  uint8_t* local_metadata = metadata();
  HashCode hash = key_context.HashKey(lookup_key, ComputeSeed());
  auto [hash_index, tag] = hash.ExtractIndexAndTag<7>();

  EntryT* local_entries = entries();

  // Walk through groups of entries using a quadratic probe starting from
  // `hash_index`.
  ProbeSequence s(hash_index, local_size);
  do {
    ssize_t group_index = s.index();

    // For each group, match the tag against the metadata to extract the
    // potentially matching entries within the group.
    MetadataGroup g = MetadataGroup::Load(local_metadata, group_index);
    auto metadata_matched_range = g.Match(tag);
    if (LLVM_LIKELY(metadata_matched_range)) {
      // If any entries in this group potentially match based on their metadata,
      // walk each candidate and compare its key to see if we have definitively
      // found a match.
      EntryT* group_entries = &local_entries[group_index];
      auto byte_it = metadata_matched_range.begin();
      auto byte_end = metadata_matched_range.end();
      do {
        EntryT* entry = byte_it.index_ptr(group_entries);
        if (LLVM_LIKELY(key_context.KeyEq(lookup_key, entry->key()))) {
          __builtin_assume(entry != nullptr);
          return entry;
        }
        ++byte_it;
      } while (LLVM_UNLIKELY(byte_it != byte_end));
    }

    // We failed to find a matching entry in this bucket, so check if there are
    // empty slots as that indicates we're done probing -- no later probed index
    // could have a match.
    auto empty_byte_matched_range = g.MatchEmpty();
    if (LLVM_LIKELY(empty_byte_matched_range)) {
      return nullptr;
    }

    s.Next();

    // We use a weird construct of an "unlikely" condition of `true`. The goal
    // is to get the compiler to not prioritize the back edge of the loop for
    // code layout, and in at least some tests this seems to be an effective
    // construct for achieving this.
  } while (LLVM_UNLIKELY(true));
}

// Note that we force inlining here because we expect to be called with lambdas
// that will in turn be inlined to form the loop body. We don't want function
// boundaries within the loop for performance, and recognizing the degree of
// simplification from inlining these callbacks may be difficult to
// automatically recognize.
template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
template <typename EntryCallbackT, typename GroupCallbackT>
[[clang::always_inline]] auto
ViewImpl<InputKeyT, InputValueT, InputKeyContextT>::ForEachEntry(
    EntryCallbackT entry_callback, GroupCallbackT group_callback) const
    -> void {
  uint8_t* local_metadata = metadata();
  EntryT* local_entries = entries();

  ssize_t local_size = alloc_size_;
  for (ssize_t group_index = 0; group_index < local_size;
       group_index += GroupSize) {
    auto g = MetadataGroup::Load(local_metadata, group_index);
    auto present_matched_range = g.MatchPresent();
    if (!present_matched_range) {
      continue;
    }
    for (ssize_t byte_index : present_matched_range) {
      entry_callback(local_entries[group_index + byte_index]);
    }

    group_callback(&local_metadata[group_index]);
  }
}

template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
auto ViewImpl<InputKeyT, InputValueT, InputKeyContextT>::CountProbedKeys(
    KeyContextT key_context) const -> ssize_t {
  uint8_t* local_metadata = metadata();
  EntryT* local_entries = entries();
  ssize_t local_size = alloc_size_;
  ssize_t count = 0;
  for (ssize_t group_index = 0; group_index < local_size;
       group_index += GroupSize) {
    auto g = MetadataGroup::Load(local_metadata, group_index);
    auto present_matched_range = g.MatchPresent();
    for (ssize_t byte_index : present_matched_range) {
      ssize_t index = group_index + byte_index;
      HashCode hash =
          key_context.HashKey(local_entries[index].key(), ComputeSeed());
      ssize_t hash_index = hash.ExtractIndexAndTag<7>().first &
                           ComputeProbeMaskFromSize(local_size);
      count += static_cast<ssize_t>(hash_index != group_index);
    }
  }
  return count;
}

template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
BaseImpl<InputKeyT, InputValueT, InputKeyContextT>::~BaseImpl() {
  Destroy();
}

// TODO: Evaluate whether it is worth forcing this out-of-line given the
// reasonable ABI boundary it forms and large volume of code necessary to
// implement it.
template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
template <typename LookupKeyT>
auto BaseImpl<InputKeyT, InputValueT, InputKeyContextT>::InsertImpl(
    LookupKeyT lookup_key, KeyContextT key_context)
    -> std::pair<EntryT*, bool> {
  CARBON_DCHECK(alloc_size() > 0);

  uint8_t* local_metadata = metadata();

  HashCode hash = key_context.HashKey(lookup_key, ComputeSeed());
  auto [hash_index, tag] = hash.ExtractIndexAndTag<7>();

  // We re-purpose the empty control byte to signal no insert is needed to the
  // caller. This is guaranteed to not be a control byte we're inserting.
  // constexpr uint8_t NoInsertNeeded = Group::Empty;

  ssize_t group_with_deleted_index;
  MetadataGroup::MatchIndex deleted_match = {};

  EntryT* local_entries = entries();

  auto return_insert_at_index = [&](ssize_t index) -> std::pair<EntryT*, bool> {
    // We'll need to insert at this index so set the control group byte to the
    // proper value.
    local_metadata[index] = tag | MetadataGroup::PresentMask;
    return {&local_entries[index], true};
  };

  for (ProbeSequence s(hash_index, alloc_size());; s.Next()) {
    ssize_t group_index = s.index();
    auto g = MetadataGroup::Load(local_metadata, group_index);

    auto control_byte_matched_range = g.Match(tag);
    if (control_byte_matched_range) {
      EntryT* group_entries = &local_entries[group_index];
      auto byte_it = control_byte_matched_range.begin();
      auto byte_end = control_byte_matched_range.end();
      do {
        EntryT* entry = byte_it.index_ptr(group_entries);
        if (LLVM_LIKELY(key_context.KeyEq(lookup_key, entry->key()))) {
          return {entry, false};
        }
        ++byte_it;
      } while (LLVM_UNLIKELY(byte_it != byte_end));
    }

    // Track the first group with a deleted entry that we could insert over.
    if (!deleted_match) {
      deleted_match = g.MatchDeleted();
      group_with_deleted_index = group_index;
    }

    // We failed to find a matching entry in this bucket, so check if there are
    // no empty slots. In that case, we'll continue probing.
    auto empty_match = g.MatchEmpty();
    if (!empty_match) {
      continue;
    }
    // Ok, we've finished probing without finding anything and need to insert
    // instead.

    // If we found a deleted slot, we don't need the probe sequence to insert
    // so just bail. We want to ensure building up a table is fast so we
    // de-prioritize this a bit. In practice this doesn't have too much of an
    // effect.
    if (LLVM_UNLIKELY(deleted_match)) {
      return return_insert_at_index(group_with_deleted_index +
                                    deleted_match.index());
    }

    // We're going to need to grow by inserting into an empty slot. Check that
    // we have the budget for that before we compute the exact index of the
    // empty slot. Without the growth budget we'll have to completely rehash and
    // so we can just bail here.
    if (LLVM_UNLIKELY(growth_budget_ == 0)) {
      return {GrowAndInsert(lookup_key, key_context), true};
    }

    --growth_budget_;
    CARBON_DCHECK(growth_budget() >= 0)
        << "Growth budget shouldn't have gone negative!";
    return return_insert_at_index(group_index + empty_match.index());
  }

  CARBON_FATAL() << "We should never finish probing without finding the entry "
                    "or an empty slot.";
}

template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
template <typename LookupKeyT>
auto BaseImpl<InputKeyT, InputValueT, InputKeyContextT>::EraseImpl(
    LookupKeyT lookup_key, KeyContextT key_context) -> bool {
  EntryT* entry = view_impl_.LookupEntry(lookup_key, key_context);
  if (!entry) {
    return false;
  }

  // If there are empty slots in this group then nothing will probe past this
  // group looking for an entry so we can simply set this slot to empty as
  // well. However, if every slot in this group is full, it might be part of
  // a long probe chain that we can't disrupt. In that case we mark the slot's
  // metadata as deleted to keep probes continuing past it.
  //
  // If we mark the slot as empty, we'll also need to increase the growth
  // budget.
  uint8_t* local_metadata = metadata();
  EntryT* local_entries = entries();
  ssize_t index = entry - local_entries;
  ssize_t group_index = index & ~GroupMask;
  auto g = MetadataGroup::Load(local_metadata, group_index);
  auto empty_matched_range = g.MatchEmpty();
  if (empty_matched_range) {
    local_metadata[index] = MetadataGroup::Empty;
    ++growth_budget_;
  } else {
    local_metadata[index] = MetadataGroup::Deleted;
  }

  if constexpr (!EntryT::IsTriviallyDestructible) {
    entry->Destroy();
  }

  return true;
}

template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
auto BaseImpl<InputKeyT, InputValueT, InputKeyContextT>::ClearImpl() -> void {
  view_impl_.ForEachEntry(
      [](EntryT& entry) {
        if constexpr (!EntryT::IsTriviallyDestructible) {
          entry.Destroy();
        }
      },
      [](uint8_t* metadata_group) {
        // Clear the group.
        std::memset(metadata_group, 0, GroupSize);
      });
  growth_budget_ = GrowthThresholdForAllocSize(alloc_size());
}

// Allocates the appropriate memory layout for a table of the given
// `alloc_size`, with space both for the metadata array and entries.
//
// The returned pointer *must* be deallocated by calling the below `Deallocate`
// function with the same `alloc_size` as used here.
template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
auto BaseImpl<InputKeyT, InputValueT, InputKeyContextT>::Allocate(
    ssize_t alloc_size) -> Storage* {
  return reinterpret_cast<Storage*>(__builtin_operator_new(
      AllocByteSize(alloc_size), static_cast<std::align_val_t>(Alignment),
      std::nothrow_t()));
}

// Deallocates a table's storage that was allocated with the `Allocate`
// function.
template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
auto BaseImpl<InputKeyT, InputValueT, InputKeyContextT>::Deallocate(
    Storage* storage, ssize_t alloc_size) -> void {
  ssize_t allocated_size = AllocByteSize(alloc_size);
  // We don't need the size, but make sure it always compiles.
  static_cast<void>(allocated_size);
  __builtin_operator_delete(storage,
#if __cpp_sized_deallocation
                            allocated_size,
#endif
                            static_cast<std::align_val_t>(Alignment));
}

// Construct a table using the provided small storage if `small_alloc_size_` is
// non-zero. If `small_alloc_size_` is zero, then `small_storage` won't be used
// and can be null. Regardless, after this the storage pointer is non-null and
// the size is non-zero so that we can directly begin inserting or querying the
// table.
template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
auto BaseImpl<InputKeyT, InputValueT, InputKeyContextT>::Construct(
    Storage* small_storage) -> void {
  if (small_alloc_size_ > 0) {
    alloc_size() = small_alloc_size_;
    storage() = small_storage;
  } else {
    // Directly allocate the initial buffer so that the hashtable is never in
    // an empty state.
    alloc_size() = MinAllocatedSize;
    storage() = Allocate(MinAllocatedSize);
  }
  std::memset(metadata(), 0, alloc_size());
  growth_budget_ = GrowthThresholdForAllocSize(alloc_size());
}

// Destroy the current table, releasing any memory used.
template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
auto BaseImpl<InputKeyT, InputValueT, InputKeyContextT>::Destroy() -> void {
  // Check for a moved-from state and don't do anything. Only a moved-from table
  // has a zero size.
  if (alloc_size() == 0) {
    return;
  }

  // Destroy all the entries.
  if constexpr (!EntryT::IsTriviallyDestructible) {
    view_impl_.ForEachEntry([](EntryT& entry) { entry.Destroy(); },
                            [](auto...) {});
  }

  // If small, nothing to deallocate.
  if (is_small()) {
    return;
  }

  // Just deallocate the storage without updating anything when destroying the
  // object.
  Deallocate(storage(), alloc_size());
}

// Optimized routine to insert a key into a table when that key *definitely*
// isn't present in the table and the table *definitely* has a viable empty slot
// (and growth space) to insert into before any deleted slots. When both of
// these are true, typically just after growth, we can dramatically simplify the
// insert position search.
template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
template <typename LookupKeyT>
[[clang::noinline]] auto
BaseImpl<InputKeyT, InputValueT, InputKeyContextT>::InsertIntoEmpty(
    LookupKeyT lookup_key, KeyContextT key_context) -> EntryT* {
  HashCode hash = key_context.HashKey(lookup_key, ComputeSeed());
  auto [hash_index, tag] = hash.ExtractIndexAndTag<7>();
  uint8_t* local_metadata = metadata();
  EntryT* local_entries = entries();

  for (ProbeSequence s(hash_index, alloc_size());; s.Next()) {
    ssize_t group_index = s.index();
    auto g = MetadataGroup::Load(local_metadata, group_index);

    if (auto empty_match = g.MatchEmpty()) {
      ssize_t index = group_index + empty_match.index();
      local_metadata[index] = tag | MetadataGroup::PresentMask;
      return &local_entries[index];
    }

    // Otherwise we continue probing.
  }
}

// Apply our doubling growth strategy and (re-)check invariants around table
// size.
template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
auto BaseImpl<InputKeyT, InputValueT, InputKeyContextT>::ComputeNextAllocSize(
    ssize_t old_alloc_size) -> ssize_t {
  CARBON_DCHECK(llvm::isPowerOf2_64(old_alloc_size))
      << "Expected a power of two!";
  ssize_t new_alloc_size;
  bool overflow = __builtin_mul_overflow(old_alloc_size, 2, &new_alloc_size);
  CARBON_CHECK(!overflow) << "Computing the new size overflowed `ssize_t`!";
  return new_alloc_size;
}

// Compute the growth threshold for a given size.
template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
auto BaseImpl<InputKeyT, InputValueT,
              InputKeyContextT>::GrowthThresholdForAllocSize(ssize_t alloc_size)
    -> ssize_t {
  // We use a 7/8ths load factor to trigger growth.
  return alloc_size - alloc_size / 8;
}

// Grow the hashtable to create space and then insert into it. Returns the
// selected insertion entry. Never returns null. In addition to growing and
// selecting the insertion entry, this routine updates the metadata array so
// that this function can be directly called and the result returned from
// `InsertImpl`.
template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
template <typename LookupKeyT>
[[clang::noinline]] auto
BaseImpl<InputKeyT, InputValueT, InputKeyContextT>::GrowAndInsert(
    LookupKeyT lookup_key, KeyContextT key_context) -> EntryT* {
  // We collect the probed elements in a small vector for re-insertion. It is
  // tempting to reuse the already allocated storage, but doing so appears to
  // be a (very slight) performance regression. These are relatively rare and
  // storing them into the existing storage creates stores to the same regions
  // of memory we're reading. Moreover, it requires moving both the key and the
  // value twice, and doing the `memcpy` widening for relocatable types before
  // the group walk rather than after the group walk. In practice, between the
  // statistical rareness and using a large small size buffer here on the stack,
  // we can handle this most efficiently with temporary, additional storage.
  llvm::SmallVector<ssize_t, 128> probed_indices;

  // We grow into a new `MapBase` so that both the new and old maps are
  // fully functional until all the entries are moved over. However, we directly
  // manipulate the internals to short circuit many aspects of the growth.
  ssize_t old_size = alloc_size();
  CARBON_DCHECK(old_size > 0);
  CARBON_DCHECK(growth_budget_ == 0);

  bool old_small = is_small();
  Storage* old_storage = storage();
  uint8_t* old_metadata = metadata();
  EntryT* old_entries = entries();

#ifndef NDEBUG
  // Count how many of the old table slots will end up being empty after we grow
  // the table. This is both the currently empty slots, but also the deleted
  // slots because we clear them to empty and re-insert everything that had any
  // probing.
  ssize_t debug_empty_count =
      llvm::count(llvm::ArrayRef(old_metadata, old_size), MetadataGroup::Empty);
  ssize_t debug_deleted_count = llvm::count(
      llvm::ArrayRef(old_metadata, old_size), MetadataGroup::Deleted);
  CARBON_DCHECK(debug_empty_count >=
                (old_size - GrowthThresholdForAllocSize(old_size)))
      << "debug_empty_count: " << debug_empty_count
      << ", debug_deleted_count: " << debug_deleted_count
      << ", size: " << old_size;
#endif

  // Compute the new size and grow the storage in place (if possible).
  ssize_t new_size = ComputeNextAllocSize(old_size);
  alloc_size() = new_size;
  storage() = Allocate(new_size);
  growth_budget_ = GrowthThresholdForAllocSize(new_size);

  // Now extract the new components of the table.
  uint8_t* new_metadata = metadata();
  EntryT* new_entries = entries();

  // We always double the size when we grow. This allows an important
  // optimization -- we're adding exactly one more high bit to the hash-computed
  // index for each entry. This in turn means we can classify every entry in the
  // table into three cases:
  //
  // 1) The new high bit is zero, the entry is at the same index in the new
  //    table as the old.
  //
  // 2) The new high bit is one, the entry is at the old index plus the old
  //    size.
  //
  // 3) The entry's current index doesn't match the initial hash index because
  //    it required some amount of probing to find an empty slot.
  //
  // The design of the hash table tries to minimize how many entries fall into
  // case (3), so we expect the vast majority of entries to be in (1) or (2).
  // This lets us model growth notionally as duplicating the hash table,
  // clearing out the empty slots, and inserting any probed elements.

  ssize_t count = 0;
  for (ssize_t group_index = 0; group_index < old_size;
       group_index += GroupSize) {
    auto low_g = MetadataGroup::Load(old_metadata, group_index);
    // Make sure to match present elements first to enable pipelining with
    // clearing.
    auto present_matched_range = low_g.MatchPresent();
    low_g.ClearDeleted();
    MetadataGroup high_g;
    if constexpr (MetadataGroup::FastByteClear) {
      // When we have a fast byte clear, we can update the metadata for the
      // growth in-register and store at the end.
      high_g = low_g;
    } else {
      // If we don't have a fast byte clear, we can store the metadata group
      // eagerly here and overwrite bytes with a byte store below instead of
      // clearing the byte in-register.
      low_g.Store(new_metadata, group_index);
      low_g.Store(new_metadata, group_index | old_size);
    }
    for (ssize_t byte_index : present_matched_range) {
      ++count;
      ssize_t old_index = group_index + byte_index;
      if constexpr (!MetadataGroup::FastByteClear) {
        CARBON_DCHECK(new_metadata[old_index] == old_metadata[old_index]);
        CARBON_DCHECK(new_metadata[old_index | old_size] ==
                      old_metadata[old_index]);
      }
      HashCode hash =
          key_context.HashKey(old_entries[old_index].key(), ComputeSeed());
      ssize_t old_hash_index = hash.ExtractIndexAndTag<7>().first &
                               ComputeProbeMaskFromSize(old_size);
      if (LLVM_UNLIKELY(old_hash_index != group_index)) {
        probed_indices.push_back(old_index);
        if constexpr (MetadataGroup::FastByteClear) {
          low_g.ClearByte(byte_index);
          high_g.ClearByte(byte_index);
        } else {
          new_metadata[old_index] = MetadataGroup::Empty;
          new_metadata[old_index | old_size] = MetadataGroup::Empty;
        }
        continue;
      }
      ssize_t new_index = hash.ExtractIndexAndTag<7>().first &
                          ComputeProbeMaskFromSize(new_size);
      CARBON_DCHECK(new_index == old_hash_index ||
                    new_index == (old_hash_index | old_size));
      // Toggle the newly added bit of the index to get to the other possible
      // target index.
      if constexpr (MetadataGroup::FastByteClear) {
        (new_index == old_hash_index ? high_g : low_g).ClearByte(byte_index);
        new_index += byte_index;
      } else {
        new_index += byte_index;
        new_metadata[new_index ^ old_size] = MetadataGroup::Empty;
      }

      // If we need to explicitly move (and destroy) the key or value, do so
      // here where we already know its target.
      if constexpr (!EntryT::IsTriviallyRelocatable) {
        new_entries[new_index].MoveFrom(std::move(old_entries[old_index]));
      }
    }
    if constexpr (MetadataGroup::FastByteClear) {
      low_g.Store(new_metadata, group_index);
      high_g.Store(new_metadata, (group_index | old_size));
    }
  }
  CARBON_DCHECK((count - static_cast<ssize_t>(probed_indices.size())) ==
                (new_size - llvm::count(llvm::ArrayRef(new_metadata, new_size),
                                        MetadataGroup::Empty)));
#ifndef NDEBUG
  CARBON_DCHECK((debug_empty_count + debug_deleted_count) ==
                (old_size - count));
  CARBON_DCHECK(llvm::count(llvm::ArrayRef(new_metadata, new_size),
                            MetadataGroup::Empty) ==
                debug_empty_count + debug_deleted_count +
                    static_cast<ssize_t>(probed_indices.size()) + old_size);
#endif

  // If the keys or values are trivially relocatable, we do a bulk memcpy of
  // them into place. This will copy them into both possible locations, which is
  // fine. One will be empty and clobbered if reused or ignored. The other will
  // be the one used. This might seem like it needs it to be valid for us to
  // create two copies, but it doesn't. This produces the exact same storage as
  // copying the storage into the wrong location first, and then again into the
  // correct location. Only one is live and only one is destroyed.
  if constexpr (EntryT::IsTriviallyRelocatable) {
    memcpy(new_entries, old_entries, old_size * sizeof(EntryT));
    memcpy(new_entries + old_size, old_entries, old_size * sizeof(EntryT));
  }

  // We then need to do a normal insertion for anything that was probed before
  // growth, but we know we'll find an empty slot, so leverage that.
  for (ssize_t old_index : probed_indices) {
    EntryT* new_entry =
        InsertIntoEmpty(old_entries[old_index].key(), key_context);
    new_entry->MoveFrom(std::move(old_entries[old_index]));
  }
  CARBON_DCHECK(count ==
                (new_size - llvm::count(llvm::ArrayRef(new_metadata, new_size),
                                        MetadataGroup::Empty)));
  growth_budget_ -= count;
  CARBON_DCHECK(growth_budget_ ==
                (GrowthThresholdForAllocSize(new_size) -
                 (new_size - llvm::count(llvm::ArrayRef(new_metadata, new_size),
                                         MetadataGroup::Empty))));
  CARBON_DCHECK(growth_budget_ > 0 &&
                "Must still have a growth budget after rehash!");

  if (!old_small) {
    // Old isn't a small buffer, so we need to deallocate it.
    Deallocate(old_storage, old_size);
  }

  // And lastly insert the lookup_key into an index in the newly grown map and
  // return that index for use.
  --growth_budget_;
  return InsertIntoEmpty(lookup_key, key_context);
}

template <typename InputBaseT, ssize_t SmallSize>
TableImpl<InputBaseT, SmallSize>::TableImpl(const TableImpl& arg)
    : BaseT(arg.alloc_size(), arg.growth_budget_, SmallSize) {
  CARBON_DCHECK(arg.small_alloc_size_ == SmallSize);

  ssize_t local_size = arg.alloc_size();

  if (SmallSize > 0 && arg.is_small()) {
    CARBON_DCHECK(local_size == SmallSize);
    this->storage() = small_storage();
  } else {
    this->storage() = BaseT::Allocate(local_size);
  }

  // Preserve which slot every entry is in, including tombstones in the
  // metadata, in order to copy into the new table's storage without rehashing
  // all of the keys. This is especially important as we don't have an easy way
  // to access the key context needed for rehashing here.
  uint8_t* local_metadata = this->metadata();
  EntryT* local_entries = this->entries();
  const uint8_t* local_arg_metadata = arg.metadata();
  const EntryT* local_arg_entries = arg.entries();
  memcpy(local_metadata, local_arg_metadata, local_size);

  for (ssize_t group_index = 0; group_index < local_size;
       group_index += GroupSize) {
    auto g = MetadataGroup::Load(local_arg_metadata, group_index);
    for (ssize_t byte_index : g.MatchPresent()) {
      local_entries[group_index + byte_index].CopyFrom(
          local_arg_entries[group_index + byte_index]);
    }
  }
}

// Puts the incoming table into a moved-from state that can be destroyed or
// re-initialized but must not be used otherwise.
template <typename InputBaseT, ssize_t SmallSize>
TableImpl<InputBaseT, SmallSize>::TableImpl(TableImpl&& arg) noexcept
    : BaseT(arg.alloc_size(), arg.growth_budget_, SmallSize) {
  CARBON_DCHECK(arg.small_alloc_size_ == SmallSize);

  ssize_t local_size = arg.alloc_size();

  if (SmallSize > 0 && arg.is_small()) {
    CARBON_DCHECK(local_size == SmallSize);
    this->storage() = small_storage();

    // For small tables, we have to move the entries as we can't move the tables
    // themselves. We do this preserving their slots and even tombstones to
    // avoid rehashing.
    uint8_t* local_metadata = this->metadata();
    EntryT* local_entries = this->entries();
    uint8_t* local_arg_metadata = arg.metadata();
    EntryT* local_arg_entries = arg.entries();
    memcpy(local_metadata, local_arg_metadata, local_size);
    if (EntryT::IsTriviallyRelocatable) {
      memcpy(local_entries, local_arg_entries, SmallSize * sizeof(EntryT));
    } else {
      for (ssize_t group_index = 0; group_index < local_size;
           group_index += GroupSize) {
        auto g = MetadataGroup::Load(local_arg_metadata, group_index);
        for (ssize_t byte_index : g.MatchPresent()) {
          local_entries[group_index + byte_index].MoveFrom(
              std::move(local_arg_entries[group_index + byte_index]));
        }
      }
    }
  } else {
    // Just point to the allocated storage.
    this->storage() = arg.storage();
  }

  // Finally, put the incoming table into a moved-from state.
  arg.alloc_size() = 0;
  // Replace the pointer with null to ease debugging.
  arg.storage() = nullptr;
}

// Reset a table to its original state, including releasing any allocated
// memory.
template <typename InputBaseT, ssize_t SmallSize>
auto TableImpl<InputBaseT, SmallSize>::ResetImpl() -> void {
  this->Destroy();

  // Re-initialize the whole thing.
  CARBON_DCHECK(this->small_alloc_size() == SmallSize);
  this->Construct(small_storage());
}

template <typename InputBaseT, ssize_t SmallSize>
auto TableImpl<InputBaseT, SmallSize>::small_storage() const -> Storage* {
  if constexpr (SmallSize > 0) {
    // Do a bunch of validation of the small size to establish our invariants
    // when we know we have a non-zero small size.
    static_assert(llvm::isPowerOf2_64(SmallSize),
                  "SmallSize must be a power of two for a hashed buffer!");
    static_assert(
        SmallSize >= MaxGroupSize,
        "We require all small sizes to multiples of the largest group "
        "size supported to ensure it can be used portably.  ");
    static_assert(
        (SmallSize % MaxGroupSize) == 0,
        "Small size must be a multiple of the max group size supported "
        "so that we can allocate a whole number of groups.");
    // Implied by the max asserts above.
    static_assert(SmallSize >= GroupSize);
    static_assert((SmallSize % GroupSize) == 0);

    static_assert(SmallSize >= alignof(StorageEntry<KeyT, ValueT>),
                  "Requested a small size that would require padding between "
                  "metadata bytes and correctly aligned key and value types. "
                  "Either a larger small size or a zero small size and heap "
                  "allocation are required for this key and value type.");

    static_assert(offsetof(SmallStorage, entries) == SmallSize,
                  "Offset to entries in small size storage doesn't match "
                  "computed offset!");

    return &small_storage_;
  } else {
    static_assert(
        sizeof(TableImpl) == sizeof(BaseT),
        "Empty small storage caused a size difference and wasted space!");

    return nullptr;
  }
}

}  // namespace Carbon::RawHashtable

#endif  // CARBON_COMMON_RAW_HASHTABLE_H_