Jelajahi Sumber

Collect more detailed metrics on hashtables. (#4046)

Previously we just looked at the raw count of probed keys. Now, we
compute the average and max of both the probe _distance_ measured in the
number of _groups_ probed, and the number of probe _compares_ measured
in the compares required _before_ finding the matching entry.

This lets us understand the relative impact of probe-distance vs. tag
collisions on a given set of benchmark keys. Some of this is motivated
by considering additional optimization techniques similar to those used
in Boost's table and the F14 table from Facebook/Meta.

---------

Co-authored-by: Richard Smith <richard@metafoo.co.uk>
Chandler Carruth 1 tahun lalu
induk
melakukan
3be57b71e0
6 mengubah file dengan 176 tambahan dan 56 penghapusan
  1. 10 12
      common/map.h
  2. 21 9
      common/map_benchmark.cpp
  3. 100 16
      common/raw_hashtable.h
  4. 33 0
      common/raw_hashtable_benchmark_helpers.h
  5. 10 12
      common/set.h
  6. 2 7
      common/set_benchmark.cpp

+ 10 - 12
common/map.h

@@ -66,6 +66,7 @@ class MapView
   using KeyT = typename ImplT::KeyT;
   using ValueT = typename ImplT::ValueT;
   using KeyContextT = typename ImplT::KeyContextT;
+  using MetricsT = typename ImplT::MetricsT;
 
   // This type represents the result of lookup operations. It encodes whether
   // the lookup was a success as well as accessors for the key and value.
@@ -117,15 +118,11 @@ class MapView
     requires(std::invocable<CallbackT, KeyT&, ValueT&>);
 
   // This routine is relatively inefficient and only intended for use in
-  // benchmarking or logging of performance anomalies. The specific count
-  // returned has no specific guarantees beyond being informative in benchmarks.
-  // It counts how many of the keys in the hashtable have required probing
-  // beyond their initial group of slots.
-  //
-  // TODO: Replace with a more general metrics routine that covers other
-  // important aspects such as load factor, and average probe *distance*.
-  auto CountProbedKeys(KeyContextT key_context = KeyContextT()) -> ssize_t {
-    return ImplT::CountProbedKeys(key_context);
+  // benchmarking or logging of performance anomalies. The specific metrics
+  // returned have no specific guarantees beyond being informative in
+  // benchmarks.
+  auto ComputeMetrics(KeyContextT key_context = KeyContextT()) -> MetricsT {
+    return ImplT::ComputeMetricsImpl(key_context);
   }
 
  private:
@@ -165,6 +162,7 @@ class MapBase : protected RawHashtable::BaseImpl<InputKeyT, InputValueT,
   using KeyContextT = typename ImplT::KeyContextT;
   using ViewT = MapView<KeyT, ValueT, KeyContextT>;
   using LookupKVResult = typename ViewT::LookupKVResult;
+  using MetricsT = typename ImplT::MetricsT;
 
   // The result type for insertion operations both indicates whether an insert
   // was needed (as opposed to finding an existing element), and provides access
@@ -233,9 +231,9 @@ class MapBase : protected RawHashtable::BaseImpl<InputKeyT, InputValueT,
   }
 
   // Convenience forwarder to the view type.
-  auto CountProbedKeys(KeyContextT key_context = KeyContextT()) const
-      -> ssize_t {
-    return ViewT(*this).CountProbedKeys(key_context);
+  auto ComputeMetrics(KeyContextT key_context = KeyContextT()) const
+      -> MetricsT {
+    return ViewT(*this).ComputeMetrics(key_context);
   }
 
   // Insert a key and value into the map. If the key is already present, the new

+ 21 - 9
common/map_benchmark.cpp

@@ -18,6 +18,7 @@ using RawHashtable::CarbonHashDI;
 using RawHashtable::GetKeysAndHitKeys;
 using RawHashtable::GetKeysAndMissKeys;
 using RawHashtable::HitArgs;
+using RawHashtable::ReportTableMetrics;
 using RawHashtable::SizeArgs;
 using RawHashtable::ValueToBool;
 
@@ -159,6 +160,15 @@ template <typename MapT>
 using MapWrapper =
     MapWrapperOverride<MapT, MapOverride::CARBON_MAP_BENCH_OVERRIDE>;
 
+template <typename MapT>
+auto ReportMetrics(const MapWrapper<MapT>& m_wrapper, benchmark::State& state)
+    -> void {
+  // Report some extra statistics about the Carbon type.
+  if constexpr (IsCarbonMap<MapT>) {
+    ReportTableMetrics(m_wrapper.m, state);
+  }
+}
+
 // NOLINTBEGIN(bugprone-macro-parentheses): Parentheses are incorrect here.
 #define MAP_BENCHMARK_ONE_OP_SIZE(NAME, APPLY, KT, VT)        \
   BENCHMARK(NAME<Map<KT, VT>>)->Apply(APPLY);                 \
@@ -223,6 +233,8 @@ static void BM_MapContainsHit(benchmark::State& state) {
       i += static_cast<ssize_t>(result);
     }
   }
+
+  ReportMetrics(m, state);
 }
 MAP_BENCHMARK_ONE_OP(BM_MapContainsHit, HitArgs);
 
@@ -250,6 +262,8 @@ static void BM_MapContainsMiss(benchmark::State& state) {
       i += static_cast<ssize_t>(!result);
     }
   }
+
+  ReportMetrics(m, state);
 }
 MAP_BENCHMARK_ONE_OP(BM_MapContainsMiss, SizeArgs);
 
@@ -302,6 +316,8 @@ static void BM_MapLookupHit(benchmark::State& state) {
       i += static_cast<ssize_t>(result);
     }
   }
+
+  ReportMetrics(m, state);
 }
 MAP_BENCHMARK_ONE_OP(BM_MapLookupHit, HitArgs);
 
@@ -339,6 +355,8 @@ static void BM_MapUpdateHit(benchmark::State& state) {
       CARBON_DCHECK(!inserted);
     }
   }
+
+  ReportMetrics(m, state);
 }
 MAP_BENCHMARK_ONE_OP(BM_MapUpdateHit, HitArgs);
 
@@ -454,19 +472,13 @@ static void BM_MapInsertSeq(benchmark::State& state) {
   if constexpr (IsCarbonMap<MapT>) {
     // Re-build a map outside of the timing loop to look at the statistics
     // rather than the timing.
-    MapT m;
+    MapWrapperT m;
     for (auto k : keys) {
-      bool inserted = m.Insert(k, MakeValue<VT>()).is_inserted();
+      bool inserted = m.BenchInsert(k, MakeValue<VT>());
       CARBON_DCHECK(inserted) << "Must be a successful insert!";
     }
 
-    // While this count is "iteration invariant" (it should be exactly the same
-    // for every iteration as the set of keys is the same), we don't use that
-    // because it will scale this by the number of iterations. We want to
-    // display the probe count of this benchmark *parameter*, not the probe
-    // count that resulted from the number of iterations. That means we use the
-    // normal counter API without flags.
-    state.counters["Probed"] = m.CountProbedKeys();
+    ReportMetrics(m, state);
 
     // Uncomment this call to print out statistics about the index-collisions
     // among these keys for debugging:

+ 100 - 16
common/raw_hashtable.h

@@ -9,6 +9,7 @@
 #include <concepts>
 #include <cstddef>
 #include <cstring>
+#include <iterator>
 #include <new>
 #include <type_traits>
 #include <utility>
@@ -255,6 +256,38 @@ struct StorageEntry<KeyT, void> {
   alignas(KeyT) std::byte key_storage[sizeof(KeyT)];
 };
 
+struct Metrics {
+  // How many keys are present in the table.
+  ssize_t key_count = 0;
+  // How many slots of the table are reserved due to deleted markers required to
+  // preserve probe sequences.
+  ssize_t deleted_count = 0;
+  // How many bytes of allocated storage are used by the table. Note, does not
+  // include the table object or any small-size buffer.
+  ssize_t storage_bytes = 0;
+
+  // How many keys have required probing beyond the initial group. These are the
+  // keys with a probe distance > 0.
+  ssize_t probed_key_count = 0;
+  // The probe distance averaged over every key. If every key is in its initial
+  // group, this will be zero as no keys will have a larger probe distance. In
+  // general, we want this to be as close to zero as possible.
+  double probe_avg_distance = 0.0;
+  // The maximum probe distance found for a single key in the table.
+  ssize_t probe_max_distance = 0;
+  // The average number of probing comparisons required to locate a specific key
+  // in the table. This is how many comparisons are required *before* the key is
+  // located, or the *failed* comparisons. We always have to do one successful
+  // comparison at the end. This successful comparison isn't counted because
+  // that focuses this metric on the overhead the table is introducing, and
+  // keeps a "perfect" table with an average of `0.0` here similar to the
+  // perfect average of `0.0` average probe distance.
+  double probe_avg_compares = 0.0;
+  // The maximum number of probing comparisons required to locate a specific
+  // key in the table.
+  ssize_t probe_max_compares = 0;
+};
+
 // A placeholder empty type used to model pointers to the allocated buffer of
 // storage.
 //
@@ -301,6 +334,7 @@ class ViewImpl {
   using ValueT = InputValueT;
   using KeyContextT = InputKeyContextT;
   using EntryT = StorageEntry<KeyT, ValueT>;
+  using MetricsT = Metrics;
 
   friend class BaseImpl<KeyT, ValueT, KeyContextT>;
 
@@ -335,9 +369,10 @@ class ViewImpl {
   auto ForEachEntry(EntryCallbackT entry_callback,
                     GroupCallbackT group_callback) const -> void;
 
-  // Counts the number of keys in the hashtable that required probing beyond the
-  // initial group.
-  auto CountProbedKeys(KeyContextT key_context) const -> ssize_t;
+  // Returns a collection of informative metrics on the the current state of the
+  // table, useful for performance analysis. These include relatively slow to
+  // compute metrics requiring deep inspection of the table's state.
+  auto ComputeMetricsImpl(KeyContextT key_context) const -> MetricsT;
 
  private:
   ViewImpl(ssize_t alloc_size, Storage* storage)
@@ -358,6 +393,11 @@ class ViewImpl {
     return alloc_size;
   }
 
+  // Compute the allocated table's byte size.
+  static constexpr auto AllocByteSize(ssize_t alloc_size) -> ssize_t {
+    return EntriesOffset(alloc_size) + sizeof(EntryT) * alloc_size;
+  }
+
   auto metadata() const -> uint8_t* {
     return reinterpret_cast<uint8_t*>(storage_);
   }
@@ -388,6 +428,7 @@ class BaseImpl {
   using KeyContextT = InputKeyContextT;
   using ViewImplT = ViewImpl<KeyT, ValueT, KeyContextT>;
   using EntryT = typename ViewImplT::EntryT;
+  using MetricsT = typename ViewImplT::MetricsT;
 
   BaseImpl(int small_alloc_size, Storage* small_storage)
       : small_alloc_size_(small_alloc_size) {
@@ -447,9 +488,6 @@ class BaseImpl {
   template <>
   struct SmallStorage<0> {};
 
-  static constexpr auto AllocByteSize(ssize_t alloc_size) -> ssize_t {
-    return ViewImplT::EntriesOffset(alloc_size) + sizeof(EntryT) * alloc_size;
-  }
   static auto Allocate(ssize_t alloc_size) -> Storage*;
   static auto Deallocate(Storage* storage, ssize_t alloc_size) -> void;
 
@@ -709,26 +747,72 @@ ViewImpl<InputKeyT, InputValueT, InputKeyContextT>::ForEachEntry(
 }
 
 template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
-auto ViewImpl<InputKeyT, InputValueT, InputKeyContextT>::CountProbedKeys(
-    KeyContextT key_context) const -> ssize_t {
+auto ViewImpl<InputKeyT, InputValueT, InputKeyContextT>::ComputeMetricsImpl(
+    KeyContextT key_context) const -> Metrics {
   uint8_t* local_metadata = metadata();
   EntryT* local_entries = entries();
   ssize_t local_size = alloc_size_;
-  ssize_t count = 0;
+
+  Metrics metrics;
+
+  // Compute the ones we can directly.
+  metrics.deleted_count = llvm::count(
+      llvm::ArrayRef(local_metadata, local_size), MetadataGroup::Deleted);
+  metrics.storage_bytes = AllocByteSize(local_size);
+
+  // We want to process present slots specially to collect metrics on their
+  // probing behavior.
   for (ssize_t group_index = 0; group_index < local_size;
        group_index += GroupSize) {
     auto g = MetadataGroup::Load(local_metadata, group_index);
     auto present_matched_range = g.MatchPresent();
     for (ssize_t byte_index : present_matched_range) {
+      ++metrics.key_count;
       ssize_t index = group_index + byte_index;
       HashCode hash =
           key_context.HashKey(local_entries[index].key(), ComputeSeed());
-      ssize_t hash_index = hash.ExtractIndexAndTag<7>().first &
-                           ComputeProbeMaskFromSize(local_size);
-      count += static_cast<ssize_t>(hash_index != group_index);
+      auto [hash_index, tag] = hash.ExtractIndexAndTag<7>();
+      ProbeSequence s(hash_index, local_size);
+      metrics.probed_key_count +=
+          static_cast<ssize_t>(s.index() != group_index);
+
+      // For each probed key, go through the probe sequence to find both the
+      // probe distance and how many comparisons are required.
+      ssize_t distance = 0;
+      ssize_t compares = 0;
+      for (; s.index() != group_index; s.Next()) {
+        auto probe_g = MetadataGroup::Load(local_metadata, s.index());
+        auto probe_matched_range = probe_g.Match(tag);
+        compares += std::distance(probe_matched_range.begin(),
+                                  probe_matched_range.end());
+        distance += 1;
+      }
+
+      auto probe_g = MetadataGroup::Load(local_metadata, s.index());
+      auto probe_matched_range = probe_g.Match(tag);
+      CARBON_CHECK(!probe_matched_range.empty());
+      for (ssize_t match_index : probe_matched_range) {
+        if (match_index >= byte_index) {
+          // Note we only count the compares that will *fail* as part of
+          // probing. The last successful compare isn't interesting, it is
+          // always needed.
+          break;
+        }
+        compares += 1;
+      }
+      metrics.probe_avg_distance += distance;
+      metrics.probe_max_distance =
+          std::max(metrics.probe_max_distance, distance);
+      metrics.probe_avg_compares += compares;
+      metrics.probe_max_compares =
+          std::max(metrics.probe_max_compares, compares);
     }
   }
-  return count;
+  if (metrics.key_count > 0) {
+    metrics.probe_avg_compares /= metrics.key_count;
+    metrics.probe_avg_distance /= metrics.key_count;
+  }
+  return metrics;
 }
 
 template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
@@ -888,8 +972,8 @@ template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
 auto BaseImpl<InputKeyT, InputValueT, InputKeyContextT>::Allocate(
     ssize_t alloc_size) -> Storage* {
   return reinterpret_cast<Storage*>(__builtin_operator_new(
-      AllocByteSize(alloc_size), static_cast<std::align_val_t>(Alignment),
-      std::nothrow_t()));
+      ViewImplT::AllocByteSize(alloc_size),
+      static_cast<std::align_val_t>(Alignment), std::nothrow_t()));
 }
 
 // Deallocates a table's storage that was allocated with the `Allocate`
@@ -897,7 +981,7 @@ auto BaseImpl<InputKeyT, InputValueT, InputKeyContextT>::Allocate(
 template <typename InputKeyT, typename InputValueT, typename InputKeyContextT>
 auto BaseImpl<InputKeyT, InputValueT, InputKeyContextT>::Deallocate(
     Storage* storage, ssize_t alloc_size) -> void {
-  ssize_t allocated_size = AllocByteSize(alloc_size);
+  ssize_t allocated_size = ViewImplT::AllocByteSize(alloc_size);
   // We don't need the size, but make sure it always compiles.
   static_cast<void>(allocated_size);
   __builtin_operator_delete(storage,

+ 33 - 0
common/raw_hashtable_benchmark_helpers.h

@@ -203,6 +203,39 @@ struct CarbonHashDI<llvm::StringRef> {
   }
 };
 
+template <typename TableT>
+auto ReportTableMetrics(const TableT& table, benchmark::State& state) -> void {
+  // While this count is "iteration invariant" (it should be exactly the same
+  // for every iteration as the set of keys is the same), we don't use that
+  // because it will scale this by the number of iterations. We want to
+  // display the metrics for this benchmark *parameter*, not what resulted
+  // from the number of iterations. That means we use the normal counter API
+  // without flags.
+  auto metrics = table.ComputeMetrics();
+  state.counters["P-compares"] = metrics.probe_avg_compares;
+  state.counters["P-distance"] = metrics.probe_avg_distance;
+  state.counters["P-fraction"] =
+      static_cast<double>(metrics.probed_key_count) / metrics.key_count;
+  state.counters["Pmax-distance"] = metrics.probe_max_distance;
+  state.counters["Pmax-compares"] = metrics.probe_max_compares;
+  state.counters["Probed"] = metrics.probed_key_count;
+
+  state.counters["Storage"] = metrics.storage_bytes;
+
+  // Also compute how 'efficient' the storage is, 1.0 being zero bytes outside
+  // of key and value.
+  ssize_t element_size;
+  if constexpr (requires { TableT::ValueT; }) {
+    element_size =
+        sizeof(typename TableT::KeyT) + sizeof(typename TableT::ValueT);
+  } else {
+    element_size = sizeof(typename TableT::KeyT);
+  }
+  state.counters["Storage eff"] =
+      static_cast<double>(metrics.key_count * element_size) /
+      metrics.storage_bytes;
+}
+
 }  // namespace Carbon::RawHashtable
 
 #endif  // CARBON_COMMON_RAW_HASHTABLE_BENCHMARK_HELPERS_H_

+ 10 - 12
common/set.h

@@ -59,6 +59,7 @@ class SetView : RawHashtable::ViewImpl<InputKeyT, void, InputKeyContextT> {
  public:
   using KeyT = typename ImplT::KeyT;
   using KeyContextT = typename ImplT::KeyContextT;
+  using MetricsT = typename ImplT::MetricsT;
 
   // This type represents the result of lookup operations. It encodes whether
   // the lookup was a success as well as accessors for the key.
@@ -97,15 +98,11 @@ class SetView : RawHashtable::ViewImpl<InputKeyT, void, InputKeyContextT> {
     requires(std::invocable<CallbackT, KeyT&>);
 
   // This routine is relatively inefficient and only intended for use in
-  // benchmarking or logging of performance anomalies. The specific count
-  // returned has no specific guarantees beyond being informative in benchmarks.
-  // It counts how many of the keys in the hashtable have required probing
-  // beyond their initial group of slots.
-  //
-  // TODO: Replace with a more general metrics routine that covers other
-  // important aspects such as load factor, and average probe *distance*.
-  auto CountProbedKeys(KeyContextT key_context = KeyContextT()) -> ssize_t {
-    return ImplT::CountProbedKeys(key_context);
+  // benchmarking or logging of performance anomalies. The specific metrics
+  // returned have no specific guarantees beyond being informative in
+  // benchmarks.
+  auto ComputeMetrics(KeyContextT key_context = KeyContextT()) -> MetricsT {
+    return ImplT::ComputeMetricsImpl(key_context);
   }
 
  private:
@@ -140,6 +137,7 @@ class SetBase
   using KeyContextT = typename ImplT::KeyContextT;
   using ViewT = SetView<KeyT, KeyContextT>;
   using LookupResult = typename ViewT::LookupResult;
+  using MetricsT = typename ImplT::MetricsT;
 
   // The result type for insertion operations both indicates whether an insert
   // was needed (as opposed to the key already being in the set), and provides
@@ -193,9 +191,9 @@ class SetBase
   }
 
   // Convenience forwarder to the view type.
-  auto CountProbedKeys(KeyContextT key_context = KeyContextT()) const
-      -> ssize_t {
-    return ViewT(*this).CountProbedKeys(key_context);
+  auto ComputeMetrics(KeyContextT key_context = KeyContextT()) const
+      -> MetricsT {
+    return ViewT(*this).ComputeMetrics(key_context);
   }
 
   // Insert a key into the set. If the key is already present, no insertion is

+ 2 - 7
common/set_benchmark.cpp

@@ -16,6 +16,7 @@ using RawHashtable::CarbonHashDI;
 using RawHashtable::GetKeysAndHitKeys;
 using RawHashtable::GetKeysAndMissKeys;
 using RawHashtable::HitArgs;
+using RawHashtable::ReportTableMetrics;
 using RawHashtable::SizeArgs;
 using RawHashtable::ValueToBool;
 
@@ -362,13 +363,7 @@ static void BM_SetInsertSeq(benchmark::State& state) {
       CARBON_DCHECK(inserted) << "Must be a successful insert!";
     }
 
-    // While this count is "iteration invariant" (it should be exactly the same
-    // for every iteration as the set of keys is the same), we don't use that
-    // because it will scale this by the number of iterations. We want to
-    // display the probe count of this benchmark *parameter*, not the probe
-    // count that resulted from the number of iterations. That means we use the
-    // normal counter API without flags.
-    state.counters["Probed"] = s.CountProbedKeys();
+    ReportTableMetrics(s, state);
 
     // Uncomment this call to print out statistics about the index-collisions
     // among these keys for debugging: