set_benchmark.cpp 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #include <benchmark/benchmark.h>
  5. #include "absl/container/flat_hash_set.h"
  6. #include "common/raw_hashtable_benchmark_helpers.h"
  7. #include "common/set.h"
  8. #include "llvm/ADT/DenseSet.h"
  9. namespace Carbon {
  10. namespace {
  11. using RawHashtable::CarbonHashDI;
  12. using RawHashtable::GetKeysAndHitKeys;
  13. using RawHashtable::GetKeysAndMissKeys;
  14. using RawHashtable::HitArgs;
  15. using RawHashtable::SizeArgs;
  16. using RawHashtable::ValueToBool;
  17. template <typename SetT>
  18. struct IsCarbonSetImpl : std::false_type {};
  19. template <typename KT, int MinSmallSize>
  20. struct IsCarbonSetImpl<Set<KT, MinSmallSize>> : std::true_type {};
  21. template <typename SetT>
  22. static constexpr bool IsCarbonSet = IsCarbonSetImpl<SetT>::value;
  23. // A wrapper around various set types that we specialize to implement a common
  24. // API used in the benchmarks for various different map data structures that
  25. // support different APIs. The primary template assumes a roughly
  26. // `std::unordered_set` API design, and types with a different API design are
  27. // supported through specializations.
  28. template <typename SetT>
  29. struct SetWrapperImpl {
  30. using KeyT = typename SetT::key_type;
  31. SetT s;
  32. auto BenchContains(KeyT k) -> bool { return s.find(k) != s.end(); }
  33. auto BenchLookup(KeyT k) -> bool {
  34. auto it = s.find(k);
  35. if (it == s.end()) {
  36. return false;
  37. }
  38. // We expect keys to always convert to `true` so directly return that here.
  39. return ValueToBool(*it);
  40. }
  41. auto BenchInsert(KeyT k) -> bool {
  42. auto result = s.insert(k);
  43. return result.second;
  44. }
  45. auto BenchErase(KeyT k) -> bool { return s.erase(k) != 0; }
  46. };
  47. // Explicit (partial) specialization for the Carbon map type that uses its
  48. // different API design.
  49. template <typename KT, int MinSmallSize>
  50. struct SetWrapperImpl<Set<KT, MinSmallSize>> {
  51. using SetT = Set<KT, MinSmallSize>;
  52. using KeyT = KT;
  53. SetT s;
  54. auto BenchContains(KeyT k) -> bool { return s.Contains(k); }
  55. auto BenchLookup(KeyT k) -> bool {
  56. auto result = s.Lookup(k);
  57. if (!result) {
  58. return false;
  59. }
  60. return ValueToBool(result.key());
  61. }
  62. auto BenchInsert(KeyT k) -> bool {
  63. auto result = s.Insert(k);
  64. return result.is_inserted();
  65. }
  66. auto BenchErase(KeyT k) -> bool { return s.Erase(k); }
  67. };
  68. // Provide a way to override the Carbon Set specific benchmark runs with another
  69. // hashtable implementation. When building, you can use one of these enum names
  70. // in a macro define such as `-DCARBON_SET_BENCH_OVERRIDE=Name` in order to
  71. // trigger a specific override for the `Set` type benchmarks. This is used to
  72. // get before/after runs that compare the performance of Carbon's Set versus
  73. // other implementations.
  74. enum class SetOverride : uint8_t {
  75. Abseil,
  76. LLVM,
  77. LLVMAndCarbonHash,
  78. };
  79. template <typename SetT, SetOverride Override>
  80. struct SetWrapperOverride : SetWrapperImpl<SetT> {};
  81. template <typename KeyT, int MinSmallSize>
  82. struct SetWrapperOverride<Set<KeyT, MinSmallSize>, SetOverride::Abseil>
  83. : SetWrapperImpl<absl::flat_hash_set<KeyT>> {};
  84. template <typename KeyT, int MinSmallSize>
  85. struct SetWrapperOverride<Set<KeyT, MinSmallSize>, SetOverride::LLVM>
  86. : SetWrapperImpl<llvm::DenseSet<KeyT>> {};
  87. template <typename KeyT, int MinSmallSize>
  88. struct SetWrapperOverride<Set<KeyT, MinSmallSize>,
  89. SetOverride::LLVMAndCarbonHash>
  90. : SetWrapperImpl<llvm::DenseSet<KeyT, CarbonHashDI<KeyT>>> {};
  91. #ifndef CARBON_SET_BENCH_OVERRIDE
  92. template <typename SetT>
  93. using SetWrapper = SetWrapperImpl<SetT>;
  94. #else
  95. template <typename SetT>
  96. using SetWrapper =
  97. SetWrapperOverride<SetT, SetOverride::CARBON_SET_BENCH_OVERRIDE>;
  98. #endif
  99. // NOLINTBEGIN(bugprone-macro-parentheses): Parentheses are incorrect here.
  100. #define MAP_BENCHMARK_ONE_OP_SIZE(NAME, APPLY, KT) \
  101. BENCHMARK(NAME<Set<KT>>)->Apply(APPLY); \
  102. BENCHMARK(NAME<absl::flat_hash_set<KT>>)->Apply(APPLY); \
  103. BENCHMARK(NAME<llvm::DenseSet<KT>>)->Apply(APPLY); \
  104. BENCHMARK(NAME<llvm::DenseSet<KT, CarbonHashDI<KT>>>)->Apply(APPLY)
  105. // NOLINTEND(bugprone-macro-parentheses)
  106. #define MAP_BENCHMARK_ONE_OP(NAME, APPLY) \
  107. MAP_BENCHMARK_ONE_OP_SIZE(NAME, APPLY, int); \
  108. MAP_BENCHMARK_ONE_OP_SIZE(NAME, APPLY, int*); \
  109. MAP_BENCHMARK_ONE_OP_SIZE(NAME, APPLY, llvm::StringRef)
  110. // Benchmark the "latency" of testing for a key in a set. This always tests with
  111. // a key that is found.
  112. //
  113. // However, because the key is always found and because the test ultimately
  114. // involves conditional control flow that can be predicted, we expect modern
  115. // CPUs to perfectly predict the control flow here and turn the measurement from
  116. // one iteration to the next into a throughput measurement rather than a real
  117. // latency measurement.
  118. //
  119. // However, this does represent a particularly common way in which a set data
  120. // structure is accessed. The numbers should just be carefully interpreted in
  121. // the context of being more a reflection of reciprocal throughput than actual
  122. // latency. See the `Lookup` benchmarks for a genuine latency measure with its
  123. // own caveats.
  124. //
  125. // However, this does still show some interesting caching effects when querying
  126. // large fractions of large tables, and can give a sense of the inescapable
  127. // magnitude of these effects even when there is a great deal of prediction and
  128. // speculative execution to hide memory access latency.
  129. template <typename SetT>
  130. static void BM_SetContainsHitPtr(benchmark::State& state) {
  131. using SetWrapperT = SetWrapper<SetT>;
  132. using KT = typename SetWrapperT::KeyT;
  133. SetWrapperT s;
  134. auto [keys, lookup_keys] =
  135. GetKeysAndHitKeys<KT>(state.range(0), state.range(1));
  136. for (auto k : keys) {
  137. s.BenchInsert(k);
  138. }
  139. ssize_t lookup_keys_size = lookup_keys.size();
  140. while (state.KeepRunningBatch(lookup_keys_size)) {
  141. for (ssize_t i = 0; i < lookup_keys_size;) {
  142. // We block optimizing `i` as that has proven both more effective at
  143. // blocking the loop from being optimized away and avoiding disruption of
  144. // the generated code that we're benchmarking.
  145. benchmark::DoNotOptimize(i);
  146. bool result = s.BenchContains(lookup_keys[i]);
  147. CARBON_DCHECK(result);
  148. // We use the lookup success to step through keys, establishing a
  149. // dependency between each lookup. This doesn't fully allow us to measure
  150. // latency rather than throughput, as noted above.
  151. i += static_cast<ssize_t>(result);
  152. }
  153. }
  154. }
  155. MAP_BENCHMARK_ONE_OP(BM_SetContainsHitPtr, HitArgs);
  156. // Benchmark the "latency" (but more likely the reciprocal throughput, see
  157. // comment above) of testing for a key in the set that is *not* present.
  158. template <typename SetT>
  159. static void BM_SetContainsMissPtr(benchmark::State& state) {
  160. using SetWrapperT = SetWrapper<SetT>;
  161. using KT = typename SetWrapperT::KeyT;
  162. SetWrapperT s;
  163. auto [keys, lookup_keys] = GetKeysAndMissKeys<KT>(state.range(0));
  164. for (auto k : keys) {
  165. s.BenchInsert(k);
  166. }
  167. ssize_t lookup_keys_size = lookup_keys.size();
  168. while (state.KeepRunningBatch(lookup_keys_size)) {
  169. for (ssize_t i = 0; i < lookup_keys_size;) {
  170. benchmark::DoNotOptimize(i);
  171. bool result = s.BenchContains(lookup_keys[i]);
  172. CARBON_DCHECK(!result);
  173. i += static_cast<ssize_t>(!result);
  174. }
  175. }
  176. }
  177. MAP_BENCHMARK_ONE_OP(BM_SetContainsMissPtr, SizeArgs);
  178. // A somewhat contrived latency test for the lookup code path.
  179. //
  180. // While lookups into a set are often (but not always) simply used to influence
  181. // control flow, that style of access produces difficult to evaluate benchmark
  182. // results (see the comments on the `Contains` benchmarks above).
  183. //
  184. // So here we actually access the key in the set and convert that key's value to
  185. // a boolean on the critical path of each iteration. This lets us have a genuine
  186. // latency benchmark of looking up a key in the set, at the expense of being
  187. // somewhat contrived. That said, for usage where the key object is queried or
  188. // operated on in some way once looked up in the set, this will be fairly
  189. // representative of the latency cost from the data structure.
  190. template <typename SetT>
  191. static void BM_SetLookupHitPtr(benchmark::State& state) {
  192. using SetWrapperT = SetWrapper<SetT>;
  193. using KT = typename SetWrapperT::KeyT;
  194. SetWrapperT s;
  195. auto [keys, lookup_keys] =
  196. GetKeysAndHitKeys<KT>(state.range(0), state.range(1));
  197. for (auto k : keys) {
  198. s.BenchInsert(k);
  199. }
  200. ssize_t lookup_keys_size = lookup_keys.size();
  201. while (state.KeepRunningBatch(lookup_keys_size)) {
  202. for (ssize_t i = 0; i < lookup_keys_size;) {
  203. benchmark::DoNotOptimize(i);
  204. bool result = s.BenchLookup(lookup_keys[i]);
  205. CARBON_DCHECK(result);
  206. i += static_cast<ssize_t>(result);
  207. }
  208. }
  209. }
  210. MAP_BENCHMARK_ONE_OP(BM_SetLookupHitPtr, HitArgs);
  211. // First erase and then insert the key. The code path will always be the same
  212. // here and so we expect this to largely be a throughput benchmark because of
  213. // branch prediction and speculative execution.
  214. //
  215. // We don't expect erase followed by insertion to be a common user code
  216. // sequence, but we don't have a good way of benchmarking either erase or insert
  217. // in isolation -- each would change the size of the table and thus the next
  218. // iteration's benchmark. And if we try to correct the table size outside of the
  219. // timed region, we end up trying to exclude too fine grained of a region from
  220. // timers to get good measurement data.
  221. //
  222. // Our solution is to benchmark both erase and insertion back to back. We can
  223. // then get a good profile of the code sequence of each, and at least measure
  224. // the sum cost of these reliably. Careful profiling can help attribute that
  225. // cost between erase and insert in order to understand which of the two
  226. // operations is contributing most to any performance artifacts observed.
  227. template <typename SetT>
  228. static void BM_SetEraseInsertHitPtr(benchmark::State& state) {
  229. using SetWrapperT = SetWrapper<SetT>;
  230. using KT = typename SetWrapperT::KeyT;
  231. SetWrapperT s;
  232. auto [keys, lookup_keys] =
  233. GetKeysAndHitKeys<KT>(state.range(0), state.range(1));
  234. for (auto k : keys) {
  235. s.BenchInsert(k);
  236. }
  237. ssize_t lookup_keys_size = lookup_keys.size();
  238. while (state.KeepRunningBatch(lookup_keys_size)) {
  239. for (ssize_t i = 0; i < lookup_keys_size;) {
  240. benchmark::DoNotOptimize(i);
  241. s.BenchErase(lookup_keys[i]);
  242. benchmark::ClobberMemory();
  243. bool inserted = s.BenchInsert(lookup_keys[i]);
  244. CARBON_DCHECK(inserted);
  245. i += static_cast<ssize_t>(inserted);
  246. }
  247. }
  248. }
  249. MAP_BENCHMARK_ONE_OP(BM_SetEraseInsertHitPtr, HitArgs);
  250. // NOLINTBEGIN(bugprone-macro-parentheses): Parentheses are incorrect here.
  251. #define MAP_BENCHMARK_OP_SEQ_SIZE(NAME, KT) \
  252. BENCHMARK(NAME<Set<KT>>)->Apply(SizeArgs); \
  253. BENCHMARK(NAME<absl::flat_hash_set<KT>>)->Apply(SizeArgs); \
  254. BENCHMARK(NAME<llvm::DenseSet<KT>>)->Apply(SizeArgs); \
  255. BENCHMARK(NAME<llvm::DenseSet<KT, CarbonHashDI<KT>>>)->Apply(SizeArgs)
  256. // NOLINTEND(bugprone-macro-parentheses)
  257. #define MAP_BENCHMARK_OP_SEQ(NAME) \
  258. MAP_BENCHMARK_OP_SEQ_SIZE(NAME, int); \
  259. MAP_BENCHMARK_OP_SEQ_SIZE(NAME, int*); \
  260. MAP_BENCHMARK_OP_SEQ_SIZE(NAME, llvm::StringRef)
  261. // This is an interesting, somewhat specialized benchmark that measures the cost
  262. // of inserting a sequence of keys into a set up to some size and then inserting
  263. // a colliding key and throwing away the set.
  264. //
  265. // This is an especially important usage pattern for sets as a large number of
  266. // algorithms essentially look like this, such as collision detection, cycle
  267. // detection, de-duplication, etc.
  268. //
  269. // It also covers both the insert-into-an-empty-slot code path that isn't
  270. // covered elsewhere, and the code path for growing a table to a larger size.
  271. //
  272. // This is the second most important aspect of expected set usage after testing
  273. // for presence. It also nicely lends itself to a single benchmark that covers
  274. // the total cost of this usage pattern.
  275. //
  276. // Because this benchmark operates on whole sets, we also compute the number of
  277. // probed keys for Carbon's set as that is both a general reflection of the
  278. // efficacy of the underlying hash function, and a direct factor that drives the
  279. // cost of these operations.
  280. template <typename SetT>
  281. static void BM_SetInsertSeq(benchmark::State& state) {
  282. using SetWrapperT = SetWrapper<SetT>;
  283. using KT = typename SetWrapperT::KeyT;
  284. constexpr ssize_t LookupKeysSize = 1 << 8;
  285. auto [keys, lookup_keys] =
  286. GetKeysAndHitKeys<KT>(state.range(0), LookupKeysSize);
  287. // Now build a large shuffled set of keys (with duplicates) we'll use at the
  288. // end.
  289. ssize_t i = 0;
  290. for (auto _ : state) {
  291. benchmark::DoNotOptimize(i);
  292. SetWrapperT s;
  293. for (auto k : keys) {
  294. bool inserted = s.BenchInsert(k);
  295. CARBON_DCHECK(inserted) << "Must be a successful insert!";
  296. }
  297. // Now insert a final random repeated key.
  298. bool inserted = s.BenchInsert(lookup_keys[i]);
  299. CARBON_DCHECK(!inserted) << "Must already be in the map!";
  300. // Rotate through the shuffled keys.
  301. i = (i + static_cast<ssize_t>(!inserted)) & (LookupKeysSize - 1);
  302. }
  303. // It can be easier in some cases to think of this as a key-throughput rate of
  304. // insertion rather than the latency of inserting N keys, so construct the
  305. // rate counter as well.
  306. state.counters["KeyRate"] = benchmark::Counter(
  307. keys.size(), benchmark::Counter::kIsIterationInvariantRate);
  308. // Report some extra statistics about the Carbon type.
  309. if constexpr (IsCarbonSet<SetT>) {
  310. // Re-build a set outside of the timing loop to look at the statistics
  311. // rather than the timing.
  312. SetT s;
  313. for (auto k : keys) {
  314. bool inserted = s.Insert(k).is_inserted();
  315. CARBON_DCHECK(inserted) << "Must be a successful insert!";
  316. }
  317. // While this count is "iteration invariant" (it should be exactly the same
  318. // for every iteration as the set of keys is the same), we don't use that
  319. // because it will scale this by the number of iterations. We want to
  320. // display the probe count of this benchmark *parameter*, not the probe
  321. // count that resulted from the number of iterations. That means we use the
  322. // normal counter API without flags.
  323. state.counters["Probed"] = s.CountProbedKeys();
  324. // Uncomment this call to print out statistics about the index-collisions
  325. // among these keys for debugging:
  326. //
  327. // RawHashtable::DumpHashStatistics(raw_keys);
  328. }
  329. }
  330. MAP_BENCHMARK_OP_SEQ(BM_SetInsertSeq);
  331. } // namespace
  332. } // namespace Carbon