hashing_benchmark.cpp 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #include <benchmark/benchmark.h>
  5. #include <algorithm>
  6. #include <array>
  7. #include <cstddef>
  8. #include <numbers>
  9. #include "absl/hash/hash.h"
  10. #include "absl/random/random.h"
  11. #include "common/hashing.h"
  12. #include "llvm/ADT/Hashing.h"
  13. namespace Carbon {
  14. namespace {
  15. // We want the benchmark working set to fit in the L1 cache where possible so
  16. // that the benchmark focuses on the CPU-execution costs and not memory latency.
  17. // For most CPUs we're going to care about, 16k will fit easily, and 32k will
  18. // probably fit. But we also need to include sizes for string benchmarks. This
  19. // targets 8k of entropy with each object up to 8k of size for a total of 16k.
  20. constexpr int EntropySize = 8 << 10;
  21. constexpr int EntropyObjSize = 8 << 10;
  22. // An array of random entropy with `EntropySize` bytes plus 8k. The goal is that
  23. // clients can read `EntropySize` objects of up to 8k size out of this pool by
  24. // starting at different byte offsets.
  25. static const llvm::ArrayRef<std::byte> entropy_bytes =
  26. []() -> llvm::ArrayRef<std::byte> {
  27. static llvm::SmallVector<std::byte> bytes;
  28. // Pad out the entropy for up to 1kb objects.
  29. bytes.resize(EntropySize + EntropyObjSize);
  30. absl::BitGen gen;
  31. for (std::byte& b : bytes) {
  32. b = static_cast<std::byte>(absl::Uniform<uint8_t>(gen));
  33. }
  34. return bytes;
  35. }();
  36. // Based on 16k of entropy above and an L1 cache size often up to 32k, keep each
  37. // array of sizes small at 8k or 1k 8-byte sizes.
  38. constexpr int NumSizes = 1 << 10;
  39. // Selects an array of `NumSizes` sizes, witch each one in the range [0,
  40. // MaxSize). The sizes will be in a random order, but the sum of sizes will
  41. // always be the same.
  42. template <size_t MaxSize>
  43. static const std::array<size_t, NumSizes> rand_sizes = []() {
  44. std::array<size_t, NumSizes> sizes;
  45. // Build an array with a deterministic set of sizes in the
  46. // range [0, MaxSize), using the golden ratio to select well distributed
  47. // points in that range. See https://www.youtube.com/watch?v=lOIP_Z_-0Hs for
  48. // an example of why this is an effective strategy for selecting sizes in the
  49. // range.
  50. static_assert(NumSizes > 128);
  51. constexpr size_t Scale = std::max<size_t>(1, MaxSize / std::numbers::phi);
  52. for (auto [i, size] : llvm::enumerate(sizes)) {
  53. size = (i * Scale) % MaxSize;
  54. }
  55. // Shuffle the sizes randomly so that there isn't any pattern of sizes
  56. // encountered and we get relatively realistic branch prediction behavior
  57. // when branching on the size. We use this approach rather than random
  58. // sizes to ensure we always have the same total size of data processed.
  59. std::shuffle(sizes.begin(), sizes.end(), absl::BitGen());
  60. return sizes;
  61. }();
  62. // A small helper class to synthesize random values out of our entropy pool.
  63. // This is done in a way that depends on an arbitrary input (`x`) to allow us to
  64. // create a benchmark that measures a *dependent* chain of hashes of these
  65. // values.
  66. //
  67. // `T` needs to be default constructable and reasonable to synthesize an
  68. // instance by copying random bytes into its underlying storage.
  69. //
  70. // This helper class also accumulates the number of bytes of data generated in
  71. // order to let us compute throughput measurements as well as latency
  72. // measurements.
  73. //
  74. // This helper class has the same API as the `RandStrings` helpers below so that
  75. // they can all be used as type parameters to a common benchmark routine below.
  76. template <typename T>
  77. struct RandValues {
  78. size_t bytes = 0;
  79. // Get a random value. We don't need to iterate through sizes so `i` is
  80. // ignored, but we use `x` to select our entropy ensuring a dependency on `x`
  81. // for the benchmark.
  82. auto Get(ssize_t /*i*/, uint64_t x) -> T {
  83. static_assert(sizeof(T) <= EntropyObjSize);
  84. bytes += sizeof(T);
  85. T result;
  86. // Clang Tidy complains about this `memcpy` despite this being the canonical
  87. // formulation. Removing the type `T` would also remove warnings for getting
  88. // the size incorrect.
  89. // NOLINTNEXTLINE(bugprone-multi-level-implicit-pointer-conversion)
  90. memcpy(&result, &entropy_bytes[x % EntropySize], sizeof(T));
  91. return result;
  92. }
  93. };
  94. // A specialization to help with building pairs of values.
  95. template <typename T, typename U>
  96. struct RandValues<std::pair<T, U>> {
  97. size_t bytes = 0;
  98. auto Get(ssize_t /*i*/, uint64_t x) -> std::pair<T, U> {
  99. static_assert(sizeof(std::pair<T, U>) <= EntropyObjSize);
  100. bytes += sizeof(std::pair<T, U>);
  101. T result0;
  102. U result1;
  103. // Clang Tidy complains about this `memcpy` despite this being the canonical
  104. // formulation. Removing the type `T` would also remove warnings for getting
  105. // the size incorrect.
  106. // NOLINTNEXTLINE(bugprone-multi-level-implicit-pointer-conversion)
  107. memcpy(&result0, &entropy_bytes[x % EntropySize], sizeof(T));
  108. // NOLINTNEXTLINE(bugprone-multi-level-implicit-pointer-conversion)
  109. memcpy(&result1, &entropy_bytes[x % EntropySize] + sizeof(T), sizeof(U));
  110. return {result0, result1};
  111. }
  112. };
  113. // A helper class similar to `RandValues`, but for building strings rather than
  114. // values. The string content is pulled from the entropy pool. The size can be
  115. // random from [0, MaxSize], or it can be fixed at `MaxSize`. But the `MaxSize`
  116. // cannot be larger than a single byte sequence pulled from the entropy pool
  117. // (`EntropyObjSize`).
  118. template <bool RandSize, size_t MaxSize>
  119. struct RandStrings {
  120. size_t bytes = 0;
  121. // Get a random string. If the sizes are random, we use `i` to select each
  122. // size and require it to be in the range [0, NumSizes). Otherwise `i` is
  123. // ignored. We always use `x` to select the entropy and establish a dependency
  124. // on the input.
  125. auto Get(ssize_t i, uint64_t x) -> llvm::StringRef {
  126. static_assert(MaxSize <= EntropyObjSize);
  127. size_t s = MaxSize;
  128. if constexpr (RandSize) {
  129. // When using random sizes, we leverage `i` which is guaranteed to range
  130. // from [0, NumSizes).
  131. s = rand_sizes<MaxSize>[i];
  132. } else {
  133. // Prevent `s` from being constant folded when we directly use `MaxSize`.
  134. benchmark::DoNotOptimize(s);
  135. }
  136. bytes += s;
  137. return llvm::StringRef(
  138. reinterpret_cast<const char*>(&entropy_bytes[x % EntropySize]), s);
  139. }
  140. };
  141. struct HashBenchBase {
  142. uint64_t seed;
  143. HashBenchBase() {
  144. // The real-world use case we care about is in a hash table where we'll mix
  145. // in some seed state, likely some ASLR address. To simulate this for
  146. // benchmarking, compute a seed from the address of a stack local variable.
  147. volatile char key;
  148. key = 42;
  149. // Rinse this through a volatile variable as well so returning it isn't
  150. // flagged. The whole point is to escape the address of something on the
  151. // stack.
  152. volatile auto key_addr = reinterpret_cast<uint64_t>(&key);
  153. seed = key_addr;
  154. }
  155. };
  156. struct CarbonHashBench : HashBenchBase {
  157. template <typename T>
  158. auto operator()(const T& value) -> uint64_t {
  159. return static_cast<uint64_t>(HashValue(value, seed));
  160. }
  161. };
  162. struct AbseilHashBench : HashBenchBase {
  163. template <typename T>
  164. auto operator()(const T& value) -> uint64_t {
  165. // Manually seed this with an after-the-fact XOR as there isn't a seeded
  166. // version. This matches what Abseil's hash tables do as well.
  167. return absl::HashOf(value) ^ seed;
  168. }
  169. };
  170. struct LLVMHashBench : HashBenchBase {
  171. template <typename T>
  172. auto operator()(const T& value) -> uint64_t {
  173. // Manually seed this with an after-the-fact XOR as there isn't a seeded
  174. // version.
  175. return llvm::hash_value(value) ^ seed;
  176. }
  177. };
  178. template <typename Values, typename Hasher>
  179. auto BM_LatencyHash(benchmark::State& state) -> void {
  180. uint64_t x = 13;
  181. Values v;
  182. Hasher h;
  183. // We run the benchmark in `NumSizes` batches so that when needed we always
  184. // process each of the sizes and we don't randomly end up with a skewed set of
  185. // sizes.
  186. while (state.KeepRunningBatch(NumSizes)) {
  187. for (ssize_t i = 0; i < NumSizes; ++i) {
  188. benchmark::DoNotOptimize(x = h(v.Get(i, x)));
  189. }
  190. }
  191. state.SetBytesProcessed(v.bytes);
  192. }
  193. // Latency benchmarks are grouped by the three different hash functions to
  194. // facilitate comparing their performance for a given value type or string size
  195. // bucket.
  196. #define LATENCY_VALUE_BENCHMARKS(...) \
  197. BENCHMARK(BM_LatencyHash<RandValues<__VA_ARGS__>, CarbonHashBench>); \
  198. BENCHMARK(BM_LatencyHash<RandValues<__VA_ARGS__>, AbseilHashBench>); \
  199. BENCHMARK(BM_LatencyHash<RandValues<__VA_ARGS__>, LLVMHashBench>)
  200. LATENCY_VALUE_BENCHMARKS(uint8_t);
  201. LATENCY_VALUE_BENCHMARKS(uint16_t);
  202. LATENCY_VALUE_BENCHMARKS(std::pair<uint8_t, uint8_t>);
  203. LATENCY_VALUE_BENCHMARKS(uint32_t);
  204. LATENCY_VALUE_BENCHMARKS(std::pair<uint16_t, uint16_t>);
  205. LATENCY_VALUE_BENCHMARKS(uint64_t);
  206. LATENCY_VALUE_BENCHMARKS(int*);
  207. LATENCY_VALUE_BENCHMARKS(std::pair<uint32_t, uint32_t>);
  208. LATENCY_VALUE_BENCHMARKS(std::pair<uint64_t, uint32_t>);
  209. LATENCY_VALUE_BENCHMARKS(std::pair<uint32_t, uint64_t>);
  210. LATENCY_VALUE_BENCHMARKS(std::pair<int*, uint32_t>);
  211. LATENCY_VALUE_BENCHMARKS(std::pair<uint32_t, int*>);
  212. LATENCY_VALUE_BENCHMARKS(__uint128_t);
  213. LATENCY_VALUE_BENCHMARKS(std::pair<uint64_t, uint64_t>);
  214. LATENCY_VALUE_BENCHMARKS(std::pair<int*, int*>);
  215. LATENCY_VALUE_BENCHMARKS(std::pair<uint64_t, int*>);
  216. LATENCY_VALUE_BENCHMARKS(std::pair<int*, uint64_t>);
  217. #define LATENCY_STRING_BENCHMARKS(MaxSize) \
  218. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/true, MaxSize>, \
  219. CarbonHashBench>); \
  220. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/true, MaxSize>, \
  221. AbseilHashBench>); \
  222. BENCHMARK( \
  223. BM_LatencyHash<RandStrings</*RandSize=*/true, MaxSize>, LLVMHashBench>)
  224. LATENCY_STRING_BENCHMARKS(/*MaxSize=*/4);
  225. LATENCY_STRING_BENCHMARKS(/*MaxSize=*/8);
  226. LATENCY_STRING_BENCHMARKS(/*MaxSize=*/16);
  227. LATENCY_STRING_BENCHMARKS(/*MaxSize=*/32);
  228. LATENCY_STRING_BENCHMARKS(/*MaxSize=*/64);
  229. LATENCY_STRING_BENCHMARKS(/*MaxSize=*/256);
  230. LATENCY_STRING_BENCHMARKS(/*MaxSize=*/512);
  231. LATENCY_STRING_BENCHMARKS(/*MaxSize=*/1024);
  232. LATENCY_STRING_BENCHMARKS(/*MaxSize=*/2048);
  233. LATENCY_STRING_BENCHMARKS(/*MaxSize=*/4096);
  234. LATENCY_STRING_BENCHMARKS(/*MaxSize=*/8192);
  235. // We also want to check for size-specific cliffs, particularly in small sizes
  236. // and sizes around implementation inflection points such as powers of two and
  237. // half-way points between powers of two. Because these benchmarks are looking
  238. // for size-related cliffs, all the runs for particular hash function are kept
  239. // together.
  240. //
  241. // Note: because these use a fixed size, their specific timing isn't terribly
  242. // informative. The branch predictor behavior on a modern CPU will be
  243. // significantly different in this benchmarks from any other and may distort all
  244. // manner of the timings. The results should really only be compared between
  245. // sizes for cliffs, and not directly compared with other numbers.
  246. #define LATENCY_STRING_SIZE_BENCHMARKS(Hash) \
  247. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 0>, Hash>); \
  248. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 1>, Hash>); \
  249. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 2>, Hash>); \
  250. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 3>, Hash>); \
  251. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 4>, Hash>); \
  252. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 5>, Hash>); \
  253. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 6>, Hash>); \
  254. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 7>, Hash>); \
  255. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 8>, Hash>); \
  256. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 9>, Hash>); \
  257. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 15>, Hash>); \
  258. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 16>, Hash>); \
  259. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 17>, Hash>); \
  260. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 23>, Hash>); \
  261. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 24>, Hash>); \
  262. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 25>, Hash>); \
  263. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 31>, Hash>); \
  264. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 32>, Hash>); \
  265. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 33>, Hash>); \
  266. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 47>, Hash>); \
  267. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 48>, Hash>); \
  268. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 49>, Hash>); \
  269. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 63>, Hash>); \
  270. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 64>, Hash>); \
  271. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 65>, Hash>); \
  272. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 91>, Hash>); \
  273. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 92>, Hash>); \
  274. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 93>, Hash>); \
  275. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 127>, Hash>); \
  276. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 128>, Hash>); \
  277. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 129>, Hash>)
  278. // Because these just look for size-related cliffs in performance, we only do a
  279. // minimal number of benchmarks. There are a lot of sizes so this avoids wasted
  280. // time in benchmark runs and there isn't much value from greater comparative
  281. // coverage here.
  282. LATENCY_STRING_SIZE_BENCHMARKS(CarbonHashBench);
  283. LATENCY_STRING_SIZE_BENCHMARKS(AbseilHashBench);
  284. } // namespace
  285. } // namespace Carbon