hashing_benchmark.cpp 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #include <benchmark/benchmark.h>
  5. #include <algorithm>
  6. #include <cstddef>
  7. #include <numbers>
  8. #include "absl/hash/hash.h"
  9. #include "absl/random/random.h"
  10. #include "common/hashing.h"
  11. #include "llvm/ADT/Hashing.h"
  12. namespace Carbon {
  13. namespace {
  14. // We want the benchmark working set to fit in the L1 cache where possible so
  15. // that the benchmark focuses on the CPU-execution costs and not memory latency.
  16. // For most CPUs we're going to care about, 16k will fit easily, and 32k will
  17. // probably fit. But we also need to include sizes for string benchmarks. This
  18. // targets 8k of entropy with each object up to 8k of size for a total of 16k.
  19. constexpr int EntropySize = 8 << 10;
  20. constexpr int EntropyObjSize = 8 << 10;
  21. // An array of random entropy with `EntropySize` bytes plus 8k. The goal is that
  22. // clients can read `EntropySize` objects of up to 8k size out of this pool by
  23. // starting at different byte offsets.
  24. static const llvm::ArrayRef<std::byte> entropy_bytes =
  25. []() -> llvm::ArrayRef<std::byte> {
  26. static llvm::SmallVector<std::byte> bytes;
  27. // Pad out the entropy for up to 1kb objects.
  28. bytes.resize(EntropySize + EntropyObjSize);
  29. absl::BitGen gen;
  30. for (std::byte& b : bytes) {
  31. b = static_cast<std::byte>(absl::Uniform<uint8_t>(gen));
  32. }
  33. return bytes;
  34. }();
  35. // Based on 16k of entropy above and an L1 cache size often up to 32k, keep each
  36. // array of sizes small at 8k or 1k 8-byte sizes.
  37. constexpr int NumSizes = 1 << 10;
  38. // Selects an array of `NumSizes` sizes, witch each one in the range [0,
  39. // MaxSize). The sizes will be in a random order, but the sum of sizes will
  40. // always be the same.
  41. template <size_t MaxSize>
  42. static const std::array<size_t, NumSizes> rand_sizes = []() {
  43. std::array<size_t, NumSizes> sizes;
  44. // Build an array with a deterministic set of sizes in the
  45. // range [0, MaxSize), using the golden ratio to select well distributed
  46. // points in that range. See https://www.youtube.com/watch?v=lOIP_Z_-0Hs for
  47. // an example of why this is an effective strategy for selecting sizes in the
  48. // range.
  49. static_assert(NumSizes > 128);
  50. constexpr size_t Scale = std::max<size_t>(1, MaxSize / std::numbers::phi);
  51. for (auto [i, size] : llvm::enumerate(sizes)) {
  52. size = (i * Scale) % MaxSize;
  53. }
  54. // Shuffle the sizes randomly so that there isn't any pattern of sizes
  55. // encountered and we get relatively realistic branch prediction behavior
  56. // when branching on the size. We use this approach rather than random
  57. // sizes to ensure we always have the same total size of data processed.
  58. std::shuffle(sizes.begin(), sizes.end(), absl::BitGen());
  59. return sizes;
  60. }();
  61. // A small helper class to synthesize random values out of our entropy pool.
  62. // This is done in a way that depends on an arbitrary input (`x`) to allow us to
  63. // create a benchmark that measures a *dependent* chain of hashes of these
  64. // values.
  65. //
  66. // `T` needs to be default constructable and reasonable to synthesize an
  67. // instance by copying random bytes into its underlying storage.
  68. //
  69. // This helper class also accumulates the number of bytes of data generated in
  70. // order to let us compute throughput measurements as well as latency
  71. // measurements.
  72. //
  73. // This helper class has the same API as the `RandStrings` helpers below so that
  74. // they can all be used as type parameters to a common benchmark routine below.
  75. template <typename T>
  76. struct RandValues {
  77. size_t bytes = 0;
  78. // Get a random value. We don't need to iterate through sizes so `i` is
  79. // ignored, but we use `x` to select our entropy ensuring a dependency on `x`
  80. // for the benchmark.
  81. auto Get(ssize_t /*i*/, uint64_t x) -> T {
  82. static_assert(sizeof(T) <= EntropyObjSize);
  83. bytes += sizeof(T);
  84. T result;
  85. // Clang Tidy complains about this `memcpy` despite this being the canonical
  86. // formulation. Removing the type `T` would also remove warnings for getting
  87. // the size incorrect.
  88. // NOLINTNEXTLINE(bugprone-multi-level-implicit-pointer-conversion)
  89. memcpy(&result, &entropy_bytes[x % EntropySize], sizeof(T));
  90. return result;
  91. }
  92. };
  93. // A specialization to help with building pairs of values.
  94. template <typename T, typename U>
  95. struct RandValues<std::pair<T, U>> {
  96. size_t bytes = 0;
  97. auto Get(ssize_t /*i*/, uint64_t x) -> std::pair<T, U> {
  98. static_assert(sizeof(std::pair<T, U>) <= EntropyObjSize);
  99. bytes += sizeof(std::pair<T, U>);
  100. T result0;
  101. U result1;
  102. // Clang Tidy complains about this `memcpy` despite this being the canonical
  103. // formulation. Removing the type `T` would also remove warnings for getting
  104. // the size incorrect.
  105. // NOLINTNEXTLINE(bugprone-multi-level-implicit-pointer-conversion)
  106. memcpy(&result0, &entropy_bytes[x % EntropySize], sizeof(T));
  107. // NOLINTNEXTLINE(bugprone-multi-level-implicit-pointer-conversion)
  108. memcpy(&result1, &entropy_bytes[x % EntropySize] + sizeof(T), sizeof(U));
  109. return {result0, result1};
  110. }
  111. };
  112. // A helper class similar to `RandValues`, but for building strings rather than
  113. // values. The string content is pulled from the entropy pool. The size can be
  114. // random from [0, MaxSize], or it can be fixed at `MaxSize`. But the `MaxSize`
  115. // cannot be larger than a single byte sequence pulled from the entropy pool
  116. // (`EntropyObjSize`).
  117. template <bool RandSize, size_t MaxSize>
  118. struct RandStrings {
  119. size_t bytes = 0;
  120. // Get a random string. If the sizes are random, we use `i` to select each
  121. // size and require it to be in the range [0, NumSizes). Otherwise `i` is
  122. // ignored. We always use `x` to select the entropy and establish a dependency
  123. // on the input.
  124. auto Get(ssize_t i, uint64_t x) -> llvm::StringRef {
  125. static_assert(MaxSize <= EntropyObjSize);
  126. size_t s = MaxSize;
  127. if constexpr (RandSize) {
  128. // When using random sizes, we leverage `i` which is guaranteed to range
  129. // from [0, NumSizes).
  130. s = rand_sizes<MaxSize>[i];
  131. } else {
  132. // Prevent `s` from being constant folded when we directly use `MaxSize`.
  133. benchmark::DoNotOptimize(s);
  134. }
  135. bytes += s;
  136. return llvm::StringRef(
  137. reinterpret_cast<const char*>(&entropy_bytes[x % EntropySize]), s);
  138. }
  139. };
  140. struct HashBenchBase {
  141. uint64_t seed;
  142. HashBenchBase() {
  143. // The real-world use case we care about is in a hash table where we'll mix
  144. // in some seed state, likely some ASLR address. To simulate this for
  145. // benchmarking, compute a seed from the address of a stack local variable.
  146. volatile char key;
  147. key = 42;
  148. // Rinse this through a volatile variable as well so returning it isn't
  149. // flagged. The whole point is to escape the address of something on the
  150. // stack.
  151. volatile auto key_addr = reinterpret_cast<uint64_t>(&key);
  152. seed = key_addr;
  153. }
  154. };
  155. struct CarbonHashBench : HashBenchBase {
  156. template <typename T>
  157. auto operator()(const T& value) -> uint64_t {
  158. return static_cast<uint64_t>(HashValue(value, seed));
  159. }
  160. };
  161. struct AbseilHashBench : HashBenchBase {
  162. template <typename T>
  163. auto operator()(const T& value) -> uint64_t {
  164. // Manually seed this with an after-the-fact XOR as there isn't a seeded
  165. // version. This matches what Abseil's hash tables do as well.
  166. return absl::HashOf(value) ^ seed;
  167. }
  168. };
  169. struct LLVMHashBench : HashBenchBase {
  170. template <typename T>
  171. auto operator()(const T& value) -> uint64_t {
  172. // Manually seed this with an after-the-fact XOR as there isn't a seeded
  173. // version.
  174. return llvm::hash_value(value) ^ seed;
  175. }
  176. };
  177. template <typename Values, typename Hasher>
  178. void BM_LatencyHash(benchmark::State& state) {
  179. uint64_t x = 13;
  180. Values v;
  181. Hasher h;
  182. // We run the benchmark in `NumSizes` batches so that when needed we always
  183. // process each of the sizes and we don't randomly end up with a skewed set of
  184. // sizes.
  185. while (state.KeepRunningBatch(NumSizes)) {
  186. for (ssize_t i = 0; i < NumSizes; ++i) {
  187. benchmark::DoNotOptimize(x = h(v.Get(i, x)));
  188. }
  189. }
  190. state.SetBytesProcessed(v.bytes);
  191. }
  192. // Latency benchmarks are grouped by the three different hash functions to
  193. // facilitate comparing their performance for a given value type or string size
  194. // bucket.
  195. #define LATENCY_VALUE_BENCHMARKS(...) \
  196. BENCHMARK(BM_LatencyHash<RandValues<__VA_ARGS__>, CarbonHashBench>); \
  197. BENCHMARK(BM_LatencyHash<RandValues<__VA_ARGS__>, AbseilHashBench>); \
  198. BENCHMARK(BM_LatencyHash<RandValues<__VA_ARGS__>, LLVMHashBench>)
  199. LATENCY_VALUE_BENCHMARKS(uint8_t);
  200. LATENCY_VALUE_BENCHMARKS(uint16_t);
  201. LATENCY_VALUE_BENCHMARKS(std::pair<uint8_t, uint8_t>);
  202. LATENCY_VALUE_BENCHMARKS(uint32_t);
  203. LATENCY_VALUE_BENCHMARKS(std::pair<uint16_t, uint16_t>);
  204. LATENCY_VALUE_BENCHMARKS(uint64_t);
  205. LATENCY_VALUE_BENCHMARKS(int*);
  206. LATENCY_VALUE_BENCHMARKS(std::pair<uint32_t, uint32_t>);
  207. LATENCY_VALUE_BENCHMARKS(std::pair<uint64_t, uint32_t>);
  208. LATENCY_VALUE_BENCHMARKS(std::pair<uint32_t, uint64_t>);
  209. LATENCY_VALUE_BENCHMARKS(std::pair<int*, uint32_t>);
  210. LATENCY_VALUE_BENCHMARKS(std::pair<uint32_t, int*>);
  211. LATENCY_VALUE_BENCHMARKS(__uint128_t);
  212. LATENCY_VALUE_BENCHMARKS(std::pair<uint64_t, uint64_t>);
  213. LATENCY_VALUE_BENCHMARKS(std::pair<int*, int*>);
  214. LATENCY_VALUE_BENCHMARKS(std::pair<uint64_t, int*>);
  215. LATENCY_VALUE_BENCHMARKS(std::pair<int*, uint64_t>);
  216. #define LATENCY_STRING_BENCHMARKS(MaxSize) \
  217. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/true, MaxSize>, \
  218. CarbonHashBench>); \
  219. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/true, MaxSize>, \
  220. AbseilHashBench>); \
  221. BENCHMARK( \
  222. BM_LatencyHash<RandStrings</*RandSize=*/true, MaxSize>, LLVMHashBench>)
  223. LATENCY_STRING_BENCHMARKS(/*MaxSize=*/4);
  224. LATENCY_STRING_BENCHMARKS(/*MaxSize=*/8);
  225. LATENCY_STRING_BENCHMARKS(/*MaxSize=*/16);
  226. LATENCY_STRING_BENCHMARKS(/*MaxSize=*/32);
  227. LATENCY_STRING_BENCHMARKS(/*MaxSize=*/64);
  228. LATENCY_STRING_BENCHMARKS(/*MaxSize=*/256);
  229. LATENCY_STRING_BENCHMARKS(/*MaxSize=*/512);
  230. LATENCY_STRING_BENCHMARKS(/*MaxSize=*/1024);
  231. LATENCY_STRING_BENCHMARKS(/*MaxSize=*/2048);
  232. LATENCY_STRING_BENCHMARKS(/*MaxSize=*/4096);
  233. LATENCY_STRING_BENCHMARKS(/*MaxSize=*/8192);
  234. // We also want to check for size-specific cliffs, particularly in small sizes
  235. // and sizes around implementation inflection points such as powers of two and
  236. // half-way points between powers of two. Because these benchmarks are looking
  237. // for size-related cliffs, all the runs for particular hash function are kept
  238. // together.
  239. //
  240. // Note: because these use a fixed size, their specific timing isn't terribly
  241. // informative. The branch predictor behavior on a modern CPU will be
  242. // significantly different in this benchmarks from any other and may distort all
  243. // manner of the timings. The results should really only be compared between
  244. // sizes for cliffs, and not directly compared with other numbers.
  245. #define LATENCY_STRING_SIZE_BENCHMARKS(Hash) \
  246. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 0>, Hash>); \
  247. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 1>, Hash>); \
  248. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 2>, Hash>); \
  249. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 3>, Hash>); \
  250. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 4>, Hash>); \
  251. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 5>, Hash>); \
  252. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 6>, Hash>); \
  253. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 7>, Hash>); \
  254. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 8>, Hash>); \
  255. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 9>, Hash>); \
  256. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 15>, Hash>); \
  257. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 16>, Hash>); \
  258. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 17>, Hash>); \
  259. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 23>, Hash>); \
  260. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 24>, Hash>); \
  261. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 25>, Hash>); \
  262. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 31>, Hash>); \
  263. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 32>, Hash>); \
  264. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 33>, Hash>); \
  265. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 47>, Hash>); \
  266. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 48>, Hash>); \
  267. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 49>, Hash>); \
  268. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 63>, Hash>); \
  269. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 64>, Hash>); \
  270. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 65>, Hash>); \
  271. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 91>, Hash>); \
  272. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 92>, Hash>); \
  273. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 93>, Hash>); \
  274. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 127>, Hash>); \
  275. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 128>, Hash>); \
  276. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 129>, Hash>)
  277. // Because these just look for size-related cliffs in performance, we only do a
  278. // minimal number of benchmarks. There are a lot of sizes so this avoids wasted
  279. // time in benchmark runs and there isn't much value from greater comparative
  280. // coverage here.
  281. LATENCY_STRING_SIZE_BENCHMARKS(CarbonHashBench);
  282. LATENCY_STRING_SIZE_BENCHMARKS(AbseilHashBench);
  283. } // namespace
  284. } // namespace Carbon