hashing_benchmark.cpp 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #include <benchmark/benchmark.h>
  5. #include <algorithm>
  6. #include <cstddef>
  7. #include "absl/hash/hash.h"
  8. #include "absl/random/random.h"
  9. #include "common/hashing.h"
  10. #include "llvm/ADT/Hashing.h"
  11. namespace Carbon {
  12. namespace {
  13. // We want the benchmark working set to fit in the L1 cache where possible so
  14. // that the benchmark focuses on the CPU-execution costs and not memory latency.
  15. // For most CPUs we're going to care about, 16k will fit easily, and 32k will
  16. // probably fit. But we also need to include sizes for string benchmarks. This
  17. // targets 8k of entropy with each object up to 8k of size for a total of 16k.
  18. constexpr int EntropySize = 8 << 10;
  19. constexpr int EntropyObjSize = 8 << 10;
  20. // An array of random entropy with `EntropySize` bytes plus 8k. The goal is that
  21. // clients can read `EntropySize` objects of up to 8k size out of this pool by
  22. // starting at different byte offsets.
  23. static const llvm::ArrayRef<std::byte> entropy_bytes =
  24. []() -> llvm::ArrayRef<std::byte> {
  25. static llvm::SmallVector<std::byte> bytes;
  26. // Pad out the entropy for up to 1kb objects.
  27. bytes.resize(EntropySize + EntropyObjSize);
  28. absl::BitGen gen;
  29. for (std::byte& b : bytes) {
  30. b = static_cast<std::byte>(absl::Uniform<uint8_t>(gen));
  31. }
  32. return bytes;
  33. }();
  34. // Based on 16k of entropy above and an L1 cache size often up to 32k, keep each
  35. // array of sizes small at 8k or 1k 8-byte sizes.
  36. constexpr int NumSizes = 1 << 10;
  37. // Selects an array of `NumSizes` sizes, witch each one in the range [0,
  38. // MaxSize). The sizes will be in a random order, but the sum of sizes will
  39. // always be the same.
  40. template <size_t MaxSize>
  41. static const std::array<size_t, NumSizes> rand_sizes = []() {
  42. std::array<size_t, NumSizes> sizes;
  43. // Build an array with a deterministic set of sizes in the
  44. // range [0, MaxSize), using the golden ratio to select well distributed
  45. // points in that range. See https://www.youtube.com/watch?v=lOIP_Z_-0Hs for
  46. // an example of why this is an effective strategy for selecting sizes in the
  47. // range.
  48. static_assert(NumSizes > 128);
  49. constexpr double Phi = 1.61803398875;
  50. constexpr size_t Scale = std::max<size_t>(1, MaxSize / Phi);
  51. for (auto [i, size] : llvm::enumerate(sizes)) {
  52. size = (i * Scale) % MaxSize;
  53. }
  54. // Shuffle the sizes randomly so that there isn't any pattern of sizes
  55. // encountered and we get relatively realistic branch prediction behavior
  56. // when branching on the size. We use this approach rather than random
  57. // sizes to ensure we always have the same total size of data processed.
  58. std::shuffle(sizes.begin(), sizes.end(), absl::BitGen());
  59. return sizes;
  60. }();
  61. // A small helper class to synthesize random values out of our entropy pool.
  62. // This is done in a way that depends on an arbitrary input (`x`) to allow us to
  63. // create a benchmark that measures a *dependent* chain of hashes of these
  64. // values.
  65. //
  66. // `T` needs to be default constructable and reasonable to synthesize an
  67. // instance by copying random bytes into its underlying storage.
  68. //
  69. // This helper class also accumulates the number of bytes of data generated in
  70. // order to let us compute throughput measurements as well as latency
  71. // measurements.
  72. //
  73. // This helper class has the same API as the `RandStrings` helpers below so that
  74. // they can all be used as type parameters to a common benchmark routine below.
  75. template <typename T>
  76. struct RandValues {
  77. size_t bytes = 0;
  78. // Get a random value. We don't need to iterate through sizes so `i` is
  79. // ignored, but we use `x` to select our entropy ensuring a dependency on `x`
  80. // for the benchmark.
  81. auto Get(ssize_t /*i*/, uint64_t x) -> T {
  82. static_assert(sizeof(T) <= EntropyObjSize);
  83. bytes += sizeof(T);
  84. T result;
  85. memcpy(&result, &entropy_bytes[x % EntropySize], sizeof(T));
  86. return result;
  87. }
  88. };
  89. // A specialization to help with building pairs of values.
  90. template <typename T, typename U>
  91. struct RandValues<std::pair<T, U>> {
  92. size_t bytes = 0;
  93. auto Get(ssize_t /*i*/, uint64_t x) -> std::pair<T, U> {
  94. static_assert(sizeof(std::pair<T, U>) <= EntropyObjSize);
  95. bytes += sizeof(std::pair<T, U>);
  96. T result0;
  97. U result1;
  98. memcpy(&result0, &entropy_bytes[x % EntropySize], sizeof(T));
  99. memcpy(&result1, &entropy_bytes[x % EntropySize] + sizeof(T), sizeof(U));
  100. return {result0, result1};
  101. }
  102. };
  103. // A helper class similar to `RandValues`, but for building strings rather than
  104. // values. The string content is pulled from the entropy pool. The size can be
  105. // random from [0, MaxSize], or it can be fixed at `MaxSize`. But the `MaxSize`
  106. // cannot be larger than a single byte sequence pulled from the entropy pool
  107. // (`EntropyObjSize`).
  108. template <bool RandSize, size_t MaxSize>
  109. struct RandStrings {
  110. size_t bytes = 0;
  111. // Get a random string. If the sizes are random, we use `i` to select each
  112. // size and require it to be in the range [0, NumSizes). Otherwise `i` is
  113. // ignored. We always use `x` to select the entropy and establish a dependency
  114. // on the input.
  115. auto Get(ssize_t i, uint64_t x) -> llvm::StringRef {
  116. static_assert(MaxSize <= EntropyObjSize);
  117. size_t s = MaxSize;
  118. if constexpr (RandSize) {
  119. // When using random sizes, we leverage `i` which is guaranteed to range
  120. // from [0, NumSizes).
  121. s = rand_sizes<MaxSize>[i];
  122. } else {
  123. // Prevent `s` from being constant folded when we directly use `MaxSize`.
  124. benchmark::DoNotOptimize(s);
  125. }
  126. bytes += s;
  127. return llvm::StringRef(
  128. reinterpret_cast<const char*>(&entropy_bytes[x % EntropySize]), s);
  129. }
  130. };
  131. struct HashBenchBase {
  132. uint64_t seed;
  133. HashBenchBase() {
  134. // The real-world use case we care about is in a hash table where we'll mix
  135. // in some seed state, likely some ASLR address. To simulate this for
  136. // benchmarking, compute a seed from the address of a stack local variable.
  137. volatile char key;
  138. key = 42;
  139. // Rinse this through a volatile variable as well so returning it isn't
  140. // flagged. The whole point is to escape the address of something on the
  141. // stack.
  142. volatile auto key_addr = reinterpret_cast<uint64_t>(&key);
  143. seed = key_addr;
  144. }
  145. };
  146. struct CarbonHashBench : HashBenchBase {
  147. template <typename T>
  148. auto operator()(const T& value) -> uint64_t {
  149. return static_cast<uint64_t>(HashValue(value, seed));
  150. }
  151. };
  152. struct AbseilHashBench : HashBenchBase {
  153. template <typename T>
  154. auto operator()(const T& value) -> uint64_t {
  155. // Manually seed this with an after-the-fact XOR as there isn't a seeded
  156. // version. This matches what Abseil's hash tables do as well.
  157. return absl::HashOf(value) ^ seed;
  158. }
  159. };
  160. struct LLVMHashBench : HashBenchBase {
  161. template <typename T>
  162. auto operator()(const T& value) -> uint64_t {
  163. // Manually seed this with an after-the-fact XOR as there isn't a seeded
  164. // version.
  165. return llvm::hash_value(value) ^ seed;
  166. }
  167. };
  168. template <typename Values, typename Hasher>
  169. void BM_LatencyHash(benchmark::State& state) {
  170. uint64_t x = 13;
  171. Values v;
  172. Hasher h;
  173. // We run the benchmark in `NumSizes` batches so that when needed we always
  174. // process each of the sizes and we don't randomly end up with a skewed set of
  175. // sizes.
  176. while (state.KeepRunningBatch(NumSizes)) {
  177. for (ssize_t i = 0; i < NumSizes; ++i) {
  178. benchmark::DoNotOptimize(x = h(v.Get(i, x)));
  179. }
  180. }
  181. state.SetBytesProcessed(v.bytes);
  182. }
  183. // Latency benchmarks are grouped by the three different hash functions to
  184. // facilitate comparing their performance for a given value type or string size
  185. // bucket.
  186. #define LATENCY_VALUE_BENCHMARKS(...) \
  187. BENCHMARK(BM_LatencyHash<RandValues<__VA_ARGS__>, CarbonHashBench>); \
  188. BENCHMARK(BM_LatencyHash<RandValues<__VA_ARGS__>, AbseilHashBench>); \
  189. BENCHMARK(BM_LatencyHash<RandValues<__VA_ARGS__>, LLVMHashBench>)
  190. LATENCY_VALUE_BENCHMARKS(uint8_t);
  191. LATENCY_VALUE_BENCHMARKS(uint16_t);
  192. LATENCY_VALUE_BENCHMARKS(std::pair<uint8_t, uint8_t>);
  193. LATENCY_VALUE_BENCHMARKS(uint32_t);
  194. LATENCY_VALUE_BENCHMARKS(std::pair<uint16_t, uint16_t>);
  195. LATENCY_VALUE_BENCHMARKS(uint64_t);
  196. LATENCY_VALUE_BENCHMARKS(int*);
  197. LATENCY_VALUE_BENCHMARKS(std::pair<uint32_t, uint32_t>);
  198. LATENCY_VALUE_BENCHMARKS(std::pair<uint64_t, uint32_t>);
  199. LATENCY_VALUE_BENCHMARKS(std::pair<uint32_t, uint64_t>);
  200. LATENCY_VALUE_BENCHMARKS(std::pair<int*, uint32_t>);
  201. LATENCY_VALUE_BENCHMARKS(std::pair<uint32_t, int*>);
  202. LATENCY_VALUE_BENCHMARKS(__uint128_t);
  203. LATENCY_VALUE_BENCHMARKS(std::pair<uint64_t, uint64_t>);
  204. LATENCY_VALUE_BENCHMARKS(std::pair<int*, int*>);
  205. LATENCY_VALUE_BENCHMARKS(std::pair<uint64_t, int*>);
  206. LATENCY_VALUE_BENCHMARKS(std::pair<int*, uint64_t>);
  207. #define LATENCY_STRING_BENCHMARKS(MaxSize) \
  208. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/true, MaxSize>, \
  209. CarbonHashBench>); \
  210. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/true, MaxSize>, \
  211. AbseilHashBench>); \
  212. BENCHMARK( \
  213. BM_LatencyHash<RandStrings</*RandSize=*/true, MaxSize>, LLVMHashBench>)
  214. LATENCY_STRING_BENCHMARKS(/*MaxSize=*/4);
  215. LATENCY_STRING_BENCHMARKS(/*MaxSize=*/8);
  216. LATENCY_STRING_BENCHMARKS(/*MaxSize=*/16);
  217. LATENCY_STRING_BENCHMARKS(/*MaxSize=*/32);
  218. LATENCY_STRING_BENCHMARKS(/*MaxSize=*/64);
  219. LATENCY_STRING_BENCHMARKS(/*MaxSize=*/256);
  220. LATENCY_STRING_BENCHMARKS(/*MaxSize=*/512);
  221. LATENCY_STRING_BENCHMARKS(/*MaxSize=*/1024);
  222. LATENCY_STRING_BENCHMARKS(/*MaxSize=*/2048);
  223. LATENCY_STRING_BENCHMARKS(/*MaxSize=*/4096);
  224. LATENCY_STRING_BENCHMARKS(/*MaxSize=*/8192);
  225. // We also want to check for size-specific cliffs, particularly in small sizes
  226. // and sizes around implementation inflection points such as powers of two and
  227. // half-way points between powers of two. Because these benchmarks are looking
  228. // for size-related cliffs, all the runs for particular hash function are kept
  229. // together.
  230. //
  231. // Note: because these use a fixed size, their specific timing isn't terribly
  232. // informative. The branch predictor behavior on a modern CPU will be
  233. // significantly different in this benchmarks from any other and may distort all
  234. // manner of the timings. The results should really only be compared between
  235. // sizes for cliffs, and not directly compared with other numbers.
  236. #define LATENCY_STRING_SIZE_BENCHMARKS(Hash) \
  237. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 0>, Hash>); \
  238. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 1>, Hash>); \
  239. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 2>, Hash>); \
  240. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 3>, Hash>); \
  241. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 4>, Hash>); \
  242. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 5>, Hash>); \
  243. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 6>, Hash>); \
  244. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 7>, Hash>); \
  245. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 8>, Hash>); \
  246. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 9>, Hash>); \
  247. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 15>, Hash>); \
  248. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 16>, Hash>); \
  249. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 17>, Hash>); \
  250. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 23>, Hash>); \
  251. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 24>, Hash>); \
  252. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 25>, Hash>); \
  253. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 31>, Hash>); \
  254. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 32>, Hash>); \
  255. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 33>, Hash>); \
  256. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 47>, Hash>); \
  257. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 48>, Hash>); \
  258. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 49>, Hash>); \
  259. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 63>, Hash>); \
  260. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 64>, Hash>); \
  261. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 65>, Hash>); \
  262. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 91>, Hash>); \
  263. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 92>, Hash>); \
  264. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 93>, Hash>); \
  265. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 127>, Hash>); \
  266. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 128>, Hash>); \
  267. BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 129>, Hash>)
  268. // Because these just look for size-related cliffs in performance, we only do a
  269. // minimal number of benchmarks. There are a lot of sizes so this avoids wasted
  270. // time in benchmark runs and there isn't much value from greater comparative
  271. // coverage here.
  272. LATENCY_STRING_SIZE_BENCHMARKS(CarbonHashBench);
  273. LATENCY_STRING_SIZE_BENCHMARKS(AbseilHashBench);
  274. } // namespace
  275. } // namespace Carbon