raw_hashtable_metadata_group_benchmark.cpp 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #include <benchmark/benchmark.h>
  5. #include <algorithm>
  6. #include <array>
  7. #include <numeric>
  8. #include "absl/random/random.h"
  9. #include "common/raw_hashtable_metadata_group.h"
  10. namespace Carbon::RawHashtable {
  11. // If we have any SIMD support, create dedicated benchmark utilities for the
  12. // portable and SIMD implementation so we can directly benchmark both.
  13. #if CARBON_NEON_SIMD_SUPPORT || CARBON_X86_SIMD_SUPPORT
  14. // Override the core API with explicit use of the portable API.
  15. class BenchmarkPortableMetadataGroup : public MetadataGroup {
  16. public:
  17. explicit BenchmarkPortableMetadataGroup(MetadataGroup g) : MetadataGroup(g) {}
  18. static auto Load(uint8_t* metadata, ssize_t index)
  19. -> BenchmarkPortableMetadataGroup {
  20. return BenchmarkPortableMetadataGroup(PortableLoad(metadata, index));
  21. }
  22. auto Store(uint8_t* metadata, ssize_t index) const -> void {
  23. PortableStore(metadata, index);
  24. }
  25. auto ClearDeleted() -> void { PortableClearDeleted(); }
  26. auto Match(uint8_t present_byte) const -> PortableMatchRange {
  27. return PortableMatch(present_byte);
  28. }
  29. auto MatchPresent() const -> PortableMatchRange {
  30. return PortableMatchPresent();
  31. }
  32. auto MatchEmpty() const -> MatchIndex { return PortableMatchEmpty(); }
  33. auto MatchDeleted() const -> MatchIndex { return PortableMatchDeleted(); }
  34. };
  35. // Override the core API with explicit use of the SIMD API.
  36. class BenchmarkSimdMetadataGroup : public MetadataGroup {
  37. public:
  38. explicit BenchmarkSimdMetadataGroup(MetadataGroup g) : MetadataGroup(g) {}
  39. static auto Load(uint8_t* metadata, ssize_t index)
  40. -> BenchmarkSimdMetadataGroup {
  41. return BenchmarkSimdMetadataGroup(SimdLoad(metadata, index));
  42. }
  43. auto Store(uint8_t* metadata, ssize_t index) const -> void {
  44. SimdStore(metadata, index);
  45. }
  46. auto ClearDeleted() -> void { SimdClearDeleted(); }
  47. auto Match(uint8_t present_byte) const -> SimdMatchRange {
  48. return SimdMatch(present_byte);
  49. }
  50. auto MatchPresent() const -> SimdMatchPresentRange {
  51. return SimdMatchPresent();
  52. }
  53. auto MatchEmpty() const -> MatchIndex { return SimdMatchEmpty(); }
  54. auto MatchDeleted() const -> MatchIndex { return SimdMatchDeleted(); }
  55. };
  56. #endif
  57. namespace {
  58. // The number of metadata groups we use when benchmarking a particular scenario
  59. // of matching within a group.
  60. constexpr ssize_t BenchSize = 256;
  61. #if CARBON_NEON_SIMD_SUPPORT || CARBON_X86_SIMD_SUPPORT
  62. using PortableGroup = BenchmarkPortableMetadataGroup;
  63. using SimdGroup = BenchmarkSimdMetadataGroup;
  64. #endif
  65. struct BenchMetadata {
  66. // The metadata for benchmarking, arranged in `BenchSize` groups, each one
  67. // `GroupSize` in length. As a consequence, the size of this array will always
  68. // be `BenchSize * GroupSize`.
  69. llvm::MutableArrayRef<uint8_t> metadata;
  70. // For benchmarking random matches in the metadata, each byte here is the tag
  71. // that should be matched against the corresponding group of the metadata.
  72. // Because this array parallels the *groups* of the metadata array, its size
  73. // will be `BenchSize`. For other kinds, this is empty.
  74. llvm::ArrayRef<uint8_t> bytes;
  75. };
  76. enum class BenchKind : uint8_t {
  77. Random,
  78. Empty,
  79. Deleted,
  80. };
  81. // This routine should only be called once per `BenchKind` as the initializer of
  82. // a global variable below. It returns an `ArrayRef` pointing into
  83. // function-local static storage that provides our benchmark metadata.
  84. //
  85. // The returned array will have exactly `GroupSize` elements, each of
  86. // `BenchMetadata`. For the `BenchMetadata` at index `i`, there will be `i+1`
  87. // matches of that kind within each group of the metadata. This lets us
  88. // benchmark each of the possible match-counts for a group.
  89. template <BenchKind Kind = BenchKind::Random>
  90. static auto BuildBenchMetadata() -> llvm::ArrayRef<BenchMetadata> {
  91. // We build `GroupSize` elements of `BenchMetadata` below, and so we need
  92. // `GroupSize` copies of each of these arrays to serve as inputs to it.
  93. //
  94. // The first storage is of `BenchSize` groups of metadata.
  95. static uint8_t metadata_storage[GroupSize][BenchSize * GroupSize];
  96. // When `Kind` is `Random`, each group above will have a *different* byte that
  97. // matches in that group. This array stores those bytes for the benchmark to
  98. // match against the group.
  99. static uint8_t bytes_storage[GroupSize][BenchSize];
  100. // The backing storage for the returned `ArrayRef`.
  101. static BenchMetadata bm_storage[GroupSize];
  102. absl::BitGen gen;
  103. for (auto [bm_index, bm] : llvm::enumerate(bm_storage)) {
  104. int match_count = bm_index + 1;
  105. for (ssize_t g_index : llvm::seq<ssize_t>(0, BenchSize)) {
  106. // Start by filling the group with random bytes.
  107. llvm::MutableArrayRef group_bytes(
  108. &metadata_storage[bm_index][g_index * GroupSize], GroupSize);
  109. for (uint8_t& b : group_bytes) {
  110. b = absl::Uniform<uint8_t>(gen) | MetadataGroup::PresentMask;
  111. }
  112. // Now we need up to `match_count` random indices into the group where
  113. // we'll put a matching byte.
  114. std::array<ssize_t, GroupSize> group_indices;
  115. std::iota(group_indices.begin(), group_indices.end(), 0);
  116. std::shuffle(group_indices.begin(), group_indices.end(), gen);
  117. // Now cause the first match index to have the desired value.
  118. ssize_t match_index = *group_indices.begin();
  119. uint8_t& match_b = group_bytes[match_index];
  120. switch (Kind) {
  121. case BenchKind::Random: {
  122. // Already a random value, but we need to ensure it isn't one that
  123. // repeats elsewhere in the group.
  124. while (llvm::count(group_bytes, match_b) > 1) {
  125. match_b = absl::Uniform<uint8_t>(gen) | MetadataGroup::PresentMask;
  126. }
  127. // Store this as the byte to search for in this group, but without the
  128. // present bit to simulate where we start when using a 7-bit tag
  129. // from a hash.
  130. bytes_storage[bm_index][g_index] =
  131. match_b & ~MetadataGroup::PresentMask;
  132. break;
  133. }
  134. case BenchKind::Empty: {
  135. match_b = MetadataGroup::Empty;
  136. break;
  137. }
  138. case BenchKind::Deleted: {
  139. match_b = MetadataGroup::Deleted;
  140. break;
  141. }
  142. }
  143. // Replicate the match byte in each of the other matching indices.
  144. for (ssize_t m_index : llvm::ArrayRef(group_indices)
  145. .drop_front()
  146. .take_front(match_count - 1)) {
  147. group_bytes[m_index] = match_b;
  148. }
  149. }
  150. // Now that the storage is set up, record these in our struct.
  151. bm.metadata = metadata_storage[bm_index];
  152. if constexpr (Kind == BenchKind::Random) {
  153. bm.bytes = bytes_storage[bm_index];
  154. }
  155. }
  156. return bm_storage;
  157. }
  158. template <BenchKind Kind>
  159. // NOLINTNEXTLINE(google-readability-casting): False positive clang-tidy bug.
  160. const auto bench_metadata = BuildBenchMetadata<Kind>();
  161. // Benchmark that simulates the dynamic execution pattern when we match exactly
  162. // one entry in the group, typically then using the index of the matching byte
  163. // to index into an element of a group of entries. But notably, the *first*
  164. // match is sufficient, and we never have to find the *next* match within the
  165. // group.
  166. template <BenchKind Kind, typename GroupT = MetadataGroup>
  167. static void BM_LoadMatch(benchmark::State& s) {
  168. // NOLINTNEXTLINE(google-readability-casting): Same as on `bench_metadata`.
  169. BenchMetadata bm = bench_metadata<Kind>[0];
  170. // We want to make the index used by the next iteration of the benchmark have
  171. // a data dependency on the result of matching. A match produces an index into
  172. // the group of metadata. To consume this match in a way that is
  173. // representative of how it will be used in a hashtable (indexing into an
  174. // array of entries), while establishing that dependence, we keep a
  175. // group-sized array of the value `1` in memory that we can index into to
  176. // increment to the next step of the loop. We do have to hide the contents of
  177. // the loop from the optimizer by clobbering the memory.
  178. ssize_t all_ones[GroupSize];
  179. for (ssize_t& n : all_ones) {
  180. n = 1;
  181. }
  182. benchmark::ClobberMemory();
  183. // We don't want the optimizer to peel iterations off of this loop, so hide
  184. // the starting index.
  185. ssize_t i = 0;
  186. benchmark::DoNotOptimize(i);
  187. // This loop looks *really* attractive to unroll to the compiler. However,
  188. // that can easily overlap some of the memory operations and generally makes
  189. // it harder to analyze the exact operation sequence we care about.
  190. #pragma clang loop unroll(disable)
  191. for (auto _ : s) {
  192. auto g = GroupT::Load(bm.metadata.data(), i * GroupSize);
  193. typename GroupT::MatchIndex matches;
  194. if constexpr (Kind == BenchKind::Empty) {
  195. matches = g.MatchEmpty();
  196. } else if constexpr (Kind == BenchKind::Deleted) {
  197. matches = g.MatchDeleted();
  198. } else {
  199. static_assert(Kind == BenchKind::Random);
  200. matches = static_cast<MetadataGroup::MatchIndex>(g.Match(bm.bytes[i]));
  201. }
  202. // Despite not being a DCHECK, this is fine for benchmarking. In an actual
  203. // hashtable, we expect to have a test for empty of the match prior to using
  204. // it to index an array, and that test is expected to be strongly predicted.
  205. // That exactly matches how the `CARBON_CHECK` macro works, and so this
  206. // serves as both a good correctness test and replication of hashtable usage
  207. // of a match.
  208. CARBON_CHECK(matches);
  209. // Now do the data-dependent increment by indexing our "all ones" array. The
  210. // index into `all_ones` is analogous to the index into a group of hashtable
  211. // entries.
  212. i = (i + all_ones[matches.index()]) & (BenchSize - 1);
  213. }
  214. }
  215. BENCHMARK(BM_LoadMatch<BenchKind::Random>);
  216. BENCHMARK(BM_LoadMatch<BenchKind::Empty>);
  217. BENCHMARK(BM_LoadMatch<BenchKind::Deleted>);
  218. #if CARBON_NEON_SIMD_SUPPORT || CARBON_X86_SIMD_SUPPORT
  219. BENCHMARK(BM_LoadMatch<BenchKind::Random, PortableGroup>);
  220. BENCHMARK(BM_LoadMatch<BenchKind::Empty, PortableGroup>);
  221. BENCHMARK(BM_LoadMatch<BenchKind::Deleted, PortableGroup>);
  222. BENCHMARK(BM_LoadMatch<BenchKind::Random, SimdGroup>);
  223. BENCHMARK(BM_LoadMatch<BenchKind::Empty, SimdGroup>);
  224. BENCHMARK(BM_LoadMatch<BenchKind::Deleted, SimdGroup>);
  225. #endif
  226. // Benchmark that measures the speed of a match that is only found after at
  227. // least one miss. Because the first match doesn't work, this covers
  228. // incrementing to the next match, with a number of increments taken from the
  229. // `Step` template parameter.
  230. template <BenchKind Kind, ssize_t Steps>
  231. static void BM_LoadMatchMissSteps(benchmark::State& s) {
  232. static_assert(Steps > 0);
  233. static_assert(Steps <= GroupSize);
  234. // We pick the benchmark metadata at index `Steps - 1`, which will have
  235. // `Steps` matches within each group.
  236. BenchMetadata bm = bench_metadata<Kind>[Steps - 1];
  237. // We want to make the index used by the next iteration of the benchmark have
  238. // a data dependency on the result of matching. A match produces an index into
  239. // the group of metadata. To consume this match in a way that is
  240. // representative of how it will be used in a hashtable (indexing into an
  241. // array of entries), while establishing that dependence, we keep a
  242. // group-sized array of the value `1` in memory that we can index into to
  243. // increment to the next step of the loop. We do have to hide the contents of
  244. // the loop from the optimizer by clobbering the memory.
  245. ssize_t all_ones[GroupSize];
  246. for (ssize_t& n : all_ones) {
  247. n = 1;
  248. }
  249. benchmark::ClobberMemory();
  250. // We don't want the optimizer to peel iterations off of this loop, so hide
  251. // the starting index.
  252. ssize_t i = 0;
  253. benchmark::DoNotOptimize(i);
  254. // This loop looks *really* attractive to unroll to the compiler. However,
  255. // that can easily overlap some of the memory operations and generally makes
  256. // it harder to analyze the exact operation sequence we care about.
  257. #pragma clang loop unroll(disable)
  258. for (auto _ : s) {
  259. auto g = MetadataGroup::Load(bm.metadata.data(), i * GroupSize);
  260. auto matched_range = g.Match(bm.bytes[i]);
  261. // We don't use a `CARBON_CHECK` here as the loop below will test the range
  262. // to see if the loop should be skipped, replicating the test that we also
  263. // expect in hashtable usage.
  264. // We want to simulate the code sequence a hashtable would produce when
  265. // matching indices are "misses" in the hashtable, but only the aspects of
  266. // those that reflect on the specific *match* implementation's generated
  267. // code and performance. For each index in the match, we locate it in the
  268. // `matched_range`, extract it as an index, and use that to index a
  269. // group-sized array. We read memory from that array to increment `indices`,
  270. // establishing data dependencies on each match index. This loop will run
  271. // exactly `Steps` times.
  272. ssize_t indices = 0;
  273. for (ssize_t index : matched_range) {
  274. indices += all_ones[index];
  275. }
  276. // We want to propagate the data dependencies accumulated into `indices`
  277. // into the next value of `i`, and we know exactly how many increments were
  278. // done in the loop, so subtract that constant and add one to arrive back at
  279. // an increment of 1.
  280. i = (i + (indices - Steps + 1)) & (BenchSize - 1);
  281. }
  282. }
  283. BENCHMARK(BM_LoadMatchMissSteps<BenchKind::Random, 1>);
  284. BENCHMARK(BM_LoadMatchMissSteps<BenchKind::Random, 2>);
  285. BENCHMARK(BM_LoadMatchMissSteps<BenchKind::Random, 4>);
  286. BENCHMARK(BM_LoadMatchMissSteps<BenchKind::Random, 8>);
  287. #if CARBON_USE_X86_SIMD_CONTROL_GROUP
  288. BENCHMARK(BM_LoadMatchMissSteps<BenchKind::Random, 12>);
  289. BENCHMARK(BM_LoadMatchMissSteps<BenchKind::Random, 16>);
  290. #endif
  291. } // namespace
  292. } // namespace Carbon::RawHashtable