raw_hashtable_metadata_group_benchmark.cpp 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #include <benchmark/benchmark.h>
  5. #include <algorithm>
  6. #include "absl/random/random.h"
  7. #include "common/raw_hashtable_metadata_group.h"
  8. namespace Carbon::RawHashtable {
  9. // If we have any SIMD support, create dedicated benchmark utilities for the
  10. // portable and SIMD implementation so we can directly benchmark both.
  11. #if CARBON_NEON_SIMD_SUPPORT || CARBON_X86_SIMD_SUPPORT
  12. // Override the core API with explicit use of the portable API.
  13. class BenchmarkPortableMetadataGroup : public MetadataGroup {
  14. public:
  15. explicit BenchmarkPortableMetadataGroup(MetadataGroup g) : MetadataGroup(g) {}
  16. static auto Load(uint8_t* metadata, ssize_t index)
  17. -> BenchmarkPortableMetadataGroup {
  18. return BenchmarkPortableMetadataGroup(PortableLoad(metadata, index));
  19. }
  20. auto Store(uint8_t* metadata, ssize_t index) const -> void {
  21. PortableStore(metadata, index);
  22. }
  23. auto ClearDeleted() -> void { PortableClearDeleted(); }
  24. auto Match(uint8_t present_byte) const -> MatchRange {
  25. return PortableMatch(present_byte);
  26. }
  27. auto MatchPresent() const -> MatchRange { return PortableMatchPresent(); }
  28. auto MatchEmpty() const -> MatchIndex { return PortableMatchEmpty(); }
  29. auto MatchDeleted() const -> MatchIndex { return PortableMatchDeleted(); }
  30. };
  31. // Override the core API with explicit use of the SIMD API.
  32. class BenchmarkSIMDMetadataGroup : public MetadataGroup {
  33. public:
  34. explicit BenchmarkSIMDMetadataGroup(MetadataGroup g) : MetadataGroup(g) {}
  35. static auto Load(uint8_t* metadata, ssize_t index)
  36. -> BenchmarkSIMDMetadataGroup {
  37. return BenchmarkSIMDMetadataGroup(SIMDLoad(metadata, index));
  38. }
  39. auto Store(uint8_t* metadata, ssize_t index) const -> void {
  40. SIMDStore(metadata, index);
  41. }
  42. auto ClearDeleted() -> void { SIMDClearDeleted(); }
  43. auto Match(uint8_t present_byte) const -> MatchRange {
  44. return SIMDMatch(present_byte);
  45. }
  46. auto MatchPresent() const -> MatchRange { return SIMDMatchPresent(); }
  47. auto MatchEmpty() const -> MatchIndex { return SIMDMatchEmpty(); }
  48. auto MatchDeleted() const -> MatchIndex { return SIMDMatchDeleted(); }
  49. };
  50. #endif
  51. namespace {
  52. // The number of metadata groups we use when benchmarking a particular scenario
  53. // of matching within a group.
  54. constexpr ssize_t BenchSize = 256;
  55. #if CARBON_NEON_SIMD_SUPPORT || CARBON_X86_SIMD_SUPPORT
  56. using PortableGroup = BenchmarkPortableMetadataGroup;
  57. using SIMDGroup = BenchmarkSIMDMetadataGroup;
  58. #endif
  59. struct BenchMetadata {
  60. // The metadata for benchmarking, arranged in `BenchSize` groups, each one
  61. // `GroupSize` in length. As a consequence, the size of this array will always
  62. // be `BenchSize * GroupSize`.
  63. llvm::MutableArrayRef<uint8_t> metadata;
  64. // For benchmarking random matches in the metadata, each byte here is the tag
  65. // that should be matched against the corresponding group of the metadata.
  66. // Because this array parallels the *groups* of the metadata array, its size
  67. // will be `BenchSize`. For other kinds, this is empty.
  68. llvm::ArrayRef<uint8_t> bytes;
  69. };
  70. enum class BenchKind : uint8_t {
  71. Random,
  72. Empty,
  73. Deleted,
  74. };
  75. // This routine should only be called once per `BenchKind` as the initializer of
  76. // a global variable below. It returns an `ArrayRef` pointing into
  77. // function-local static storage that provides our benchmark metadata.
  78. //
  79. // The returned array will have exactly `GroupSize` elements, each of
  80. // `BenchMetadata`. For the `BenchMetadata` at index `i`, there will be `i+1`
  81. // matches of that kind within each group of the metadata. This lets us
  82. // benchmark each of the possible match-counts for a group.
  83. template <BenchKind Kind = BenchKind::Random>
  84. static auto BuildBenchMetadata() -> llvm::ArrayRef<BenchMetadata> {
  85. // We build `GroupSize` elements of `BenchMetadata` below, and so we need
  86. // `GroupSize` copies of each of these arrays to serve as inputs to it.
  87. //
  88. // The first storage is of `BenchSize` groups of metadata.
  89. static uint8_t metadata_storage[GroupSize][BenchSize * GroupSize];
  90. // When `Kind` is `Random`, each group above will have a *different* byte that
  91. // matches in that group. This array stores those bytes for the benchmark to
  92. // match against the group.
  93. static uint8_t bytes_storage[GroupSize][BenchSize];
  94. // The backing storage for the returned `ArrayRef`.
  95. static BenchMetadata bm_storage[GroupSize];
  96. absl::BitGen gen;
  97. for (auto [bm_index, bm] : llvm::enumerate(bm_storage)) {
  98. int match_count = bm_index + 1;
  99. for (ssize_t g_index : llvm::seq<ssize_t>(0, BenchSize)) {
  100. // Start by filling the group with random bytes.
  101. auto group_bytes = llvm::MutableArrayRef(
  102. &metadata_storage[bm_index][g_index * GroupSize], GroupSize);
  103. for (uint8_t& b : group_bytes) {
  104. b = absl::Uniform<uint8_t>(gen) | MetadataGroup::PresentMask;
  105. }
  106. // Now we need up to `match_count` random indices into the group where
  107. // we'll put a matching byte.
  108. std::array<ssize_t, GroupSize> group_indices;
  109. std::iota(group_indices.begin(), group_indices.end(), 0);
  110. std::shuffle(group_indices.begin(), group_indices.end(), gen);
  111. // Now cause the first match index to have the desired value.
  112. ssize_t match_index = *group_indices.begin();
  113. uint8_t& match_b = group_bytes[match_index];
  114. switch (Kind) {
  115. case BenchKind::Random: {
  116. // Already a random value, but we need to ensure it isn't one that
  117. // repeats elsewhere in the group.
  118. while (llvm::count(group_bytes, match_b) > 1) {
  119. match_b = absl::Uniform<uint8_t>(gen) | MetadataGroup::PresentMask;
  120. }
  121. // Store this as the byte to search for in this group, but without the
  122. // present bit to simulate where we start when using a 7-bit tag
  123. // from a hash.
  124. bytes_storage[bm_index][g_index] =
  125. match_b & ~MetadataGroup::PresentMask;
  126. break;
  127. }
  128. case BenchKind::Empty: {
  129. match_b = MetadataGroup::Empty;
  130. break;
  131. }
  132. case BenchKind::Deleted: {
  133. match_b = MetadataGroup::Deleted;
  134. break;
  135. }
  136. }
  137. // Replicate the match byte in each of the other matching indices.
  138. for (ssize_t m_index : llvm::ArrayRef(group_indices)
  139. .drop_front()
  140. .take_front(match_count - 1)) {
  141. group_bytes[m_index] = match_b;
  142. }
  143. }
  144. // Now that the storage is set up, record these in our struct.
  145. bm.metadata = metadata_storage[bm_index];
  146. if constexpr (Kind == BenchKind::Random) {
  147. bm.bytes = bytes_storage[bm_index];
  148. }
  149. }
  150. return bm_storage;
  151. }
  152. template <BenchKind Kind>
  153. // NOLINTNEXTLINE(google-readability-casting): False positive clang-tidy bug.
  154. const auto bench_metadata = BuildBenchMetadata<Kind>();
  155. // Benchmark that simulates the dynamic execution pattern when we match exactly
  156. // one entry in the group, typically then using the index of the matching byte
  157. // to index into an element of a group of entries. But notably, the *first*
  158. // match is sufficient, and we never have to find the *next* match within the
  159. // group.
  160. template <BenchKind Kind, typename GroupT = MetadataGroup>
  161. static void BM_LoadMatch(benchmark::State& s) {
  162. BenchMetadata bm = bench_metadata<Kind>[0];
  163. // We want to make the index used by the next iteration of the benchmark have
  164. // a data dependency on the result of matching. A match produces an index into
  165. // the group of metadata. To consume this match in a way that is
  166. // representative of how it will be used in a hashtable (indexing into an
  167. // array of entries), while establishing that dependence, we keep a
  168. // group-sized array of the value `1` in memory that we can index into to
  169. // increment to the next step of the loop. We do have to hide the contents of
  170. // the loop from the optimizer by clobbering the memory.
  171. ssize_t all_ones[GroupSize];
  172. for (ssize_t& n : all_ones) {
  173. n = 1;
  174. }
  175. benchmark::ClobberMemory();
  176. // We don't want the optimizer to peel iterations off of this loop, so hide
  177. // the starting index.
  178. ssize_t i = 0;
  179. benchmark::DoNotOptimize(i);
  180. // This loop looks *really* attractive to unroll to the compiler. However,
  181. // that can easily overlap some of the memory operations and generally makes
  182. // it harder to analyze the exact operation sequence we care about.
  183. #pragma clang loop unroll(disable)
  184. for (auto _ : s) {
  185. auto g = GroupT::Load(bm.metadata.data(), i * GroupSize);
  186. typename GroupT::MatchIndex matches;
  187. if constexpr (Kind == BenchKind::Empty) {
  188. matches = g.MatchEmpty();
  189. } else if constexpr (Kind == BenchKind::Deleted) {
  190. matches = g.MatchDeleted();
  191. } else {
  192. static_assert(Kind == BenchKind::Random);
  193. matches = static_cast<MetadataGroup::MatchIndex>(g.Match(bm.bytes[i]));
  194. }
  195. // Despite not being a DCHECK, this is fine for benchmarking. In an actual
  196. // hashtable, we expect to have a test for empty of the match prior to using
  197. // it to index an array, and that test is expected to be strongly predicted.
  198. // That exactly matches how the `CARBON_CHECK` macro works, and so this
  199. // serves as both a good correctness test and replication of hashtable usage
  200. // of a match.
  201. CARBON_CHECK(matches);
  202. // Now do the data-dependent increment by indexing our "all ones" array. The
  203. // index into `all_ones` is analogous to the index into a group of hashtable
  204. // entries.
  205. i = (i + all_ones[matches.index()]) & (BenchSize - 1);
  206. }
  207. }
  208. BENCHMARK(BM_LoadMatch<BenchKind::Random>);
  209. BENCHMARK(BM_LoadMatch<BenchKind::Empty>);
  210. BENCHMARK(BM_LoadMatch<BenchKind::Deleted>);
  211. #if CARBON_NEON_SIMD_SUPPORT || CARBON_X86_SIMD_SUPPORT
  212. BENCHMARK(BM_LoadMatch<BenchKind::Random, PortableGroup>);
  213. BENCHMARK(BM_LoadMatch<BenchKind::Empty, PortableGroup>);
  214. BENCHMARK(BM_LoadMatch<BenchKind::Deleted, PortableGroup>);
  215. BENCHMARK(BM_LoadMatch<BenchKind::Random, SIMDGroup>);
  216. BENCHMARK(BM_LoadMatch<BenchKind::Empty, SIMDGroup>);
  217. BENCHMARK(BM_LoadMatch<BenchKind::Deleted, SIMDGroup>);
  218. #endif
  219. // Benchmark that measures the speed of a match that is only found after at
  220. // least one miss. Because the first match doesn't work, this covers
  221. // incrementing to the next match, with a number of increments taken from the
  222. // `Step` template parameter.
  223. template <BenchKind Kind, ssize_t Steps>
  224. static void BM_LoadMatchMissSteps(benchmark::State& s) {
  225. static_assert(Steps > 0);
  226. static_assert(Steps <= GroupSize);
  227. // We pick the benchmark metadata at index `Steps - 1`, which will have
  228. // `Steps` matches within each group.
  229. BenchMetadata bm = bench_metadata<Kind>[Steps - 1];
  230. // We want to make the index used by the next iteration of the benchmark have
  231. // a data dependency on the result of matching. A match produces an index into
  232. // the group of metadata. To consume this match in a way that is
  233. // representative of how it will be used in a hashtable (indexing into an
  234. // array of entries), while establishing that dependence, we keep a
  235. // group-sized array of the value `1` in memory that we can index into to
  236. // increment to the next step of the loop. We do have to hide the contents of
  237. // the loop from the optimizer by clobbering the memory.
  238. ssize_t all_ones[GroupSize];
  239. for (ssize_t& n : all_ones) {
  240. n = 1;
  241. }
  242. benchmark::ClobberMemory();
  243. // We don't want the optimizer to peel iterations off of this loop, so hide
  244. // the starting index.
  245. ssize_t i = 0;
  246. benchmark::DoNotOptimize(i);
  247. // This loop looks *really* attractive to unroll to the compiler. However,
  248. // that can easily overlap some of the memory operations and generally makes
  249. // it harder to analyze the exact operation sequence we care about.
  250. #pragma clang loop unroll(disable)
  251. for (auto _ : s) {
  252. auto g = MetadataGroup::Load(bm.metadata.data(), i * GroupSize);
  253. auto matched_range = g.Match(bm.bytes[i]);
  254. // We don't use a `CARBON_CHECK` here as the loop below will test the range
  255. // to see if the loop should be skipped, replicating the test that we also
  256. // expect in hashtable usage.
  257. // We want to simulate the code sequence a hashtable would produce when
  258. // matching indices are "misses" in the hashtable, but only the aspects of
  259. // those that reflect on the specific *match* implementation's generated
  260. // code and performance. For each index in the match, we locate it in the
  261. // `matched_range`, extract it as an index, and use that to index a
  262. // group-sized array. We read memory from that array to increment `indices`,
  263. // establishing data dependencies on each match index. This loop will run
  264. // exactly `Steps` times.
  265. ssize_t indices = 0;
  266. for (ssize_t index : matched_range) {
  267. indices += all_ones[index];
  268. }
  269. // We want to propagate the data dependencies accumulated into `indices`
  270. // into the next value of `i`, and we know exactly how many increments were
  271. // done in the loop, so subtract that constant and add one to arrive back at
  272. // an increment of 1.
  273. i = (i + (indices - Steps + 1)) & (BenchSize - 1);
  274. }
  275. }
  276. BENCHMARK(BM_LoadMatchMissSteps<BenchKind::Random, 1>);
  277. BENCHMARK(BM_LoadMatchMissSteps<BenchKind::Random, 2>);
  278. BENCHMARK(BM_LoadMatchMissSteps<BenchKind::Random, 4>);
  279. BENCHMARK(BM_LoadMatchMissSteps<BenchKind::Random, 8>);
  280. #if CARBON_USE_X86_SIMD_CONTROL_GROUP
  281. BENCHMARK(BM_LoadMatchMissSteps<BenchKind::Random, 12>);
  282. BENCHMARK(BM_LoadMatchMissSteps<BenchKind::Random, 16>);
  283. #endif
  284. } // namespace
  285. } // namespace Carbon::RawHashtable