raw_hashtable_metadata_group_benchmark.cpp 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #include <benchmark/benchmark.h>
  5. #include <algorithm>
  6. #include "absl/random/random.h"
  7. #include "common/raw_hashtable_metadata_group.h"
  8. namespace Carbon::RawHashtable {
  9. // If we have any SIMD support, create dedicated benchmark utilities for the
  10. // portable and SIMD implementation so we can directly benchmark both.
  11. #if CARBON_NEON_SIMD_SUPPORT || CARBON_X86_SIMD_SUPPORT
  12. // Override the core API with explicit use of the portable API.
  13. class BenchmarkPortableMetadataGroup : public MetadataGroup {
  14. public:
  15. explicit BenchmarkPortableMetadataGroup(MetadataGroup g) : MetadataGroup(g) {}
  16. static auto Load(uint8_t* metadata, ssize_t index)
  17. -> BenchmarkPortableMetadataGroup {
  18. return BenchmarkPortableMetadataGroup(PortableLoad(metadata, index));
  19. }
  20. auto Store(uint8_t* metadata, ssize_t index) const -> void {
  21. PortableStore(metadata, index);
  22. }
  23. auto ClearDeleted() -> void { PortableClearDeleted(); }
  24. auto Match(uint8_t present_byte) const -> PortableMatchRange {
  25. return PortableMatch(present_byte);
  26. }
  27. auto MatchPresent() const -> PortableMatchRange {
  28. return PortableMatchPresent();
  29. }
  30. auto MatchEmpty() const -> MatchIndex { return PortableMatchEmpty(); }
  31. auto MatchDeleted() const -> MatchIndex { return PortableMatchDeleted(); }
  32. };
  33. // Override the core API with explicit use of the SIMD API.
  34. class BenchmarkSIMDMetadataGroup : public MetadataGroup {
  35. public:
  36. explicit BenchmarkSIMDMetadataGroup(MetadataGroup g) : MetadataGroup(g) {}
  37. static auto Load(uint8_t* metadata, ssize_t index)
  38. -> BenchmarkSIMDMetadataGroup {
  39. return BenchmarkSIMDMetadataGroup(SIMDLoad(metadata, index));
  40. }
  41. auto Store(uint8_t* metadata, ssize_t index) const -> void {
  42. SIMDStore(metadata, index);
  43. }
  44. auto ClearDeleted() -> void { SIMDClearDeleted(); }
  45. auto Match(uint8_t present_byte) const -> SIMDMatchRange {
  46. return SIMDMatch(present_byte);
  47. }
  48. auto MatchPresent() const -> SIMDMatchPresentRange {
  49. return SIMDMatchPresent();
  50. }
  51. auto MatchEmpty() const -> MatchIndex { return SIMDMatchEmpty(); }
  52. auto MatchDeleted() const -> MatchIndex { return SIMDMatchDeleted(); }
  53. };
  54. #endif
  55. namespace {
  56. // The number of metadata groups we use when benchmarking a particular scenario
  57. // of matching within a group.
  58. constexpr ssize_t BenchSize = 256;
  59. #if CARBON_NEON_SIMD_SUPPORT || CARBON_X86_SIMD_SUPPORT
  60. using PortableGroup = BenchmarkPortableMetadataGroup;
  61. using SIMDGroup = BenchmarkSIMDMetadataGroup;
  62. #endif
  63. struct BenchMetadata {
  64. // The metadata for benchmarking, arranged in `BenchSize` groups, each one
  65. // `GroupSize` in length. As a consequence, the size of this array will always
  66. // be `BenchSize * GroupSize`.
  67. llvm::MutableArrayRef<uint8_t> metadata;
  68. // For benchmarking random matches in the metadata, each byte here is the tag
  69. // that should be matched against the corresponding group of the metadata.
  70. // Because this array parallels the *groups* of the metadata array, its size
  71. // will be `BenchSize`. For other kinds, this is empty.
  72. llvm::ArrayRef<uint8_t> bytes;
  73. };
  74. enum class BenchKind : uint8_t {
  75. Random,
  76. Empty,
  77. Deleted,
  78. };
  79. // This routine should only be called once per `BenchKind` as the initializer of
  80. // a global variable below. It returns an `ArrayRef` pointing into
  81. // function-local static storage that provides our benchmark metadata.
  82. //
  83. // The returned array will have exactly `GroupSize` elements, each of
  84. // `BenchMetadata`. For the `BenchMetadata` at index `i`, there will be `i+1`
  85. // matches of that kind within each group of the metadata. This lets us
  86. // benchmark each of the possible match-counts for a group.
  87. template <BenchKind Kind = BenchKind::Random>
  88. static auto BuildBenchMetadata() -> llvm::ArrayRef<BenchMetadata> {
  89. // We build `GroupSize` elements of `BenchMetadata` below, and so we need
  90. // `GroupSize` copies of each of these arrays to serve as inputs to it.
  91. //
  92. // The first storage is of `BenchSize` groups of metadata.
  93. static uint8_t metadata_storage[GroupSize][BenchSize * GroupSize];
  94. // When `Kind` is `Random`, each group above will have a *different* byte that
  95. // matches in that group. This array stores those bytes for the benchmark to
  96. // match against the group.
  97. static uint8_t bytes_storage[GroupSize][BenchSize];
  98. // The backing storage for the returned `ArrayRef`.
  99. static BenchMetadata bm_storage[GroupSize];
  100. absl::BitGen gen;
  101. for (auto [bm_index, bm] : llvm::enumerate(bm_storage)) {
  102. int match_count = bm_index + 1;
  103. for (ssize_t g_index : llvm::seq<ssize_t>(0, BenchSize)) {
  104. // Start by filling the group with random bytes.
  105. llvm::MutableArrayRef group_bytes(
  106. &metadata_storage[bm_index][g_index * GroupSize], GroupSize);
  107. for (uint8_t& b : group_bytes) {
  108. b = absl::Uniform<uint8_t>(gen) | MetadataGroup::PresentMask;
  109. }
  110. // Now we need up to `match_count` random indices into the group where
  111. // we'll put a matching byte.
  112. std::array<ssize_t, GroupSize> group_indices;
  113. std::iota(group_indices.begin(), group_indices.end(), 0);
  114. std::shuffle(group_indices.begin(), group_indices.end(), gen);
  115. // Now cause the first match index to have the desired value.
  116. ssize_t match_index = *group_indices.begin();
  117. uint8_t& match_b = group_bytes[match_index];
  118. switch (Kind) {
  119. case BenchKind::Random: {
  120. // Already a random value, but we need to ensure it isn't one that
  121. // repeats elsewhere in the group.
  122. while (llvm::count(group_bytes, match_b) > 1) {
  123. match_b = absl::Uniform<uint8_t>(gen) | MetadataGroup::PresentMask;
  124. }
  125. // Store this as the byte to search for in this group, but without the
  126. // present bit to simulate where we start when using a 7-bit tag
  127. // from a hash.
  128. bytes_storage[bm_index][g_index] =
  129. match_b & ~MetadataGroup::PresentMask;
  130. break;
  131. }
  132. case BenchKind::Empty: {
  133. match_b = MetadataGroup::Empty;
  134. break;
  135. }
  136. case BenchKind::Deleted: {
  137. match_b = MetadataGroup::Deleted;
  138. break;
  139. }
  140. }
  141. // Replicate the match byte in each of the other matching indices.
  142. for (ssize_t m_index : llvm::ArrayRef(group_indices)
  143. .drop_front()
  144. .take_front(match_count - 1)) {
  145. group_bytes[m_index] = match_b;
  146. }
  147. }
  148. // Now that the storage is set up, record these in our struct.
  149. bm.metadata = metadata_storage[bm_index];
  150. if constexpr (Kind == BenchKind::Random) {
  151. bm.bytes = bytes_storage[bm_index];
  152. }
  153. }
  154. return bm_storage;
  155. }
  156. template <BenchKind Kind>
  157. // NOLINTNEXTLINE(google-readability-casting): False positive clang-tidy bug.
  158. const auto bench_metadata = BuildBenchMetadata<Kind>();
  159. // Benchmark that simulates the dynamic execution pattern when we match exactly
  160. // one entry in the group, typically then using the index of the matching byte
  161. // to index into an element of a group of entries. But notably, the *first*
  162. // match is sufficient, and we never have to find the *next* match within the
  163. // group.
  164. template <BenchKind Kind, typename GroupT = MetadataGroup>
  165. static void BM_LoadMatch(benchmark::State& s) {
  166. // NOLINTNEXTLINE(google-readability-casting): Same as on `bench_metadata`.
  167. BenchMetadata bm = bench_metadata<Kind>[0];
  168. // We want to make the index used by the next iteration of the benchmark have
  169. // a data dependency on the result of matching. A match produces an index into
  170. // the group of metadata. To consume this match in a way that is
  171. // representative of how it will be used in a hashtable (indexing into an
  172. // array of entries), while establishing that dependence, we keep a
  173. // group-sized array of the value `1` in memory that we can index into to
  174. // increment to the next step of the loop. We do have to hide the contents of
  175. // the loop from the optimizer by clobbering the memory.
  176. ssize_t all_ones[GroupSize];
  177. for (ssize_t& n : all_ones) {
  178. n = 1;
  179. }
  180. benchmark::ClobberMemory();
  181. // We don't want the optimizer to peel iterations off of this loop, so hide
  182. // the starting index.
  183. ssize_t i = 0;
  184. benchmark::DoNotOptimize(i);
  185. // This loop looks *really* attractive to unroll to the compiler. However,
  186. // that can easily overlap some of the memory operations and generally makes
  187. // it harder to analyze the exact operation sequence we care about.
  188. #pragma clang loop unroll(disable)
  189. for (auto _ : s) {
  190. auto g = GroupT::Load(bm.metadata.data(), i * GroupSize);
  191. typename GroupT::MatchIndex matches;
  192. if constexpr (Kind == BenchKind::Empty) {
  193. matches = g.MatchEmpty();
  194. } else if constexpr (Kind == BenchKind::Deleted) {
  195. matches = g.MatchDeleted();
  196. } else {
  197. static_assert(Kind == BenchKind::Random);
  198. matches = static_cast<MetadataGroup::MatchIndex>(g.Match(bm.bytes[i]));
  199. }
  200. // Despite not being a DCHECK, this is fine for benchmarking. In an actual
  201. // hashtable, we expect to have a test for empty of the match prior to using
  202. // it to index an array, and that test is expected to be strongly predicted.
  203. // That exactly matches how the `CARBON_CHECK` macro works, and so this
  204. // serves as both a good correctness test and replication of hashtable usage
  205. // of a match.
  206. CARBON_CHECK(matches);
  207. // Now do the data-dependent increment by indexing our "all ones" array. The
  208. // index into `all_ones` is analogous to the index into a group of hashtable
  209. // entries.
  210. i = (i + all_ones[matches.index()]) & (BenchSize - 1);
  211. }
  212. }
  213. BENCHMARK(BM_LoadMatch<BenchKind::Random>);
  214. BENCHMARK(BM_LoadMatch<BenchKind::Empty>);
  215. BENCHMARK(BM_LoadMatch<BenchKind::Deleted>);
  216. #if CARBON_NEON_SIMD_SUPPORT || CARBON_X86_SIMD_SUPPORT
  217. BENCHMARK(BM_LoadMatch<BenchKind::Random, PortableGroup>);
  218. BENCHMARK(BM_LoadMatch<BenchKind::Empty, PortableGroup>);
  219. BENCHMARK(BM_LoadMatch<BenchKind::Deleted, PortableGroup>);
  220. BENCHMARK(BM_LoadMatch<BenchKind::Random, SIMDGroup>);
  221. BENCHMARK(BM_LoadMatch<BenchKind::Empty, SIMDGroup>);
  222. BENCHMARK(BM_LoadMatch<BenchKind::Deleted, SIMDGroup>);
  223. #endif
  224. // Benchmark that measures the speed of a match that is only found after at
  225. // least one miss. Because the first match doesn't work, this covers
  226. // incrementing to the next match, with a number of increments taken from the
  227. // `Step` template parameter.
  228. template <BenchKind Kind, ssize_t Steps>
  229. static void BM_LoadMatchMissSteps(benchmark::State& s) {
  230. static_assert(Steps > 0);
  231. static_assert(Steps <= GroupSize);
  232. // We pick the benchmark metadata at index `Steps - 1`, which will have
  233. // `Steps` matches within each group.
  234. BenchMetadata bm = bench_metadata<Kind>[Steps - 1];
  235. // We want to make the index used by the next iteration of the benchmark have
  236. // a data dependency on the result of matching. A match produces an index into
  237. // the group of metadata. To consume this match in a way that is
  238. // representative of how it will be used in a hashtable (indexing into an
  239. // array of entries), while establishing that dependence, we keep a
  240. // group-sized array of the value `1` in memory that we can index into to
  241. // increment to the next step of the loop. We do have to hide the contents of
  242. // the loop from the optimizer by clobbering the memory.
  243. ssize_t all_ones[GroupSize];
  244. for (ssize_t& n : all_ones) {
  245. n = 1;
  246. }
  247. benchmark::ClobberMemory();
  248. // We don't want the optimizer to peel iterations off of this loop, so hide
  249. // the starting index.
  250. ssize_t i = 0;
  251. benchmark::DoNotOptimize(i);
  252. // This loop looks *really* attractive to unroll to the compiler. However,
  253. // that can easily overlap some of the memory operations and generally makes
  254. // it harder to analyze the exact operation sequence we care about.
  255. #pragma clang loop unroll(disable)
  256. for (auto _ : s) {
  257. auto g = MetadataGroup::Load(bm.metadata.data(), i * GroupSize);
  258. auto matched_range = g.Match(bm.bytes[i]);
  259. // We don't use a `CARBON_CHECK` here as the loop below will test the range
  260. // to see if the loop should be skipped, replicating the test that we also
  261. // expect in hashtable usage.
  262. // We want to simulate the code sequence a hashtable would produce when
  263. // matching indices are "misses" in the hashtable, but only the aspects of
  264. // those that reflect on the specific *match* implementation's generated
  265. // code and performance. For each index in the match, we locate it in the
  266. // `matched_range`, extract it as an index, and use that to index a
  267. // group-sized array. We read memory from that array to increment `indices`,
  268. // establishing data dependencies on each match index. This loop will run
  269. // exactly `Steps` times.
  270. ssize_t indices = 0;
  271. for (ssize_t index : matched_range) {
  272. indices += all_ones[index];
  273. }
  274. // We want to propagate the data dependencies accumulated into `indices`
  275. // into the next value of `i`, and we know exactly how many increments were
  276. // done in the loop, so subtract that constant and add one to arrive back at
  277. // an increment of 1.
  278. i = (i + (indices - Steps + 1)) & (BenchSize - 1);
  279. }
  280. }
  281. BENCHMARK(BM_LoadMatchMissSteps<BenchKind::Random, 1>);
  282. BENCHMARK(BM_LoadMatchMissSteps<BenchKind::Random, 2>);
  283. BENCHMARK(BM_LoadMatchMissSteps<BenchKind::Random, 4>);
  284. BENCHMARK(BM_LoadMatchMissSteps<BenchKind::Random, 8>);
  285. #if CARBON_USE_X86_SIMD_CONTROL_GROUP
  286. BENCHMARK(BM_LoadMatchMissSteps<BenchKind::Random, 12>);
  287. BENCHMARK(BM_LoadMatchMissSteps<BenchKind::Random, 16>);
  288. #endif
  289. } // namespace
  290. } // namespace Carbon::RawHashtable