raw_hashtable_metadata_group.h 44 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #ifndef CARBON_COMMON_RAW_HASHTABLE_METADATA_GROUP_H_
  5. #define CARBON_COMMON_RAW_HASHTABLE_METADATA_GROUP_H_
  6. #include <cstddef>
  7. #include <cstring>
  8. #include <iterator>
  9. #include "common/check.h"
  10. #include "common/ostream.h"
  11. #include "llvm/ADT/Sequence.h"
  12. #include "llvm/ADT/bit.h"
  13. #include "llvm/Support/FormatVariadic.h"
  14. #include "llvm/Support/MathExtras.h"
  15. // Detect whether we can use SIMD accelerated implementations of the control
  16. // groups, and include the relevant platform specific APIs for the SIMD
  17. // implementations.
  18. //
  19. // Reference documentation for the SIMD APIs used here:
  20. // - https://arm-software.github.io/acle/neon_intrinsics/advsimd.html
  21. // - https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html
  22. #if defined(__SSSE3__)
  23. #include <x86intrin.h>
  24. #define CARBON_X86_SIMD_SUPPORT 1
  25. #elif defined(__ARM_NEON)
  26. #include <arm_neon.h>
  27. #define CARBON_NEON_SIMD_SUPPORT 1
  28. #endif
  29. // This namespace collects low-level utilities for implementing hashtable
  30. // data structures. This file only provides one of them:
  31. //
  32. // - Primitives to manage "groups" of hashtable entries that have densely packed
  33. // control bytes we can scan rapidly as a group, often using SIMD facilities
  34. // to process the entire group at once.
  35. namespace Carbon::RawHashtable {
  36. // We define a constant max group size. The particular group size used in
  37. // practice may vary, but we want to have some upper bound used to ensure
  38. // memory allocation is done consistently across different architectures.
  39. constexpr ssize_t MaxGroupSize = 16;
  40. // This takes a collection of bits representing the results of looking for a
  41. // particular tag in this metadata group and determines the first position with
  42. // a match. The position is represented by either the least significant set bit
  43. // or the least significant non-zero byte, depending on `ByteEncoding`. When
  44. // represented with a non-zero byte, that byte must have at least its most
  45. // significant bit set, but may have other bits set to any value. Bits more
  46. // significant than the match may have any value provided there is at least one
  47. // match. Zero matches must be represented by a zero input.
  48. //
  49. // Some bits of the underlying value may be known-zero, which can optimize
  50. // various operations. These can be represented as a `ZeroMask`.
  51. template <typename BitsInputT, bool ByteEncoding, BitsInputT ZeroMask = 0>
  52. class BitIndex
  53. : public Printable<BitIndex<BitsInputT, ByteEncoding, ZeroMask>> {
  54. public:
  55. using BitsT = BitsInputT;
  56. BitIndex() = default;
  57. explicit BitIndex(BitsT bits) : bits_(bits) {}
  58. friend auto operator==(BitIndex lhs, BitIndex rhs) -> bool {
  59. if (lhs.empty() || rhs.empty()) {
  60. return lhs.empty() == rhs.empty();
  61. }
  62. // For non-empty bit indices, compare the indices directly to ignore other
  63. // (extraneous) parts of the incoming bits.
  64. return lhs.index() == rhs.index();
  65. }
  66. auto Print(llvm::raw_ostream& out) const -> void {
  67. out << llvm::formatv("{0:x}", bits_);
  68. }
  69. explicit operator bool() const { return !empty(); }
  70. // Returns true when there are no matches for the tag.
  71. auto empty() const -> bool {
  72. CARBON_DCHECK((bits_ & ZeroMask) == 0, "Unexpected non-zero bits!");
  73. __builtin_assume((bits_ & ZeroMask) == 0);
  74. return bits_ == 0;
  75. }
  76. // Returns the index of the first matched tag.
  77. auto index() -> ssize_t {
  78. CARBON_DCHECK(bits_ != 0, "Cannot get an index from zero bits!");
  79. __builtin_assume(bits_ != 0);
  80. ssize_t index = unscaled_index();
  81. if constexpr (ByteEncoding) {
  82. // Shift to scale out of the byte encoding.
  83. index >>= ByteEncodingShift;
  84. }
  85. return index;
  86. }
  87. // Optimized tool to index a pointer `p` by `index()`.
  88. template <typename T>
  89. auto index_ptr(T* pointer) -> T* {
  90. CARBON_DCHECK(bits_ != 0, "Cannot get an index from zero bits!");
  91. __builtin_assume(bits_ != 0);
  92. if constexpr (!ByteEncoding) {
  93. return &pointer[unscaled_index()];
  94. }
  95. ssize_t index = unscaled_index();
  96. // Scale the index as we counted zero *bits* and not zero *bytes*.
  97. // However, we can fold that scale with the size of `T` when it is a power
  98. // of two or divisible by 8.
  99. CARBON_DCHECK(
  100. (index & ((static_cast<size_t>(1) << ByteEncodingShift) - 1)) == 0);
  101. if constexpr (sizeof(T) % 8 == 0) {
  102. constexpr size_t FoldedScale = sizeof(T) / 8;
  103. index *= FoldedScale;
  104. return reinterpret_cast<T*>(
  105. &reinterpret_cast<std::byte*>(pointer)[index]);
  106. } else if constexpr (llvm::isPowerOf2_64(sizeof(T))) {
  107. constexpr size_t ScaleShift = llvm::CTLog2<sizeof(T)>();
  108. static_assert(ScaleShift <= ByteEncodingShift,
  109. "Scaling by >=8 should be handled above!");
  110. constexpr size_t FoldedShift = ByteEncodingShift - ScaleShift;
  111. index >>= FoldedShift;
  112. return reinterpret_cast<T*>(
  113. &reinterpret_cast<std::byte*>(pointer)[index]);
  114. }
  115. // Nothing we can fold here.
  116. return &pointer[index >> ByteEncodingShift];
  117. }
  118. private:
  119. // When using a byte encoding, we'll need to shift any index by this amount.
  120. static constexpr size_t ByteEncodingShift = 3;
  121. auto unscaled_index() -> ssize_t {
  122. if constexpr (!ByteEncoding) {
  123. // Note the cast to `size_t` to force zero extending the result.
  124. return static_cast<size_t>(llvm::countr_zero(bits_));
  125. } else {
  126. // The index is encoded in the high bit of each byte. We compute the index
  127. // by counting the number of low zero bytes there are before the first
  128. // byte with its high bit set. Rather that shifting the high bit to be the
  129. // low bit and counting the trailing (least significant) zero bits
  130. // directly, we instead byte-reverse the bits and count the *leading*
  131. // (most significant) zero bits. While this may be a wash on CPUs with
  132. // direct support for counting the trailing zero bits, AArch64 only
  133. // supports counting the leading zero bits and requires a bit-reverse to
  134. // count the trailing zero bits. Doing the byte-reverse approach
  135. // essentially combines moving the high bit into the low bit and the
  136. // reverse necessary for counting the zero bits. While this only removes
  137. // one instruction, it is an instruction in the critical path of the
  138. // hottest part of table lookup, and that critical path dependency height
  139. // is few enough instructions that removing even one significantly impacts
  140. // latency.
  141. //
  142. // We also cast to `size_t` to clearly zero-extend the result.
  143. return static_cast<size_t>(llvm::countl_zero(llvm::byteswap(bits_)));
  144. }
  145. }
  146. BitsT bits_ = 0;
  147. };
  148. // This is like `BitIndex`, but allows iterating through all of the matches.
  149. //
  150. // A key requirement for efficient iteration is that all of the matches are
  151. // represented with a single bit and there are no other bits set. For example,
  152. // with byte-encoded bit indices, exactly the high bit and no other bit of each
  153. // matching byte must be set. This is a stricter constraint than what `BitIndex`
  154. // alone would impose on any one of the matches.
  155. template <typename BitIndexT>
  156. class BitIndexRange : public Printable<BitIndexRange<BitIndexT>> {
  157. public:
  158. using BitsT = BitIndexT::BitsT;
  159. class Iterator
  160. : public llvm::iterator_facade_base<Iterator, std::forward_iterator_tag,
  161. ssize_t, ssize_t> {
  162. public:
  163. Iterator() = default;
  164. explicit Iterator(BitsT bits) : bits_(bits) {}
  165. auto operator==(const Iterator& rhs) const -> bool {
  166. return bits_ == rhs.bits_;
  167. }
  168. auto operator*() -> ssize_t& {
  169. CARBON_DCHECK(bits_ != 0, "Cannot get an index from zero bits!");
  170. __builtin_assume(bits_ != 0);
  171. index_ = BitIndexT(bits_).index();
  172. // Note that we store the index in a member so we can return a reference
  173. // to it here as required to be a forward iterator.
  174. return index_;
  175. }
  176. template <typename T>
  177. auto index_ptr(T* pointer) -> T* {
  178. return BitIndexT(bits_).index_ptr(pointer);
  179. }
  180. auto operator++() -> Iterator& {
  181. CARBON_DCHECK(bits_ != 0, "Must not increment past the end!");
  182. __builtin_assume(bits_ != 0);
  183. // Clears the least significant set bit, effectively stepping to the next
  184. // match.
  185. bits_ &= (bits_ - 1);
  186. return *this;
  187. }
  188. private:
  189. ssize_t index_;
  190. BitsT bits_ = 0;
  191. };
  192. BitIndexRange() = default;
  193. explicit BitIndexRange(BitsT bits) : bits_(bits) {}
  194. explicit operator bool() const { return !empty(); }
  195. auto empty() const -> bool { return BitIndexT(bits_).empty(); }
  196. auto begin() const -> Iterator { return Iterator(bits_); }
  197. auto end() const -> Iterator { return Iterator(); }
  198. friend auto operator==(BitIndexRange lhs, BitIndexRange rhs) -> bool {
  199. return lhs.bits_ == rhs.bits_;
  200. }
  201. auto Print(llvm::raw_ostream& out) const -> void {
  202. out << llvm::formatv("{0:x}", bits_);
  203. }
  204. explicit operator BitsT() const { return bits_; }
  205. explicit operator BitIndexT() const { return BitIndexT(bits_); }
  206. private:
  207. BitsT bits_ = 0;
  208. };
  209. // A group of metadata bytes that can be manipulated together.
  210. //
  211. // The metadata bytes used Carbon's hashtable implementation are designed to
  212. // support being manipulating as groups, either using architecture specific SIMD
  213. // code sequences or using portable SIMD-in-an-integer-register code sequences.
  214. // These operations are unusually performance sensitive and in sometimes
  215. // surprising ways. The implementations here are crafted specifically to
  216. // optimize the particular usages in Carbon's hashtable and should not be
  217. // expected to be reusable in any other context.
  218. //
  219. // Throughout the functions operating on this type we try to use patterns with a
  220. // fallback portable implementation which can be directly used in the absence of
  221. // a SIMD implementation, but is also used (with the same code) to check that
  222. // any SIMD implementation produces the same result as the portable one. These
  223. // patterns help minimize un-compiled or un-tested paths through either portable
  224. // or SIMD code, regardless of which path is actually *used* on a particular
  225. // platform. To illustrate a common version of this pattern, we might have code
  226. // like:
  227. //
  228. // ```cpp
  229. // auto MetadataGroup::Operation(...) -> ... {
  230. // ... portable_result;
  231. // ... simd_result;
  232. // if constexpr (!UseSIMD || DebugSIMD) {
  233. // portable_result = PortableOperation(...);
  234. // }
  235. // if (UseSIMD || DebugSIMD) {
  236. // simd_result = SIMDOperation(...)
  237. // CARBON_DCHECK(result == portable_result, "{0}", ...);
  238. // }
  239. // return UseSIMD ? simd_result : portable_result;
  240. // }
  241. // ```
  242. class MetadataGroup : public Printable<MetadataGroup> {
  243. public:
  244. static constexpr ssize_t Size =
  245. #if CARBON_X86_SIMD_SUPPORT
  246. 16;
  247. #else
  248. 8;
  249. #endif
  250. static_assert(Size >= 8);
  251. static_assert(Size % 8 == 0);
  252. static_assert(Size <= MaxGroupSize);
  253. static_assert(MaxGroupSize % Size == 0);
  254. static_assert(llvm::isPowerOf2_64(Size),
  255. "The group size must be a constant power of two so dividing by "
  256. "it is a simple shift.");
  257. static constexpr ssize_t Mask = Size - 1;
  258. // Each control byte can have special values. All special values have the
  259. // most significant bit cleared to distinguish them from the seven hash bits
  260. // stored when the control byte represents a full bucket.
  261. //
  262. // Otherwise, their values are chose primarily to provide efficient SIMD
  263. // implementations of the common operations on an entire control group.
  264. static constexpr uint8_t Empty = 0;
  265. static constexpr uint8_t Deleted = 1;
  266. static constexpr uint8_t PresentMask = 0b1000'0000;
  267. // Some architectures make it much more efficient to build the match indices
  268. // in a byte-encoded form rather than a bit-encoded form. This encoding
  269. // changes verification and other aspects of our algorithms.
  270. static constexpr bool ByteEncoding =
  271. #if CARBON_X86_SIMD_SUPPORT
  272. false;
  273. #else
  274. true;
  275. #endif
  276. static_assert(!ByteEncoding || Size == 8,
  277. "We can only support byte encoding with a group size of 8.");
  278. // We need to indicate to users of the metadata group when they can hold a
  279. // group value in a "register" (local variable) across clearing of individual
  280. // bytes in the group efficiently. If the entire group can fit in an integer
  281. // register, this works well and clients of the group should work to use the
  282. // already-loaded value when clearing bytes. But when we have a larger group
  283. // size, clearing the byte will typically require storing a byte to memory and
  284. // re-loading the group. The usage patterns that need to clear bytes can in
  285. // those cases avoid clearing a loaded group, and clear the byte directly in
  286. // the larger metadata array.
  287. static constexpr bool FastByteClear = Size == 8;
  288. using MatchIndex =
  289. BitIndex<std::conditional_t<ByteEncoding, uint64_t, uint32_t>,
  290. ByteEncoding,
  291. /*ZeroMask=*/ByteEncoding ? 0 : (~0U << Size)>;
  292. using MatchRange = BitIndexRange<MatchIndex>;
  293. union {
  294. uint8_t metadata_bytes[Size];
  295. uint64_t metadata_ints[Size / 8];
  296. #if CARBON_NEON_SIMD_SUPPORT
  297. uint8x8_t metadata_vec = {};
  298. static_assert(sizeof(metadata_vec) == Size);
  299. #elif CARBON_X86_SIMD_SUPPORT
  300. __m128i metadata_vec = {};
  301. static_assert(sizeof(metadata_vec) == Size);
  302. #endif
  303. };
  304. auto Print(llvm::raw_ostream& out) const -> void;
  305. friend auto operator==(MetadataGroup lhs, MetadataGroup rhs) -> bool {
  306. return CompareEqual(lhs, rhs);
  307. }
  308. // The main API for this class. This API will switch between a portable and
  309. // SIMD implementation based on what is most efficient, but in debug builds
  310. // will cross check that the implementations do not diverge.
  311. // Load and return a group of metadata bytes out of the main metadata array at
  312. // a particular `index`. The index must be a multiple of `GroupSize`. This
  313. // will arrange for the load to place the group into the correct structure for
  314. // efficient register-based processing.
  315. static auto Load(const uint8_t* metadata, ssize_t index) -> MetadataGroup;
  316. // Store this metadata group into the main metadata array at the provided
  317. // `index`. The index must be a multiple of `GroupSize`.
  318. auto Store(uint8_t* metadata, ssize_t index) const -> void;
  319. // Clear a byte of this group's metadata at the provided `byte_index` to the
  320. // empty value. Note that this must only be called when `FastByteClear` is
  321. // true -- in all other cases users of this class should arrange to clear
  322. // individual bytes in the underlying array rather than using the group API.
  323. auto ClearByte(ssize_t byte_index) -> void;
  324. // Clear all of this group's metadata bytes that indicate a deleted slot to
  325. // the empty value.
  326. auto ClearDeleted() -> void;
  327. // Find all of the bytes of metadata in this group that are present and whose
  328. // low 7 bits match the provided `tag`. The `tag` byte must have a clear high
  329. // bit, only 7 bits of tag are used. Note that this means the provided tag is
  330. // *not* the actual present metadata byte -- this function is responsible for
  331. // mapping the tag into that form as it can do so more efficiently in some
  332. // cases. A range over all of the byte indices which matched is returned.
  333. auto Match(uint8_t tag) const -> MatchRange;
  334. // Find all of the present bytes of metadata in this group. A range over all
  335. // of the byte indices which are present is returned.
  336. auto MatchPresent() const -> MatchRange;
  337. // Find the first byte of the metadata group that is empty and return that
  338. // index. There is no order or position required for which of the bytes of
  339. // metadata is considered "first", any model will do that makes it efficient
  340. // to produce the matching index. Must return an empty match index if no bytes
  341. // match the empty metadata.
  342. auto MatchEmpty() const -> MatchIndex;
  343. // Find the first byte of the metadata group that is deleted and return that
  344. // index. There is no order or position required for which of the bytes of
  345. // metadata is considered "first", any model will do that makes it efficient
  346. // to produce the matching index. Must return an empty match index if no bytes
  347. // match the deleted metadata.
  348. auto MatchDeleted() const -> MatchIndex;
  349. private:
  350. // Two classes only defined in the benchmark code are allowed to directly call
  351. // the portable and SIMD implementations for benchmarking purposes.
  352. friend class BenchmarkPortableMetadataGroup;
  353. friend class BenchmarkSIMDMetadataGroup;
  354. // Whether to use a SIMD implementation. Even when we *support* a SIMD
  355. // implementation, we do not always have to use it in the event that it is
  356. // less efficient than the portable version.
  357. static constexpr bool UseSIMD =
  358. #if CARBON_X86_SIMD_SUPPORT
  359. true;
  360. #else
  361. false;
  362. #endif
  363. // All SIMD variants that we have an implementation for should be enabled for
  364. // debugging. This lets us maintain a SIMD implementation even if it is not
  365. // used due to performance reasons, and easily re-enable it if the performance
  366. // changes.
  367. static constexpr bool DebugSIMD =
  368. #if !defined(NDEBUG) && (CARBON_NEON_SIMD_SUPPORT || CARBON_X86_SIMD_SUPPORT)
  369. true;
  370. #else
  371. false;
  372. #endif
  373. // Most and least significant bits set.
  374. static constexpr uint64_t MSBs = 0x8080'8080'8080'8080ULL;
  375. static constexpr uint64_t LSBs = 0x0101'0101'0101'0101ULL;
  376. using MatchBitsT = MatchIndex::BitsT;
  377. static auto CompareEqual(MetadataGroup lhs, MetadataGroup rhs) -> bool;
  378. // Functions for validating the returned matches agree with what is predicted
  379. // by the `byte_match` function. These either `CHECK`-fail or return true. To
  380. // pass validation, the `*_bits` argument must have `0x80` for those bytes
  381. // where `byte_match` returns true, and `0` for the rest.
  382. // `VerifyIndexBits` is for functions that return `MatchIndex`, as they only
  383. // promise to return accurate information up to the first match.
  384. auto VerifyIndexBits(
  385. MatchBitsT index_bits,
  386. llvm::function_ref<auto(uint8_t byte)->bool> byte_match) const -> bool;
  387. // `VerifyRangeBits` is for functions that return `MatchRange`, and so it
  388. // validates all the bytes of `range_bits`.
  389. auto VerifyRangeBits(
  390. MatchBitsT range_bits,
  391. llvm::function_ref<auto(uint8_t byte)->bool> byte_match) const -> bool;
  392. // Portable implementations of each operation. These are used on platforms
  393. // without SIMD support or where the portable implementation is faster than
  394. // SIMD. They are heavily optimized even though they are not SIMD because we
  395. // expect there to be platforms where the portable implementation can
  396. // outperform SIMD. Their behavior and semantics exactly match the
  397. // documentation for the un-prefixed functions.
  398. //
  399. // In debug builds, these also directly verify their results to help establish
  400. // baseline functionality.
  401. static auto PortableLoad(const uint8_t* metadata, ssize_t index)
  402. -> MetadataGroup;
  403. auto PortableStore(uint8_t* metadata, ssize_t index) const -> void;
  404. auto PortableClearDeleted() -> void;
  405. auto PortableMatch(uint8_t tag) const -> MatchRange;
  406. auto PortableMatchPresent() const -> MatchRange;
  407. auto PortableMatchEmpty() const -> MatchIndex;
  408. auto PortableMatchDeleted() const -> MatchIndex;
  409. static auto PortableCompareEqual(MetadataGroup lhs, MetadataGroup rhs)
  410. -> bool;
  411. // SIMD implementations of each operation. We minimize platform-specific APIs
  412. // to reduce the scope of errors that can only be discoverd building on one
  413. // platform, so the bodies of these contain the platform specific code. Their
  414. // behavior and semantics exactly match the documentation for the un-prefixed
  415. // functions.
  416. //
  417. // These routines don't directly verify their results as we can build simpler
  418. // debug checks by comparing them against the verified portable results.
  419. static auto SIMDLoad(const uint8_t* metadata, ssize_t index) -> MetadataGroup;
  420. auto SIMDStore(uint8_t* metadata, ssize_t index) const -> void;
  421. auto SIMDClearDeleted() -> void;
  422. auto SIMDMatch(uint8_t tag) const -> MatchRange;
  423. auto SIMDMatchPresent() const -> MatchRange;
  424. auto SIMDMatchEmpty() const -> MatchIndex;
  425. auto SIMDMatchDeleted() const -> MatchIndex;
  426. static auto SIMDCompareEqual(MetadataGroup lhs, MetadataGroup rhs) -> bool;
  427. #if CARBON_X86_SIMD_SUPPORT
  428. // A common routine for x86 SIMD matching that can be used for matching
  429. // present, empty, and deleted bytes with equal efficiency.
  430. auto X86SIMDMatch(uint8_t match_byte) const -> MatchRange;
  431. #endif
  432. };
  433. // Promote the size and mask to top-level constants as we'll need to operate on
  434. // the grouped structure outside of the metadata bytes.
  435. inline constexpr ssize_t GroupSize = MetadataGroup::Size;
  436. inline constexpr ssize_t GroupMask = MetadataGroup::Mask;
  437. inline auto MetadataGroup::Load(const uint8_t* metadata, ssize_t index)
  438. -> MetadataGroup {
  439. MetadataGroup portable_g;
  440. if constexpr (!UseSIMD || DebugSIMD) {
  441. portable_g = PortableLoad(metadata, index);
  442. if constexpr (!UseSIMD) {
  443. return portable_g;
  444. }
  445. }
  446. MetadataGroup g = SIMDLoad(metadata, index);
  447. CARBON_DCHECK(g == portable_g);
  448. return g;
  449. }
  450. inline auto MetadataGroup::Store(uint8_t* metadata, ssize_t index) const
  451. -> void {
  452. if constexpr (!UseSIMD) {
  453. std::memcpy(metadata + index, &metadata_bytes, Size);
  454. } else {
  455. SIMDStore(metadata, index);
  456. }
  457. CARBON_DCHECK(0 == std::memcmp(metadata + index, &metadata_bytes, Size));
  458. }
  459. inline auto MetadataGroup::ClearByte(ssize_t byte_index) -> void {
  460. CARBON_DCHECK(FastByteClear, "Only use byte clearing when fast!");
  461. CARBON_DCHECK(Size == 8, "The clear implementation assumes an 8-byte group.");
  462. metadata_ints[0] &= ~(static_cast<uint64_t>(0xff) << (byte_index * 8));
  463. }
  464. inline auto MetadataGroup::ClearDeleted() -> void {
  465. MetadataGroup portable_g = *this;
  466. MetadataGroup simd_g = *this;
  467. if constexpr (!UseSIMD || DebugSIMD) {
  468. portable_g.PortableClearDeleted();
  469. }
  470. if constexpr (UseSIMD || DebugSIMD) {
  471. simd_g.SIMDClearDeleted();
  472. CARBON_DCHECK(
  473. simd_g == portable_g,
  474. "SIMD cleared group '{0}' doesn't match portable cleared group '{1}'",
  475. simd_g, portable_g);
  476. }
  477. *this = UseSIMD ? simd_g : portable_g;
  478. }
  479. inline auto MetadataGroup::Match(uint8_t tag) const -> MatchRange {
  480. // The caller should provide us with the present byte hash, and not set any
  481. // present bit tag on it so that this layer can manage tagging the high bit of
  482. // a present byte.
  483. CARBON_DCHECK((tag & PresentMask) == 0, "{0:x}", tag);
  484. MatchRange portable_result;
  485. MatchRange simd_result;
  486. if constexpr (!UseSIMD || DebugSIMD) {
  487. portable_result = PortableMatch(tag);
  488. }
  489. if constexpr (UseSIMD || DebugSIMD) {
  490. simd_result = SIMDMatch(tag);
  491. CARBON_DCHECK(simd_result == portable_result,
  492. "SIMD result '{0}' doesn't match portable result '{1}'",
  493. simd_result, portable_result);
  494. }
  495. return UseSIMD ? simd_result : portable_result;
  496. }
  497. inline auto MetadataGroup::MatchPresent() const -> MatchRange {
  498. MatchRange portable_result;
  499. MatchRange simd_result;
  500. if constexpr (!UseSIMD || DebugSIMD) {
  501. portable_result = PortableMatchPresent();
  502. }
  503. if constexpr (UseSIMD || DebugSIMD) {
  504. simd_result = SIMDMatchPresent();
  505. CARBON_DCHECK(simd_result == portable_result,
  506. "SIMD result '{0}' doesn't match portable result '{1}'",
  507. simd_result, portable_result);
  508. }
  509. return UseSIMD ? simd_result : portable_result;
  510. }
  511. inline auto MetadataGroup::MatchEmpty() const -> MatchIndex {
  512. MatchIndex portable_result;
  513. MatchIndex simd_result;
  514. if constexpr (!UseSIMD || DebugSIMD) {
  515. portable_result = PortableMatchEmpty();
  516. }
  517. if constexpr (UseSIMD || DebugSIMD) {
  518. simd_result = SIMDMatchEmpty();
  519. CARBON_DCHECK(simd_result == portable_result,
  520. "SIMD result '{0}' doesn't match portable result '{1}'",
  521. simd_result, portable_result);
  522. }
  523. return UseSIMD ? simd_result : portable_result;
  524. }
  525. inline auto MetadataGroup::MatchDeleted() const -> MatchIndex {
  526. MatchIndex portable_result;
  527. MatchIndex simd_result;
  528. if constexpr (!UseSIMD || DebugSIMD) {
  529. portable_result = PortableMatchDeleted();
  530. }
  531. if constexpr (UseSIMD || DebugSIMD) {
  532. simd_result = SIMDMatchDeleted();
  533. CARBON_DCHECK(simd_result == portable_result,
  534. "SIMD result '{0}' doesn't match portable result '{1}'",
  535. simd_result, portable_result);
  536. }
  537. return UseSIMD ? simd_result : portable_result;
  538. }
  539. inline auto MetadataGroup::CompareEqual(MetadataGroup lhs, MetadataGroup rhs)
  540. -> bool {
  541. bool portable_result;
  542. bool simd_result;
  543. if constexpr (!UseSIMD || DebugSIMD) {
  544. portable_result = PortableCompareEqual(lhs, rhs);
  545. }
  546. if constexpr (UseSIMD || DebugSIMD) {
  547. simd_result = SIMDCompareEqual(lhs, rhs);
  548. CARBON_DCHECK(simd_result == portable_result);
  549. }
  550. return UseSIMD ? simd_result : portable_result;
  551. }
  552. inline auto MetadataGroup::VerifyIndexBits(
  553. MatchBitsT index_bits,
  554. llvm::function_ref<auto(uint8_t byte)->bool> byte_match) const -> bool {
  555. for (ssize_t byte_index : llvm::seq<ssize_t>(0, Size)) {
  556. if constexpr (!ByteEncoding) {
  557. if (byte_match(metadata_bytes[byte_index])) {
  558. CARBON_CHECK(((index_bits >> byte_index) & 1) == 1,
  559. "Bit not set at matching byte index: {0}", byte_index);
  560. // Only the first match is needed, so stop scanning once found.
  561. break;
  562. }
  563. CARBON_CHECK(((index_bits >> byte_index) & 1) == 0,
  564. "Bit set at non-matching byte index: {0}", byte_index);
  565. } else {
  566. // `index_bits` is byte-encoded rather than bit encoded, so extract a
  567. // byte.
  568. uint8_t index_byte = (index_bits >> (byte_index * 8)) & 0xFF;
  569. if (byte_match(metadata_bytes[byte_index])) {
  570. CARBON_CHECK(
  571. (index_byte & 0x80) == 0x80,
  572. "Should have the high bit set for a matching byte, found: {0:x}",
  573. index_byte);
  574. // Only the first match is needed so stop scanning once found.
  575. break;
  576. }
  577. CARBON_CHECK(
  578. index_byte == 0,
  579. "Should have no bits set for an unmatched byte, found: {0:x}",
  580. index_byte);
  581. }
  582. }
  583. return true;
  584. }
  585. inline auto MetadataGroup::VerifyRangeBits(
  586. MatchBitsT range_bits,
  587. llvm::function_ref<auto(uint8_t byte)->bool> byte_match) const -> bool {
  588. for (ssize_t byte_index : llvm::seq<ssize_t>(0, Size)) {
  589. if constexpr (!ByteEncoding) {
  590. if (byte_match(metadata_bytes[byte_index])) {
  591. CARBON_CHECK(((range_bits >> byte_index) & 1) == 1,
  592. "Bit not set at matching byte index: {0}", byte_index);
  593. } else {
  594. CARBON_CHECK(((range_bits >> byte_index) & 1) == 0,
  595. "Bit set at non-matching byte index: {0}", byte_index);
  596. }
  597. } else {
  598. // `range_bits` is byte-encoded rather than bit encoded, so extract a
  599. // byte.
  600. uint8_t range_byte = (range_bits >> (byte_index * 8)) & 0xFF;
  601. if (byte_match(metadata_bytes[byte_index])) {
  602. CARBON_CHECK(range_byte == 0x80,
  603. "Should just have the high bit set for a matching byte, "
  604. "found: {0:x}",
  605. range_byte);
  606. } else {
  607. CARBON_CHECK(
  608. range_byte == 0,
  609. "Should have no bits set for an unmatched byte, found: {0:x}",
  610. range_byte);
  611. }
  612. }
  613. }
  614. return true;
  615. }
  616. inline auto MetadataGroup::PortableLoad(const uint8_t* metadata, ssize_t index)
  617. -> MetadataGroup {
  618. MetadataGroup g;
  619. static_assert(sizeof(g) == Size);
  620. std::memcpy(&g.metadata_bytes, metadata + index, Size);
  621. return g;
  622. }
  623. inline auto MetadataGroup::PortableStore(uint8_t* metadata, ssize_t index) const
  624. -> void {
  625. std::memcpy(metadata + index, &metadata_bytes, Size);
  626. }
  627. inline auto MetadataGroup::PortableClearDeleted() -> void {
  628. for (uint64_t& metadata_int : metadata_ints) {
  629. // Deleted bytes have only the least significant bits set, so to clear them
  630. // we only need to clear the least significant bit. And empty bytes already
  631. // have a clear least significant bit, so the only least significant bits we
  632. // need to preserve are those of present bytes. The most significant bit of
  633. // every present byte is set, so we take the most significant bit of each
  634. // byte, shift it into the least significant bit position, and bit-or it
  635. // with the compliment of `LSBs`. This will have ones for every bit but the
  636. // least significant bits, and ones for the least significant bits of every
  637. // present byte.
  638. metadata_int &= (~LSBs | metadata_int >> 7);
  639. }
  640. }
  641. inline auto MetadataGroup::PortableMatch(uint8_t tag) const -> MatchRange {
  642. // The caller should provide us with the present byte hash, and not set any
  643. // present bit tag on it so that this layer can manage tagging the high bit of
  644. // a present byte.
  645. CARBON_DCHECK((tag & PresentMask) == 0, "{0:x}", tag);
  646. // Use a simple fallback approach for sizes beyond 8.
  647. // TODO: Instead of a simple fallback, we should generalize the below
  648. // algorithm for sizes above 8, even if to just exercise the same code on
  649. // more platforms.
  650. if constexpr (Size > 8) {
  651. static_assert(Size <= 32, "Sizes larger than 32 not yet supported!");
  652. uint32_t match_bits = 0;
  653. uint32_t bit = 1;
  654. uint8_t present_byte = tag | PresentMask;
  655. for (ssize_t i : llvm::seq<ssize_t>(0, Size)) {
  656. if (metadata_bytes[i] == present_byte) {
  657. match_bits |= bit;
  658. }
  659. bit <<= 1;
  660. }
  661. return MatchRange(match_bits);
  662. }
  663. // This algorithm only works for matching *present* bytes. We leverage the
  664. // set high bit in the present case as part of the algorithm. The whole
  665. // algorithm has a critical path height of 4 operations, and does 6
  666. // operations total on AArch64. The operation dependency graph is:
  667. //
  668. // group | MSBs LSBs * match_byte + MSBs
  669. // \ /
  670. // match_bits ^ broadcast
  671. // |
  672. // group & MSBs MSBs - match_bits
  673. // \ /
  674. // group_MSBs & match_bits
  675. //
  676. // This diagram and the operation count are specific to AArch64 where we have
  677. // a fused *integer* multiply-add operation.
  678. //
  679. // While it is superficially similar to the "find zero bytes in a word" bit
  680. // math trick, it is different because this is designed to have no false
  681. // positives and perfectly produce 0x80 for matching bytes and 0x00 for
  682. // non-matching bytes. This is do-able because we constrain to only handle
  683. // present matches which only require testing 7 bits and have a particular
  684. // layout.
  685. // Set the high bit of every byte to `1`. Any matching byte is a present byte
  686. // and so always has this bit set as well, which means the xor below, in
  687. // addition to zeroing the low 7 bits of any byte that matches the tag, also
  688. // clears the high bit of every byte.
  689. uint64_t match_bits = metadata_ints[0] | MSBs;
  690. // Broadcast the match byte to all bytes, and mask in the present bits in the
  691. // MSBs of each byte. We structure this as a multiply and an add because we
  692. // know that the add cannot carry, and this way it can be lowered using
  693. // combined multiply-add instructions if available.
  694. uint64_t broadcast = LSBs * tag + MSBs;
  695. CARBON_DCHECK(broadcast == (LSBs * tag | MSBs),
  696. "Unexpected carry from addition!");
  697. // Xor the broadcast byte pattern. This makes bytes with matches become 0, and
  698. // clears the high-bits of non-matches. Note that if we are looking for a tag
  699. // with the same value as `Empty` or `Deleted`, those bytes will be zero as
  700. // well.
  701. match_bits = match_bits ^ broadcast;
  702. // Subtract each byte of `match_bits` from `0x80` bytes. After this, the high
  703. // bit will be set only for those bytes that were zero.
  704. match_bits = MSBs - match_bits;
  705. // Zero everything but the high bits, and also zero the high bits of any bytes
  706. // for "not present" slots in the original group. This avoids false positives
  707. // for `Empty` and `Deleted` bytes in the metadata.
  708. match_bits &= (metadata_ints[0] & MSBs);
  709. // At this point, `match_bits` has the high bit set for bytes where the
  710. // original group byte equals `tag` plus the high bit.
  711. CARBON_DCHECK(VerifyRangeBits(
  712. match_bits, [&](uint8_t byte) { return byte == (tag | PresentMask); }));
  713. return MatchRange(match_bits);
  714. }
  715. inline auto MetadataGroup::PortableMatchPresent() const -> MatchRange {
  716. // Use a simple fallback approach for sizes beyond 8.
  717. // TODO: Instead of a simple fallback, we should generalize the below
  718. // algorithm for sizes above 8, even if to just exercise the same code on
  719. // more platforms.
  720. if constexpr (Size > 8) {
  721. static_assert(Size <= 32, "Sizes larger than 32 not yet supported!");
  722. uint32_t match_bits = 0;
  723. uint32_t bit = 1;
  724. for (ssize_t i : llvm::seq<ssize_t>(0, Size)) {
  725. if (metadata_bytes[i] & PresentMask) {
  726. match_bits |= bit;
  727. }
  728. bit <<= 1;
  729. }
  730. return MatchRange(match_bits);
  731. }
  732. // Want to keep the high bit of each byte, which indicates whether that byte
  733. // represents a present slot.
  734. uint64_t match_bits = metadata_ints[0] & MSBs;
  735. CARBON_DCHECK(VerifyRangeBits(
  736. match_bits, [&](uint8_t byte) { return (byte & PresentMask) != 0; }));
  737. return MatchRange(match_bits);
  738. }
  739. inline auto MetadataGroup::PortableMatchEmpty() const -> MatchIndex {
  740. // Use a simple fallback approach for sizes beyond 8.
  741. // TODO: Instead of a simple fallback, we should generalize the below
  742. // algorithm for sizes above 8, even if to just exercise the same code on
  743. // more platforms.
  744. if constexpr (Size > 8) {
  745. static_assert(Size <= 32, "Sizes larger than 32 not yet supported!");
  746. uint32_t bit = 1;
  747. for (ssize_t i : llvm::seq<ssize_t>(0, Size)) {
  748. if (metadata_bytes[i] == Empty) {
  749. return MatchIndex(bit);
  750. }
  751. bit <<= 1;
  752. }
  753. return MatchIndex(0);
  754. }
  755. // This sets the high bit of every byte in `match_bits` unless the
  756. // corresponding metadata byte is 0. We take advantage of the fact that
  757. // the metadata bytes in are non-zero only if they are either:
  758. // - present: in which case the high bit of the byte will already be set; or
  759. // - deleted: in which case the byte will be 1, and shifting it left by 7 will
  760. // cause the high bit to be set.
  761. uint64_t match_bits = metadata_ints[0] | (metadata_ints[0] << 7);
  762. // This inverts the high bits of the bytes, and clears the remaining bits.
  763. match_bits = ~match_bits & MSBs;
  764. // The high bits of the bytes of `match_bits` are set if the corresponding
  765. // metadata byte is `Empty`.
  766. CARBON_DCHECK(
  767. VerifyIndexBits(match_bits, [](uint8_t byte) { return byte == Empty; }));
  768. return MatchIndex(match_bits);
  769. }
  770. inline auto MetadataGroup::PortableMatchDeleted() const -> MatchIndex {
  771. // Use a simple fallback approach for sizes beyond 8.
  772. // TODO: Instead of a simple fallback, we should generalize the below
  773. // algorithm for sizes above 8, even if to just exercise the same code on
  774. // more platforms.
  775. if constexpr (Size > 8) {
  776. static_assert(Size <= 32, "Sizes larger than 32 not yet supported!");
  777. uint32_t bit = 1;
  778. for (ssize_t i : llvm::seq<ssize_t>(0, Size)) {
  779. if (metadata_bytes[i] == Deleted) {
  780. return MatchIndex(bit);
  781. }
  782. bit <<= 1;
  783. }
  784. return MatchIndex(0);
  785. }
  786. // This sets the high bit of every byte in `match_bits` unless the
  787. // corresponding metadata byte is 1. We take advantage of the fact that the
  788. // metadata bytes are not 1 only if they are either:
  789. // - present: in which case the high bit of the byte will already be set; or
  790. // - empty: in which case the byte will be 0, and in that case inverting and
  791. // shifting left by 7 will have the high bit set.
  792. uint64_t match_bits = metadata_ints[0] | (~metadata_ints[0] << 7);
  793. // This inverts the high bits of the bytes, and clears the remaining bits.
  794. match_bits = ~match_bits & MSBs;
  795. // The high bits of the bytes of `match_bits` are set if the corresponding
  796. // metadata byte is `Deleted`.
  797. CARBON_DCHECK(VerifyIndexBits(match_bits,
  798. [](uint8_t byte) { return byte == Deleted; }));
  799. return MatchIndex(match_bits);
  800. }
  801. inline auto MetadataGroup::PortableCompareEqual(MetadataGroup lhs,
  802. MetadataGroup rhs) -> bool {
  803. return llvm::equal(lhs.metadata_bytes, rhs.metadata_bytes);
  804. }
  805. inline auto MetadataGroup::SIMDLoad(const uint8_t* metadata, ssize_t index)
  806. -> MetadataGroup {
  807. MetadataGroup g;
  808. #if CARBON_NEON_SIMD_SUPPORT
  809. g.metadata_vec = vld1_u8(metadata + index);
  810. #elif CARBON_X86_SIMD_SUPPORT
  811. g.metadata_vec =
  812. _mm_load_si128(reinterpret_cast<const __m128i*>(metadata + index));
  813. #else
  814. static_assert(!UseSIMD, "Unimplemented SIMD operation");
  815. static_cast<void>(metadata);
  816. static_cast<void>(index);
  817. #endif
  818. return g;
  819. }
  820. inline auto MetadataGroup::SIMDStore(uint8_t* metadata, ssize_t index) const
  821. -> void {
  822. #if CARBON_NEON_SIMD_SUPPORT
  823. vst1_u8(metadata + index, metadata_vec);
  824. #elif CARBON_X86_SIMD_SUPPORT
  825. _mm_store_si128(reinterpret_cast<__m128i*>(metadata + index), metadata_vec);
  826. #else
  827. static_assert(!UseSIMD, "Unimplemented SIMD operation");
  828. static_cast<void>(metadata);
  829. static_cast<void>(index);
  830. #endif
  831. }
  832. inline auto MetadataGroup::SIMDClearDeleted() -> void {
  833. #if CARBON_NEON_SIMD_SUPPORT
  834. // There is no good Neon operation to implement this, so do it using integer
  835. // code. This is reasonably fast, but unfortunate because it forces the group
  836. // out of a SIMD register and into a general purpose register, which can have
  837. // high latency.
  838. metadata_ints[0] &= (~LSBs | metadata_ints[0] >> 7);
  839. #elif CARBON_X86_SIMD_SUPPORT
  840. // For each byte, use `metadata_vec` if the byte's high bit is set (indicating
  841. // it is present), otherwise (it is empty or deleted) replace it with zero
  842. // (representing empty).
  843. metadata_vec =
  844. _mm_blendv_epi8(_mm_setzero_si128(), metadata_vec, metadata_vec);
  845. #else
  846. static_assert(!UseSIMD && !DebugSIMD, "Unimplemented SIMD operation");
  847. #endif
  848. }
  849. inline auto MetadataGroup::SIMDMatch(uint8_t tag) const -> MatchRange {
  850. MatchRange result;
  851. #if CARBON_NEON_SIMD_SUPPORT
  852. // Broadcast byte we want to match to every byte in the vector.
  853. auto match_byte_vec = vdup_n_u8(tag | PresentMask);
  854. // Result bytes have all bits set for the bytes that match, so we have to
  855. // clear everything but MSBs next.
  856. auto match_byte_cmp_vec = vceq_u8(metadata_vec, match_byte_vec);
  857. uint64_t match_bits = vreinterpret_u64_u8(match_byte_cmp_vec)[0];
  858. // The matched range is likely to be tested for zero by the caller, and that
  859. // test can often be folded into masking the bits with `MSBs` when we do that
  860. // mask in the scalar domain rather than the SIMD domain. So we do the mask
  861. // here rather than above prior to extracting the match bits.
  862. result = MatchRange(match_bits & MSBs);
  863. #elif CARBON_X86_SIMD_SUPPORT
  864. result = X86SIMDMatch(tag | PresentMask);
  865. #else
  866. static_assert(!UseSIMD && !DebugSIMD, "Unimplemented SIMD operation");
  867. static_cast<void>(tag);
  868. #endif
  869. return result;
  870. }
  871. inline auto MetadataGroup::SIMDMatchPresent() const -> MatchRange {
  872. MatchRange result;
  873. #if CARBON_NEON_SIMD_SUPPORT
  874. // Just extract the metadata directly.
  875. uint64_t match_bits = vreinterpret_u64_u8(metadata_vec)[0];
  876. // The matched range is likely to be tested for zero by the caller, and that
  877. // test can often be folded into masking the bits with `MSBs` when we do that
  878. // mask in the scalar domain rather than the SIMD domain. So we do the mask
  879. // here rather than above prior to extracting the match bits.
  880. result = MatchRange(match_bits & MSBs);
  881. #elif CARBON_X86_SIMD_SUPPORT
  882. // We arranged the byte vector so that present bytes have the high bit set,
  883. // which this instruction extracts.
  884. result = MatchRange(_mm_movemask_epi8(metadata_vec));
  885. #else
  886. static_assert(!UseSIMD && !DebugSIMD, "Unimplemented SIMD operation");
  887. #endif
  888. return result;
  889. }
  890. inline auto MetadataGroup::SIMDMatchEmpty() const -> MatchIndex {
  891. MatchIndex result;
  892. #if CARBON_NEON_SIMD_SUPPORT
  893. // Compare all bytes with zero, as that is the empty byte value. Result will
  894. // have all bits set for any input zero byte, so we zero all but the high bits
  895. // below.
  896. auto cmp_vec = vceqz_u8(metadata_vec);
  897. uint64_t metadata_bits = vreinterpret_u64_u8(cmp_vec)[0];
  898. // The matched range is likely to be tested for zero by the caller, and that
  899. // test can often be folded into masking the bits with `MSBs` when we do that
  900. // mask in the scalar domain rather than the SIMD domain. So we do the mask
  901. // here rather than above prior to extracting the match bits.
  902. result = MatchIndex(metadata_bits & MSBs);
  903. #elif CARBON_X86_SIMD_SUPPORT
  904. // Even though we only need the first match rather than all matches, we don't
  905. // have a more efficient way to compute this on x86 and so we reuse the
  906. // general match infrastructure that computes all matches in a bit-encoding.
  907. // We then convert it into a `MatchIndex` that just finds the first one.
  908. result = static_cast<MatchIndex>(X86SIMDMatch(Empty));
  909. #else
  910. static_assert(!UseSIMD && !DebugSIMD, "Unimplemented SIMD operation");
  911. #endif
  912. return result;
  913. }
  914. inline auto MetadataGroup::SIMDMatchDeleted() const -> MatchIndex {
  915. MatchIndex result;
  916. #if CARBON_NEON_SIMD_SUPPORT
  917. // Broadcast the `Deleted` byte across the vector and compare the bytes of
  918. // that with the metadata vector. The result will have all bits set for any
  919. // input zero byte, so we zero all but the high bits below.
  920. auto cmp_vec = vceq_u8(metadata_vec, vdup_n_u8(Deleted));
  921. uint64_t match_bits = vreinterpret_u64_u8(cmp_vec)[0];
  922. // The matched range is likely to be tested for zero by the caller, and that
  923. // test can often be folded into masking the bits with `MSBs` when we do that
  924. // mask in the scalar domain rather than the SIMD domain. So we do the mask
  925. // here rather than above prior to extracting the match bits.
  926. result = MatchIndex(match_bits & MSBs);
  927. #elif CARBON_X86_SIMD_SUPPORT
  928. // Even though we only need the first match rather than all matches, we don't
  929. // have a more efficient way to compute this on x86 and so we reuse the
  930. // general match infrastructure that computes all matches in a bit-encoding.
  931. // We then convert it into a `MatchIndex` that just finds the first one.
  932. result = static_cast<MatchIndex>(X86SIMDMatch(Deleted));
  933. #else
  934. static_assert(!UseSIMD && !DebugSIMD, "Unimplemented SIMD operation");
  935. #endif
  936. return result;
  937. }
  938. inline auto MetadataGroup::SIMDCompareEqual(MetadataGroup lhs,
  939. MetadataGroup rhs) -> bool {
  940. #if CARBON_NEON_SIMD_SUPPORT
  941. return vreinterpret_u64_u8(vceq_u8(lhs.metadata_vec, rhs.metadata_vec))[0] ==
  942. static_cast<uint64_t>(-1LL);
  943. #elif CARBON_X86_SIMD_SUPPORT
  944. // Different x86 SIMD extensions provide different comparison functionality
  945. // available.
  946. #if __SSE4_2__
  947. // With SSE 4.2, we can directly test and branch in the SIMD domain on whether
  948. // the two metadata vectors are equal.
  949. return _mm_testc_si128(_mm_cmpeq_epi8(lhs.metadata_vec, rhs.metadata_vec),
  950. _mm_set1_epi8(0xff)) == 1;
  951. #else
  952. // With older versions of SSE we have to extract the result of the comparison,
  953. // much like we do when matching. That will have the usual bitmask
  954. // representing equal bytes, and test for that exact bitmask in scalar code.
  955. return _mm_movemask_epi8(_mm_cmpeq_epi8(lhs.metadata_vec,
  956. rhs.metadata_vec)) == 0x0000'ffffU;
  957. #endif
  958. #else
  959. static_assert(!UseSIMD && !DebugSIMD, "Unimplemented SIMD operation");
  960. static_cast<void>(lhs);
  961. static_cast<void>(rhs);
  962. return false;
  963. #endif
  964. }
  965. #if CARBON_X86_SIMD_SUPPORT
  966. inline auto MetadataGroup::X86SIMDMatch(uint8_t match_byte) const
  967. -> MatchRange {
  968. // Broadcast the byte we're matching against to all bytes in a vector, and
  969. // compare those bytes with the metadata vector bytes.
  970. auto match_byte_vec = _mm_set1_epi8(match_byte);
  971. auto match_byte_cmp_vec = _mm_cmpeq_epi8(metadata_vec, match_byte_vec);
  972. // Extract the result of each byte-wise comparison into the low bits of an
  973. // integer.
  974. uint32_t match_bits = _mm_movemask_epi8(match_byte_cmp_vec);
  975. return MatchRange(match_bits);
  976. }
  977. #endif
  978. } // namespace Carbon::RawHashtable
  979. #endif // CARBON_COMMON_RAW_HASHTABLE_METADATA_GROUP_H_