raw_hashtable_metadata_group.h 47 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #ifndef CARBON_COMMON_RAW_HASHTABLE_METADATA_GROUP_H_
  5. #define CARBON_COMMON_RAW_HASHTABLE_METADATA_GROUP_H_
  6. #include <cstddef>
  7. #include <cstring>
  8. #include <iterator>
  9. #include <type_traits>
  10. #include "common/check.h"
  11. #include "common/ostream.h"
  12. #include "llvm/ADT/Sequence.h"
  13. #include "llvm/ADT/bit.h"
  14. #include "llvm/Support/FormatVariadic.h"
  15. #include "llvm/Support/MathExtras.h"
  16. // Detect whether we can use SIMD accelerated implementations of the control
  17. // groups, and include the relevant platform specific APIs for the SIMD
  18. // implementations.
  19. //
  20. // Reference documentation for the SIMD APIs used here:
  21. // - https://arm-software.github.io/acle/neon_intrinsics/advsimd.html
  22. // - https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html
  23. #if defined(__SSSE3__)
  24. #include <x86intrin.h>
  25. #define CARBON_X86_SIMD_SUPPORT 1
  26. #elif defined(__ARM_NEON)
  27. #include <arm_neon.h>
  28. #define CARBON_NEON_SIMD_SUPPORT 1
  29. #endif
  30. // This namespace collects low-level utilities for implementing hashtable
  31. // data structures. This file only provides one of them:
  32. //
  33. // - Primitives to manage "groups" of hashtable entries that have densely packed
  34. // control bytes we can scan rapidly as a group, often using SIMD facilities
  35. // to process the entire group at once.
  36. namespace Carbon::RawHashtable {
  37. // We define a constant max group size. The particular group size used in
  38. // practice may vary, but we want to have some upper bound used to ensure
  39. // memory allocation is done consistently across different architectures.
  40. constexpr ssize_t MaxGroupSize = 16;
  41. // This takes a collection of bits representing the results of looking for a
  42. // particular tag in this metadata group and determines the first position with
  43. // a match. The position is represented by either the least significant set bit
  44. // or the least significant non-zero byte, depending on `ByteEncoding`. When
  45. // represented with a non-zero byte, that byte must have at least its most
  46. // significant bit set, but may have other bits set to any value. Bits more
  47. // significant than the match may have any value provided there is at least one
  48. // match. Zero matches must be represented by a zero input.
  49. //
  50. // Some bits of the underlying value may be known-zero, which can optimize
  51. // various operations. These can be represented as a `ZeroMask`.
  52. template <typename BitsInputT, bool ByteEncodingInput, BitsInputT ZeroMask = 0>
  53. class BitIndex
  54. : public Printable<BitIndex<BitsInputT, ByteEncodingInput, ZeroMask>> {
  55. public:
  56. using BitsT = BitsInputT;
  57. static constexpr bool ByteEncoding = ByteEncodingInput;
  58. BitIndex() = default;
  59. explicit BitIndex(BitsT bits) : bits_(bits) {}
  60. friend auto operator==(BitIndex lhs, BitIndex rhs) -> bool {
  61. if (lhs.empty() || rhs.empty()) {
  62. return lhs.empty() == rhs.empty();
  63. }
  64. // For non-empty bit indices, compare the indices directly to ignore other
  65. // (extraneous) parts of the incoming bits.
  66. return lhs.index() == rhs.index();
  67. }
  68. auto Print(llvm::raw_ostream& out) const -> void {
  69. out << llvm::formatv("{0:x}", bits_);
  70. }
  71. explicit operator bool() const { return !empty(); }
  72. // Returns true when there are no matches for the tag.
  73. auto empty() const -> bool {
  74. CARBON_DCHECK((bits_ & ZeroMask) == 0, "Unexpected non-zero bits!");
  75. __builtin_assume((bits_ & ZeroMask) == 0);
  76. return bits_ == 0;
  77. }
  78. // Returns the index of the first matched tag.
  79. auto index() -> ssize_t {
  80. CARBON_DCHECK(bits_ != 0, "Cannot get an index from zero bits!");
  81. __builtin_assume(bits_ != 0);
  82. ssize_t index = unscaled_index();
  83. if constexpr (ByteEncoding) {
  84. // Shift to scale out of the byte encoding.
  85. index >>= ByteEncodingShift;
  86. }
  87. return index;
  88. }
  89. // Optimized tool to index a pointer `p` by `index()`.
  90. template <typename T>
  91. auto index_ptr(T* pointer) -> T* {
  92. CARBON_DCHECK(bits_ != 0, "Cannot get an index from zero bits!");
  93. __builtin_assume(bits_ != 0);
  94. if constexpr (!ByteEncoding) {
  95. return &pointer[unscaled_index()];
  96. }
  97. ssize_t index = unscaled_index();
  98. // Scale the index as we counted zero *bits* and not zero *bytes*.
  99. // However, we can fold that scale with the size of `T` when it is a power
  100. // of two or divisible by 8.
  101. CARBON_DCHECK(
  102. (index & ((static_cast<size_t>(1) << ByteEncodingShift) - 1)) == 0);
  103. if constexpr (sizeof(T) % 8 == 0) {
  104. constexpr size_t FoldedScale = sizeof(T) / 8;
  105. index *= FoldedScale;
  106. return reinterpret_cast<T*>(
  107. &reinterpret_cast<std::byte*>(pointer)[index]);
  108. } else if constexpr (llvm::isPowerOf2_64(sizeof(T))) {
  109. constexpr size_t ScaleShift = llvm::CTLog2<sizeof(T)>();
  110. static_assert(ScaleShift <= ByteEncodingShift,
  111. "Scaling by >=8 should be handled above!");
  112. constexpr size_t FoldedShift = ByteEncodingShift - ScaleShift;
  113. index >>= FoldedShift;
  114. return reinterpret_cast<T*>(
  115. &reinterpret_cast<std::byte*>(pointer)[index]);
  116. }
  117. // Nothing we can fold here.
  118. return &pointer[index >> ByteEncodingShift];
  119. }
  120. private:
  121. // When using a byte encoding, we'll need to shift any index by this amount.
  122. static constexpr size_t ByteEncodingShift = 3;
  123. auto unscaled_index() -> ssize_t {
  124. if constexpr (!ByteEncoding) {
  125. // Note the cast to `size_t` to force zero extending the result.
  126. return static_cast<size_t>(llvm::countr_zero(bits_));
  127. } else {
  128. // The index is encoded in the high bit of each byte. We compute the index
  129. // by counting the number of low zero bytes there are before the first
  130. // byte with its high bit set. Rather that shifting the high bit to be the
  131. // low bit and counting the trailing (least significant) zero bits
  132. // directly, we instead byte-reverse the bits and count the *leading*
  133. // (most significant) zero bits. While this may be a wash on CPUs with
  134. // direct support for counting the trailing zero bits, AArch64 only
  135. // supports counting the leading zero bits and requires a bit-reverse to
  136. // count the trailing zero bits. Doing the byte-reverse approach
  137. // essentially combines moving the high bit into the low bit and the
  138. // reverse necessary for counting the zero bits. While this only removes
  139. // one instruction, it is an instruction in the critical path of the
  140. // hottest part of table lookup, and that critical path dependency height
  141. // is few enough instructions that removing even one significantly impacts
  142. // latency.
  143. //
  144. // We also cast to `size_t` to clearly zero-extend the result.
  145. return static_cast<size_t>(llvm::countl_zero(llvm::byteswap(bits_)));
  146. }
  147. }
  148. BitsT bits_ = 0;
  149. };
  150. // This is like `BitIndex`, but allows iterating through all of the matches.
  151. //
  152. // A key requirement for efficient iteration is that all of the matches are
  153. // represented with a single bit and there are no other bits set. For example,
  154. // with byte-encoded bit indices, exactly the high bit and no other bit of each
  155. // matching byte must be set. This is a stricter constraint than what `BitIndex`
  156. // alone would impose on any one of the matches.
  157. template <typename BitIndexT, BitIndexT::BitsT ByteEncodingMask = 0>
  158. class BitIndexRange
  159. : public Printable<BitIndexRange<BitIndexT, ByteEncodingMask>> {
  160. public:
  161. using BitsT = BitIndexT::BitsT;
  162. static_assert(BitIndexT::ByteEncoding || ByteEncodingMask == 0,
  163. "Non-byte encoding must not have a byte encoding mask.");
  164. class Iterator
  165. : public llvm::iterator_facade_base<Iterator, std::forward_iterator_tag,
  166. ssize_t, ssize_t> {
  167. public:
  168. Iterator() = default;
  169. explicit Iterator(BitsT bits) : bits_(bits) {}
  170. friend auto operator==(const Iterator& lhs, const Iterator& rhs) -> bool {
  171. return lhs.bits_ == rhs.bits_;
  172. }
  173. auto operator*() -> ssize_t& {
  174. CARBON_DCHECK(bits_ != 0, "Cannot get an index from zero bits!");
  175. __builtin_assume(bits_ != 0);
  176. index_ = BitIndexT(bits_).index();
  177. // Note that we store the index in a member so we can return a reference
  178. // to it here as required to be a forward iterator.
  179. return index_;
  180. }
  181. template <typename T>
  182. auto index_ptr(T* pointer) -> T* {
  183. return BitIndexT(bits_).index_ptr(pointer);
  184. }
  185. auto operator++() -> Iterator& {
  186. CARBON_DCHECK(bits_ != 0, "Must not increment past the end!");
  187. __builtin_assume(bits_ != 0);
  188. if constexpr (ByteEncodingMask != 0) {
  189. // Apply an increment mask to the bits first. This is used with the byte
  190. // encoding when the mask isn't needed until we begin incrementing.
  191. bits_ &= ByteEncodingMask;
  192. }
  193. // Clears the least significant set bit, effectively stepping to the next
  194. // match.
  195. bits_ &= (bits_ - 1);
  196. return *this;
  197. }
  198. private:
  199. ssize_t index_;
  200. BitsT bits_ = 0;
  201. };
  202. BitIndexRange() = default;
  203. explicit BitIndexRange(BitsT bits) : bits_(bits) {}
  204. explicit operator bool() const { return !empty(); }
  205. auto empty() const -> bool { return BitIndexT(bits_).empty(); }
  206. auto begin() const -> Iterator { return Iterator(bits_); }
  207. auto end() const -> Iterator { return Iterator(); }
  208. friend auto operator==(BitIndexRange lhs, BitIndexRange rhs) -> bool {
  209. if constexpr (ByteEncodingMask == 0) {
  210. // If there is no encoding mask, we can just compare the bits directly.
  211. return lhs.bits_ == rhs.bits_;
  212. } else {
  213. // Otherwise, compare the initial bit indices and the masked bits.
  214. return BitIndexT(lhs.bits_) == BitIndexT(rhs.bits_) &&
  215. (lhs.bits_ & ByteEncodingMask) == (rhs.bits_ & ByteEncodingMask);
  216. }
  217. }
  218. // Define heterogeneous equality between a masked (the current type) and
  219. // unmasked range. Requires a non-zero mask to avoid a redundant definition
  220. // with the homogeneous equality.
  221. friend auto operator==(BitIndexRange lhs, BitIndexRange<BitIndexT, 0> rhs)
  222. -> bool
  223. requires(ByteEncodingMask != 0)
  224. {
  225. // For mixed masked / unmasked comparison, we make sure the initial indices
  226. // are the same and that the masked side (LHS) is the same after masking as
  227. // the unmasked side (RHS).
  228. return BitIndexT(lhs.bits_) == BitIndexT(rhs.bits_) &&
  229. (lhs.bits_ & ByteEncodingMask) == rhs.bits_;
  230. }
  231. auto Print(llvm::raw_ostream& out) const -> void {
  232. out << llvm::formatv("{0:x}", bits_);
  233. }
  234. explicit operator BitsT() const { return bits_; }
  235. explicit operator BitIndexT() const { return BitIndexT(bits_); }
  236. private:
  237. template <typename FriendBitIndexT,
  238. FriendBitIndexT::BitsT FriendByteEncodingMask>
  239. friend class BitIndexRange;
  240. BitsT bits_ = 0;
  241. };
  242. // A group of metadata bytes that can be manipulated together.
  243. //
  244. // The metadata bytes used Carbon's hashtable implementation are designed to
  245. // support being manipulating as groups, either using architecture specific SIMD
  246. // code sequences or using portable SIMD-in-an-integer-register code sequences.
  247. // These operations are unusually performance sensitive and in sometimes
  248. // surprising ways. The implementations here are crafted specifically to
  249. // optimize the particular usages in Carbon's hashtable and should not be
  250. // expected to be reusable in any other context.
  251. //
  252. // Throughout the functions operating on this type we try to use patterns with a
  253. // fallback portable implementation which can be directly used in the absence of
  254. // a SIMD implementation, but is also used (with the same code) to check that
  255. // any SIMD implementation produces the same result as the portable one. These
  256. // patterns help minimize un-compiled or un-tested paths through either portable
  257. // or SIMD code, regardless of which path is actually *used* on a particular
  258. // platform. To illustrate a common version of this pattern, we might have code
  259. // like:
  260. //
  261. // ```cpp
  262. // auto MetadataGroup::Operation(...) -> ... {
  263. // ... portable_result;
  264. // ... simd_result;
  265. // if constexpr (!UseSIMD || DebugSIMD) {
  266. // portable_result = PortableOperation(...);
  267. // }
  268. // if (UseSIMD || DebugSIMD) {
  269. // simd_result = SIMDOperation(...)
  270. // CARBON_DCHECK(result == portable_result, "{0}", ...);
  271. // }
  272. // return UseSIMD ? simd_result : portable_result;
  273. // }
  274. // ```
  275. class MetadataGroup : public Printable<MetadataGroup> {
  276. public:
  277. static constexpr ssize_t Size =
  278. #if CARBON_X86_SIMD_SUPPORT
  279. 16;
  280. #else
  281. 8;
  282. #endif
  283. static_assert(Size >= 8);
  284. static_assert(Size % 8 == 0);
  285. static_assert(Size <= MaxGroupSize);
  286. static_assert(MaxGroupSize % Size == 0);
  287. static_assert(llvm::isPowerOf2_64(Size),
  288. "The group size must be a constant power of two so dividing by "
  289. "it is a simple shift.");
  290. static constexpr ssize_t Mask = Size - 1;
  291. // Each control byte can have special values. All special values have the
  292. // most significant bit cleared to distinguish them from the seven hash bits
  293. // stored when the control byte represents a full bucket.
  294. //
  295. // Otherwise, their values are chose primarily to provide efficient SIMD
  296. // implementations of the common operations on an entire control group.
  297. static constexpr uint8_t Empty = 0;
  298. static constexpr uint8_t Deleted = 1;
  299. static constexpr uint8_t PresentMask = 0b1000'0000;
  300. // Whether to use a SIMD implementation. Even when we *support* a SIMD
  301. // implementation, we do not always have to use it in the event that it is
  302. // less efficient than the portable version.
  303. static constexpr bool UseSIMD =
  304. #if CARBON_X86_SIMD_SUPPORT
  305. true;
  306. #else
  307. false;
  308. #endif
  309. // Some architectures make it much more efficient to build the match indices
  310. // in a byte-encoded form rather than a bit-encoded form. This encoding
  311. // changes verification and other aspects of our algorithms.
  312. static constexpr bool ByteEncoding =
  313. #if CARBON_X86_SIMD_SUPPORT
  314. false;
  315. #else
  316. true;
  317. #endif
  318. static_assert(!ByteEncoding || Size == 8,
  319. "We can only support byte encoding with a group size of 8.");
  320. // We need to indicate to users of the metadata group when they can hold a
  321. // group value in a "register" (local variable) across clearing of individual
  322. // bytes in the group efficiently. If the entire group can fit in an integer
  323. // register, this works well and clients of the group should work to use the
  324. // already-loaded value when clearing bytes. But when we have a larger group
  325. // size, clearing the byte will typically require storing a byte to memory and
  326. // re-loading the group. The usage patterns that need to clear bytes can in
  327. // those cases avoid clearing a loaded group, and clear the byte directly in
  328. // the larger metadata array.
  329. static constexpr bool FastByteClear = Size == 8;
  330. // Most and least significant bits set.
  331. static constexpr uint64_t MSBs = 0x8080'8080'8080'8080ULL;
  332. static constexpr uint64_t LSBs = 0x0101'0101'0101'0101ULL;
  333. using MatchIndex =
  334. BitIndex<std::conditional_t<ByteEncoding, uint64_t, uint32_t>,
  335. ByteEncoding,
  336. /*ZeroMask=*/ByteEncoding ? 0 : (~0U << Size)>;
  337. // Only one kind of portable matched range is needed.
  338. using PortableMatchRange = BitIndexRange<MatchIndex>;
  339. // We use specialized match range types for SIMD implementations to allow
  340. // deferring the masking operation where useful. When that optimization
  341. // doesn't apply, these will be the same type.
  342. using SIMDMatchRange =
  343. BitIndexRange<MatchIndex, /*ByteEncodingMask=*/ByteEncoding ? MSBs : 0>;
  344. using SIMDMatchPresentRange = BitIndexRange<MatchIndex>;
  345. // The public API range types can be either the portable or SIMD variations,
  346. // selected here.
  347. using MatchRange =
  348. std::conditional_t<UseSIMD, SIMDMatchRange, PortableMatchRange>;
  349. using MatchPresentRange =
  350. std::conditional_t<UseSIMD, SIMDMatchPresentRange, PortableMatchRange>;
  351. union {
  352. uint8_t metadata_bytes[Size];
  353. uint64_t metadata_ints[Size / 8];
  354. #if CARBON_NEON_SIMD_SUPPORT
  355. uint8x8_t metadata_vec = {};
  356. static_assert(sizeof(metadata_vec) == Size);
  357. #elif CARBON_X86_SIMD_SUPPORT
  358. __m128i metadata_vec = {};
  359. static_assert(sizeof(metadata_vec) == Size);
  360. #endif
  361. };
  362. auto Print(llvm::raw_ostream& out) const -> void;
  363. friend auto operator==(MetadataGroup lhs, MetadataGroup rhs) -> bool {
  364. return CompareEqual(lhs, rhs);
  365. }
  366. // The main API for this class. This API will switch between a portable and
  367. // SIMD implementation based on what is most efficient, but in debug builds
  368. // will cross check that the implementations do not diverge.
  369. // Load and return a group of metadata bytes out of the main metadata array at
  370. // a particular `index`. The index must be a multiple of `GroupSize`. This
  371. // will arrange for the load to place the group into the correct structure for
  372. // efficient register-based processing.
  373. static auto Load(const uint8_t* metadata, ssize_t index) -> MetadataGroup;
  374. // Store this metadata group into the main metadata array at the provided
  375. // `index`. The index must be a multiple of `GroupSize`.
  376. auto Store(uint8_t* metadata, ssize_t index) const -> void;
  377. // Clear a byte of this group's metadata at the provided `byte_index` to the
  378. // empty value.
  379. //
  380. // Note that this must only be called when `FastByteClear` is true -- in all
  381. // other cases users of this class should arrange to clear individual bytes in
  382. // the underlying array rather than using the group API. This is checked by a
  383. // static_assert, and the function is templated so that it is not instantiated
  384. // in the cases where it would not be valid.
  385. template <bool IsCalled = true>
  386. auto ClearByte(ssize_t byte_index) -> void;
  387. // Clear all of this group's metadata bytes that indicate a deleted slot to
  388. // the empty value.
  389. auto ClearDeleted() -> void;
  390. // Find all of the bytes of metadata in this group that are present and whose
  391. // low 7 bits match the provided `tag`. The `tag` byte must have a clear high
  392. // bit, only 7 bits of tag are used. Note that this means the provided tag is
  393. // *not* the actual present metadata byte -- this function is responsible for
  394. // mapping the tag into that form as it can do so more efficiently in some
  395. // cases. A range over all of the byte indices which matched is returned.
  396. auto Match(uint8_t tag) const -> MatchRange;
  397. // Find all of the present bytes of metadata in this group. A range over all
  398. // of the byte indices which are present is returned.
  399. auto MatchPresent() const -> MatchPresentRange;
  400. // Find the first byte of the metadata group that is empty and return that
  401. // index. There is no order or position required for which of the bytes of
  402. // metadata is considered "first", any model will do that makes it efficient
  403. // to produce the matching index. Must return an empty match index if no bytes
  404. // match the empty metadata.
  405. auto MatchEmpty() const -> MatchIndex;
  406. // Find the first byte of the metadata group that is deleted and return that
  407. // index. There is no order or position required for which of the bytes of
  408. // metadata is considered "first", any model will do that makes it efficient
  409. // to produce the matching index. Must return an empty match index if no bytes
  410. // match the deleted metadata.
  411. auto MatchDeleted() const -> MatchIndex;
  412. private:
  413. // Two classes only defined in the benchmark code are allowed to directly call
  414. // the portable and SIMD implementations for benchmarking purposes.
  415. friend class BenchmarkPortableMetadataGroup;
  416. friend class BenchmarkSIMDMetadataGroup;
  417. // All SIMD variants that we have an implementation for should be enabled for
  418. // debugging. This lets us maintain a SIMD implementation even if it is not
  419. // used due to performance reasons, and easily re-enable it if the performance
  420. // changes.
  421. static constexpr bool DebugSIMD =
  422. #if !defined(NDEBUG) && (CARBON_NEON_SIMD_SUPPORT || CARBON_X86_SIMD_SUPPORT)
  423. true;
  424. #else
  425. false;
  426. #endif
  427. using MatchBitsT = MatchIndex::BitsT;
  428. // A helper function to allow deducing the return type from the selected arm
  429. // of a `constexpr` ternary.
  430. template <bool Condition, typename LeftT, typename RightT>
  431. static auto ConstexprTernary(LeftT lhs, RightT rhs) {
  432. if constexpr (Condition) {
  433. return lhs;
  434. } else {
  435. return rhs;
  436. }
  437. }
  438. static auto CompareEqual(MetadataGroup lhs, MetadataGroup rhs) -> bool;
  439. // Functions for validating the returned matches agree with what is predicted
  440. // by the `byte_match` function. These either `CHECK`-fail or return true. To
  441. // pass validation, the `*_bits` argument must have `0x80` for those bytes
  442. // where `byte_match` returns true, and `0` for the rest.
  443. // `VerifyIndexBits` is for functions that return `MatchIndex`, as they only
  444. // promise to return accurate information up to the first match.
  445. auto VerifyIndexBits(
  446. MatchBitsT index_bits,
  447. llvm::function_ref<auto(uint8_t byte)->bool> byte_match) const -> bool;
  448. // `VerifyPortableRangeBits` is for functions that return `MatchRange`, and so
  449. // it validates all the bytes of `range_bits`.
  450. auto VerifyPortableRangeBits(
  451. MatchBitsT range_bits,
  452. llvm::function_ref<auto(uint8_t byte)->bool> byte_match) const -> bool;
  453. // Portable implementations of each operation. These are used on platforms
  454. // without SIMD support or where the portable implementation is faster than
  455. // SIMD. They are heavily optimized even though they are not SIMD because we
  456. // expect there to be platforms where the portable implementation can
  457. // outperform SIMD. Their behavior and semantics exactly match the
  458. // documentation for the un-prefixed functions.
  459. //
  460. // In debug builds, these also directly verify their results to help establish
  461. // baseline functionality.
  462. static auto PortableLoad(const uint8_t* metadata, ssize_t index)
  463. -> MetadataGroup;
  464. auto PortableStore(uint8_t* metadata, ssize_t index) const -> void;
  465. auto PortableClearDeleted() -> void;
  466. auto PortableMatch(uint8_t tag) const -> PortableMatchRange;
  467. auto PortableMatchPresent() const -> PortableMatchRange;
  468. auto PortableMatchEmpty() const -> MatchIndex;
  469. auto PortableMatchDeleted() const -> MatchIndex;
  470. static auto PortableCompareEqual(MetadataGroup lhs, MetadataGroup rhs)
  471. -> bool;
  472. // SIMD implementations of each operation. We minimize platform-specific APIs
  473. // to reduce the scope of errors that can only be discovered building on one
  474. // platform, so the bodies of these contain the platform specific code. Their
  475. // behavior and semantics exactly match the documentation for the un-prefixed
  476. // functions.
  477. //
  478. // These routines don't directly verify their results as we can build simpler
  479. // debug checks by comparing them against the verified portable results.
  480. static auto SIMDLoad(const uint8_t* metadata, ssize_t index) -> MetadataGroup;
  481. auto SIMDStore(uint8_t* metadata, ssize_t index) const -> void;
  482. auto SIMDClearDeleted() -> void;
  483. auto SIMDMatch(uint8_t tag) const -> SIMDMatchRange;
  484. auto SIMDMatchPresent() const -> SIMDMatchPresentRange;
  485. auto SIMDMatchEmpty() const -> MatchIndex;
  486. auto SIMDMatchDeleted() const -> MatchIndex;
  487. static auto SIMDCompareEqual(MetadataGroup lhs, MetadataGroup rhs) -> bool;
  488. #if CARBON_X86_SIMD_SUPPORT
  489. // A common routine for x86 SIMD matching that can be used for matching
  490. // present, empty, and deleted bytes with equal efficiency.
  491. auto X86SIMDMatch(uint8_t match_byte) const -> SIMDMatchRange;
  492. #endif
  493. };
  494. // Promote the size and mask to top-level constants as we'll need to operate on
  495. // the grouped structure outside of the metadata bytes.
  496. inline constexpr ssize_t GroupSize = MetadataGroup::Size;
  497. inline constexpr ssize_t GroupMask = MetadataGroup::Mask;
  498. inline auto MetadataGroup::Load(const uint8_t* metadata, ssize_t index)
  499. -> MetadataGroup {
  500. MetadataGroup portable_g;
  501. if constexpr (!UseSIMD || DebugSIMD) {
  502. portable_g = PortableLoad(metadata, index);
  503. if constexpr (!UseSIMD) {
  504. return portable_g;
  505. }
  506. }
  507. MetadataGroup g = SIMDLoad(metadata, index);
  508. CARBON_DCHECK(g == portable_g);
  509. return g;
  510. }
  511. inline auto MetadataGroup::Store(uint8_t* metadata, ssize_t index) const
  512. -> void {
  513. if constexpr (!UseSIMD) {
  514. std::memcpy(metadata + index, &metadata_bytes, Size);
  515. } else {
  516. SIMDStore(metadata, index);
  517. }
  518. CARBON_DCHECK(0 == std::memcmp(metadata + index, &metadata_bytes, Size));
  519. }
  520. template <bool IsCalled>
  521. inline auto MetadataGroup::ClearByte(ssize_t byte_index) -> void {
  522. static_assert(!IsCalled || FastByteClear,
  523. "Only use byte clearing when fast!");
  524. static_assert(!IsCalled || Size == 8,
  525. "The clear implementation assumes an 8-byte group.");
  526. metadata_ints[0] &= ~(static_cast<uint64_t>(0xff) << (byte_index * 8));
  527. }
  528. inline auto MetadataGroup::ClearDeleted() -> void {
  529. MetadataGroup portable_g = *this;
  530. MetadataGroup simd_g = *this;
  531. if constexpr (!UseSIMD || DebugSIMD) {
  532. portable_g.PortableClearDeleted();
  533. }
  534. if constexpr (UseSIMD || DebugSIMD) {
  535. simd_g.SIMDClearDeleted();
  536. CARBON_DCHECK(
  537. simd_g == portable_g,
  538. "SIMD cleared group '{0}' doesn't match portable cleared group '{1}'",
  539. simd_g, portable_g);
  540. }
  541. *this = UseSIMD ? simd_g : portable_g;
  542. }
  543. inline auto MetadataGroup::Match(uint8_t tag) const -> MatchRange {
  544. // The caller should provide us with the present byte hash, and not set any
  545. // present bit tag on it so that this layer can manage tagging the high bit of
  546. // a present byte.
  547. CARBON_DCHECK((tag & PresentMask) == 0, "{0:x}", tag);
  548. PortableMatchRange portable_result;
  549. SIMDMatchRange simd_result;
  550. if constexpr (!UseSIMD || DebugSIMD) {
  551. portable_result = PortableMatch(tag);
  552. }
  553. if constexpr (UseSIMD || DebugSIMD) {
  554. simd_result = SIMDMatch(tag);
  555. CARBON_DCHECK(simd_result == portable_result,
  556. "SIMD result '{0}' doesn't match portable result '{1}'",
  557. simd_result, portable_result);
  558. }
  559. // Return whichever result we're using.
  560. return ConstexprTernary<UseSIMD>(simd_result, portable_result);
  561. }
  562. inline auto MetadataGroup::MatchPresent() const -> MatchPresentRange {
  563. PortableMatchRange portable_result;
  564. SIMDMatchPresentRange simd_result;
  565. if constexpr (!UseSIMD || DebugSIMD) {
  566. portable_result = PortableMatchPresent();
  567. }
  568. if constexpr (UseSIMD || DebugSIMD) {
  569. simd_result = SIMDMatchPresent();
  570. CARBON_DCHECK(simd_result == portable_result,
  571. "SIMD result '{0}' doesn't match portable result '{1}'",
  572. simd_result, portable_result);
  573. }
  574. // Return whichever result we're using.
  575. return ConstexprTernary<UseSIMD>(simd_result, portable_result);
  576. }
  577. inline auto MetadataGroup::MatchEmpty() const -> MatchIndex {
  578. MatchIndex portable_result;
  579. MatchIndex simd_result;
  580. if constexpr (!UseSIMD || DebugSIMD) {
  581. portable_result = PortableMatchEmpty();
  582. }
  583. if constexpr (UseSIMD || DebugSIMD) {
  584. simd_result = SIMDMatchEmpty();
  585. CARBON_DCHECK(simd_result == portable_result,
  586. "SIMD result '{0}' doesn't match portable result '{1}'",
  587. simd_result, portable_result);
  588. }
  589. return UseSIMD ? simd_result : portable_result;
  590. }
  591. inline auto MetadataGroup::MatchDeleted() const -> MatchIndex {
  592. MatchIndex portable_result;
  593. MatchIndex simd_result;
  594. if constexpr (!UseSIMD || DebugSIMD) {
  595. portable_result = PortableMatchDeleted();
  596. }
  597. if constexpr (UseSIMD || DebugSIMD) {
  598. simd_result = SIMDMatchDeleted();
  599. CARBON_DCHECK(simd_result == portable_result,
  600. "SIMD result '{0}' doesn't match portable result '{1}'",
  601. simd_result, portable_result);
  602. }
  603. return UseSIMD ? simd_result : portable_result;
  604. }
  605. inline auto MetadataGroup::CompareEqual(MetadataGroup lhs, MetadataGroup rhs)
  606. -> bool {
  607. bool portable_result;
  608. bool simd_result;
  609. if constexpr (!UseSIMD || DebugSIMD) {
  610. portable_result = PortableCompareEqual(lhs, rhs);
  611. }
  612. if constexpr (UseSIMD || DebugSIMD) {
  613. simd_result = SIMDCompareEqual(lhs, rhs);
  614. CARBON_DCHECK(simd_result == portable_result);
  615. }
  616. return UseSIMD ? simd_result : portable_result;
  617. }
  618. inline auto MetadataGroup::VerifyIndexBits(
  619. MatchBitsT index_bits,
  620. llvm::function_ref<auto(uint8_t byte)->bool> byte_match) const -> bool {
  621. for (ssize_t byte_index : llvm::seq<ssize_t>(0, Size)) {
  622. if constexpr (!ByteEncoding) {
  623. if (byte_match(metadata_bytes[byte_index])) {
  624. CARBON_CHECK(((index_bits >> byte_index) & 1) == 1,
  625. "Bit not set at matching byte index: {0}", byte_index);
  626. // Only the first match is needed, so stop scanning once found.
  627. break;
  628. }
  629. CARBON_CHECK(((index_bits >> byte_index) & 1) == 0,
  630. "Bit set at non-matching byte index: {0}", byte_index);
  631. } else {
  632. // `index_bits` is byte-encoded rather than bit encoded, so extract a
  633. // byte.
  634. uint8_t index_byte = (index_bits >> (byte_index * 8)) & 0xFF;
  635. if (byte_match(metadata_bytes[byte_index])) {
  636. CARBON_CHECK(
  637. (index_byte & 0x80) == 0x80,
  638. "Should have the high bit set for a matching byte, found: {0:x}",
  639. index_byte);
  640. // Only the first match is needed so stop scanning once found.
  641. break;
  642. }
  643. CARBON_CHECK(
  644. index_byte == 0,
  645. "Should have no bits set for an unmatched byte, found: {0:x}",
  646. index_byte);
  647. }
  648. }
  649. return true;
  650. }
  651. inline auto MetadataGroup::VerifyPortableRangeBits(
  652. MatchBitsT range_bits,
  653. llvm::function_ref<auto(uint8_t byte)->bool> byte_match) const -> bool {
  654. for (ssize_t byte_index : llvm::seq<ssize_t>(0, Size)) {
  655. if constexpr (!ByteEncoding) {
  656. if (byte_match(metadata_bytes[byte_index])) {
  657. CARBON_CHECK(((range_bits >> byte_index) & 1) == 1,
  658. "Bit not set at matching byte index: {0}", byte_index);
  659. } else {
  660. CARBON_CHECK(((range_bits >> byte_index) & 1) == 0,
  661. "Bit set at non-matching byte index: {0}", byte_index);
  662. }
  663. } else {
  664. // `range_bits` is byte-encoded rather than bit encoded, so extract a
  665. // byte.
  666. uint8_t range_byte = (range_bits >> (byte_index * 8)) & 0xFF;
  667. if (byte_match(metadata_bytes[byte_index])) {
  668. CARBON_CHECK(range_byte == 0x80,
  669. "Should just have the high bit set for a matching byte, "
  670. "found: {0:x}",
  671. range_byte);
  672. } else {
  673. CARBON_CHECK(
  674. range_byte == 0,
  675. "Should have no bits set for an unmatched byte, found: {0:x}",
  676. range_byte);
  677. }
  678. }
  679. }
  680. return true;
  681. }
  682. inline auto MetadataGroup::PortableLoad(const uint8_t* metadata, ssize_t index)
  683. -> MetadataGroup {
  684. MetadataGroup g;
  685. static_assert(sizeof(g) == Size);
  686. std::memcpy(&g.metadata_bytes, metadata + index, Size);
  687. return g;
  688. }
  689. inline auto MetadataGroup::PortableStore(uint8_t* metadata, ssize_t index) const
  690. -> void {
  691. std::memcpy(metadata + index, &metadata_bytes, Size);
  692. }
  693. inline auto MetadataGroup::PortableClearDeleted() -> void {
  694. for (uint64_t& metadata_int : metadata_ints) {
  695. // Deleted bytes have only the least significant bits set, so to clear them
  696. // we only need to clear the least significant bit. And empty bytes already
  697. // have a clear least significant bit, so the only least significant bits we
  698. // need to preserve are those of present bytes. The most significant bit of
  699. // every present byte is set, so we take the most significant bit of each
  700. // byte, shift it into the least significant bit position, and bit-or it
  701. // with the compliment of `LSBs`. This will have ones for every bit but the
  702. // least significant bits, and ones for the least significant bits of every
  703. // present byte.
  704. metadata_int &= (~LSBs | metadata_int >> 7);
  705. }
  706. }
  707. inline auto MetadataGroup::PortableMatch(uint8_t tag) const -> MatchRange {
  708. // The caller should provide us with the present byte hash, and not set any
  709. // present bit tag on it so that this layer can manage tagging the high bit of
  710. // a present byte.
  711. CARBON_DCHECK((tag & PresentMask) == 0, "{0:x}", tag);
  712. // Use a simple fallback approach for sizes beyond 8.
  713. // TODO: Instead of a simple fallback, we should generalize the below
  714. // algorithm for sizes above 8, even if to just exercise the same code on
  715. // more platforms.
  716. if constexpr (Size > 8) {
  717. static_assert(Size <= 32, "Sizes larger than 32 not yet supported!");
  718. uint32_t match_bits = 0;
  719. uint32_t bit = 1;
  720. uint8_t present_byte = tag | PresentMask;
  721. for (ssize_t i : llvm::seq<ssize_t>(0, Size)) {
  722. if (metadata_bytes[i] == present_byte) {
  723. match_bits |= bit;
  724. }
  725. bit <<= 1;
  726. }
  727. return MatchRange(match_bits);
  728. }
  729. // This algorithm only works for matching *present* bytes. We leverage the
  730. // set high bit in the present case as part of the algorithm. The whole
  731. // algorithm has a critical path height of 4 operations, and does 6
  732. // operations total on AArch64. The operation dependency graph is:
  733. //
  734. // group | MSBs LSBs * match_byte + MSBs
  735. // \ /
  736. // match_bits ^ broadcast
  737. // |
  738. // group & MSBs MSBs - match_bits
  739. // \ /
  740. // group_MSBs & match_bits
  741. //
  742. // This diagram and the operation count are specific to AArch64 where we have
  743. // a fused *integer* multiply-add operation.
  744. //
  745. // While it is superficially similar to the "find zero bytes in a word" bit
  746. // math trick, it is different because this is designed to have no false
  747. // positives and perfectly produce 0x80 for matching bytes and 0x00 for
  748. // non-matching bytes. This is do-able because we constrain to only handle
  749. // present matches which only require testing 7 bits and have a particular
  750. // layout.
  751. // Set the high bit of every byte to `1`. Any matching byte is a present byte
  752. // and so always has this bit set as well, which means the xor below, in
  753. // addition to zeroing the low 7 bits of any byte that matches the tag, also
  754. // clears the high bit of every byte.
  755. uint64_t match_bits = metadata_ints[0] | MSBs;
  756. // Broadcast the match byte to all bytes, and mask in the present bits in the
  757. // MSBs of each byte. We structure this as a multiply and an add because we
  758. // know that the add cannot carry, and this way it can be lowered using
  759. // combined multiply-add instructions if available.
  760. uint64_t broadcast = LSBs * tag + MSBs;
  761. CARBON_DCHECK(broadcast == (LSBs * tag | MSBs),
  762. "Unexpected carry from addition!");
  763. // Xor the broadcast byte pattern. This makes bytes with matches become 0, and
  764. // clears the high-bits of non-matches. Note that if we are looking for a tag
  765. // with the same value as `Empty` or `Deleted`, those bytes will be zero as
  766. // well.
  767. match_bits = match_bits ^ broadcast;
  768. // Subtract each byte of `match_bits` from `0x80` bytes. After this, the high
  769. // bit will be set only for those bytes that were zero.
  770. match_bits = MSBs - match_bits;
  771. // Zero everything but the high bits, and also zero the high bits of any bytes
  772. // for "not present" slots in the original group. This avoids false positives
  773. // for `Empty` and `Deleted` bytes in the metadata.
  774. match_bits &= (metadata_ints[0] & MSBs);
  775. // At this point, `match_bits` has the high bit set for bytes where the
  776. // original group byte equals `tag` plus the high bit.
  777. CARBON_DCHECK(VerifyPortableRangeBits(
  778. match_bits, [&](uint8_t byte) { return byte == (tag | PresentMask); }));
  779. return MatchRange(match_bits);
  780. }
  781. inline auto MetadataGroup::PortableMatchPresent() const -> MatchRange {
  782. // Use a simple fallback approach for sizes beyond 8.
  783. // TODO: Instead of a simple fallback, we should generalize the below
  784. // algorithm for sizes above 8, even if to just exercise the same code on
  785. // more platforms.
  786. if constexpr (Size > 8) {
  787. static_assert(Size <= 32, "Sizes larger than 32 not yet supported!");
  788. uint32_t match_bits = 0;
  789. uint32_t bit = 1;
  790. for (ssize_t i : llvm::seq<ssize_t>(0, Size)) {
  791. if (metadata_bytes[i] & PresentMask) {
  792. match_bits |= bit;
  793. }
  794. bit <<= 1;
  795. }
  796. return MatchRange(match_bits);
  797. }
  798. // Want to keep the high bit of each byte, which indicates whether that byte
  799. // represents a present slot.
  800. uint64_t match_bits = metadata_ints[0] & MSBs;
  801. CARBON_DCHECK(VerifyPortableRangeBits(
  802. match_bits, [&](uint8_t byte) { return (byte & PresentMask) != 0; }));
  803. return MatchRange(match_bits);
  804. }
  805. inline auto MetadataGroup::PortableMatchEmpty() const -> MatchIndex {
  806. // Use a simple fallback approach for sizes beyond 8.
  807. // TODO: Instead of a simple fallback, we should generalize the below
  808. // algorithm for sizes above 8, even if to just exercise the same code on
  809. // more platforms.
  810. if constexpr (Size > 8) {
  811. static_assert(Size <= 32, "Sizes larger than 32 not yet supported!");
  812. uint32_t bit = 1;
  813. for (ssize_t i : llvm::seq<ssize_t>(0, Size)) {
  814. if (metadata_bytes[i] == Empty) {
  815. return MatchIndex(bit);
  816. }
  817. bit <<= 1;
  818. }
  819. return MatchIndex(0);
  820. }
  821. // This sets the high bit of every byte in `match_bits` unless the
  822. // corresponding metadata byte is 0. We take advantage of the fact that
  823. // the metadata bytes in are non-zero only if they are either:
  824. // - present: in which case the high bit of the byte will already be set; or
  825. // - deleted: in which case the byte will be 1, and shifting it left by 7 will
  826. // cause the high bit to be set.
  827. uint64_t match_bits = metadata_ints[0] | (metadata_ints[0] << 7);
  828. // This inverts the high bits of the bytes, and clears the remaining bits.
  829. match_bits = ~match_bits & MSBs;
  830. // The high bits of the bytes of `match_bits` are set if the corresponding
  831. // metadata byte is `Empty`.
  832. CARBON_DCHECK(
  833. VerifyIndexBits(match_bits, [](uint8_t byte) { return byte == Empty; }));
  834. return MatchIndex(match_bits);
  835. }
  836. inline auto MetadataGroup::PortableMatchDeleted() const -> MatchIndex {
  837. // Use a simple fallback approach for sizes beyond 8.
  838. // TODO: Instead of a simple fallback, we should generalize the below
  839. // algorithm for sizes above 8, even if to just exercise the same code on
  840. // more platforms.
  841. if constexpr (Size > 8) {
  842. static_assert(Size <= 32, "Sizes larger than 32 not yet supported!");
  843. uint32_t bit = 1;
  844. for (ssize_t i : llvm::seq<ssize_t>(0, Size)) {
  845. if (metadata_bytes[i] == Deleted) {
  846. return MatchIndex(bit);
  847. }
  848. bit <<= 1;
  849. }
  850. return MatchIndex(0);
  851. }
  852. // This sets the high bit of every byte in `match_bits` unless the
  853. // corresponding metadata byte is 1. We take advantage of the fact that the
  854. // metadata bytes are not 1 only if they are either:
  855. // - present: in which case the high bit of the byte will already be set; or
  856. // - empty: in which case the byte will be 0, and in that case inverting and
  857. // shifting left by 7 will have the high bit set.
  858. uint64_t match_bits = metadata_ints[0] | (~metadata_ints[0] << 7);
  859. // This inverts the high bits of the bytes, and clears the remaining bits.
  860. match_bits = ~match_bits & MSBs;
  861. // The high bits of the bytes of `match_bits` are set if the corresponding
  862. // metadata byte is `Deleted`.
  863. CARBON_DCHECK(VerifyIndexBits(match_bits,
  864. [](uint8_t byte) { return byte == Deleted; }));
  865. return MatchIndex(match_bits);
  866. }
  867. inline auto MetadataGroup::PortableCompareEqual(MetadataGroup lhs,
  868. MetadataGroup rhs) -> bool {
  869. return llvm::equal(lhs.metadata_bytes, rhs.metadata_bytes);
  870. }
  871. inline auto MetadataGroup::SIMDLoad(const uint8_t* metadata, ssize_t index)
  872. -> MetadataGroup {
  873. MetadataGroup g;
  874. #if CARBON_NEON_SIMD_SUPPORT
  875. g.metadata_vec = vld1_u8(metadata + index);
  876. #elif CARBON_X86_SIMD_SUPPORT
  877. g.metadata_vec =
  878. _mm_load_si128(reinterpret_cast<const __m128i*>(metadata + index));
  879. #else
  880. static_assert(!UseSIMD, "Unimplemented SIMD operation");
  881. static_cast<void>(metadata);
  882. static_cast<void>(index);
  883. #endif
  884. return g;
  885. }
  886. inline auto MetadataGroup::SIMDStore(uint8_t* metadata, ssize_t index) const
  887. -> void {
  888. #if CARBON_NEON_SIMD_SUPPORT
  889. vst1_u8(metadata + index, metadata_vec);
  890. #elif CARBON_X86_SIMD_SUPPORT
  891. _mm_store_si128(reinterpret_cast<__m128i*>(metadata + index), metadata_vec);
  892. #else
  893. static_assert(!UseSIMD, "Unimplemented SIMD operation");
  894. static_cast<void>(metadata);
  895. static_cast<void>(index);
  896. #endif
  897. }
  898. inline auto MetadataGroup::SIMDClearDeleted() -> void {
  899. #if CARBON_NEON_SIMD_SUPPORT
  900. // There is no good Neon operation to implement this, so do it using integer
  901. // code. This is reasonably fast, but unfortunate because it forces the group
  902. // out of a SIMD register and into a general purpose register, which can have
  903. // high latency.
  904. metadata_ints[0] &= (~LSBs | metadata_ints[0] >> 7);
  905. #elif CARBON_X86_SIMD_SUPPORT
  906. // For each byte, use `metadata_vec` if the byte's high bit is set (indicating
  907. // it is present), otherwise (it is empty or deleted) replace it with zero
  908. // (representing empty).
  909. metadata_vec =
  910. _mm_blendv_epi8(_mm_setzero_si128(), metadata_vec, metadata_vec);
  911. #else
  912. static_assert(!UseSIMD && !DebugSIMD, "Unimplemented SIMD operation");
  913. #endif
  914. }
  915. inline auto MetadataGroup::SIMDMatch(uint8_t tag) const -> SIMDMatchRange {
  916. SIMDMatchRange result;
  917. #if CARBON_NEON_SIMD_SUPPORT
  918. // Broadcast byte we want to match to every byte in the vector.
  919. auto match_byte_vec = vdup_n_u8(tag | PresentMask);
  920. // Result bytes have all bits set for the bytes that match, so we have to
  921. // clear everything but MSBs next.
  922. auto match_byte_cmp_vec = vceq_u8(metadata_vec, match_byte_vec);
  923. uint64_t match_bits = vreinterpret_u64_u8(match_byte_cmp_vec)[0];
  924. // Note that the range will lazily mask to the MSBs as part of incrementing.
  925. result = SIMDMatchRange(match_bits);
  926. #elif CARBON_X86_SIMD_SUPPORT
  927. result = X86SIMDMatch(tag | PresentMask);
  928. #else
  929. static_assert(!UseSIMD && !DebugSIMD, "Unimplemented SIMD operation");
  930. static_cast<void>(tag);
  931. #endif
  932. return result;
  933. }
  934. inline auto MetadataGroup::SIMDMatchPresent() const -> SIMDMatchPresentRange {
  935. SIMDMatchPresentRange result;
  936. #if CARBON_NEON_SIMD_SUPPORT
  937. // Just extract the metadata directly.
  938. uint64_t match_bits = vreinterpret_u64_u8(metadata_vec)[0];
  939. // Even though the Neon SIMD range will do its own masking, we have to mask
  940. // here so that `empty` is correct.
  941. result = SIMDMatchPresentRange(match_bits & MSBs);
  942. #elif CARBON_X86_SIMD_SUPPORT
  943. // We arranged the byte vector so that present bytes have the high bit set,
  944. // which this instruction extracts.
  945. result = SIMDMatchPresentRange(_mm_movemask_epi8(metadata_vec));
  946. #else
  947. static_assert(!UseSIMD && !DebugSIMD, "Unimplemented SIMD operation");
  948. #endif
  949. return result;
  950. }
  951. inline auto MetadataGroup::SIMDMatchEmpty() const -> MatchIndex {
  952. MatchIndex result;
  953. #if CARBON_NEON_SIMD_SUPPORT
  954. // Compare all bytes with zero, as that is the empty byte value. Result will
  955. // have all bits set for any input zero byte, so we zero all but the high bits
  956. // below.
  957. auto cmp_vec = vceqz_u8(metadata_vec);
  958. uint64_t metadata_bits = vreinterpret_u64_u8(cmp_vec)[0];
  959. // The matched range is likely to be tested for zero by the caller, and that
  960. // test can often be folded into masking the bits with `MSBs` when we do that
  961. // mask in the scalar domain rather than the SIMD domain. So we do the mask
  962. // here rather than above prior to extracting the match bits.
  963. result = MatchIndex(metadata_bits & MSBs);
  964. #elif CARBON_X86_SIMD_SUPPORT
  965. // Even though we only need the first match rather than all matches, we don't
  966. // have a more efficient way to compute this on x86 and so we reuse the
  967. // general match infrastructure that computes all matches in a bit-encoding.
  968. // We then convert it into a `MatchIndex` that just finds the first one.
  969. result = static_cast<MatchIndex>(X86SIMDMatch(Empty));
  970. #else
  971. static_assert(!UseSIMD && !DebugSIMD, "Unimplemented SIMD operation");
  972. #endif
  973. return result;
  974. }
  975. inline auto MetadataGroup::SIMDMatchDeleted() const -> MatchIndex {
  976. MatchIndex result;
  977. #if CARBON_NEON_SIMD_SUPPORT
  978. // Broadcast the `Deleted` byte across the vector and compare the bytes of
  979. // that with the metadata vector. The result will have all bits set for any
  980. // input zero byte, so we zero all but the high bits below.
  981. auto cmp_vec = vceq_u8(metadata_vec, vdup_n_u8(Deleted));
  982. uint64_t match_bits = vreinterpret_u64_u8(cmp_vec)[0];
  983. // The matched range is likely to be tested for zero by the caller, and that
  984. // test can often be folded into masking the bits with `MSBs` when we do that
  985. // mask in the scalar domain rather than the SIMD domain. So we do the mask
  986. // here rather than above prior to extracting the match bits.
  987. result = MatchIndex(match_bits & MSBs);
  988. #elif CARBON_X86_SIMD_SUPPORT
  989. // Even though we only need the first match rather than all matches, we don't
  990. // have a more efficient way to compute this on x86 and so we reuse the
  991. // general match infrastructure that computes all matches in a bit-encoding.
  992. // We then convert it into a `MatchIndex` that just finds the first one.
  993. result = static_cast<MatchIndex>(X86SIMDMatch(Deleted));
  994. #else
  995. static_assert(!UseSIMD && !DebugSIMD, "Unimplemented SIMD operation");
  996. #endif
  997. return result;
  998. }
  999. inline auto MetadataGroup::SIMDCompareEqual(MetadataGroup lhs,
  1000. MetadataGroup rhs) -> bool {
  1001. #if CARBON_NEON_SIMD_SUPPORT
  1002. return vreinterpret_u64_u8(vceq_u8(lhs.metadata_vec, rhs.metadata_vec))[0] ==
  1003. static_cast<uint64_t>(-1LL);
  1004. #elif CARBON_X86_SIMD_SUPPORT
  1005. // Different x86 SIMD extensions provide different comparison functionality
  1006. // available.
  1007. #if __SSE4_2__
  1008. // With SSE 4.2, we can directly test and branch in the SIMD domain on whether
  1009. // the two metadata vectors are equal.
  1010. return _mm_testc_si128(_mm_cmpeq_epi8(lhs.metadata_vec, rhs.metadata_vec),
  1011. _mm_set1_epi8(0xff)) == 1;
  1012. #else
  1013. // With older versions of SSE we have to extract the result of the comparison,
  1014. // much like we do when matching. That will have the usual bitmask
  1015. // representing equal bytes, and test for that exact bitmask in scalar code.
  1016. return _mm_movemask_epi8(_mm_cmpeq_epi8(lhs.metadata_vec,
  1017. rhs.metadata_vec)) == 0x0000'ffffU;
  1018. #endif
  1019. #else
  1020. static_assert(!UseSIMD && !DebugSIMD, "Unimplemented SIMD operation");
  1021. static_cast<void>(lhs);
  1022. static_cast<void>(rhs);
  1023. return false;
  1024. #endif
  1025. }
  1026. #if CARBON_X86_SIMD_SUPPORT
  1027. inline auto MetadataGroup::X86SIMDMatch(uint8_t match_byte) const
  1028. -> MatchRange {
  1029. // Broadcast the byte we're matching against to all bytes in a vector, and
  1030. // compare those bytes with the metadata vector bytes.
  1031. auto match_byte_vec = _mm_set1_epi8(match_byte);
  1032. auto match_byte_cmp_vec = _mm_cmpeq_epi8(metadata_vec, match_byte_vec);
  1033. // Extract the result of each byte-wise comparison into the low bits of an
  1034. // integer.
  1035. uint32_t match_bits = _mm_movemask_epi8(match_byte_cmp_vec);
  1036. return MatchRange(match_bits);
  1037. }
  1038. #endif
  1039. } // namespace Carbon::RawHashtable
  1040. #endif // CARBON_COMMON_RAW_HASHTABLE_METADATA_GROUP_H_