raw_hashtable_metadata_group.h 47 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #ifndef CARBON_COMMON_RAW_HASHTABLE_METADATA_GROUP_H_
  5. #define CARBON_COMMON_RAW_HASHTABLE_METADATA_GROUP_H_
  6. #include <cstddef>
  7. #include <cstring>
  8. #include <iterator>
  9. #include <type_traits>
  10. #include "common/check.h"
  11. #include "common/ostream.h"
  12. #include "llvm/ADT/Sequence.h"
  13. #include "llvm/ADT/bit.h"
  14. #include "llvm/Support/FormatVariadic.h"
  15. #include "llvm/Support/MathExtras.h"
  16. // Detect whether we can use SIMD accelerated implementations of the control
  17. // groups, and include the relevant platform specific APIs for the SIMD
  18. // implementations.
  19. //
  20. // Reference documentation for the SIMD APIs used here:
  21. // - https://arm-software.github.io/acle/neon_intrinsics/advsimd.html
  22. // - https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html
  23. #if defined(__SSSE3__)
  24. #include <x86intrin.h>
  25. #define CARBON_X86_SIMD_SUPPORT 1
  26. #elif defined(__ARM_NEON)
  27. #include <arm_neon.h>
  28. #define CARBON_NEON_SIMD_SUPPORT 1
  29. #endif
  30. // This namespace collects low-level utilities for implementing hashtable
  31. // data structures. This file only provides one of them:
  32. //
  33. // - Primitives to manage "groups" of hashtable entries that have densely packed
  34. // control bytes we can scan rapidly as a group, often using SIMD facilities
  35. // to process the entire group at once.
  36. namespace Carbon::RawHashtable {
  37. // We define a constant max group size. The particular group size used in
  38. // practice may vary, but we want to have some upper bound used to ensure
  39. // memory allocation is done consistently across different architectures.
  40. inline constexpr ssize_t MaxGroupSize = 16;
  41. // This takes a collection of bits representing the results of looking for a
  42. // particular tag in this metadata group and determines the first position with
  43. // a match. The position is represented by either the least significant set bit
  44. // or the least significant non-zero byte, depending on `ByteEncoding`. When
  45. // represented with a non-zero byte, that byte must have at least its most
  46. // significant bit set, but may have other bits set to any value. Bits more
  47. // significant than the match may have any value provided there is at least one
  48. // match. Zero matches must be represented by a zero input.
  49. //
  50. // Some bits of the underlying value may be known-zero, which can optimize
  51. // various operations. These can be represented as a `ZeroMask`.
  52. template <typename BitsInputT, bool ByteEncodingInput, BitsInputT ZeroMask = 0>
  53. class BitIndex
  54. : public Printable<BitIndex<BitsInputT, ByteEncodingInput, ZeroMask>> {
  55. public:
  56. using BitsT = BitsInputT;
  57. static constexpr bool ByteEncoding = ByteEncodingInput;
  58. BitIndex() = default;
  59. explicit BitIndex(BitsT bits) : bits_(bits) {}
  60. friend auto operator==(BitIndex lhs, BitIndex rhs) -> bool {
  61. if (lhs.empty() || rhs.empty()) {
  62. return lhs.empty() == rhs.empty();
  63. }
  64. // For non-empty bit indices, compare the indices directly to ignore other
  65. // (extraneous) parts of the incoming bits.
  66. return lhs.index() == rhs.index();
  67. }
  68. auto Print(llvm::raw_ostream& out) const -> void {
  69. out << llvm::formatv("{0:x}", bits_);
  70. }
  71. explicit operator bool() const { return !empty(); }
  72. // Returns true when there are no matches for the tag.
  73. auto empty() const -> bool {
  74. CARBON_DCHECK((bits_ & ZeroMask) == 0, "Unexpected non-zero bits!");
  75. __builtin_assume((bits_ & ZeroMask) == 0);
  76. return bits_ == 0;
  77. }
  78. // Returns the index of the first matched tag.
  79. auto index() -> ssize_t {
  80. CARBON_DCHECK(bits_ != 0, "Cannot get an index from zero bits!");
  81. __builtin_assume(bits_ != 0);
  82. ssize_t index = unscaled_index();
  83. if constexpr (ByteEncoding) {
  84. // Shift to scale out of the byte encoding.
  85. index >>= ByteEncodingShift;
  86. }
  87. return index;
  88. }
  89. // Optimized tool to index a pointer `p` by `index()`.
  90. template <typename T>
  91. auto index_ptr(T* pointer) -> T* {
  92. CARBON_DCHECK(bits_ != 0, "Cannot get an index from zero bits!");
  93. __builtin_assume(bits_ != 0);
  94. if constexpr (!ByteEncoding) {
  95. return &pointer[unscaled_index()];
  96. }
  97. ssize_t index = unscaled_index();
  98. // Scale the index as we counted zero *bits* and not zero *bytes*.
  99. // However, we can fold that scale with the size of `T` when it is a power
  100. // of two or divisible by 8.
  101. CARBON_DCHECK(
  102. (index & ((static_cast<size_t>(1) << ByteEncodingShift) - 1)) == 0);
  103. if constexpr (sizeof(T) % 8 == 0) {
  104. constexpr size_t FoldedScale = sizeof(T) / 8;
  105. index *= FoldedScale;
  106. return reinterpret_cast<T*>(
  107. &reinterpret_cast<std::byte*>(pointer)[index]);
  108. } else if constexpr (llvm::isPowerOf2_64(sizeof(T))) {
  109. constexpr size_t ScaleShift = llvm::ConstantLog2<sizeof(T)>();
  110. static_assert(ScaleShift <= ByteEncodingShift,
  111. "Scaling by >=8 should be handled above!");
  112. constexpr size_t FoldedShift = ByteEncodingShift - ScaleShift;
  113. index >>= FoldedShift;
  114. return reinterpret_cast<T*>(
  115. &reinterpret_cast<std::byte*>(pointer)[index]);
  116. }
  117. // Nothing we can fold here.
  118. return &pointer[index >> ByteEncodingShift];
  119. }
  120. private:
  121. // When using a byte encoding, we'll need to shift any index by this amount.
  122. static constexpr size_t ByteEncodingShift = 3;
  123. auto unscaled_index() -> ssize_t {
  124. if constexpr (!ByteEncoding) {
  125. // Note the cast to `size_t` to force zero extending the result.
  126. return static_cast<size_t>(llvm::countr_zero(bits_));
  127. } else {
  128. // The index is encoded in the high bit of each byte. We compute the index
  129. // by counting the number of low zero bytes there are before the first
  130. // byte with its high bit set. Rather that shifting the high bit to be the
  131. // low bit and counting the trailing (least significant) zero bits
  132. // directly, we instead byte-reverse the bits and count the *leading*
  133. // (most significant) zero bits. While this may be a wash on CPUs with
  134. // direct support for counting the trailing zero bits, AArch64 only
  135. // supports counting the leading zero bits and requires a bit-reverse to
  136. // count the trailing zero bits. Doing the byte-reverse approach
  137. // essentially combines moving the high bit into the low bit and the
  138. // reverse necessary for counting the zero bits. While this only removes
  139. // one instruction, it is an instruction in the critical path of the
  140. // hottest part of table lookup, and that critical path dependency height
  141. // is few enough instructions that removing even one significantly impacts
  142. // latency.
  143. //
  144. // We also cast to `size_t` to clearly zero-extend the result.
  145. return static_cast<size_t>(llvm::countl_zero(llvm::byteswap(bits_)));
  146. }
  147. }
  148. BitsT bits_ = 0;
  149. };
  150. // This is like `BitIndex`, but allows iterating through all of the matches.
  151. //
  152. // A key requirement for efficient iteration is that all of the matches are
  153. // represented with a single bit and there are no other bits set. For example,
  154. // with byte-encoded bit indices, exactly the high bit and no other bit of each
  155. // matching byte must be set. This is a stricter constraint than what `BitIndex`
  156. // alone would impose on any one of the matches.
  157. template <typename BitIndexT, BitIndexT::BitsT ByteEncodingMask = 0>
  158. class BitIndexRange
  159. : public Printable<BitIndexRange<BitIndexT, ByteEncodingMask>> {
  160. public:
  161. using BitsT = BitIndexT::BitsT;
  162. static_assert(BitIndexT::ByteEncoding || ByteEncodingMask == 0,
  163. "Non-byte encoding must not have a byte encoding mask.");
  164. class Iterator
  165. : public llvm::iterator_facade_base<Iterator, std::forward_iterator_tag,
  166. ssize_t, ssize_t> {
  167. public:
  168. Iterator() = default;
  169. explicit Iterator(BitsT bits) : bits_(bits) {}
  170. friend auto operator==(const Iterator& lhs, const Iterator& rhs) -> bool {
  171. return lhs.bits_ == rhs.bits_;
  172. }
  173. auto operator*() -> ssize_t& {
  174. CARBON_DCHECK(bits_ != 0, "Cannot get an index from zero bits!");
  175. __builtin_assume(bits_ != 0);
  176. index_ = BitIndexT(bits_).index();
  177. // Note that we store the index in a member so we can return a reference
  178. // to it here as required to be a forward iterator.
  179. return index_;
  180. }
  181. template <typename T>
  182. auto index_ptr(T* pointer) -> T* {
  183. return BitIndexT(bits_).index_ptr(pointer);
  184. }
  185. auto operator++() -> Iterator& {
  186. CARBON_DCHECK(bits_ != 0, "Must not increment past the end!");
  187. __builtin_assume(bits_ != 0);
  188. if constexpr (ByteEncodingMask != 0) {
  189. // Apply an increment mask to the bits first. This is used with the byte
  190. // encoding when the mask isn't needed until we begin incrementing.
  191. bits_ &= ByteEncodingMask;
  192. }
  193. // Clears the least significant set bit, effectively stepping to the next
  194. // match.
  195. bits_ &= (bits_ - 1);
  196. return *this;
  197. }
  198. private:
  199. ssize_t index_;
  200. BitsT bits_ = 0;
  201. };
  202. BitIndexRange() = default;
  203. explicit BitIndexRange(BitsT bits) : bits_(bits) {}
  204. explicit operator bool() const { return !empty(); }
  205. auto empty() const -> bool { return BitIndexT(bits_).empty(); }
  206. auto begin() const -> Iterator { return Iterator(bits_); }
  207. auto end() const -> Iterator { return Iterator(); }
  208. friend auto operator==(BitIndexRange lhs, BitIndexRange rhs) -> bool {
  209. if constexpr (ByteEncodingMask == 0) {
  210. // If there is no encoding mask, we can just compare the bits directly.
  211. return lhs.bits_ == rhs.bits_;
  212. } else {
  213. // Otherwise, compare the initial bit indices and the masked bits.
  214. return BitIndexT(lhs.bits_) == BitIndexT(rhs.bits_) &&
  215. (lhs.bits_ & ByteEncodingMask) == (rhs.bits_ & ByteEncodingMask);
  216. }
  217. }
  218. // Define heterogeneous equality between a masked (the current type) and
  219. // unmasked range. Requires a non-zero mask to avoid a redundant definition
  220. // with the homogeneous equality.
  221. friend auto operator==(BitIndexRange lhs, BitIndexRange<BitIndexT, 0> rhs)
  222. -> bool
  223. requires(ByteEncodingMask != 0)
  224. {
  225. // For mixed masked / unmasked comparison, we make sure the initial indices
  226. // are the same and that the masked side (LHS) is the same after masking as
  227. // the unmasked side (RHS).
  228. return BitIndexT(lhs.bits_) == BitIndexT(rhs.bits_) &&
  229. (lhs.bits_ & ByteEncodingMask) == rhs.bits_;
  230. }
  231. auto Print(llvm::raw_ostream& out) const -> void {
  232. out << llvm::formatv("{0:x}", bits_);
  233. }
  234. explicit operator BitsT() const { return bits_; }
  235. explicit operator BitIndexT() const { return BitIndexT(bits_); }
  236. private:
  237. template <typename FriendBitIndexT,
  238. FriendBitIndexT::BitsT FriendByteEncodingMask>
  239. friend class BitIndexRange;
  240. BitsT bits_ = 0;
  241. };
  242. // A group of metadata bytes that can be manipulated together.
  243. //
  244. // The metadata bytes used Carbon's hashtable implementation are designed to
  245. // support being manipulating as groups, either using architecture specific SIMD
  246. // code sequences or using portable SIMD-in-an-integer-register code sequences.
  247. // These operations are unusually performance sensitive and in sometimes
  248. // surprising ways. The implementations here are crafted specifically to
  249. // optimize the particular usages in Carbon's hashtable and should not be
  250. // expected to be reusable in any other context.
  251. //
  252. // Throughout the functions operating on this type we try to use patterns with a
  253. // fallback portable implementation which can be directly used in the absence of
  254. // a SIMD implementation, but is also used (with the same code) to check that
  255. // any SIMD implementation produces the same result as the portable one. These
  256. // patterns help minimize un-compiled or un-tested paths through either portable
  257. // or SIMD code, regardless of which path is actually *used* on a particular
  258. // platform. To illustrate a common version of this pattern, we might have code
  259. // like:
  260. //
  261. // ```cpp
  262. // auto MetadataGroup::Operation(...) -> ... {
  263. // ... portable_result;
  264. // ... simd_result;
  265. // if constexpr (!UseSimd || DebugSimd) {
  266. // portable_result = PortableOperation(...);
  267. // }
  268. // if (UseSimd || DebugSimd) {
  269. // simd_result = SimdOperation(...)
  270. // CARBON_DCHECK(result == portable_result, "{0}", ...);
  271. // }
  272. // return UseSimd ? simd_result : portable_result;
  273. // }
  274. // ```
  275. class MetadataGroup : public Printable<MetadataGroup> {
  276. public:
  277. static constexpr ssize_t Size =
  278. #if CARBON_X86_SIMD_SUPPORT
  279. 16;
  280. #else
  281. 8;
  282. #endif
  283. static_assert(Size >= 8);
  284. static_assert(Size % 8 == 0);
  285. static_assert(Size <= MaxGroupSize);
  286. static_assert(MaxGroupSize % Size == 0);
  287. static_assert(llvm::isPowerOf2_64(Size),
  288. "The group size must be a constant power of two so dividing by "
  289. "it is a simple shift.");
  290. static constexpr ssize_t Mask = Size - 1;
  291. // Each control byte can have special values. All special values have the
  292. // most significant bit cleared to distinguish them from the seven hash bits
  293. // stored when the control byte represents a full bucket.
  294. //
  295. // Otherwise, their values are chose primarily to provide efficient SIMD
  296. // implementations of the common operations on an entire control group.
  297. static constexpr uint8_t Empty = 0;
  298. static constexpr uint8_t Deleted = 1;
  299. static constexpr uint8_t PresentMask = 0b1000'0000;
  300. // Whether to use a SIMD implementation. Even when we *support* a SIMD
  301. // implementation, we do not always have to use it in the event that it is
  302. // less efficient than the portable version.
  303. static constexpr bool UseSimd =
  304. #if CARBON_X86_SIMD_SUPPORT
  305. true;
  306. #else
  307. false;
  308. #endif
  309. // Some architectures make it much more efficient to build the match indices
  310. // in a byte-encoded form rather than a bit-encoded form. This encoding
  311. // changes verification and other aspects of our algorithms.
  312. static constexpr bool ByteEncoding =
  313. #if CARBON_X86_SIMD_SUPPORT
  314. false;
  315. #else
  316. true;
  317. #endif
  318. static_assert(!ByteEncoding || Size == 8,
  319. "We can only support byte encoding with a group size of 8.");
  320. // We need to indicate to users of the metadata group when they can hold a
  321. // group value in a "register" (local variable) across clearing of individual
  322. // bytes in the group efficiently. If the entire group can fit in an integer
  323. // register, this works well and clients of the group should work to use the
  324. // already-loaded value when clearing bytes. But when we have a larger group
  325. // size, clearing the byte will typically require storing a byte to memory and
  326. // re-loading the group. The usage patterns that need to clear bytes can in
  327. // those cases avoid clearing a loaded group, and clear the byte directly in
  328. // the larger metadata array.
  329. static constexpr bool FastByteClear = Size == 8;
  330. // Most and least significant bits set.
  331. static constexpr uint64_t Msbs = 0x8080'8080'8080'8080ULL;
  332. static constexpr uint64_t Lsbs = 0x0101'0101'0101'0101ULL;
  333. using MatchIndex =
  334. BitIndex<std::conditional_t<ByteEncoding, uint64_t, uint32_t>,
  335. ByteEncoding,
  336. /*ZeroMask=*/ByteEncoding ? 0 : (~0U << Size)>;
  337. // Only one kind of portable matched range is needed.
  338. using PortableMatchRange = BitIndexRange<MatchIndex>;
  339. // We use specialized match range types for SIMD implementations to allow
  340. // deferring the masking operation where useful. When that optimization
  341. // doesn't apply, these will be the same type.
  342. using SimdMatchRange =
  343. BitIndexRange<MatchIndex, /*ByteEncodingMask=*/ByteEncoding ? Msbs : 0>;
  344. using SimdMatchPresentRange = BitIndexRange<MatchIndex>;
  345. // The public API range types can be either the portable or SIMD variations,
  346. // selected here.
  347. using MatchRange =
  348. std::conditional_t<UseSimd, SimdMatchRange, PortableMatchRange>;
  349. using MatchPresentRange =
  350. std::conditional_t<UseSimd, SimdMatchPresentRange, PortableMatchRange>;
  351. union {
  352. uint8_t metadata_bytes[Size];
  353. uint64_t metadata_ints[Size / 8];
  354. #if CARBON_NEON_SIMD_SUPPORT
  355. uint8x8_t metadata_vec = {};
  356. static_assert(sizeof(metadata_vec) == Size);
  357. #elif CARBON_X86_SIMD_SUPPORT
  358. __m128i metadata_vec = {};
  359. static_assert(sizeof(metadata_vec) == Size);
  360. #endif
  361. };
  362. auto Print(llvm::raw_ostream& out) const -> void;
  363. friend auto operator==(MetadataGroup lhs, MetadataGroup rhs) -> bool {
  364. return CompareEqual(lhs, rhs);
  365. }
  366. // The main API for this class. This API will switch between a portable and
  367. // SIMD implementation based on what is most efficient, but in debug builds
  368. // will cross check that the implementations do not diverge.
  369. // Load and return a group of metadata bytes out of the main metadata array at
  370. // a particular `index`. The index must be a multiple of `GroupSize`. This
  371. // will arrange for the load to place the group into the correct structure for
  372. // efficient register-based processing.
  373. static auto Load(const uint8_t* metadata, ssize_t index) -> MetadataGroup;
  374. // Store this metadata group into the main metadata array at the provided
  375. // `index`. The index must be a multiple of `GroupSize`.
  376. auto Store(uint8_t* metadata, ssize_t index) const -> void;
  377. // Clear a byte of this group's metadata at the provided `byte_index` to the
  378. // empty value.
  379. //
  380. // Note that this must only be called when `FastByteClear` is true -- in all
  381. // other cases users of this class should arrange to clear individual bytes in
  382. // the underlying array rather than using the group API. This is checked by a
  383. // static_assert, and the function is templated so that it is not instantiated
  384. // in the cases where it would not be valid.
  385. template <bool IsCalled = true>
  386. auto ClearByte(ssize_t byte_index) -> void;
  387. // Clear all of this group's metadata bytes that indicate a deleted slot to
  388. // the empty value.
  389. auto ClearDeleted() -> void;
  390. // Find all of the bytes of metadata in this group that are present and whose
  391. // low 7 bits match the provided `tag`. The `tag` byte must have a clear high
  392. // bit, only 7 bits of tag are used. Note that this means the provided tag is
  393. // *not* the actual present metadata byte -- this function is responsible for
  394. // mapping the tag into that form as it can do so more efficiently in some
  395. // cases. A range over all of the byte indices which matched is returned.
  396. auto Match(uint8_t tag) const -> MatchRange;
  397. // Find all of the present bytes of metadata in this group. A range over all
  398. // of the byte indices which are present is returned.
  399. auto MatchPresent() const -> MatchPresentRange;
  400. // Find the first byte of the metadata group that is empty and return that
  401. // index. There is no order or position required for which of the bytes of
  402. // metadata is considered "first", any model will do that makes it efficient
  403. // to produce the matching index. Must return an empty match index if no bytes
  404. // match the empty metadata.
  405. auto MatchEmpty() const -> MatchIndex;
  406. // Find the first byte of the metadata group that is deleted and return that
  407. // index. There is no order or position required for which of the bytes of
  408. // metadata is considered "first", any model will do that makes it efficient
  409. // to produce the matching index. Must return an empty match index if no bytes
  410. // match the deleted metadata.
  411. auto MatchDeleted() const -> MatchIndex;
  412. private:
  413. // Two classes only defined in the benchmark code are allowed to directly call
  414. // the portable and SIMD implementations for benchmarking purposes.
  415. friend class BenchmarkPortableMetadataGroup;
  416. friend class BenchmarkSimdMetadataGroup;
  417. // All SIMD variants that we have an implementation for should be enabled for
  418. // debugging. This lets us maintain a SIMD implementation even if it is not
  419. // used due to performance reasons, and easily re-enable it if the performance
  420. // changes.
  421. static constexpr bool DebugSimd =
  422. #if !defined(NDEBUG) && (CARBON_NEON_SIMD_SUPPORT || CARBON_X86_SIMD_SUPPORT)
  423. true;
  424. #else
  425. false;
  426. #endif
  427. using MatchBitsT = MatchIndex::BitsT;
  428. // A helper function to allow deducing the return type from the selected arm
  429. // of a `constexpr` ternary.
  430. template <bool Condition, typename LeftT, typename RightT>
  431. static auto ConstexprTernary(LeftT lhs, RightT rhs) {
  432. if constexpr (Condition) {
  433. return lhs;
  434. } else {
  435. return rhs;
  436. }
  437. }
  438. static auto CompareEqual(MetadataGroup lhs, MetadataGroup rhs) -> bool;
  439. // Functions for validating the returned matches agree with what is predicted
  440. // by the `byte_match` function. These either `CHECK`-fail or return true. To
  441. // pass validation, the `*_bits` argument must have `0x80` for those bytes
  442. // where `byte_match` returns true, and `0` for the rest.
  443. // `VerifyIndexBits` is for functions that return `MatchIndex`, as they only
  444. // promise to return accurate information up to the first match.
  445. auto VerifyIndexBits(
  446. MatchBitsT index_bits,
  447. llvm::function_ref<auto(uint8_t byte)->bool> byte_match) const -> bool;
  448. // `VerifyPortableRangeBits` is for functions that return `MatchRange`, and so
  449. // it validates all the bytes of `range_bits`.
  450. auto VerifyPortableRangeBits(
  451. MatchBitsT range_bits,
  452. llvm::function_ref<auto(uint8_t byte)->bool> byte_match) const -> bool;
  453. // Portable implementations of each operation. These are used on platforms
  454. // without SIMD support or where the portable implementation is faster than
  455. // SIMD. They are heavily optimized even though they are not SIMD because we
  456. // expect there to be platforms where the portable implementation can
  457. // outperform SIMD. Their behavior and semantics exactly match the
  458. // documentation for the un-prefixed functions.
  459. //
  460. // In debug builds, these also directly verify their results to help establish
  461. // baseline functionality.
  462. static auto PortableLoad(const uint8_t* metadata, ssize_t index)
  463. -> MetadataGroup;
  464. auto PortableStore(uint8_t* metadata, ssize_t index) const -> void;
  465. auto PortableClearDeleted() -> void;
  466. auto PortableMatch(uint8_t tag) const -> PortableMatchRange;
  467. auto PortableMatchPresent() const -> PortableMatchRange;
  468. auto PortableMatchEmpty() const -> MatchIndex;
  469. auto PortableMatchDeleted() const -> MatchIndex;
  470. static auto PortableCompareEqual(MetadataGroup lhs, MetadataGroup rhs)
  471. -> bool;
  472. // SIMD implementations of each operation. We minimize platform-specific APIs
  473. // to reduce the scope of errors that can only be discovered building on one
  474. // platform, so the bodies of these contain the platform specific code. Their
  475. // behavior and semantics exactly match the documentation for the un-prefixed
  476. // functions.
  477. //
  478. // These routines don't directly verify their results as we can build simpler
  479. // debug checks by comparing them against the verified portable results.
  480. static auto SimdLoad(const uint8_t* metadata, ssize_t index) -> MetadataGroup;
  481. auto SimdStore(uint8_t* metadata, ssize_t index) const -> void;
  482. auto SimdClearDeleted() -> void;
  483. auto SimdMatch(uint8_t tag) const -> SimdMatchRange;
  484. auto SimdMatchPresent() const -> SimdMatchPresentRange;
  485. auto SimdMatchEmpty() const -> MatchIndex;
  486. auto SimdMatchDeleted() const -> MatchIndex;
  487. static auto SimdCompareEqual(MetadataGroup lhs, MetadataGroup rhs) -> bool;
  488. #if CARBON_X86_SIMD_SUPPORT
  489. // A common routine for x86 SIMD matching that can be used for matching
  490. // present, empty, and deleted bytes with equal efficiency.
  491. auto X86SimdMatch(uint8_t match_byte) const -> SimdMatchRange;
  492. #endif
  493. };
  494. // Promote the size and mask to top-level constants as we'll need to operate on
  495. // the grouped structure outside of the metadata bytes.
  496. inline constexpr ssize_t GroupSize = MetadataGroup::Size;
  497. inline constexpr ssize_t GroupMask = MetadataGroup::Mask;
  498. inline auto MetadataGroup::Load(const uint8_t* metadata, ssize_t index)
  499. -> MetadataGroup {
  500. MetadataGroup portable_g;
  501. if constexpr (!UseSimd || DebugSimd) {
  502. portable_g = PortableLoad(metadata, index);
  503. if constexpr (!UseSimd) {
  504. return portable_g;
  505. }
  506. }
  507. MetadataGroup g = SimdLoad(metadata, index);
  508. CARBON_DCHECK(g == portable_g);
  509. return g;
  510. }
  511. inline auto MetadataGroup::Store(uint8_t* metadata, ssize_t index) const
  512. -> void {
  513. if constexpr (!UseSimd) {
  514. std::memcpy(metadata + index, &metadata_bytes, Size);
  515. } else {
  516. SimdStore(metadata, index);
  517. }
  518. CARBON_DCHECK(0 == std::memcmp(metadata + index, &metadata_bytes, Size));
  519. }
  520. template <bool IsCalled>
  521. inline auto MetadataGroup::ClearByte(ssize_t byte_index) -> void {
  522. static_assert(!IsCalled || FastByteClear,
  523. "Only use byte clearing when fast!");
  524. static_assert(!IsCalled || Size == 8,
  525. "The clear implementation assumes an 8-byte group.");
  526. metadata_ints[0] &= ~(static_cast<uint64_t>(0xff) << (byte_index * 8));
  527. }
  528. inline auto MetadataGroup::ClearDeleted() -> void {
  529. MetadataGroup portable_g = *this;
  530. MetadataGroup simd_g = *this;
  531. if constexpr (!UseSimd || DebugSimd) {
  532. portable_g.PortableClearDeleted();
  533. }
  534. if constexpr (UseSimd || DebugSimd) {
  535. simd_g.SimdClearDeleted();
  536. CARBON_DCHECK(
  537. simd_g == portable_g,
  538. "SIMD cleared group '{0}' doesn't match portable cleared group '{1}'",
  539. simd_g, portable_g);
  540. }
  541. *this = UseSimd ? simd_g : portable_g;
  542. }
  543. inline auto MetadataGroup::Match(uint8_t tag) const -> MatchRange {
  544. // The caller should provide us with the present byte hash, and not set any
  545. // present bit tag on it so that this layer can manage tagging the high bit of
  546. // a present byte.
  547. CARBON_DCHECK((tag & PresentMask) == 0, "{0:x}", tag);
  548. PortableMatchRange portable_result;
  549. SimdMatchRange simd_result;
  550. if constexpr (!UseSimd || DebugSimd) {
  551. portable_result = PortableMatch(tag);
  552. }
  553. if constexpr (UseSimd || DebugSimd) {
  554. simd_result = SimdMatch(tag);
  555. CARBON_DCHECK(simd_result == portable_result,
  556. "SIMD result '{0}' doesn't match portable result '{1}'",
  557. simd_result, portable_result);
  558. }
  559. // Return whichever result we're using.
  560. return ConstexprTernary<UseSimd>(simd_result, portable_result);
  561. }
  562. inline auto MetadataGroup::MatchPresent() const -> MatchPresentRange {
  563. PortableMatchRange portable_result;
  564. SimdMatchPresentRange simd_result;
  565. if constexpr (!UseSimd || DebugSimd) {
  566. portable_result = PortableMatchPresent();
  567. }
  568. if constexpr (UseSimd || DebugSimd) {
  569. simd_result = SimdMatchPresent();
  570. CARBON_DCHECK(simd_result == portable_result,
  571. "SIMD result '{0}' doesn't match portable result '{1}'",
  572. simd_result, portable_result);
  573. }
  574. // Return whichever result we're using.
  575. return ConstexprTernary<UseSimd>(simd_result, portable_result);
  576. }
  577. inline auto MetadataGroup::MatchEmpty() const -> MatchIndex {
  578. MatchIndex portable_result;
  579. MatchIndex simd_result;
  580. if constexpr (!UseSimd || DebugSimd) {
  581. portable_result = PortableMatchEmpty();
  582. }
  583. if constexpr (UseSimd || DebugSimd) {
  584. simd_result = SimdMatchEmpty();
  585. CARBON_DCHECK(simd_result == portable_result,
  586. "SIMD result '{0}' doesn't match portable result '{1}'",
  587. simd_result, portable_result);
  588. }
  589. return UseSimd ? simd_result : portable_result;
  590. }
  591. inline auto MetadataGroup::MatchDeleted() const -> MatchIndex {
  592. MatchIndex portable_result;
  593. MatchIndex simd_result;
  594. if constexpr (!UseSimd || DebugSimd) {
  595. portable_result = PortableMatchDeleted();
  596. }
  597. if constexpr (UseSimd || DebugSimd) {
  598. simd_result = SimdMatchDeleted();
  599. CARBON_DCHECK(simd_result == portable_result,
  600. "SIMD result '{0}' doesn't match portable result '{1}'",
  601. simd_result, portable_result);
  602. }
  603. return UseSimd ? simd_result : portable_result;
  604. }
  605. inline auto MetadataGroup::CompareEqual(MetadataGroup lhs, MetadataGroup rhs)
  606. -> bool {
  607. bool portable_result;
  608. bool simd_result;
  609. if constexpr (!UseSimd || DebugSimd) {
  610. portable_result = PortableCompareEqual(lhs, rhs);
  611. }
  612. if constexpr (UseSimd || DebugSimd) {
  613. simd_result = SimdCompareEqual(lhs, rhs);
  614. CARBON_DCHECK(simd_result == portable_result);
  615. }
  616. return UseSimd ? simd_result : portable_result;
  617. }
  618. inline auto MetadataGroup::VerifyIndexBits(
  619. MatchBitsT index_bits,
  620. llvm::function_ref<auto(uint8_t byte)->bool> byte_match) const -> bool {
  621. for (ssize_t byte_index : llvm::seq<ssize_t>(0, Size)) {
  622. if constexpr (!ByteEncoding) {
  623. if (byte_match(metadata_bytes[byte_index])) {
  624. CARBON_CHECK(((index_bits >> byte_index) & 1) == 1,
  625. "Bit not set at matching byte index: {0}", byte_index);
  626. // Only the first match is needed, so stop scanning once found.
  627. break;
  628. }
  629. CARBON_CHECK(((index_bits >> byte_index) & 1) == 0,
  630. "Bit set at non-matching byte index: {0}", byte_index);
  631. } else {
  632. // `index_bits` is byte-encoded rather than bit encoded, so extract a
  633. // byte.
  634. uint8_t index_byte = (index_bits >> (byte_index * 8)) & 0xFF;
  635. if (byte_match(metadata_bytes[byte_index])) {
  636. CARBON_CHECK(
  637. (index_byte & 0x80) == 0x80,
  638. "Should have the high bit set for a matching byte, found: {0:x}",
  639. index_byte);
  640. // Only the first match is needed so stop scanning once found.
  641. break;
  642. }
  643. CARBON_CHECK(
  644. index_byte == 0,
  645. "Should have no bits set for an unmatched byte, found: {0:x}",
  646. index_byte);
  647. }
  648. }
  649. return true;
  650. }
  651. inline auto MetadataGroup::VerifyPortableRangeBits(
  652. MatchBitsT range_bits,
  653. llvm::function_ref<auto(uint8_t byte)->bool> byte_match) const -> bool {
  654. for (ssize_t byte_index : llvm::seq<ssize_t>(0, Size)) {
  655. if constexpr (!ByteEncoding) {
  656. if (byte_match(metadata_bytes[byte_index])) {
  657. CARBON_CHECK(((range_bits >> byte_index) & 1) == 1,
  658. "Bit not set at matching byte index: {0}", byte_index);
  659. } else {
  660. CARBON_CHECK(((range_bits >> byte_index) & 1) == 0,
  661. "Bit set at non-matching byte index: {0}", byte_index);
  662. }
  663. } else {
  664. // `range_bits` is byte-encoded rather than bit encoded, so extract a
  665. // byte.
  666. uint8_t range_byte = (range_bits >> (byte_index * 8)) & 0xFF;
  667. if (byte_match(metadata_bytes[byte_index])) {
  668. CARBON_CHECK(range_byte == 0x80,
  669. "Should just have the high bit set for a matching byte, "
  670. "found: {0:x}",
  671. range_byte);
  672. } else {
  673. CARBON_CHECK(
  674. range_byte == 0,
  675. "Should have no bits set for an unmatched byte, found: {0:x}",
  676. range_byte);
  677. }
  678. }
  679. }
  680. return true;
  681. }
  682. inline auto MetadataGroup::PortableLoad(const uint8_t* metadata, ssize_t index)
  683. -> MetadataGroup {
  684. MetadataGroup g;
  685. static_assert(sizeof(g) == Size);
  686. std::memcpy(&g.metadata_bytes, metadata + index, Size);
  687. return g;
  688. }
  689. inline auto MetadataGroup::PortableStore(uint8_t* metadata, ssize_t index) const
  690. -> void {
  691. std::memcpy(metadata + index, &metadata_bytes, Size);
  692. }
  693. inline auto MetadataGroup::PortableClearDeleted() -> void {
  694. for (uint64_t& metadata_int : metadata_ints) {
  695. // Deleted bytes have only the least significant bits set, so to clear them
  696. // we only need to clear the least significant bit. And empty bytes already
  697. // have a clear least significant bit, so the only least significant bits we
  698. // need to preserve are those of present bytes. The most significant bit of
  699. // every present byte is set, so we take the most significant bit of each
  700. // byte, shift it into the least significant bit position, and bit-or it
  701. // with the compliment of `Lsbs`. This will have ones for every bit but the
  702. // least significant bits, and ones for the least significant bits of every
  703. // present byte.
  704. metadata_int &= (~Lsbs | metadata_int >> 7);
  705. }
  706. }
  707. inline auto MetadataGroup::PortableMatch(uint8_t tag) const -> MatchRange {
  708. // The caller should provide us with the present byte hash, and not set any
  709. // present bit tag on it so that this layer can manage tagging the high bit of
  710. // a present byte.
  711. CARBON_DCHECK((tag & PresentMask) == 0, "{0:x}", tag);
  712. // Use a simple fallback approach for sizes beyond 8.
  713. // TODO: Instead of a simple fallback, we should generalize the below
  714. // algorithm for sizes above 8, even if to just exercise the same code on
  715. // more platforms.
  716. if constexpr (Size > 8) {
  717. static_assert(Size <= 32, "Sizes larger than 32 not yet supported!");
  718. uint32_t match_bits = 0;
  719. uint32_t bit = 1;
  720. uint8_t present_byte = tag | PresentMask;
  721. for (ssize_t i : llvm::seq<ssize_t>(0, Size)) {
  722. if (metadata_bytes[i] == present_byte) {
  723. match_bits |= bit;
  724. }
  725. bit <<= 1;
  726. }
  727. return MatchRange(match_bits);
  728. }
  729. // This algorithm only works for matching *present* bytes. We leverage the
  730. // set high bit in the present case as part of the algorithm. The whole
  731. // algorithm has a critical path height of 4 operations, and does 6
  732. // operations total on AArch64. The operation dependency graph is:
  733. //
  734. // group | Msbs Lsbs * match_byte + Msbs
  735. // \ /
  736. // match_bits ^ broadcast
  737. // |
  738. // group & Msbs Msbs - match_bits
  739. // \ /
  740. // group_Msbs & match_bits
  741. //
  742. // This diagram and the operation count are specific to AArch64 where we have
  743. // a fused *integer* multiply-add operation.
  744. //
  745. // While it is superficially similar to the "find zero bytes in a word" bit
  746. // math trick, it is different because this is designed to have no false
  747. // positives and perfectly produce 0x80 for matching bytes and 0x00 for
  748. // non-matching bytes. This is do-able because we constrain to only handle
  749. // present matches which only require testing 7 bits and have a particular
  750. // layout.
  751. // Set the high bit of every byte to `1`. Any matching byte is a present byte
  752. // and so always has this bit set as well, which means the xor below, in
  753. // addition to zeroing the low 7 bits of any byte that matches the tag, also
  754. // clears the high bit of every byte.
  755. uint64_t match_bits = metadata_ints[0] | Msbs;
  756. // Broadcast the match byte to all bytes, and mask in the present bits in the
  757. // Msbs of each byte. We structure this as a multiply and an add because we
  758. // know that the add cannot carry, and this way it can be lowered using
  759. // combined multiply-add instructions if available.
  760. uint64_t broadcast = Lsbs * tag + Msbs;
  761. CARBON_DCHECK(broadcast == (Lsbs * tag | Msbs),
  762. "Unexpected carry from addition!");
  763. // Xor the broadcast byte pattern. This makes bytes with matches become 0, and
  764. // clears the high-bits of non-matches. Note that if we are looking for a tag
  765. // with the same value as `Empty` or `Deleted`, those bytes will be zero as
  766. // well.
  767. match_bits = match_bits ^ broadcast;
  768. // Subtract each byte of `match_bits` from `0x80` bytes. After this, the high
  769. // bit will be set only for those bytes that were zero.
  770. match_bits = Msbs - match_bits;
  771. // Zero everything but the high bits, and also zero the high bits of any bytes
  772. // for "not present" slots in the original group. This avoids false positives
  773. // for `Empty` and `Deleted` bytes in the metadata.
  774. match_bits &= (metadata_ints[0] & Msbs);
  775. // At this point, `match_bits` has the high bit set for bytes where the
  776. // original group byte equals `tag` plus the high bit.
  777. CARBON_DCHECK(VerifyPortableRangeBits(
  778. match_bits, [&](uint8_t byte) { return byte == (tag | PresentMask); }));
  779. return MatchRange(match_bits);
  780. }
  781. inline auto MetadataGroup::PortableMatchPresent() const -> MatchRange {
  782. // Use a simple fallback approach for sizes beyond 8.
  783. // TODO: Instead of a simple fallback, we should generalize the below
  784. // algorithm for sizes above 8, even if to just exercise the same code on
  785. // more platforms.
  786. if constexpr (Size > 8) {
  787. static_assert(Size <= 32, "Sizes larger than 32 not yet supported!");
  788. uint32_t match_bits = 0;
  789. uint32_t bit = 1;
  790. for (ssize_t i : llvm::seq<ssize_t>(0, Size)) {
  791. if (metadata_bytes[i] & PresentMask) {
  792. match_bits |= bit;
  793. }
  794. bit <<= 1;
  795. }
  796. return MatchRange(match_bits);
  797. }
  798. // Want to keep the high bit of each byte, which indicates whether that byte
  799. // represents a present slot.
  800. uint64_t match_bits = metadata_ints[0] & Msbs;
  801. CARBON_DCHECK(VerifyPortableRangeBits(
  802. match_bits, [&](uint8_t byte) { return (byte & PresentMask) != 0; }));
  803. return MatchRange(match_bits);
  804. }
  805. inline auto MetadataGroup::PortableMatchEmpty() const -> MatchIndex {
  806. // Use a simple fallback approach for sizes beyond 8.
  807. // TODO: Instead of a simple fallback, we should generalize the below
  808. // algorithm for sizes above 8, even if to just exercise the same code on
  809. // more platforms.
  810. if constexpr (Size > 8) {
  811. static_assert(Size <= 32, "Sizes larger than 32 not yet supported!");
  812. uint32_t bit = 1;
  813. for (ssize_t i : llvm::seq<ssize_t>(0, Size)) {
  814. if (metadata_bytes[i] == Empty) {
  815. return MatchIndex(bit);
  816. }
  817. bit <<= 1;
  818. }
  819. return MatchIndex(0);
  820. }
  821. // This sets the high bit of every byte in `match_bits` unless the
  822. // corresponding metadata byte is 0. We take advantage of the fact that
  823. // the metadata bytes in are non-zero only if they are either:
  824. // - present: in which case the high bit of the byte will already be set; or
  825. // - deleted: in which case the byte will be 1, and shifting it left by 7 will
  826. // cause the high bit to be set.
  827. uint64_t match_bits = metadata_ints[0] | (metadata_ints[0] << 7);
  828. // This inverts the high bits of the bytes, and clears the remaining bits.
  829. match_bits = ~match_bits & Msbs;
  830. // The high bits of the bytes of `match_bits` are set if the corresponding
  831. // metadata byte is `Empty`.
  832. CARBON_DCHECK(
  833. VerifyIndexBits(match_bits, [](uint8_t byte) { return byte == Empty; }));
  834. return MatchIndex(match_bits);
  835. }
  836. inline auto MetadataGroup::PortableMatchDeleted() const -> MatchIndex {
  837. // Use a simple fallback approach for sizes beyond 8.
  838. // TODO: Instead of a simple fallback, we should generalize the below
  839. // algorithm for sizes above 8, even if to just exercise the same code on
  840. // more platforms.
  841. if constexpr (Size > 8) {
  842. static_assert(Size <= 32, "Sizes larger than 32 not yet supported!");
  843. uint32_t bit = 1;
  844. for (ssize_t i : llvm::seq<ssize_t>(0, Size)) {
  845. if (metadata_bytes[i] == Deleted) {
  846. return MatchIndex(bit);
  847. }
  848. bit <<= 1;
  849. }
  850. return MatchIndex(0);
  851. }
  852. // This sets the high bit of every byte in `match_bits` unless the
  853. // corresponding metadata byte is 1. We take advantage of the fact that the
  854. // metadata bytes are not 1 only if they are either:
  855. // - present: in which case the high bit of the byte will already be set; or
  856. // - empty: in which case the byte will be 0, and in that case inverting and
  857. // shifting left by 7 will have the high bit set.
  858. uint64_t match_bits = metadata_ints[0] | (~metadata_ints[0] << 7);
  859. // This inverts the high bits of the bytes, and clears the remaining bits.
  860. match_bits = ~match_bits & Msbs;
  861. // The high bits of the bytes of `match_bits` are set if the corresponding
  862. // metadata byte is `Deleted`.
  863. CARBON_DCHECK(VerifyIndexBits(match_bits,
  864. [](uint8_t byte) { return byte == Deleted; }));
  865. return MatchIndex(match_bits);
  866. }
  867. inline auto MetadataGroup::PortableCompareEqual(MetadataGroup lhs,
  868. MetadataGroup rhs) -> bool {
  869. return llvm::equal(lhs.metadata_bytes, rhs.metadata_bytes);
  870. }
  871. inline auto MetadataGroup::SimdLoad(const uint8_t* metadata, ssize_t index)
  872. -> MetadataGroup {
  873. MetadataGroup g;
  874. #if CARBON_NEON_SIMD_SUPPORT
  875. g.metadata_vec = vld1_u8(metadata + index);
  876. #elif CARBON_X86_SIMD_SUPPORT
  877. g.metadata_vec =
  878. _mm_load_si128(reinterpret_cast<const __m128i*>(metadata + index));
  879. #else
  880. static_assert(!UseSimd, "Unimplemented SIMD operation");
  881. static_cast<void>(metadata);
  882. static_cast<void>(index);
  883. #endif
  884. return g;
  885. }
  886. // NOLINTNEXTLINE(readability-non-const-parameter): Mutation is in #if.
  887. inline auto MetadataGroup::SimdStore(uint8_t* metadata, ssize_t index) const
  888. -> void {
  889. #if CARBON_NEON_SIMD_SUPPORT
  890. vst1_u8(metadata + index, metadata_vec);
  891. #elif CARBON_X86_SIMD_SUPPORT
  892. _mm_store_si128(reinterpret_cast<__m128i*>(metadata + index), metadata_vec);
  893. #else
  894. static_assert(!UseSimd, "Unimplemented SIMD operation");
  895. static_cast<void>(metadata);
  896. static_cast<void>(index);
  897. #endif
  898. }
  899. inline auto MetadataGroup::SimdClearDeleted() -> void {
  900. #if CARBON_NEON_SIMD_SUPPORT
  901. // There is no good Neon operation to implement this, so do it using integer
  902. // code. This is reasonably fast, but unfortunate because it forces the group
  903. // out of a SIMD register and into a general purpose register, which can have
  904. // high latency.
  905. metadata_ints[0] &= (~Lsbs | metadata_ints[0] >> 7);
  906. #elif CARBON_X86_SIMD_SUPPORT
  907. // For each byte, use `metadata_vec` if the byte's high bit is set (indicating
  908. // it is present), otherwise (it is empty or deleted) replace it with zero
  909. // (representing empty).
  910. metadata_vec =
  911. _mm_blendv_epi8(_mm_setzero_si128(), metadata_vec, metadata_vec);
  912. #else
  913. static_assert(!UseSimd && !DebugSimd, "Unimplemented SIMD operation");
  914. #endif
  915. }
  916. inline auto MetadataGroup::SimdMatch(uint8_t tag) const -> SimdMatchRange {
  917. SimdMatchRange result;
  918. #if CARBON_NEON_SIMD_SUPPORT
  919. // Broadcast byte we want to match to every byte in the vector.
  920. auto match_byte_vec = vdup_n_u8(tag | PresentMask);
  921. // Result bytes have all bits set for the bytes that match, so we have to
  922. // clear everything but Msbs next.
  923. auto match_byte_cmp_vec = vceq_u8(metadata_vec, match_byte_vec);
  924. uint64_t match_bits = vreinterpret_u64_u8(match_byte_cmp_vec)[0];
  925. // Note that the range will lazily mask to the Msbs as part of incrementing.
  926. result = SimdMatchRange(match_bits);
  927. #elif CARBON_X86_SIMD_SUPPORT
  928. result = X86SimdMatch(tag | PresentMask);
  929. #else
  930. static_assert(!UseSimd && !DebugSimd, "Unimplemented SIMD operation");
  931. static_cast<void>(tag);
  932. #endif
  933. return result;
  934. }
  935. inline auto MetadataGroup::SimdMatchPresent() const -> SimdMatchPresentRange {
  936. SimdMatchPresentRange result;
  937. #if CARBON_NEON_SIMD_SUPPORT
  938. // Just extract the metadata directly.
  939. uint64_t match_bits = vreinterpret_u64_u8(metadata_vec)[0];
  940. // Even though the Neon SIMD range will do its own masking, we have to mask
  941. // here so that `empty` is correct.
  942. result = SimdMatchPresentRange(match_bits & Msbs);
  943. #elif CARBON_X86_SIMD_SUPPORT
  944. // We arranged the byte vector so that present bytes have the high bit set,
  945. // which this instruction extracts.
  946. result = SimdMatchPresentRange(_mm_movemask_epi8(metadata_vec));
  947. #else
  948. static_assert(!UseSimd && !DebugSimd, "Unimplemented SIMD operation");
  949. #endif
  950. return result;
  951. }
  952. inline auto MetadataGroup::SimdMatchEmpty() const -> MatchIndex {
  953. MatchIndex result;
  954. #if CARBON_NEON_SIMD_SUPPORT
  955. // Compare all bytes with zero, as that is the empty byte value. Result will
  956. // have all bits set for any input zero byte, so we zero all but the high bits
  957. // below.
  958. auto cmp_vec = vceqz_u8(metadata_vec);
  959. uint64_t metadata_bits = vreinterpret_u64_u8(cmp_vec)[0];
  960. // The matched range is likely to be tested for zero by the caller, and that
  961. // test can often be folded into masking the bits with `Msbs` when we do that
  962. // mask in the scalar domain rather than the SIMD domain. So we do the mask
  963. // here rather than above prior to extracting the match bits.
  964. result = MatchIndex(metadata_bits & Msbs);
  965. #elif CARBON_X86_SIMD_SUPPORT
  966. // Even though we only need the first match rather than all matches, we don't
  967. // have a more efficient way to compute this on x86 and so we reuse the
  968. // general match infrastructure that computes all matches in a bit-encoding.
  969. // We then convert it into a `MatchIndex` that just finds the first one.
  970. result = static_cast<MatchIndex>(X86SimdMatch(Empty));
  971. #else
  972. static_assert(!UseSimd && !DebugSimd, "Unimplemented SIMD operation");
  973. #endif
  974. return result;
  975. }
  976. inline auto MetadataGroup::SimdMatchDeleted() const -> MatchIndex {
  977. MatchIndex result;
  978. #if CARBON_NEON_SIMD_SUPPORT
  979. // Broadcast the `Deleted` byte across the vector and compare the bytes of
  980. // that with the metadata vector. The result will have all bits set for any
  981. // input zero byte, so we zero all but the high bits below.
  982. auto cmp_vec = vceq_u8(metadata_vec, vdup_n_u8(Deleted));
  983. uint64_t match_bits = vreinterpret_u64_u8(cmp_vec)[0];
  984. // The matched range is likely to be tested for zero by the caller, and that
  985. // test can often be folded into masking the bits with `Msbs` when we do that
  986. // mask in the scalar domain rather than the SIMD domain. So we do the mask
  987. // here rather than above prior to extracting the match bits.
  988. result = MatchIndex(match_bits & Msbs);
  989. #elif CARBON_X86_SIMD_SUPPORT
  990. // Even though we only need the first match rather than all matches, we don't
  991. // have a more efficient way to compute this on x86 and so we reuse the
  992. // general match infrastructure that computes all matches in a bit-encoding.
  993. // We then convert it into a `MatchIndex` that just finds the first one.
  994. result = static_cast<MatchIndex>(X86SimdMatch(Deleted));
  995. #else
  996. static_assert(!UseSimd && !DebugSimd, "Unimplemented SIMD operation");
  997. #endif
  998. return result;
  999. }
  1000. inline auto MetadataGroup::SimdCompareEqual(MetadataGroup lhs,
  1001. MetadataGroup rhs) -> bool {
  1002. #if CARBON_NEON_SIMD_SUPPORT
  1003. return vreinterpret_u64_u8(vceq_u8(lhs.metadata_vec, rhs.metadata_vec))[0] ==
  1004. static_cast<uint64_t>(-1LL);
  1005. #elif CARBON_X86_SIMD_SUPPORT
  1006. // Different x86 SIMD extensions provide different comparison functionality
  1007. // available.
  1008. #if __SSE4_2__
  1009. // With SSE 4.2, we can directly test and branch in the SIMD domain on whether
  1010. // the two metadata vectors are equal.
  1011. return _mm_testc_si128(_mm_cmpeq_epi8(lhs.metadata_vec, rhs.metadata_vec),
  1012. _mm_set1_epi8(0xff)) == 1;
  1013. #else
  1014. // With older versions of SSE we have to extract the result of the comparison,
  1015. // much like we do when matching. That will have the usual bitmask
  1016. // representing equal bytes, and test for that exact bitmask in scalar code.
  1017. return _mm_movemask_epi8(_mm_cmpeq_epi8(lhs.metadata_vec,
  1018. rhs.metadata_vec)) == 0x0000'ffffU;
  1019. #endif
  1020. #else
  1021. static_assert(!UseSimd && !DebugSimd, "Unimplemented SIMD operation");
  1022. static_cast<void>(lhs);
  1023. static_cast<void>(rhs);
  1024. return false;
  1025. #endif
  1026. }
  1027. #if CARBON_X86_SIMD_SUPPORT
  1028. inline auto MetadataGroup::X86SimdMatch(uint8_t match_byte) const
  1029. -> MatchRange {
  1030. // Broadcast the byte we're matching against to all bytes in a vector, and
  1031. // compare those bytes with the metadata vector bytes.
  1032. auto match_byte_vec = _mm_set1_epi8(match_byte);
  1033. auto match_byte_cmp_vec = _mm_cmpeq_epi8(metadata_vec, match_byte_vec);
  1034. // Extract the result of each byte-wise comparison into the low bits of an
  1035. // integer.
  1036. uint32_t match_bits = _mm_movemask_epi8(match_byte_cmp_vec);
  1037. return MatchRange(match_bits);
  1038. }
  1039. #endif
  1040. } // namespace Carbon::RawHashtable
  1041. #endif // CARBON_COMMON_RAW_HASHTABLE_METADATA_GROUP_H_