raw_hashtable_metadata_group.h 47 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #ifndef CARBON_COMMON_RAW_HASHTABLE_METADATA_GROUP_H_
  5. #define CARBON_COMMON_RAW_HASHTABLE_METADATA_GROUP_H_
  6. #include <cstddef>
  7. #include <cstring>
  8. #include <iterator>
  9. #include <type_traits>
  10. #include "common/check.h"
  11. #include "common/ostream.h"
  12. #include "llvm/ADT/Sequence.h"
  13. #include "llvm/ADT/bit.h"
  14. #include "llvm/Support/FormatVariadic.h"
  15. #include "llvm/Support/MathExtras.h"
  16. // Detect whether we can use SIMD accelerated implementations of the control
  17. // groups, and include the relevant platform specific APIs for the SIMD
  18. // implementations.
  19. //
  20. // Reference documentation for the SIMD APIs used here:
  21. // - https://arm-software.github.io/acle/neon_intrinsics/advsimd.html
  22. // - https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html
  23. #if defined(__SSSE3__)
  24. #include <x86intrin.h>
  25. #define CARBON_X86_SIMD_SUPPORT 1
  26. #elif defined(__ARM_NEON)
  27. #include <arm_neon.h>
  28. #define CARBON_NEON_SIMD_SUPPORT 1
  29. #endif
  30. // This namespace collects low-level utilities for implementing hashtable
  31. // data structures. This file only provides one of them:
  32. //
  33. // - Primitives to manage "groups" of hashtable entries that have densely packed
  34. // control bytes we can scan rapidly as a group, often using SIMD facilities
  35. // to process the entire group at once.
  36. namespace Carbon::RawHashtable {
  37. // We define a constant max group size. The particular group size used in
  38. // practice may vary, but we want to have some upper bound used to ensure
  39. // memory allocation is done consistently across different architectures.
  40. constexpr ssize_t MaxGroupSize = 16;
  41. // This takes a collection of bits representing the results of looking for a
  42. // particular tag in this metadata group and determines the first position with
  43. // a match. The position is represented by either the least significant set bit
  44. // or the least significant non-zero byte, depending on `ByteEncoding`. When
  45. // represented with a non-zero byte, that byte must have at least its most
  46. // significant bit set, but may have other bits set to any value. Bits more
  47. // significant than the match may have any value provided there is at least one
  48. // match. Zero matches must be represented by a zero input.
  49. //
  50. // Some bits of the underlying value may be known-zero, which can optimize
  51. // various operations. These can be represented as a `ZeroMask`.
  52. template <typename BitsInputT, bool ByteEncodingInput, BitsInputT ZeroMask = 0>
  53. class BitIndex
  54. : public Printable<BitIndex<BitsInputT, ByteEncodingInput, ZeroMask>> {
  55. public:
  56. using BitsT = BitsInputT;
  57. static constexpr bool ByteEncoding = ByteEncodingInput;
  58. BitIndex() = default;
  59. explicit BitIndex(BitsT bits) : bits_(bits) {}
  60. friend auto operator==(BitIndex lhs, BitIndex rhs) -> bool {
  61. if (lhs.empty() || rhs.empty()) {
  62. return lhs.empty() == rhs.empty();
  63. }
  64. // For non-empty bit indices, compare the indices directly to ignore other
  65. // (extraneous) parts of the incoming bits.
  66. return lhs.index() == rhs.index();
  67. }
  68. auto Print(llvm::raw_ostream& out) const -> void {
  69. out << llvm::formatv("{0:x}", bits_);
  70. }
  71. explicit operator bool() const { return !empty(); }
  72. // Returns true when there are no matches for the tag.
  73. auto empty() const -> bool {
  74. CARBON_DCHECK((bits_ & ZeroMask) == 0, "Unexpected non-zero bits!");
  75. __builtin_assume((bits_ & ZeroMask) == 0);
  76. return bits_ == 0;
  77. }
  78. // Returns the index of the first matched tag.
  79. auto index() -> ssize_t {
  80. CARBON_DCHECK(bits_ != 0, "Cannot get an index from zero bits!");
  81. __builtin_assume(bits_ != 0);
  82. ssize_t index = unscaled_index();
  83. if constexpr (ByteEncoding) {
  84. // Shift to scale out of the byte encoding.
  85. index >>= ByteEncodingShift;
  86. }
  87. return index;
  88. }
  89. // Optimized tool to index a pointer `p` by `index()`.
  90. template <typename T>
  91. auto index_ptr(T* pointer) -> T* {
  92. CARBON_DCHECK(bits_ != 0, "Cannot get an index from zero bits!");
  93. __builtin_assume(bits_ != 0);
  94. if constexpr (!ByteEncoding) {
  95. return &pointer[unscaled_index()];
  96. }
  97. ssize_t index = unscaled_index();
  98. // Scale the index as we counted zero *bits* and not zero *bytes*.
  99. // However, we can fold that scale with the size of `T` when it is a power
  100. // of two or divisible by 8.
  101. CARBON_DCHECK(
  102. (index & ((static_cast<size_t>(1) << ByteEncodingShift) - 1)) == 0);
  103. if constexpr (sizeof(T) % 8 == 0) {
  104. constexpr size_t FoldedScale = sizeof(T) / 8;
  105. index *= FoldedScale;
  106. return reinterpret_cast<T*>(
  107. &reinterpret_cast<std::byte*>(pointer)[index]);
  108. } else if constexpr (llvm::isPowerOf2_64(sizeof(T))) {
  109. constexpr size_t ScaleShift = llvm::CTLog2<sizeof(T)>();
  110. static_assert(ScaleShift <= ByteEncodingShift,
  111. "Scaling by >=8 should be handled above!");
  112. constexpr size_t FoldedShift = ByteEncodingShift - ScaleShift;
  113. index >>= FoldedShift;
  114. return reinterpret_cast<T*>(
  115. &reinterpret_cast<std::byte*>(pointer)[index]);
  116. }
  117. // Nothing we can fold here.
  118. return &pointer[index >> ByteEncodingShift];
  119. }
  120. private:
  121. // When using a byte encoding, we'll need to shift any index by this amount.
  122. static constexpr size_t ByteEncodingShift = 3;
  123. auto unscaled_index() -> ssize_t {
  124. if constexpr (!ByteEncoding) {
  125. // Note the cast to `size_t` to force zero extending the result.
  126. return static_cast<size_t>(llvm::countr_zero(bits_));
  127. } else {
  128. // The index is encoded in the high bit of each byte. We compute the index
  129. // by counting the number of low zero bytes there are before the first
  130. // byte with its high bit set. Rather that shifting the high bit to be the
  131. // low bit and counting the trailing (least significant) zero bits
  132. // directly, we instead byte-reverse the bits and count the *leading*
  133. // (most significant) zero bits. While this may be a wash on CPUs with
  134. // direct support for counting the trailing zero bits, AArch64 only
  135. // supports counting the leading zero bits and requires a bit-reverse to
  136. // count the trailing zero bits. Doing the byte-reverse approach
  137. // essentially combines moving the high bit into the low bit and the
  138. // reverse necessary for counting the zero bits. While this only removes
  139. // one instruction, it is an instruction in the critical path of the
  140. // hottest part of table lookup, and that critical path dependency height
  141. // is few enough instructions that removing even one significantly impacts
  142. // latency.
  143. //
  144. // We also cast to `size_t` to clearly zero-extend the result.
  145. return static_cast<size_t>(llvm::countl_zero(llvm::byteswap(bits_)));
  146. }
  147. }
  148. BitsT bits_ = 0;
  149. };
  150. // This is like `BitIndex`, but allows iterating through all of the matches.
  151. //
  152. // A key requirement for efficient iteration is that all of the matches are
  153. // represented with a single bit and there are no other bits set. For example,
  154. // with byte-encoded bit indices, exactly the high bit and no other bit of each
  155. // matching byte must be set. This is a stricter constraint than what `BitIndex`
  156. // alone would impose on any one of the matches.
  157. template <typename BitIndexT, BitIndexT::BitsT ByteEncodingMask = 0>
  158. class BitIndexRange
  159. : public Printable<BitIndexRange<BitIndexT, ByteEncodingMask>> {
  160. public:
  161. using BitsT = BitIndexT::BitsT;
  162. static_assert(BitIndexT::ByteEncoding || ByteEncodingMask == 0,
  163. "Non-byte encoding must not have a byte encoding mask.");
  164. class Iterator
  165. : public llvm::iterator_facade_base<Iterator, std::forward_iterator_tag,
  166. ssize_t, ssize_t> {
  167. public:
  168. Iterator() = default;
  169. explicit Iterator(BitsT bits) : bits_(bits) {}
  170. friend auto operator==(const Iterator& lhs, const Iterator& rhs) -> bool {
  171. return lhs.bits_ == rhs.bits_;
  172. }
  173. auto operator*() -> ssize_t& {
  174. CARBON_DCHECK(bits_ != 0, "Cannot get an index from zero bits!");
  175. __builtin_assume(bits_ != 0);
  176. index_ = BitIndexT(bits_).index();
  177. // Note that we store the index in a member so we can return a reference
  178. // to it here as required to be a forward iterator.
  179. return index_;
  180. }
  181. template <typename T>
  182. auto index_ptr(T* pointer) -> T* {
  183. return BitIndexT(bits_).index_ptr(pointer);
  184. }
  185. auto operator++() -> Iterator& {
  186. CARBON_DCHECK(bits_ != 0, "Must not increment past the end!");
  187. __builtin_assume(bits_ != 0);
  188. if constexpr (ByteEncodingMask != 0) {
  189. // Apply an increment mask to the bits first. This is used with the byte
  190. // encoding when the mask isn't needed until we begin incrementing.
  191. bits_ &= ByteEncodingMask;
  192. }
  193. // Clears the least significant set bit, effectively stepping to the next
  194. // match.
  195. bits_ &= (bits_ - 1);
  196. return *this;
  197. }
  198. private:
  199. ssize_t index_;
  200. BitsT bits_ = 0;
  201. };
  202. BitIndexRange() = default;
  203. explicit BitIndexRange(BitsT bits) : bits_(bits) {}
  204. explicit operator bool() const { return !empty(); }
  205. auto empty() const -> bool { return BitIndexT(bits_).empty(); }
  206. auto begin() const -> Iterator { return Iterator(bits_); }
  207. auto end() const -> Iterator { return Iterator(); }
  208. friend auto operator==(BitIndexRange lhs, BitIndexRange rhs) -> bool {
  209. if constexpr (ByteEncodingMask == 0) {
  210. // If there is no encoding mask, we can just compare the bits directly.
  211. return lhs.bits_ == rhs.bits_;
  212. } else {
  213. // Otherwise, compare the initial bit indices and the masked bits.
  214. return BitIndexT(lhs.bits_) == BitIndexT(rhs.bits_) &&
  215. (lhs.bits_ & ByteEncodingMask) == (rhs.bits_ & ByteEncodingMask);
  216. }
  217. }
  218. // Define heterogeneous equality between a masked (the current type) and
  219. // unmasked range. Requires a non-zero mask to avoid a redundant definition
  220. // with the homogeneous equality.
  221. friend auto operator==(BitIndexRange lhs, BitIndexRange<BitIndexT, 0> rhs)
  222. -> bool
  223. requires(ByteEncodingMask != 0)
  224. {
  225. // For mixed masked / unmasked comparison, we make sure the initial indices
  226. // are the same and that the masked side (LHS) is the same after masking as
  227. // the unmasked side (RHS).
  228. return BitIndexT(lhs.bits_) == BitIndexT(rhs.bits_) &&
  229. (lhs.bits_ & ByteEncodingMask) == rhs.bits_;
  230. }
  231. auto Print(llvm::raw_ostream& out) const -> void {
  232. out << llvm::formatv("{0:x}", bits_);
  233. }
  234. explicit operator BitsT() const { return bits_; }
  235. explicit operator BitIndexT() const { return BitIndexT(bits_); }
  236. private:
  237. template <typename FriendBitIndexT,
  238. FriendBitIndexT::BitsT FriendByteEncodingMask>
  239. friend class BitIndexRange;
  240. BitsT bits_ = 0;
  241. };
  242. // A group of metadata bytes that can be manipulated together.
  243. //
  244. // The metadata bytes used Carbon's hashtable implementation are designed to
  245. // support being manipulating as groups, either using architecture specific SIMD
  246. // code sequences or using portable SIMD-in-an-integer-register code sequences.
  247. // These operations are unusually performance sensitive and in sometimes
  248. // surprising ways. The implementations here are crafted specifically to
  249. // optimize the particular usages in Carbon's hashtable and should not be
  250. // expected to be reusable in any other context.
  251. //
  252. // Throughout the functions operating on this type we try to use patterns with a
  253. // fallback portable implementation which can be directly used in the absence of
  254. // a SIMD implementation, but is also used (with the same code) to check that
  255. // any SIMD implementation produces the same result as the portable one. These
  256. // patterns help minimize un-compiled or un-tested paths through either portable
  257. // or SIMD code, regardless of which path is actually *used* on a particular
  258. // platform. To illustrate a common version of this pattern, we might have code
  259. // like:
  260. //
  261. // ```cpp
  262. // auto MetadataGroup::Operation(...) -> ... {
  263. // ... portable_result;
  264. // ... simd_result;
  265. // if constexpr (!UseSimd || DebugSimd) {
  266. // portable_result = PortableOperation(...);
  267. // }
  268. // if (UseSimd || DebugSimd) {
  269. // simd_result = SimdOperation(...)
  270. // CARBON_DCHECK(result == portable_result, "{0}", ...);
  271. // }
  272. // return UseSimd ? simd_result : portable_result;
  273. // }
  274. // ```
  275. class MetadataGroup : public Printable<MetadataGroup> {
  276. public:
  277. static constexpr ssize_t Size =
  278. #if CARBON_X86_SIMD_SUPPORT
  279. 16;
  280. #else
  281. 8;
  282. #endif
  283. static_assert(Size >= 8);
  284. static_assert(Size % 8 == 0);
  285. static_assert(Size <= MaxGroupSize);
  286. static_assert(MaxGroupSize % Size == 0);
  287. static_assert(llvm::isPowerOf2_64(Size),
  288. "The group size must be a constant power of two so dividing by "
  289. "it is a simple shift.");
  290. static constexpr ssize_t Mask = Size - 1;
  291. // Each control byte can have special values. All special values have the
  292. // most significant bit cleared to distinguish them from the seven hash bits
  293. // stored when the control byte represents a full bucket.
  294. //
  295. // Otherwise, their values are chose primarily to provide efficient SIMD
  296. // implementations of the common operations on an entire control group.
  297. static constexpr uint8_t Empty = 0;
  298. static constexpr uint8_t Deleted = 1;
  299. static constexpr uint8_t PresentMask = 0b1000'0000;
  300. // Whether to use a SIMD implementation. Even when we *support* a SIMD
  301. // implementation, we do not always have to use it in the event that it is
  302. // less efficient than the portable version.
  303. static constexpr bool UseSimd =
  304. #if CARBON_X86_SIMD_SUPPORT
  305. true;
  306. #else
  307. false;
  308. #endif
  309. // Some architectures make it much more efficient to build the match indices
  310. // in a byte-encoded form rather than a bit-encoded form. This encoding
  311. // changes verification and other aspects of our algorithms.
  312. static constexpr bool ByteEncoding =
  313. #if CARBON_X86_SIMD_SUPPORT
  314. false;
  315. #else
  316. true;
  317. #endif
  318. static_assert(!ByteEncoding || Size == 8,
  319. "We can only support byte encoding with a group size of 8.");
  320. // We need to indicate to users of the metadata group when they can hold a
  321. // group value in a "register" (local variable) across clearing of individual
  322. // bytes in the group efficiently. If the entire group can fit in an integer
  323. // register, this works well and clients of the group should work to use the
  324. // already-loaded value when clearing bytes. But when we have a larger group
  325. // size, clearing the byte will typically require storing a byte to memory and
  326. // re-loading the group. The usage patterns that need to clear bytes can in
  327. // those cases avoid clearing a loaded group, and clear the byte directly in
  328. // the larger metadata array.
  329. static constexpr bool FastByteClear = Size == 8;
  330. // Most and least significant bits set.
  331. static constexpr uint64_t Msbs = 0x8080'8080'8080'8080ULL;
  332. static constexpr uint64_t Lsbs = 0x0101'0101'0101'0101ULL;
  333. using MatchIndex =
  334. BitIndex<std::conditional_t<ByteEncoding, uint64_t, uint32_t>,
  335. ByteEncoding,
  336. /*ZeroMask=*/ByteEncoding ? 0 : (~0U << Size)>;
  337. // Only one kind of portable matched range is needed.
  338. using PortableMatchRange = BitIndexRange<MatchIndex>;
  339. // We use specialized match range types for SIMD implementations to allow
  340. // deferring the masking operation where useful. When that optimization
  341. // doesn't apply, these will be the same type.
  342. using SimdMatchRange =
  343. BitIndexRange<MatchIndex, /*ByteEncodingMask=*/ByteEncoding ? Msbs : 0>;
  344. using SimdMatchPresentRange = BitIndexRange<MatchIndex>;
  345. // The public API range types can be either the portable or SIMD variations,
  346. // selected here.
  347. using MatchRange =
  348. std::conditional_t<UseSimd, SimdMatchRange, PortableMatchRange>;
  349. using MatchPresentRange =
  350. std::conditional_t<UseSimd, SimdMatchPresentRange, PortableMatchRange>;
  351. union {
  352. uint8_t metadata_bytes[Size];
  353. uint64_t metadata_ints[Size / 8];
  354. #if CARBON_NEON_SIMD_SUPPORT
  355. uint8x8_t metadata_vec = {};
  356. static_assert(sizeof(metadata_vec) == Size);
  357. #elif CARBON_X86_SIMD_SUPPORT
  358. __m128i metadata_vec = {};
  359. static_assert(sizeof(metadata_vec) == Size);
  360. #endif
  361. };
  362. auto Print(llvm::raw_ostream& out) const -> void;
  363. friend auto operator==(MetadataGroup lhs, MetadataGroup rhs) -> bool {
  364. return CompareEqual(lhs, rhs);
  365. }
  366. // The main API for this class. This API will switch between a portable and
  367. // SIMD implementation based on what is most efficient, but in debug builds
  368. // will cross check that the implementations do not diverge.
  369. // Load and return a group of metadata bytes out of the main metadata array at
  370. // a particular `index`. The index must be a multiple of `GroupSize`. This
  371. // will arrange for the load to place the group into the correct structure for
  372. // efficient register-based processing.
  373. static auto Load(const uint8_t* metadata, ssize_t index) -> MetadataGroup;
  374. // Store this metadata group into the main metadata array at the provided
  375. // `index`. The index must be a multiple of `GroupSize`.
  376. auto Store(uint8_t* metadata, ssize_t index) const -> void;
  377. // Clear a byte of this group's metadata at the provided `byte_index` to the
  378. // empty value.
  379. //
  380. // Note that this must only be called when `FastByteClear` is true -- in all
  381. // other cases users of this class should arrange to clear individual bytes in
  382. // the underlying array rather than using the group API. This is checked by a
  383. // static_assert, and the function is templated so that it is not instantiated
  384. // in the cases where it would not be valid.
  385. template <bool IsCalled = true>
  386. auto ClearByte(ssize_t byte_index) -> void;
  387. // Clear all of this group's metadata bytes that indicate a deleted slot to
  388. // the empty value.
  389. auto ClearDeleted() -> void;
  390. // Find all of the bytes of metadata in this group that are present and whose
  391. // low 7 bits match the provided `tag`. The `tag` byte must have a clear high
  392. // bit, only 7 bits of tag are used. Note that this means the provided tag is
  393. // *not* the actual present metadata byte -- this function is responsible for
  394. // mapping the tag into that form as it can do so more efficiently in some
  395. // cases. A range over all of the byte indices which matched is returned.
  396. auto Match(uint8_t tag) const -> MatchRange;
  397. // Find all of the present bytes of metadata in this group. A range over all
  398. // of the byte indices which are present is returned.
  399. auto MatchPresent() const -> MatchPresentRange;
  400. // Find the first byte of the metadata group that is empty and return that
  401. // index. There is no order or position required for which of the bytes of
  402. // metadata is considered "first", any model will do that makes it efficient
  403. // to produce the matching index. Must return an empty match index if no bytes
  404. // match the empty metadata.
  405. auto MatchEmpty() const -> MatchIndex;
  406. // Find the first byte of the metadata group that is deleted and return that
  407. // index. There is no order or position required for which of the bytes of
  408. // metadata is considered "first", any model will do that makes it efficient
  409. // to produce the matching index. Must return an empty match index if no bytes
  410. // match the deleted metadata.
  411. auto MatchDeleted() const -> MatchIndex;
  412. private:
  413. // Two classes only defined in the benchmark code are allowed to directly call
  414. // the portable and SIMD implementations for benchmarking purposes.
  415. friend class BenchmarkPortableMetadataGroup;
  416. friend class BenchmarkSimdMetadataGroup;
  417. // All SIMD variants that we have an implementation for should be enabled for
  418. // debugging and fuzzing. This lets us maintain a SIMD implementation even if
  419. // it is not used due to performance reasons, and easily re-enable it if the
  420. // performance changes.
  421. static constexpr bool DebugSimd =
  422. #if (!defined(NDEBUG) || defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION)) && \
  423. (CARBON_NEON_SIMD_SUPPORT || CARBON_X86_SIMD_SUPPORT)
  424. true;
  425. #else
  426. false;
  427. #endif
  428. using MatchBitsT = MatchIndex::BitsT;
  429. // A helper function to allow deducing the return type from the selected arm
  430. // of a `constexpr` ternary.
  431. template <bool Condition, typename LeftT, typename RightT>
  432. static auto ConstexprTernary(LeftT lhs, RightT rhs) {
  433. if constexpr (Condition) {
  434. return lhs;
  435. } else {
  436. return rhs;
  437. }
  438. }
  439. static auto CompareEqual(MetadataGroup lhs, MetadataGroup rhs) -> bool;
  440. // Functions for validating the returned matches agree with what is predicted
  441. // by the `byte_match` function. These either `CHECK`-fail or return true. To
  442. // pass validation, the `*_bits` argument must have `0x80` for those bytes
  443. // where `byte_match` returns true, and `0` for the rest.
  444. // `VerifyIndexBits` is for functions that return `MatchIndex`, as they only
  445. // promise to return accurate information up to the first match.
  446. auto VerifyIndexBits(
  447. MatchBitsT index_bits,
  448. llvm::function_ref<auto(uint8_t byte)->bool> byte_match) const -> bool;
  449. // `VerifyPortableRangeBits` is for functions that return `MatchRange`, and so
  450. // it validates all the bytes of `range_bits`.
  451. auto VerifyPortableRangeBits(
  452. MatchBitsT range_bits,
  453. llvm::function_ref<auto(uint8_t byte)->bool> byte_match) const -> bool;
  454. // Portable implementations of each operation. These are used on platforms
  455. // without SIMD support or where the portable implementation is faster than
  456. // SIMD. They are heavily optimized even though they are not SIMD because we
  457. // expect there to be platforms where the portable implementation can
  458. // outperform SIMD. Their behavior and semantics exactly match the
  459. // documentation for the un-prefixed functions.
  460. //
  461. // In debug builds, these also directly verify their results to help establish
  462. // baseline functionality.
  463. static auto PortableLoad(const uint8_t* metadata, ssize_t index)
  464. -> MetadataGroup;
  465. auto PortableStore(uint8_t* metadata, ssize_t index) const -> void;
  466. auto PortableClearDeleted() -> void;
  467. auto PortableMatch(uint8_t tag) const -> PortableMatchRange;
  468. auto PortableMatchPresent() const -> PortableMatchRange;
  469. auto PortableMatchEmpty() const -> MatchIndex;
  470. auto PortableMatchDeleted() const -> MatchIndex;
  471. static auto PortableCompareEqual(MetadataGroup lhs, MetadataGroup rhs)
  472. -> bool;
  473. // SIMD implementations of each operation. We minimize platform-specific APIs
  474. // to reduce the scope of errors that can only be discovered building on one
  475. // platform, so the bodies of these contain the platform specific code. Their
  476. // behavior and semantics exactly match the documentation for the un-prefixed
  477. // functions.
  478. //
  479. // These routines don't directly verify their results as we can build simpler
  480. // debug checks by comparing them against the verified portable results.
  481. static auto SimdLoad(const uint8_t* metadata, ssize_t index) -> MetadataGroup;
  482. auto SimdStore(uint8_t* metadata, ssize_t index) const -> void;
  483. auto SimdClearDeleted() -> void;
  484. auto SimdMatch(uint8_t tag) const -> SimdMatchRange;
  485. auto SimdMatchPresent() const -> SimdMatchPresentRange;
  486. auto SimdMatchEmpty() const -> MatchIndex;
  487. auto SimdMatchDeleted() const -> MatchIndex;
  488. static auto SimdCompareEqual(MetadataGroup lhs, MetadataGroup rhs) -> bool;
  489. #if CARBON_X86_SIMD_SUPPORT
  490. // A common routine for x86 SIMD matching that can be used for matching
  491. // present, empty, and deleted bytes with equal efficiency.
  492. auto X86SimdMatch(uint8_t match_byte) const -> SimdMatchRange;
  493. #endif
  494. };
  495. // Promote the size and mask to top-level constants as we'll need to operate on
  496. // the grouped structure outside of the metadata bytes.
  497. inline constexpr ssize_t GroupSize = MetadataGroup::Size;
  498. inline constexpr ssize_t GroupMask = MetadataGroup::Mask;
  499. inline auto MetadataGroup::Load(const uint8_t* metadata, ssize_t index)
  500. -> MetadataGroup {
  501. MetadataGroup portable_g;
  502. if constexpr (!UseSimd || DebugSimd) {
  503. portable_g = PortableLoad(metadata, index);
  504. if constexpr (!UseSimd) {
  505. return portable_g;
  506. }
  507. }
  508. MetadataGroup g = SimdLoad(metadata, index);
  509. CARBON_DCHECK(g == portable_g);
  510. return g;
  511. }
  512. inline auto MetadataGroup::Store(uint8_t* metadata, ssize_t index) const
  513. -> void {
  514. if constexpr (!UseSimd) {
  515. std::memcpy(metadata + index, &metadata_bytes, Size);
  516. } else {
  517. SimdStore(metadata, index);
  518. }
  519. CARBON_DCHECK(0 == std::memcmp(metadata + index, &metadata_bytes, Size));
  520. }
  521. template <bool IsCalled>
  522. inline auto MetadataGroup::ClearByte(ssize_t byte_index) -> void {
  523. static_assert(!IsCalled || FastByteClear,
  524. "Only use byte clearing when fast!");
  525. static_assert(!IsCalled || Size == 8,
  526. "The clear implementation assumes an 8-byte group.");
  527. metadata_ints[0] &= ~(static_cast<uint64_t>(0xff) << (byte_index * 8));
  528. }
  529. inline auto MetadataGroup::ClearDeleted() -> void {
  530. MetadataGroup portable_g = *this;
  531. MetadataGroup simd_g = *this;
  532. if constexpr (!UseSimd || DebugSimd) {
  533. portable_g.PortableClearDeleted();
  534. }
  535. if constexpr (UseSimd || DebugSimd) {
  536. simd_g.SimdClearDeleted();
  537. CARBON_DCHECK(
  538. simd_g == portable_g,
  539. "SIMD cleared group '{0}' doesn't match portable cleared group '{1}'",
  540. simd_g, portable_g);
  541. }
  542. *this = UseSimd ? simd_g : portable_g;
  543. }
  544. inline auto MetadataGroup::Match(uint8_t tag) const -> MatchRange {
  545. // The caller should provide us with the present byte hash, and not set any
  546. // present bit tag on it so that this layer can manage tagging the high bit of
  547. // a present byte.
  548. CARBON_DCHECK((tag & PresentMask) == 0, "{0:x}", tag);
  549. PortableMatchRange portable_result;
  550. SimdMatchRange simd_result;
  551. if constexpr (!UseSimd || DebugSimd) {
  552. portable_result = PortableMatch(tag);
  553. }
  554. if constexpr (UseSimd || DebugSimd) {
  555. simd_result = SimdMatch(tag);
  556. CARBON_DCHECK(simd_result == portable_result,
  557. "SIMD result '{0}' doesn't match portable result '{1}'",
  558. simd_result, portable_result);
  559. }
  560. // Return whichever result we're using.
  561. return ConstexprTernary<UseSimd>(simd_result, portable_result);
  562. }
  563. inline auto MetadataGroup::MatchPresent() const -> MatchPresentRange {
  564. PortableMatchRange portable_result;
  565. SimdMatchPresentRange simd_result;
  566. if constexpr (!UseSimd || DebugSimd) {
  567. portable_result = PortableMatchPresent();
  568. }
  569. if constexpr (UseSimd || DebugSimd) {
  570. simd_result = SimdMatchPresent();
  571. CARBON_DCHECK(simd_result == portable_result,
  572. "SIMD result '{0}' doesn't match portable result '{1}'",
  573. simd_result, portable_result);
  574. }
  575. // Return whichever result we're using.
  576. return ConstexprTernary<UseSimd>(simd_result, portable_result);
  577. }
  578. inline auto MetadataGroup::MatchEmpty() const -> MatchIndex {
  579. MatchIndex portable_result;
  580. MatchIndex simd_result;
  581. if constexpr (!UseSimd || DebugSimd) {
  582. portable_result = PortableMatchEmpty();
  583. }
  584. if constexpr (UseSimd || DebugSimd) {
  585. simd_result = SimdMatchEmpty();
  586. CARBON_DCHECK(simd_result == portable_result,
  587. "SIMD result '{0}' doesn't match portable result '{1}'",
  588. simd_result, portable_result);
  589. }
  590. return UseSimd ? simd_result : portable_result;
  591. }
  592. inline auto MetadataGroup::MatchDeleted() const -> MatchIndex {
  593. MatchIndex portable_result;
  594. MatchIndex simd_result;
  595. if constexpr (!UseSimd || DebugSimd) {
  596. portable_result = PortableMatchDeleted();
  597. }
  598. if constexpr (UseSimd || DebugSimd) {
  599. simd_result = SimdMatchDeleted();
  600. CARBON_DCHECK(simd_result == portable_result,
  601. "SIMD result '{0}' doesn't match portable result '{1}'",
  602. simd_result, portable_result);
  603. }
  604. return UseSimd ? simd_result : portable_result;
  605. }
  606. inline auto MetadataGroup::CompareEqual(MetadataGroup lhs, MetadataGroup rhs)
  607. -> bool {
  608. bool portable_result;
  609. bool simd_result;
  610. if constexpr (!UseSimd || DebugSimd) {
  611. portable_result = PortableCompareEqual(lhs, rhs);
  612. }
  613. if constexpr (UseSimd || DebugSimd) {
  614. simd_result = SimdCompareEqual(lhs, rhs);
  615. CARBON_DCHECK(simd_result == portable_result);
  616. }
  617. return UseSimd ? simd_result : portable_result;
  618. }
  619. inline auto MetadataGroup::VerifyIndexBits(
  620. MatchBitsT index_bits,
  621. llvm::function_ref<auto(uint8_t byte)->bool> byte_match) const -> bool {
  622. for (ssize_t byte_index : llvm::seq<ssize_t>(0, Size)) {
  623. if constexpr (!ByteEncoding) {
  624. if (byte_match(metadata_bytes[byte_index])) {
  625. CARBON_CHECK(((index_bits >> byte_index) & 1) == 1,
  626. "Bit not set at matching byte index: {0}", byte_index);
  627. // Only the first match is needed, so stop scanning once found.
  628. break;
  629. }
  630. CARBON_CHECK(((index_bits >> byte_index) & 1) == 0,
  631. "Bit set at non-matching byte index: {0}", byte_index);
  632. } else {
  633. // `index_bits` is byte-encoded rather than bit encoded, so extract a
  634. // byte.
  635. uint8_t index_byte = (index_bits >> (byte_index * 8)) & 0xFF;
  636. if (byte_match(metadata_bytes[byte_index])) {
  637. CARBON_CHECK(
  638. (index_byte & 0x80) == 0x80,
  639. "Should have the high bit set for a matching byte, found: {0:x}",
  640. index_byte);
  641. // Only the first match is needed so stop scanning once found.
  642. break;
  643. }
  644. CARBON_CHECK(
  645. index_byte == 0,
  646. "Should have no bits set for an unmatched byte, found: {0:x}",
  647. index_byte);
  648. }
  649. }
  650. return true;
  651. }
  652. inline auto MetadataGroup::VerifyPortableRangeBits(
  653. MatchBitsT range_bits,
  654. llvm::function_ref<auto(uint8_t byte)->bool> byte_match) const -> bool {
  655. for (ssize_t byte_index : llvm::seq<ssize_t>(0, Size)) {
  656. if constexpr (!ByteEncoding) {
  657. if (byte_match(metadata_bytes[byte_index])) {
  658. CARBON_CHECK(((range_bits >> byte_index) & 1) == 1,
  659. "Bit not set at matching byte index: {0}", byte_index);
  660. } else {
  661. CARBON_CHECK(((range_bits >> byte_index) & 1) == 0,
  662. "Bit set at non-matching byte index: {0}", byte_index);
  663. }
  664. } else {
  665. // `range_bits` is byte-encoded rather than bit encoded, so extract a
  666. // byte.
  667. uint8_t range_byte = (range_bits >> (byte_index * 8)) & 0xFF;
  668. if (byte_match(metadata_bytes[byte_index])) {
  669. CARBON_CHECK(range_byte == 0x80,
  670. "Should just have the high bit set for a matching byte, "
  671. "found: {0:x}",
  672. range_byte);
  673. } else {
  674. CARBON_CHECK(
  675. range_byte == 0,
  676. "Should have no bits set for an unmatched byte, found: {0:x}",
  677. range_byte);
  678. }
  679. }
  680. }
  681. return true;
  682. }
  683. inline auto MetadataGroup::PortableLoad(const uint8_t* metadata, ssize_t index)
  684. -> MetadataGroup {
  685. MetadataGroup g;
  686. static_assert(sizeof(g) == Size);
  687. std::memcpy(&g.metadata_bytes, metadata + index, Size);
  688. return g;
  689. }
  690. inline auto MetadataGroup::PortableStore(uint8_t* metadata, ssize_t index) const
  691. -> void {
  692. std::memcpy(metadata + index, &metadata_bytes, Size);
  693. }
  694. inline auto MetadataGroup::PortableClearDeleted() -> void {
  695. for (uint64_t& metadata_int : metadata_ints) {
  696. // Deleted bytes have only the least significant bits set, so to clear them
  697. // we only need to clear the least significant bit. And empty bytes already
  698. // have a clear least significant bit, so the only least significant bits we
  699. // need to preserve are those of present bytes. The most significant bit of
  700. // every present byte is set, so we take the most significant bit of each
  701. // byte, shift it into the least significant bit position, and bit-or it
  702. // with the compliment of `Lsbs`. This will have ones for every bit but the
  703. // least significant bits, and ones for the least significant bits of every
  704. // present byte.
  705. metadata_int &= (~Lsbs | metadata_int >> 7);
  706. }
  707. }
  708. inline auto MetadataGroup::PortableMatch(uint8_t tag) const -> MatchRange {
  709. // The caller should provide us with the present byte hash, and not set any
  710. // present bit tag on it so that this layer can manage tagging the high bit of
  711. // a present byte.
  712. CARBON_DCHECK((tag & PresentMask) == 0, "{0:x}", tag);
  713. // Use a simple fallback approach for sizes beyond 8.
  714. // TODO: Instead of a simple fallback, we should generalize the below
  715. // algorithm for sizes above 8, even if to just exercise the same code on
  716. // more platforms.
  717. if constexpr (Size > 8) {
  718. static_assert(Size <= 32, "Sizes larger than 32 not yet supported!");
  719. uint32_t match_bits = 0;
  720. uint32_t bit = 1;
  721. uint8_t present_byte = tag | PresentMask;
  722. for (ssize_t i : llvm::seq<ssize_t>(0, Size)) {
  723. if (metadata_bytes[i] == present_byte) {
  724. match_bits |= bit;
  725. }
  726. bit <<= 1;
  727. }
  728. return MatchRange(match_bits);
  729. }
  730. // This algorithm only works for matching *present* bytes. We leverage the
  731. // set high bit in the present case as part of the algorithm. The whole
  732. // algorithm has a critical path height of 4 operations, and does 6
  733. // operations total on AArch64. The operation dependency graph is:
  734. //
  735. // group | Msbs Lsbs * match_byte + Msbs
  736. // \ /
  737. // match_bits ^ broadcast
  738. // |
  739. // group & Msbs Msbs - match_bits
  740. // \ /
  741. // group_Msbs & match_bits
  742. //
  743. // This diagram and the operation count are specific to AArch64 where we have
  744. // a fused *integer* multiply-add operation.
  745. //
  746. // While it is superficially similar to the "find zero bytes in a word" bit
  747. // math trick, it is different because this is designed to have no false
  748. // positives and perfectly produce 0x80 for matching bytes and 0x00 for
  749. // non-matching bytes. This is do-able because we constrain to only handle
  750. // present matches which only require testing 7 bits and have a particular
  751. // layout.
  752. // Set the high bit of every byte to `1`. Any matching byte is a present byte
  753. // and so always has this bit set as well, which means the xor below, in
  754. // addition to zeroing the low 7 bits of any byte that matches the tag, also
  755. // clears the high bit of every byte.
  756. uint64_t match_bits = metadata_ints[0] | Msbs;
  757. // Broadcast the match byte to all bytes, and mask in the present bits in the
  758. // Msbs of each byte. We structure this as a multiply and an add because we
  759. // know that the add cannot carry, and this way it can be lowered using
  760. // combined multiply-add instructions if available.
  761. uint64_t broadcast = Lsbs * tag + Msbs;
  762. CARBON_DCHECK(broadcast == (Lsbs * tag | Msbs),
  763. "Unexpected carry from addition!");
  764. // Xor the broadcast byte pattern. This makes bytes with matches become 0, and
  765. // clears the high-bits of non-matches. Note that if we are looking for a tag
  766. // with the same value as `Empty` or `Deleted`, those bytes will be zero as
  767. // well.
  768. match_bits = match_bits ^ broadcast;
  769. // Subtract each byte of `match_bits` from `0x80` bytes. After this, the high
  770. // bit will be set only for those bytes that were zero.
  771. match_bits = Msbs - match_bits;
  772. // Zero everything but the high bits, and also zero the high bits of any bytes
  773. // for "not present" slots in the original group. This avoids false positives
  774. // for `Empty` and `Deleted` bytes in the metadata.
  775. match_bits &= (metadata_ints[0] & Msbs);
  776. // At this point, `match_bits` has the high bit set for bytes where the
  777. // original group byte equals `tag` plus the high bit.
  778. CARBON_DCHECK(VerifyPortableRangeBits(
  779. match_bits, [&](uint8_t byte) { return byte == (tag | PresentMask); }));
  780. return MatchRange(match_bits);
  781. }
  782. inline auto MetadataGroup::PortableMatchPresent() const -> MatchRange {
  783. // Use a simple fallback approach for sizes beyond 8.
  784. // TODO: Instead of a simple fallback, we should generalize the below
  785. // algorithm for sizes above 8, even if to just exercise the same code on
  786. // more platforms.
  787. if constexpr (Size > 8) {
  788. static_assert(Size <= 32, "Sizes larger than 32 not yet supported!");
  789. uint32_t match_bits = 0;
  790. uint32_t bit = 1;
  791. for (ssize_t i : llvm::seq<ssize_t>(0, Size)) {
  792. if (metadata_bytes[i] & PresentMask) {
  793. match_bits |= bit;
  794. }
  795. bit <<= 1;
  796. }
  797. return MatchRange(match_bits);
  798. }
  799. // Want to keep the high bit of each byte, which indicates whether that byte
  800. // represents a present slot.
  801. uint64_t match_bits = metadata_ints[0] & Msbs;
  802. CARBON_DCHECK(VerifyPortableRangeBits(
  803. match_bits, [&](uint8_t byte) { return (byte & PresentMask) != 0; }));
  804. return MatchRange(match_bits);
  805. }
  806. inline auto MetadataGroup::PortableMatchEmpty() const -> MatchIndex {
  807. // Use a simple fallback approach for sizes beyond 8.
  808. // TODO: Instead of a simple fallback, we should generalize the below
  809. // algorithm for sizes above 8, even if to just exercise the same code on
  810. // more platforms.
  811. if constexpr (Size > 8) {
  812. static_assert(Size <= 32, "Sizes larger than 32 not yet supported!");
  813. uint32_t bit = 1;
  814. for (ssize_t i : llvm::seq<ssize_t>(0, Size)) {
  815. if (metadata_bytes[i] == Empty) {
  816. return MatchIndex(bit);
  817. }
  818. bit <<= 1;
  819. }
  820. return MatchIndex(0);
  821. }
  822. // This sets the high bit of every byte in `match_bits` unless the
  823. // corresponding metadata byte is 0. We take advantage of the fact that
  824. // the metadata bytes in are non-zero only if they are either:
  825. // - present: in which case the high bit of the byte will already be set; or
  826. // - deleted: in which case the byte will be 1, and shifting it left by 7 will
  827. // cause the high bit to be set.
  828. uint64_t match_bits = metadata_ints[0] | (metadata_ints[0] << 7);
  829. // This inverts the high bits of the bytes, and clears the remaining bits.
  830. match_bits = ~match_bits & Msbs;
  831. // The high bits of the bytes of `match_bits` are set if the corresponding
  832. // metadata byte is `Empty`.
  833. CARBON_DCHECK(
  834. VerifyIndexBits(match_bits, [](uint8_t byte) { return byte == Empty; }));
  835. return MatchIndex(match_bits);
  836. }
  837. inline auto MetadataGroup::PortableMatchDeleted() const -> MatchIndex {
  838. // Use a simple fallback approach for sizes beyond 8.
  839. // TODO: Instead of a simple fallback, we should generalize the below
  840. // algorithm for sizes above 8, even if to just exercise the same code on
  841. // more platforms.
  842. if constexpr (Size > 8) {
  843. static_assert(Size <= 32, "Sizes larger than 32 not yet supported!");
  844. uint32_t bit = 1;
  845. for (ssize_t i : llvm::seq<ssize_t>(0, Size)) {
  846. if (metadata_bytes[i] == Deleted) {
  847. return MatchIndex(bit);
  848. }
  849. bit <<= 1;
  850. }
  851. return MatchIndex(0);
  852. }
  853. // This sets the high bit of every byte in `match_bits` unless the
  854. // corresponding metadata byte is 1. We take advantage of the fact that the
  855. // metadata bytes are not 1 only if they are either:
  856. // - present: in which case the high bit of the byte will already be set; or
  857. // - empty: in which case the byte will be 0, and in that case inverting and
  858. // shifting left by 7 will have the high bit set.
  859. uint64_t match_bits = metadata_ints[0] | (~metadata_ints[0] << 7);
  860. // This inverts the high bits of the bytes, and clears the remaining bits.
  861. match_bits = ~match_bits & Msbs;
  862. // The high bits of the bytes of `match_bits` are set if the corresponding
  863. // metadata byte is `Deleted`.
  864. CARBON_DCHECK(VerifyIndexBits(match_bits,
  865. [](uint8_t byte) { return byte == Deleted; }));
  866. return MatchIndex(match_bits);
  867. }
  868. inline auto MetadataGroup::PortableCompareEqual(MetadataGroup lhs,
  869. MetadataGroup rhs) -> bool {
  870. return llvm::equal(lhs.metadata_bytes, rhs.metadata_bytes);
  871. }
  872. inline auto MetadataGroup::SimdLoad(const uint8_t* metadata, ssize_t index)
  873. -> MetadataGroup {
  874. MetadataGroup g;
  875. #if CARBON_NEON_SIMD_SUPPORT
  876. g.metadata_vec = vld1_u8(metadata + index);
  877. #elif CARBON_X86_SIMD_SUPPORT
  878. g.metadata_vec =
  879. _mm_load_si128(reinterpret_cast<const __m128i*>(metadata + index));
  880. #else
  881. static_assert(!UseSimd, "Unimplemented SIMD operation");
  882. static_cast<void>(metadata);
  883. static_cast<void>(index);
  884. #endif
  885. return g;
  886. }
  887. inline auto MetadataGroup::SimdStore(uint8_t* metadata, ssize_t index) const
  888. -> void {
  889. #if CARBON_NEON_SIMD_SUPPORT
  890. vst1_u8(metadata + index, metadata_vec);
  891. #elif CARBON_X86_SIMD_SUPPORT
  892. _mm_store_si128(reinterpret_cast<__m128i*>(metadata + index), metadata_vec);
  893. #else
  894. static_assert(!UseSimd, "Unimplemented SIMD operation");
  895. static_cast<void>(metadata);
  896. static_cast<void>(index);
  897. #endif
  898. }
  899. inline auto MetadataGroup::SimdClearDeleted() -> void {
  900. #if CARBON_NEON_SIMD_SUPPORT
  901. // There is no good Neon operation to implement this, so do it using integer
  902. // code. This is reasonably fast, but unfortunate because it forces the group
  903. // out of a SIMD register and into a general purpose register, which can have
  904. // high latency.
  905. metadata_ints[0] &= (~Lsbs | metadata_ints[0] >> 7);
  906. #elif CARBON_X86_SIMD_SUPPORT
  907. // For each byte, use `metadata_vec` if the byte's high bit is set (indicating
  908. // it is present), otherwise (it is empty or deleted) replace it with zero
  909. // (representing empty).
  910. metadata_vec =
  911. _mm_blendv_epi8(_mm_setzero_si128(), metadata_vec, metadata_vec);
  912. #else
  913. static_assert(!UseSimd && !DebugSimd, "Unimplemented SIMD operation");
  914. #endif
  915. }
  916. inline auto MetadataGroup::SimdMatch(uint8_t tag) const -> SimdMatchRange {
  917. SimdMatchRange result;
  918. #if CARBON_NEON_SIMD_SUPPORT
  919. // Broadcast byte we want to match to every byte in the vector.
  920. auto match_byte_vec = vdup_n_u8(tag | PresentMask);
  921. // Result bytes have all bits set for the bytes that match, so we have to
  922. // clear everything but Msbs next.
  923. auto match_byte_cmp_vec = vceq_u8(metadata_vec, match_byte_vec);
  924. uint64_t match_bits = vreinterpret_u64_u8(match_byte_cmp_vec)[0];
  925. // Note that the range will lazily mask to the Msbs as part of incrementing.
  926. result = SimdMatchRange(match_bits);
  927. #elif CARBON_X86_SIMD_SUPPORT
  928. result = X86SimdMatch(tag | PresentMask);
  929. #else
  930. static_assert(!UseSimd && !DebugSimd, "Unimplemented SIMD operation");
  931. static_cast<void>(tag);
  932. #endif
  933. return result;
  934. }
  935. inline auto MetadataGroup::SimdMatchPresent() const -> SimdMatchPresentRange {
  936. SimdMatchPresentRange result;
  937. #if CARBON_NEON_SIMD_SUPPORT
  938. // Just extract the metadata directly.
  939. uint64_t match_bits = vreinterpret_u64_u8(metadata_vec)[0];
  940. // Even though the Neon SIMD range will do its own masking, we have to mask
  941. // here so that `empty` is correct.
  942. result = SimdMatchPresentRange(match_bits & Msbs);
  943. #elif CARBON_X86_SIMD_SUPPORT
  944. // We arranged the byte vector so that present bytes have the high bit set,
  945. // which this instruction extracts.
  946. result = SimdMatchPresentRange(_mm_movemask_epi8(metadata_vec));
  947. #else
  948. static_assert(!UseSimd && !DebugSimd, "Unimplemented SIMD operation");
  949. #endif
  950. return result;
  951. }
  952. inline auto MetadataGroup::SimdMatchEmpty() const -> MatchIndex {
  953. MatchIndex result;
  954. #if CARBON_NEON_SIMD_SUPPORT
  955. // Compare all bytes with zero, as that is the empty byte value. Result will
  956. // have all bits set for any input zero byte, so we zero all but the high bits
  957. // below.
  958. auto cmp_vec = vceqz_u8(metadata_vec);
  959. uint64_t metadata_bits = vreinterpret_u64_u8(cmp_vec)[0];
  960. // The matched range is likely to be tested for zero by the caller, and that
  961. // test can often be folded into masking the bits with `Msbs` when we do that
  962. // mask in the scalar domain rather than the SIMD domain. So we do the mask
  963. // here rather than above prior to extracting the match bits.
  964. result = MatchIndex(metadata_bits & Msbs);
  965. #elif CARBON_X86_SIMD_SUPPORT
  966. // Even though we only need the first match rather than all matches, we don't
  967. // have a more efficient way to compute this on x86 and so we reuse the
  968. // general match infrastructure that computes all matches in a bit-encoding.
  969. // We then convert it into a `MatchIndex` that just finds the first one.
  970. result = static_cast<MatchIndex>(X86SimdMatch(Empty));
  971. #else
  972. static_assert(!UseSimd && !DebugSimd, "Unimplemented SIMD operation");
  973. #endif
  974. return result;
  975. }
  976. inline auto MetadataGroup::SimdMatchDeleted() const -> MatchIndex {
  977. MatchIndex result;
  978. #if CARBON_NEON_SIMD_SUPPORT
  979. // Broadcast the `Deleted` byte across the vector and compare the bytes of
  980. // that with the metadata vector. The result will have all bits set for any
  981. // input zero byte, so we zero all but the high bits below.
  982. auto cmp_vec = vceq_u8(metadata_vec, vdup_n_u8(Deleted));
  983. uint64_t match_bits = vreinterpret_u64_u8(cmp_vec)[0];
  984. // The matched range is likely to be tested for zero by the caller, and that
  985. // test can often be folded into masking the bits with `Msbs` when we do that
  986. // mask in the scalar domain rather than the SIMD domain. So we do the mask
  987. // here rather than above prior to extracting the match bits.
  988. result = MatchIndex(match_bits & Msbs);
  989. #elif CARBON_X86_SIMD_SUPPORT
  990. // Even though we only need the first match rather than all matches, we don't
  991. // have a more efficient way to compute this on x86 and so we reuse the
  992. // general match infrastructure that computes all matches in a bit-encoding.
  993. // We then convert it into a `MatchIndex` that just finds the first one.
  994. result = static_cast<MatchIndex>(X86SimdMatch(Deleted));
  995. #else
  996. static_assert(!UseSimd && !DebugSimd, "Unimplemented SIMD operation");
  997. #endif
  998. return result;
  999. }
  1000. inline auto MetadataGroup::SimdCompareEqual(MetadataGroup lhs,
  1001. MetadataGroup rhs) -> bool {
  1002. #if CARBON_NEON_SIMD_SUPPORT
  1003. return vreinterpret_u64_u8(vceq_u8(lhs.metadata_vec, rhs.metadata_vec))[0] ==
  1004. static_cast<uint64_t>(-1LL);
  1005. #elif CARBON_X86_SIMD_SUPPORT
  1006. // Different x86 SIMD extensions provide different comparison functionality
  1007. // available.
  1008. #if __SSE4_2__
  1009. // With SSE 4.2, we can directly test and branch in the SIMD domain on whether
  1010. // the two metadata vectors are equal.
  1011. return _mm_testc_si128(_mm_cmpeq_epi8(lhs.metadata_vec, rhs.metadata_vec),
  1012. _mm_set1_epi8(0xff)) == 1;
  1013. #else
  1014. // With older versions of SSE we have to extract the result of the comparison,
  1015. // much like we do when matching. That will have the usual bitmask
  1016. // representing equal bytes, and test for that exact bitmask in scalar code.
  1017. return _mm_movemask_epi8(_mm_cmpeq_epi8(lhs.metadata_vec,
  1018. rhs.metadata_vec)) == 0x0000'ffffU;
  1019. #endif
  1020. #else
  1021. static_assert(!UseSimd && !DebugSimd, "Unimplemented SIMD operation");
  1022. static_cast<void>(lhs);
  1023. static_cast<void>(rhs);
  1024. return false;
  1025. #endif
  1026. }
  1027. #if CARBON_X86_SIMD_SUPPORT
  1028. inline auto MetadataGroup::X86SimdMatch(uint8_t match_byte) const
  1029. -> MatchRange {
  1030. // Broadcast the byte we're matching against to all bytes in a vector, and
  1031. // compare those bytes with the metadata vector bytes.
  1032. auto match_byte_vec = _mm_set1_epi8(match_byte);
  1033. auto match_byte_cmp_vec = _mm_cmpeq_epi8(metadata_vec, match_byte_vec);
  1034. // Extract the result of each byte-wise comparison into the low bits of an
  1035. // integer.
  1036. uint32_t match_bits = _mm_movemask_epi8(match_byte_cmp_vec);
  1037. return MatchRange(match_bits);
  1038. }
  1039. #endif
  1040. } // namespace Carbon::RawHashtable
  1041. #endif // CARBON_COMMON_RAW_HASHTABLE_METADATA_GROUP_H_