raw_hashtable_metadata_group.h 44 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #ifndef CARBON_COMMON_RAW_HASHTABLE_METADATA_GROUP_H_
  5. #define CARBON_COMMON_RAW_HASHTABLE_METADATA_GROUP_H_
  6. #include <cstddef>
  7. #include <cstring>
  8. #include <iterator>
  9. #include "common/check.h"
  10. #include "llvm/ADT/Sequence.h"
  11. #include "llvm/ADT/bit.h"
  12. #include "llvm/Support/FormatVariadic.h"
  13. #include "llvm/Support/MathExtras.h"
  14. // Detect whether we can use SIMD accelerated implementations of the control
  15. // groups, and include the relevant platform specific APIs for the SIMD
  16. // implementations.
  17. //
  18. // Reference documentation for the SIMD APIs used here:
  19. // - https://arm-software.github.io/acle/neon_intrinsics/advsimd.html
  20. // - https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html
  21. #if defined(__SSSE3__)
  22. #include <x86intrin.h>
  23. #define CARBON_X86_SIMD_SUPPORT 1
  24. #elif defined(__ARM_NEON)
  25. #include <arm_neon.h>
  26. #define CARBON_NEON_SIMD_SUPPORT 1
  27. #endif
  28. // This namespace collects low-level utilities for implementing hashtable
  29. // data structures. This file only provides one of them:
  30. //
  31. // - Primitives to manage "groups" of hashtable entries that have densely packed
  32. // control bytes we can scan rapidly as a group, often using SIMD facilities
  33. // to process the entire group at once.
  34. namespace Carbon::RawHashtable {
  35. // We define a constant max group size. The particular group size used in
  36. // practice may vary, but we want to have some upper bound used to ensure
  37. // memory allocation is done consistently across different architectures.
  38. constexpr ssize_t MaxGroupSize = 16;
  39. // This takes a collection of bits representing the results of looking for a
  40. // particular tag in this metadata group and determines the first position with
  41. // a match. The position is represented by either the least significant set bit
  42. // or the least significant non-zero byte, depending on `ByteEncoding`. When
  43. // represented with a non-zero byte, that byte must have at least its most
  44. // significant bit set, but may have other bits set to any value. Bits more
  45. // significant than the match may have any value provided there is at least one
  46. // match. Zero matches must be represented by a zero input.
  47. //
  48. // Some bits of the underlying value may be known-zero, which can optimize
  49. // various operations. These can be represented as a `ZeroMask`.
  50. template <typename BitsInputT, bool ByteEncoding, BitsInputT ZeroMask = 0>
  51. class BitIndex
  52. : public Printable<BitIndex<BitsInputT, ByteEncoding, ZeroMask>> {
  53. public:
  54. using BitsT = BitsInputT;
  55. BitIndex() = default;
  56. explicit BitIndex(BitsT bits) : bits_(bits) {}
  57. friend auto operator==(BitIndex lhs, BitIndex rhs) -> bool {
  58. if (lhs.empty() || rhs.empty()) {
  59. return lhs.empty() == rhs.empty();
  60. }
  61. // For non-empty bit indices, compare the indices directly to ignore other
  62. // (extraneous) parts of the incoming bits.
  63. return lhs.index() == rhs.index();
  64. }
  65. auto Print(llvm::raw_ostream& out) const -> void {
  66. out << llvm::formatv("{0:x}", bits_);
  67. }
  68. explicit operator bool() const { return !empty(); }
  69. // Returns true when there are no matches for the tag.
  70. auto empty() const -> bool {
  71. CARBON_DCHECK((bits_ & ZeroMask) == 0) << "Unexpected non-zero bits!";
  72. __builtin_assume((bits_ & ZeroMask) == 0);
  73. return bits_ == 0;
  74. }
  75. // Returns the index of the first matched tag.
  76. auto index() -> ssize_t {
  77. CARBON_DCHECK(bits_ != 0) << "Cannot get an index from zero bits!";
  78. __builtin_assume(bits_ != 0);
  79. ssize_t index = unscaled_index();
  80. if constexpr (ByteEncoding) {
  81. // Shift to scale out of the byte encoding.
  82. index >>= ByteEncodingShift;
  83. }
  84. return index;
  85. }
  86. // Optimized tool to index a pointer `p` by `index()`.
  87. template <typename T>
  88. auto index_ptr(T* pointer) -> T* {
  89. CARBON_DCHECK(bits_ != 0) << "Cannot get an index from zero bits!";
  90. __builtin_assume(bits_ != 0);
  91. if constexpr (!ByteEncoding) {
  92. return &pointer[unscaled_index()];
  93. }
  94. ssize_t index = unscaled_index();
  95. // Scale the index as we counted zero *bits* and not zero *bytes*.
  96. // However, we can fold that scale with the size of `T` when it is a power
  97. // of two or divisible by 8.
  98. CARBON_DCHECK(
  99. (index & ((static_cast<size_t>(1) << ByteEncodingShift) - 1)) == 0);
  100. if constexpr (sizeof(T) % 8 == 0) {
  101. constexpr size_t FoldedScale = sizeof(T) / 8;
  102. index *= FoldedScale;
  103. return reinterpret_cast<T*>(
  104. &reinterpret_cast<std::byte*>(pointer)[index]);
  105. } else if constexpr (llvm::isPowerOf2_64(sizeof(T))) {
  106. constexpr size_t ScaleShift = llvm::CTLog2<sizeof(T)>();
  107. static_assert(ScaleShift <= ByteEncodingShift,
  108. "Scaling by >=8 should be handled above!");
  109. constexpr size_t FoldedShift = ByteEncodingShift - ScaleShift;
  110. index >>= FoldedShift;
  111. return reinterpret_cast<T*>(
  112. &reinterpret_cast<std::byte*>(pointer)[index]);
  113. }
  114. // Nothing we can fold here.
  115. return &pointer[index >> ByteEncodingShift];
  116. }
  117. private:
  118. // When using a byte encoding, we'll need to shift any index by this amount.
  119. static constexpr size_t ByteEncodingShift = 3;
  120. auto unscaled_index() -> ssize_t {
  121. if constexpr (!ByteEncoding) {
  122. // Note the cast to `size_t` to force zero extending the result.
  123. return static_cast<size_t>(llvm::countr_zero(bits_));
  124. } else {
  125. // The index is encoded in the high bit of each byte. We compute the index
  126. // by counting the number of low zero bytes there are before the first
  127. // byte with its high bit set. Rather that shifting the high bit to be the
  128. // low bit and counting the trailing (least significant) zero bits
  129. // directly, we instead byte-reverse the bits and count the *leading*
  130. // (most significant) zero bits. While this may be a wash on CPUs with
  131. // direct support for counting the trailing zero bits, AArch64 only
  132. // supports counting the leading zero bits and requires a bit-reverse to
  133. // count the trailing zero bits. Doing the byte-reverse approach
  134. // essentially combines moving the high bit into the low bit and the
  135. // reverse necessary for counting the zero bits. While this only removes
  136. // one instruction, it is an instruction in the critical path of the
  137. // hottest part of table lookup, and that critical path dependency height
  138. // is few enough instructions that removing even one significantly impacts
  139. // latency.
  140. //
  141. // We also cast to `size_t` to clearly zero-extend the result.
  142. return static_cast<size_t>(llvm::countl_zero(llvm::byteswap(bits_)));
  143. }
  144. }
  145. BitsT bits_ = 0;
  146. };
  147. // This is like `BitIndex`, but allows iterating through all of the matches.
  148. //
  149. // A key requirement for efficient iteration is that all of the matches are
  150. // represented with a single bit and there are no other bits set. For example,
  151. // with byte-encoded bit indices, exactly the high bit and no other bit of each
  152. // matching byte must be set. This is a stricter constraint than what `BitIndex`
  153. // alone would impose on any one of the matches.
  154. template <typename BitIndexT>
  155. class BitIndexRange : public Printable<BitIndexRange<BitIndexT>> {
  156. public:
  157. using BitsT = BitIndexT::BitsT;
  158. class Iterator
  159. : public llvm::iterator_facade_base<Iterator, std::forward_iterator_tag,
  160. ssize_t, ssize_t> {
  161. public:
  162. Iterator() = default;
  163. explicit Iterator(BitsT bits) : bits_(bits) {}
  164. auto operator==(const Iterator& rhs) const -> bool {
  165. return bits_ == rhs.bits_;
  166. }
  167. auto operator*() -> ssize_t& {
  168. CARBON_DCHECK(bits_ != 0) << "Cannot get an index from zero bits!";
  169. __builtin_assume(bits_ != 0);
  170. index_ = BitIndexT(bits_).index();
  171. // Note that we store the index in a member so we can return a reference
  172. // to it here as required to be a forward iterator.
  173. return index_;
  174. }
  175. template <typename T>
  176. auto index_ptr(T* pointer) -> T* {
  177. return BitIndexT(bits_).index_ptr(pointer);
  178. }
  179. auto operator++() -> Iterator& {
  180. CARBON_DCHECK(bits_ != 0) << "Must not increment past the end!";
  181. __builtin_assume(bits_ != 0);
  182. // Clears the least significant set bit, effectively stepping to the next
  183. // match.
  184. bits_ &= (bits_ - 1);
  185. return *this;
  186. }
  187. private:
  188. ssize_t index_;
  189. BitsT bits_ = 0;
  190. };
  191. BitIndexRange() = default;
  192. explicit BitIndexRange(BitsT bits) : bits_(bits) {}
  193. explicit operator bool() const { return !empty(); }
  194. auto empty() const -> bool { return BitIndexT(bits_).empty(); }
  195. auto begin() const -> Iterator { return Iterator(bits_); }
  196. auto end() const -> Iterator { return Iterator(); }
  197. friend auto operator==(BitIndexRange lhs, BitIndexRange rhs) -> bool {
  198. return lhs.bits_ == rhs.bits_;
  199. }
  200. auto Print(llvm::raw_ostream& out) const -> void {
  201. out << llvm::formatv("{0:x}", bits_);
  202. }
  203. explicit operator BitsT() const { return bits_; }
  204. explicit operator BitIndexT() const { return BitIndexT(bits_); }
  205. private:
  206. BitsT bits_ = 0;
  207. };
  208. // A group of metadata bytes that can be manipulated together.
  209. //
  210. // The metadata bytes used Carbon's hashtable implementation are designed to
  211. // support being manipulating as groups, either using architecture specific SIMD
  212. // code sequences or using portable SIMD-in-an-integer-register code sequences.
  213. // These operations are unusually performance sensitive and in sometimes
  214. // surprising ways. The implementations here are crafted specifically to
  215. // optimize the particular usages in Carbon's hashtable and should not be
  216. // expected to be reusable in any other context.
  217. //
  218. // Throughout the functions operating on this type we try to use patterns with a
  219. // fallback portable implementation which can be directly used in the absence of
  220. // a SIMD implementation, but is also used (with the same code) to check that
  221. // any SIMD implementation produces the same result as the portable one. These
  222. // patterns help minimize un-compiled or un-tested paths through either portable
  223. // or SIMD code, regardless of which path is actually *used* on a particular
  224. // platform. To illustrate a common version of this pattern, we might have code
  225. // like:
  226. //
  227. // ```cpp
  228. // auto MetadataGroup::Operation(...) -> ... {
  229. // ... portable_result;
  230. // ... simd_result;
  231. // if constexpr (!UseSIMD || DebugSIMD) {
  232. // portable_result = PortableOperation(...);
  233. // }
  234. // if (UseSIMD || DebugSIMD) {
  235. // simd_result = SIMDOperation(...)
  236. // CARBON_DCHECK(result == portable_result) << ...;
  237. // }
  238. // return UseSIMD ? simd_result : portable_result;
  239. // }
  240. // ```
  241. class MetadataGroup : public Printable<MetadataGroup> {
  242. public:
  243. static constexpr ssize_t Size =
  244. #if CARBON_X86_SIMD_SUPPORT
  245. 16;
  246. #else
  247. 8;
  248. #endif
  249. static_assert(Size >= 8);
  250. static_assert(Size % 8 == 0);
  251. static_assert(Size <= MaxGroupSize);
  252. static_assert(MaxGroupSize % Size == 0);
  253. static_assert(llvm::isPowerOf2_64(Size),
  254. "The group size must be a constant power of two so dividing by "
  255. "it is a simple shift.");
  256. static constexpr ssize_t Mask = Size - 1;
  257. // Each control byte can have special values. All special values have the
  258. // most significant bit cleared to distinguish them from the seven hash bits
  259. // stored when the control byte represents a full bucket.
  260. //
  261. // Otherwise, their values are chose primarily to provide efficient SIMD
  262. // implementations of the common operations on an entire control group.
  263. static constexpr uint8_t Empty = 0;
  264. static constexpr uint8_t Deleted = 1;
  265. static constexpr uint8_t PresentMask = 0b1000'0000;
  266. // Some architectures make it much more efficient to build the match indices
  267. // in a byte-encoded form rather than a bit-encoded form. This encoding
  268. // changes verification and other aspects of our algorithms.
  269. static constexpr bool ByteEncoding =
  270. #if CARBON_X86_SIMD_SUPPORT
  271. false;
  272. #else
  273. true;
  274. #endif
  275. static_assert(!ByteEncoding || Size == 8,
  276. "We can only support byte encoding with a group size of 8.");
  277. // We need to indicate to users of the metadata group when they can hold a
  278. // group value in a "register" (local variable) across clearing of individual
  279. // bytes in the group efficiently. If the entire group can fit in an integer
  280. // register, this works well and clients of the group should work to use the
  281. // already-loaded value when clearing bytes. But when we have a larger group
  282. // size, clearing the byte will typically require storing a byte to memory and
  283. // re-loading the group. The usage patterns that need to clear bytes can in
  284. // those cases avoid clearing a loaded group, and clear the byte directly in
  285. // the larger metadata array.
  286. static constexpr bool FastByteClear = Size == 8;
  287. using MatchIndex =
  288. BitIndex<std::conditional_t<ByteEncoding, uint64_t, uint32_t>,
  289. ByteEncoding,
  290. /*ZeroMask=*/ByteEncoding ? 0 : (~0U << Size)>;
  291. using MatchRange = BitIndexRange<MatchIndex>;
  292. union {
  293. uint8_t metadata_bytes[Size];
  294. uint64_t metadata_ints[Size / 8];
  295. #if CARBON_NEON_SIMD_SUPPORT
  296. uint8x8_t metadata_vec = {};
  297. static_assert(sizeof(metadata_vec) == Size);
  298. #elif CARBON_X86_SIMD_SUPPORT
  299. __m128i metadata_vec = {};
  300. static_assert(sizeof(metadata_vec) == Size);
  301. #endif
  302. };
  303. auto Print(llvm::raw_ostream& out) const -> void;
  304. friend auto operator==(MetadataGroup lhs, MetadataGroup rhs) -> bool {
  305. return CompareEqual(lhs, rhs);
  306. }
  307. // The main API for this class. This API will switch between a portable and
  308. // SIMD implementation based on what is most efficient, but in debug builds
  309. // will cross check that the implementations do not diverge.
  310. // Load and return a group of metadata bytes out of the main metadata array at
  311. // a particular `index`. The index must be a multiple of `GroupSize`. This
  312. // will arrange for the load to place the group into the correct structure for
  313. // efficient register-based processing.
  314. static auto Load(const uint8_t* metadata, ssize_t index) -> MetadataGroup;
  315. // Store this metadata group into the main metadata array at the provided
  316. // `index`. The index must be a multiple of `GroupSize`.
  317. auto Store(uint8_t* metadata, ssize_t index) const -> void;
  318. // Clear a byte of this group's metadata at the provided `byte_index` to the
  319. // empty value. Note that this must only be called when `FastByteClear` is
  320. // true -- in all other cases users of this class should arrange to clear
  321. // individual bytes in the underlying array rather than using the group API.
  322. auto ClearByte(ssize_t byte_index) -> void;
  323. // Clear all of this group's metadata bytes that indicate a deleted slot to
  324. // the empty value.
  325. auto ClearDeleted() -> void;
  326. // Find all of the bytes of metadata in this group that are present and whose
  327. // low 7 bits match the provided `tag`. The `tag` byte must have a clear high
  328. // bit, only 7 bits of tag are used. Note that this means the provided tag is
  329. // *not* the actual present metadata byte -- this function is responsible for
  330. // mapping the tag into that form as it can do so more efficiently in some
  331. // cases. A range over all of the byte indices which matched is returned.
  332. auto Match(uint8_t tag) const -> MatchRange;
  333. // Find all of the present bytes of metadata in this group. A range over all
  334. // of the byte indices which are present is returned.
  335. auto MatchPresent() const -> MatchRange;
  336. // Find the first byte of the metadata group that is empty and return that
  337. // index. There is no order or position required for which of the bytes of
  338. // metadata is considered "first", any model will do that makes it efficient
  339. // to produce the matching index. Must return an empty match index if no bytes
  340. // match the empty metadata.
  341. auto MatchEmpty() const -> MatchIndex;
  342. // Find the first byte of the metadata group that is deleted and return that
  343. // index. There is no order or position required for which of the bytes of
  344. // metadata is considered "first", any model will do that makes it efficient
  345. // to produce the matching index. Must return an empty match index if no bytes
  346. // match the deleted metadata.
  347. auto MatchDeleted() const -> MatchIndex;
  348. private:
  349. // Two classes only defined in the benchmark code are allowed to directly call
  350. // the portable and SIMD implementations for benchmarking purposes.
  351. friend class BenchmarkPortableMetadataGroup;
  352. friend class BenchmarkSIMDMetadataGroup;
  353. // Whether to use a SIMD implementation. Even when we *support* a SIMD
  354. // implementation, we do not always have to use it in the event that it is
  355. // less efficient than the portable version.
  356. static constexpr bool UseSIMD =
  357. #if CARBON_X86_SIMD_SUPPORT
  358. true;
  359. #else
  360. false;
  361. #endif
  362. // All SIMD variants that we have an implementation for should be enabled for
  363. // debugging. This lets us maintain a SIMD implementation even if it is not
  364. // used due to performance reasons, and easily re-enable it if the performance
  365. // changes.
  366. static constexpr bool DebugSIMD =
  367. #if !defined(NDEBUG) && (CARBON_NEON_SIMD_SUPPORT || CARBON_X86_SIMD_SUPPORT)
  368. true;
  369. #else
  370. false;
  371. #endif
  372. // Most and least significant bits set.
  373. static constexpr uint64_t MSBs = 0x8080'8080'8080'8080ULL;
  374. static constexpr uint64_t LSBs = 0x0101'0101'0101'0101ULL;
  375. using MatchBitsT = MatchIndex::BitsT;
  376. static auto CompareEqual(MetadataGroup lhs, MetadataGroup rhs) -> bool;
  377. // Functions for validating the returned matches agree with what is predicted
  378. // by the `byte_match` function. These either `CHECK`-fail or return true. To
  379. // pass validation, the `*_bits` argument must have `0x80` for those bytes
  380. // where `byte_match` returns true, and `0` for the rest.
  381. // `VerifyIndexBits` is for functions that return `MatchIndex`, as they only
  382. // promise to return accurate information up to the first match.
  383. auto VerifyIndexBits(
  384. MatchBitsT index_bits,
  385. llvm::function_ref<auto(uint8_t byte)->bool> byte_match) const -> bool;
  386. // `VerifyRangeBits` is for functions that return `MatchRange`, and so it
  387. // validates all the bytes of `range_bits`.
  388. auto VerifyRangeBits(
  389. MatchBitsT range_bits,
  390. llvm::function_ref<auto(uint8_t byte)->bool> byte_match) const -> bool;
  391. // Portable implementations of each operation. These are used on platforms
  392. // without SIMD support or where the portable implementation is faster than
  393. // SIMD. They are heavily optimized even though they are not SIMD because we
  394. // expect there to be platforms where the portable implementation can
  395. // outperform SIMD. Their behavior and semantics exactly match the
  396. // documentation for the un-prefixed functions.
  397. //
  398. // In debug builds, these also directly verify their results to help establish
  399. // baseline functionality.
  400. static auto PortableLoad(const uint8_t* metadata, ssize_t index)
  401. -> MetadataGroup;
  402. auto PortableStore(uint8_t* metadata, ssize_t index) const -> void;
  403. auto PortableClearDeleted() -> void;
  404. auto PortableMatch(uint8_t tag) const -> MatchRange;
  405. auto PortableMatchPresent() const -> MatchRange;
  406. auto PortableMatchEmpty() const -> MatchIndex;
  407. auto PortableMatchDeleted() const -> MatchIndex;
  408. static auto PortableCompareEqual(MetadataGroup lhs, MetadataGroup rhs)
  409. -> bool;
  410. // SIMD implementations of each operation. We minimize platform-specific APIs
  411. // to reduce the scope of errors that can only be discoverd building on one
  412. // platform, so the bodies of these contain the platform specific code. Their
  413. // behavior and semantics exactly match the documentation for the un-prefixed
  414. // functions.
  415. //
  416. // These routines don't directly verify their results as we can build simpler
  417. // debug checks by comparing them against the verified portable results.
  418. static auto SIMDLoad(const uint8_t* metadata, ssize_t index) -> MetadataGroup;
  419. auto SIMDStore(uint8_t* metadata, ssize_t index) const -> void;
  420. auto SIMDClearDeleted() -> void;
  421. auto SIMDMatch(uint8_t tag) const -> MatchRange;
  422. auto SIMDMatchPresent() const -> MatchRange;
  423. auto SIMDMatchEmpty() const -> MatchIndex;
  424. auto SIMDMatchDeleted() const -> MatchIndex;
  425. static auto SIMDCompareEqual(MetadataGroup lhs, MetadataGroup rhs) -> bool;
  426. #if CARBON_X86_SIMD_SUPPORT
  427. // A common routine for x86 SIMD matching that can be used for matching
  428. // present, empty, and deleted bytes with equal efficiency.
  429. auto X86SIMDMatch(uint8_t match_byte) const -> MatchRange;
  430. #endif
  431. };
  432. // Promote the size and mask to top-level constants as we'll need to operate on
  433. // the grouped structure outside of the metadata bytes.
  434. inline constexpr ssize_t GroupSize = MetadataGroup::Size;
  435. inline constexpr ssize_t GroupMask = MetadataGroup::Mask;
  436. inline auto MetadataGroup::Load(const uint8_t* metadata, ssize_t index)
  437. -> MetadataGroup {
  438. MetadataGroup portable_g;
  439. if constexpr (!UseSIMD || DebugSIMD) {
  440. portable_g = PortableLoad(metadata, index);
  441. if constexpr (!UseSIMD) {
  442. return portable_g;
  443. }
  444. }
  445. MetadataGroup g = SIMDLoad(metadata, index);
  446. CARBON_DCHECK(g == portable_g);
  447. return g;
  448. }
  449. inline auto MetadataGroup::Store(uint8_t* metadata, ssize_t index) const
  450. -> void {
  451. if constexpr (!UseSIMD) {
  452. std::memcpy(metadata + index, &metadata_bytes, Size);
  453. } else {
  454. SIMDStore(metadata, index);
  455. }
  456. CARBON_DCHECK(0 == std::memcmp(metadata + index, &metadata_bytes, Size));
  457. }
  458. inline auto MetadataGroup::ClearByte(ssize_t byte_index) -> void {
  459. CARBON_DCHECK(FastByteClear) << "Only use byte clearing when fast!";
  460. CARBON_DCHECK(Size == 8)
  461. << "The clear implementation assumes an 8-byte group.";
  462. metadata_ints[0] &= ~(static_cast<uint64_t>(0xff) << (byte_index * 8));
  463. }
  464. inline auto MetadataGroup::ClearDeleted() -> void {
  465. MetadataGroup portable_g = *this;
  466. MetadataGroup simd_g = *this;
  467. if constexpr (!UseSIMD || DebugSIMD) {
  468. portable_g.PortableClearDeleted();
  469. }
  470. if constexpr (UseSIMD || DebugSIMD) {
  471. simd_g.SIMDClearDeleted();
  472. CARBON_DCHECK(simd_g == portable_g)
  473. << "SIMD cleared group '" << simd_g
  474. << "' doesn't match portable cleared group '" << portable_g << "'";
  475. }
  476. *this = UseSIMD ? simd_g : portable_g;
  477. }
  478. inline auto MetadataGroup::Match(uint8_t tag) const -> MatchRange {
  479. // The caller should provide us with the present byte hash, and not set any
  480. // present bit tag on it so that this layer can manage tagging the high bit of
  481. // a present byte.
  482. CARBON_DCHECK((tag & PresentMask) == 0) << llvm::formatv("{0:x}", tag);
  483. MatchRange portable_result;
  484. MatchRange simd_result;
  485. if constexpr (!UseSIMD || DebugSIMD) {
  486. portable_result = PortableMatch(tag);
  487. }
  488. if constexpr (UseSIMD || DebugSIMD) {
  489. simd_result = SIMDMatch(tag);
  490. CARBON_DCHECK(simd_result == portable_result)
  491. << "SIMD result '" << simd_result << "' doesn't match portable result '"
  492. << portable_result << "'";
  493. }
  494. return UseSIMD ? simd_result : portable_result;
  495. }
  496. inline auto MetadataGroup::MatchPresent() const -> MatchRange {
  497. MatchRange portable_result;
  498. MatchRange simd_result;
  499. if constexpr (!UseSIMD || DebugSIMD) {
  500. portable_result = PortableMatchPresent();
  501. }
  502. if constexpr (UseSIMD || DebugSIMD) {
  503. simd_result = SIMDMatchPresent();
  504. CARBON_DCHECK(simd_result == portable_result)
  505. << "SIMD result '" << simd_result << "' doesn't match portable result '"
  506. << portable_result << "'";
  507. }
  508. return UseSIMD ? simd_result : portable_result;
  509. }
  510. inline auto MetadataGroup::MatchEmpty() const -> MatchIndex {
  511. MatchIndex portable_result;
  512. MatchIndex simd_result;
  513. if constexpr (!UseSIMD || DebugSIMD) {
  514. portable_result = PortableMatchEmpty();
  515. }
  516. if constexpr (UseSIMD || DebugSIMD) {
  517. simd_result = SIMDMatchEmpty();
  518. CARBON_DCHECK(simd_result == portable_result)
  519. << "SIMD result '" << simd_result << "' doesn't match portable result '"
  520. << portable_result << "'";
  521. }
  522. return UseSIMD ? simd_result : portable_result;
  523. }
  524. inline auto MetadataGroup::MatchDeleted() const -> MatchIndex {
  525. MatchIndex portable_result;
  526. MatchIndex simd_result;
  527. if constexpr (!UseSIMD || DebugSIMD) {
  528. portable_result = PortableMatchDeleted();
  529. }
  530. if constexpr (UseSIMD || DebugSIMD) {
  531. simd_result = SIMDMatchDeleted();
  532. CARBON_DCHECK(simd_result == portable_result)
  533. << "SIMD result '" << simd_result << "' doesn't match portable result '"
  534. << portable_result << "'";
  535. }
  536. return UseSIMD ? simd_result : portable_result;
  537. }
  538. inline auto MetadataGroup::CompareEqual(MetadataGroup lhs, MetadataGroup rhs)
  539. -> bool {
  540. bool portable_result;
  541. bool simd_result;
  542. if constexpr (!UseSIMD || DebugSIMD) {
  543. portable_result = PortableCompareEqual(lhs, rhs);
  544. }
  545. if constexpr (UseSIMD || DebugSIMD) {
  546. simd_result = SIMDCompareEqual(lhs, rhs);
  547. CARBON_DCHECK(simd_result == portable_result);
  548. }
  549. return UseSIMD ? simd_result : portable_result;
  550. }
  551. inline auto MetadataGroup::VerifyIndexBits(
  552. MatchBitsT index_bits,
  553. llvm::function_ref<auto(uint8_t byte)->bool> byte_match) const -> bool {
  554. for (ssize_t byte_index : llvm::seq<ssize_t>(0, Size)) {
  555. if constexpr (!ByteEncoding) {
  556. if (byte_match(metadata_bytes[byte_index])) {
  557. CARBON_CHECK(((index_bits >> byte_index) & 1) == 1)
  558. << "Bit not set at matching byte index: " << byte_index;
  559. // Only the first match is needed, so stop scanning once found.
  560. break;
  561. }
  562. CARBON_CHECK(((index_bits >> byte_index) & 1) == 0)
  563. << "Bit set at non-matching byte index: " << byte_index;
  564. } else {
  565. // `index_bits` is byte-encoded rather than bit encoded, so extract a
  566. // byte.
  567. uint8_t index_byte = (index_bits >> (byte_index * 8)) & 0xFF;
  568. if (byte_match(metadata_bytes[byte_index])) {
  569. CARBON_CHECK((index_byte & 0x80) == 0x80)
  570. << "Should have the high bit set for a matching byte, found: "
  571. << llvm::formatv("{0:x}", index_byte);
  572. // Only the first match is needed so stop scanning once found.
  573. break;
  574. }
  575. CARBON_CHECK(index_byte == 0)
  576. << "Should have no bits set for an unmatched byte, found: "
  577. << llvm::formatv("{0:x}", index_byte);
  578. }
  579. }
  580. return true;
  581. }
  582. inline auto MetadataGroup::VerifyRangeBits(
  583. MatchBitsT range_bits,
  584. llvm::function_ref<auto(uint8_t byte)->bool> byte_match) const -> bool {
  585. for (ssize_t byte_index : llvm::seq<ssize_t>(0, Size)) {
  586. if constexpr (!ByteEncoding) {
  587. if (byte_match(metadata_bytes[byte_index])) {
  588. CARBON_CHECK(((range_bits >> byte_index) & 1) == 1)
  589. << "Bit not set at matching byte index: " << byte_index;
  590. } else {
  591. CARBON_CHECK(((range_bits >> byte_index) & 1) == 0)
  592. << "Bit set at non-matching byte index: " << byte_index;
  593. }
  594. } else {
  595. // `range_bits` is byte-encoded rather than bit encoded, so extract a
  596. // byte.
  597. uint8_t range_byte = (range_bits >> (byte_index * 8)) & 0xFF;
  598. if (byte_match(metadata_bytes[byte_index])) {
  599. CARBON_CHECK(range_byte == 0x80)
  600. << "Should just have the high bit set for a matching byte, found: "
  601. << llvm::formatv("{0:x}", range_byte);
  602. } else {
  603. CARBON_CHECK(range_byte == 0)
  604. << "Should have no bits set for an unmatched byte, found: "
  605. << llvm::formatv("{0:x}", range_byte);
  606. }
  607. }
  608. }
  609. return true;
  610. }
  611. inline auto MetadataGroup::PortableLoad(const uint8_t* metadata, ssize_t index)
  612. -> MetadataGroup {
  613. MetadataGroup g;
  614. static_assert(sizeof(g) == Size);
  615. std::memcpy(&g.metadata_bytes, metadata + index, Size);
  616. return g;
  617. }
  618. inline auto MetadataGroup::PortableStore(uint8_t* metadata, ssize_t index) const
  619. -> void {
  620. std::memcpy(metadata + index, &metadata_bytes, Size);
  621. }
  622. inline auto MetadataGroup::PortableClearDeleted() -> void {
  623. for (uint64_t& metadata_int : metadata_ints) {
  624. // Deleted bytes have only the least significant bits set, so to clear them
  625. // we only need to clear the least significant bit. And empty bytes already
  626. // have a clear least significant bit, so the only least significant bits we
  627. // need to preserve are those of present bytes. The most significant bit of
  628. // every present byte is set, so we take the most significant bit of each
  629. // byte, shift it into the least significant bit position, and bit-or it
  630. // with the compliment of `LSBs`. This will have ones for every bit but the
  631. // least significant bits, and ones for the least significant bits of every
  632. // present byte.
  633. metadata_int &= (~LSBs | metadata_int >> 7);
  634. }
  635. }
  636. inline auto MetadataGroup::PortableMatch(uint8_t tag) const -> MatchRange {
  637. // The caller should provide us with the present byte hash, and not set any
  638. // present bit tag on it so that this layer can manage tagging the high bit of
  639. // a present byte.
  640. CARBON_DCHECK((tag & PresentMask) == 0) << llvm::formatv("{0:x}", tag);
  641. // Use a simple fallback approach for sizes beyond 8.
  642. // TODO: Instead of a simple fallback, we should generalize the below
  643. // algorithm for sizes above 8, even if to just exercise the same code on
  644. // more platforms.
  645. if constexpr (Size > 8) {
  646. static_assert(Size <= 32, "Sizes larger than 32 not yet supported!");
  647. uint32_t match_bits = 0;
  648. uint32_t bit = 1;
  649. uint8_t present_byte = tag | PresentMask;
  650. for (ssize_t i : llvm::seq<ssize_t>(0, Size)) {
  651. if (metadata_bytes[i] == present_byte) {
  652. match_bits |= bit;
  653. }
  654. bit <<= 1;
  655. }
  656. return MatchRange(match_bits);
  657. }
  658. // This algorithm only works for matching *present* bytes. We leverage the
  659. // set high bit in the present case as part of the algorithm. The whole
  660. // algorithm has a critical path height of 4 operations, and does 6
  661. // operations total on AArch64. The operation dependency graph is:
  662. //
  663. // group | MSBs LSBs * match_byte + MSBs
  664. // \ /
  665. // match_bits ^ broadcast
  666. // |
  667. // group & MSBs MSBs - match_bits
  668. // \ /
  669. // group_MSBs & match_bits
  670. //
  671. // This diagram and the operation count are specific to AArch64 where we have
  672. // a fused *integer* multiply-add operation.
  673. //
  674. // While it is superficially similar to the "find zero bytes in a word" bit
  675. // math trick, it is different because this is designed to have no false
  676. // positives and perfectly produce 0x80 for matching bytes and 0x00 for
  677. // non-matching bytes. This is do-able because we constrain to only handle
  678. // present matches which only require testing 7 bits and have a particular
  679. // layout.
  680. // Set the high bit of every byte to `1`. Any matching byte is a present byte
  681. // and so always has this bit set as well, which means the xor below, in
  682. // addition to zeroing the low 7 bits of any byte that matches the tag, also
  683. // clears the high bit of every byte.
  684. uint64_t match_bits = metadata_ints[0] | MSBs;
  685. // Broadcast the match byte to all bytes, and mask in the present bits in the
  686. // MSBs of each byte. We structure this as a multiply and an add because we
  687. // know that the add cannot carry, and this way it can be lowered using
  688. // combined multiply-add instructions if available.
  689. uint64_t broadcast = LSBs * tag + MSBs;
  690. CARBON_DCHECK(broadcast == (LSBs * tag | MSBs))
  691. << "Unexpected carry from addition!";
  692. // Xor the broadcast byte pattern. This makes bytes with matches become 0, and
  693. // clears the high-bits of non-matches. Note that if we are looking for a tag
  694. // with the same value as `Empty` or `Deleted`, those bytes will be zero as
  695. // well.
  696. match_bits = match_bits ^ broadcast;
  697. // Subtract each byte of `match_bits` from `0x80` bytes. After this, the high
  698. // bit will be set only for those bytes that were zero.
  699. match_bits = MSBs - match_bits;
  700. // Zero everything but the high bits, and also zero the high bits of any bytes
  701. // for "not present" slots in the original group. This avoids false positives
  702. // for `Empty` and `Deleted` bytes in the metadata.
  703. match_bits &= (metadata_ints[0] & MSBs);
  704. // At this point, `match_bits` has the high bit set for bytes where the
  705. // original group byte equals `tag` plus the high bit.
  706. CARBON_DCHECK(VerifyRangeBits(
  707. match_bits, [&](uint8_t byte) { return byte == (tag | PresentMask); }));
  708. return MatchRange(match_bits);
  709. }
  710. inline auto MetadataGroup::PortableMatchPresent() const -> MatchRange {
  711. // Use a simple fallback approach for sizes beyond 8.
  712. // TODO: Instead of a simple fallback, we should generalize the below
  713. // algorithm for sizes above 8, even if to just exercise the same code on
  714. // more platforms.
  715. if constexpr (Size > 8) {
  716. static_assert(Size <= 32, "Sizes larger than 32 not yet supported!");
  717. uint32_t match_bits = 0;
  718. uint32_t bit = 1;
  719. for (ssize_t i : llvm::seq<ssize_t>(0, Size)) {
  720. if (metadata_bytes[i] & PresentMask) {
  721. match_bits |= bit;
  722. }
  723. bit <<= 1;
  724. }
  725. return MatchRange(match_bits);
  726. }
  727. // Want to keep the high bit of each byte, which indicates whether that byte
  728. // represents a present slot.
  729. uint64_t match_bits = metadata_ints[0] & MSBs;
  730. CARBON_DCHECK(VerifyRangeBits(
  731. match_bits, [&](uint8_t byte) { return (byte & PresentMask) != 0; }));
  732. return MatchRange(match_bits);
  733. }
  734. inline auto MetadataGroup::PortableMatchEmpty() const -> MatchIndex {
  735. // Use a simple fallback approach for sizes beyond 8.
  736. // TODO: Instead of a simple fallback, we should generalize the below
  737. // algorithm for sizes above 8, even if to just exercise the same code on
  738. // more platforms.
  739. if constexpr (Size > 8) {
  740. static_assert(Size <= 32, "Sizes larger than 32 not yet supported!");
  741. uint32_t bit = 1;
  742. for (ssize_t i : llvm::seq<ssize_t>(0, Size)) {
  743. if (metadata_bytes[i] == Empty) {
  744. return MatchIndex(bit);
  745. }
  746. bit <<= 1;
  747. }
  748. return MatchIndex(0);
  749. }
  750. // This sets the high bit of every byte in `match_bits` unless the
  751. // corresponding metadata byte is 0. We take advantage of the fact that
  752. // the metadata bytes in are non-zero only if they are either:
  753. // - present: in which case the high bit of the byte will already be set; or
  754. // - deleted: in which case the byte will be 1, and shifting it left by 7 will
  755. // cause the high bit to be set.
  756. uint64_t match_bits = metadata_ints[0] | (metadata_ints[0] << 7);
  757. // This inverts the high bits of the bytes, and clears the remaining bits.
  758. match_bits = ~match_bits & MSBs;
  759. // The high bits of the bytes of `match_bits` are set if the corresponding
  760. // metadata byte is `Empty`.
  761. CARBON_DCHECK(
  762. VerifyIndexBits(match_bits, [](uint8_t byte) { return byte == Empty; }));
  763. return MatchIndex(match_bits);
  764. }
  765. inline auto MetadataGroup::PortableMatchDeleted() const -> MatchIndex {
  766. // Use a simple fallback approach for sizes beyond 8.
  767. // TODO: Instead of a simple fallback, we should generalize the below
  768. // algorithm for sizes above 8, even if to just exercise the same code on
  769. // more platforms.
  770. if constexpr (Size > 8) {
  771. static_assert(Size <= 32, "Sizes larger than 32 not yet supported!");
  772. uint32_t bit = 1;
  773. for (ssize_t i : llvm::seq<ssize_t>(0, Size)) {
  774. if (metadata_bytes[i] == Deleted) {
  775. return MatchIndex(bit);
  776. }
  777. bit <<= 1;
  778. }
  779. return MatchIndex(0);
  780. }
  781. // This sets the high bit of every byte in `match_bits` unless the
  782. // corresponding metadata byte is 1. We take advantage of the fact that the
  783. // metadata bytes are not 1 only if they are either:
  784. // - present: in which case the high bit of the byte will already be set; or
  785. // - empty: in which case the byte will be 0, and in that case inverting and
  786. // shifting left by 7 will have the high bit set.
  787. uint64_t match_bits = metadata_ints[0] | (~metadata_ints[0] << 7);
  788. // This inverts the high bits of the bytes, and clears the remaining bits.
  789. match_bits = ~match_bits & MSBs;
  790. // The high bits of the bytes of `match_bits` are set if the corresponding
  791. // metadata byte is `Deleted`.
  792. CARBON_DCHECK(VerifyIndexBits(match_bits,
  793. [](uint8_t byte) { return byte == Deleted; }));
  794. return MatchIndex(match_bits);
  795. }
  796. inline auto MetadataGroup::PortableCompareEqual(MetadataGroup lhs,
  797. MetadataGroup rhs) -> bool {
  798. return llvm::equal(lhs.metadata_bytes, rhs.metadata_bytes);
  799. }
  800. inline auto MetadataGroup::SIMDLoad(const uint8_t* metadata, ssize_t index)
  801. -> MetadataGroup {
  802. MetadataGroup g;
  803. #if CARBON_NEON_SIMD_SUPPORT
  804. g.metadata_vec = vld1_u8(metadata + index);
  805. #elif CARBON_X86_SIMD_SUPPORT
  806. g.metadata_vec =
  807. _mm_load_si128(reinterpret_cast<const __m128i*>(metadata + index));
  808. #else
  809. static_assert(!UseSIMD, "Unimplemented SIMD operation");
  810. static_cast<void>(metadata);
  811. static_cast<void>(index);
  812. #endif
  813. return g;
  814. }
  815. inline auto MetadataGroup::SIMDStore(uint8_t* metadata, ssize_t index) const
  816. -> void {
  817. #if CARBON_NEON_SIMD_SUPPORT
  818. vst1_u8(metadata + index, metadata_vec);
  819. #elif CARBON_X86_SIMD_SUPPORT
  820. _mm_store_si128(reinterpret_cast<__m128i*>(metadata + index), metadata_vec);
  821. #else
  822. static_assert(!UseSIMD, "Unimplemented SIMD operation");
  823. static_cast<void>(metadata);
  824. static_cast<void>(index);
  825. #endif
  826. }
  827. inline auto MetadataGroup::SIMDClearDeleted() -> void {
  828. #if CARBON_NEON_SIMD_SUPPORT
  829. // There is no good Neon operation to implement this, so do it using integer
  830. // code. This is reasonably fast, but unfortunate because it forces the group
  831. // out of a SIMD register and into a general purpose register, which can have
  832. // high latency.
  833. metadata_ints[0] &= (~LSBs | metadata_ints[0] >> 7);
  834. #elif CARBON_X86_SIMD_SUPPORT
  835. // For each byte, use `metadata_vec` if the byte's high bit is set (indicating
  836. // it is present), otherwise (it is empty or deleted) replace it with zero
  837. // (representing empty).
  838. metadata_vec =
  839. _mm_blendv_epi8(_mm_setzero_si128(), metadata_vec, metadata_vec);
  840. #else
  841. static_assert(!UseSIMD && !DebugSIMD, "Unimplemented SIMD operation");
  842. #endif
  843. }
  844. inline auto MetadataGroup::SIMDMatch(uint8_t tag) const -> MatchRange {
  845. MatchRange result;
  846. #if CARBON_NEON_SIMD_SUPPORT
  847. // Broadcast byte we want to match to every byte in the vector.
  848. auto match_byte_vec = vdup_n_u8(tag | PresentMask);
  849. // Result bytes have all bits set for the bytes that match, so we have to
  850. // clear everything but MSBs next.
  851. auto match_byte_cmp_vec = vceq_u8(metadata_vec, match_byte_vec);
  852. uint64_t match_bits = vreinterpret_u64_u8(match_byte_cmp_vec)[0];
  853. // The matched range is likely to be tested for zero by the caller, and that
  854. // test can often be folded into masking the bits with `MSBs` when we do that
  855. // mask in the scalar domain rather than the SIMD domain. So we do the mask
  856. // here rather than above prior to extracting the match bits.
  857. result = MatchRange(match_bits & MSBs);
  858. #elif CARBON_X86_SIMD_SUPPORT
  859. result = X86SIMDMatch(tag | PresentMask);
  860. #else
  861. static_assert(!UseSIMD && !DebugSIMD, "Unimplemented SIMD operation");
  862. static_cast<void>(tag);
  863. #endif
  864. return result;
  865. }
  866. inline auto MetadataGroup::SIMDMatchPresent() const -> MatchRange {
  867. MatchRange result;
  868. #if CARBON_NEON_SIMD_SUPPORT
  869. // Just extract the metadata directly.
  870. uint64_t match_bits = vreinterpret_u64_u8(metadata_vec)[0];
  871. // The matched range is likely to be tested for zero by the caller, and that
  872. // test can often be folded into masking the bits with `MSBs` when we do that
  873. // mask in the scalar domain rather than the SIMD domain. So we do the mask
  874. // here rather than above prior to extracting the match bits.
  875. result = MatchRange(match_bits & MSBs);
  876. #elif CARBON_X86_SIMD_SUPPORT
  877. // We arranged the byte vector so that present bytes have the high bit set,
  878. // which this instruction extracts.
  879. result = MatchRange(_mm_movemask_epi8(metadata_vec));
  880. #else
  881. static_assert(!UseSIMD && !DebugSIMD, "Unimplemented SIMD operation");
  882. #endif
  883. return result;
  884. }
  885. inline auto MetadataGroup::SIMDMatchEmpty() const -> MatchIndex {
  886. MatchIndex result;
  887. #if CARBON_NEON_SIMD_SUPPORT
  888. // Compare all bytes with zero, as that is the empty byte value. Result will
  889. // have all bits set for any input zero byte, so we zero all but the high bits
  890. // below.
  891. auto cmp_vec = vceqz_u8(metadata_vec);
  892. uint64_t metadata_bits = vreinterpret_u64_u8(cmp_vec)[0];
  893. // The matched range is likely to be tested for zero by the caller, and that
  894. // test can often be folded into masking the bits with `MSBs` when we do that
  895. // mask in the scalar domain rather than the SIMD domain. So we do the mask
  896. // here rather than above prior to extracting the match bits.
  897. result = MatchIndex(metadata_bits & MSBs);
  898. #elif CARBON_X86_SIMD_SUPPORT
  899. // Even though we only need the first match rather than all matches, we don't
  900. // have a more efficient way to compute this on x86 and so we reuse the
  901. // general match infrastructure that computes all matches in a bit-encoding.
  902. // We then convert it into a `MatchIndex` that just finds the first one.
  903. result = static_cast<MatchIndex>(X86SIMDMatch(Empty));
  904. #else
  905. static_assert(!UseSIMD && !DebugSIMD, "Unimplemented SIMD operation");
  906. #endif
  907. return result;
  908. }
  909. inline auto MetadataGroup::SIMDMatchDeleted() const -> MatchIndex {
  910. MatchIndex result;
  911. #if CARBON_NEON_SIMD_SUPPORT
  912. // Broadcast the `Deleted` byte across the vector and compare the bytes of
  913. // that with the metadata vector. The result will have all bits set for any
  914. // input zero byte, so we zero all but the high bits below.
  915. auto cmp_vec = vceq_u8(metadata_vec, vdup_n_u8(Deleted));
  916. uint64_t match_bits = vreinterpret_u64_u8(cmp_vec)[0];
  917. // The matched range is likely to be tested for zero by the caller, and that
  918. // test can often be folded into masking the bits with `MSBs` when we do that
  919. // mask in the scalar domain rather than the SIMD domain. So we do the mask
  920. // here rather than above prior to extracting the match bits.
  921. result = MatchIndex(match_bits & MSBs);
  922. #elif CARBON_X86_SIMD_SUPPORT
  923. // Even though we only need the first match rather than all matches, we don't
  924. // have a more efficient way to compute this on x86 and so we reuse the
  925. // general match infrastructure that computes all matches in a bit-encoding.
  926. // We then convert it into a `MatchIndex` that just finds the first one.
  927. result = static_cast<MatchIndex>(X86SIMDMatch(Deleted));
  928. #else
  929. static_assert(!UseSIMD && !DebugSIMD, "Unimplemented SIMD operation");
  930. #endif
  931. return result;
  932. }
  933. inline auto MetadataGroup::SIMDCompareEqual(MetadataGroup lhs,
  934. MetadataGroup rhs) -> bool {
  935. #if CARBON_NEON_SIMD_SUPPORT
  936. return vreinterpret_u64_u8(vceq_u8(lhs.metadata_vec, rhs.metadata_vec))[0] ==
  937. static_cast<uint64_t>(-1LL);
  938. #elif CARBON_X86_SIMD_SUPPORT
  939. // Different x86 SIMD extensions provide different comparison functionality
  940. // available.
  941. #if __SSE4_2__
  942. // With SSE 4.2, we can directly test and branch in the SIMD domain on whether
  943. // the two metadata vectors are equal.
  944. return _mm_testc_si128(_mm_cmpeq_epi8(lhs.metadata_vec, rhs.metadata_vec),
  945. _mm_set1_epi8(0xff)) == 1;
  946. #else
  947. // With older versions of SSE we have to extract the result of the comparison,
  948. // much like we do when matching. That will have the usual bitmask
  949. // representing equal bytes, and test for that exact bitmask in scalar code.
  950. return _mm_movemask_epi8(_mm_cmpeq_epi8(lhs.metadata_vec,
  951. rhs.metadata_vec)) == 0x0000'ffffU;
  952. #endif
  953. #else
  954. static_assert(!UseSIMD && !DebugSIMD, "Unimplemented SIMD operation");
  955. static_cast<void>(lhs);
  956. static_cast<void>(rhs);
  957. return false;
  958. #endif
  959. }
  960. #if CARBON_X86_SIMD_SUPPORT
  961. inline auto MetadataGroup::X86SIMDMatch(uint8_t match_byte) const
  962. -> MatchRange {
  963. // Broadcast the byte we're matching against to all bytes in a vector, and
  964. // compare those bytes with the metadata vector bytes.
  965. auto match_byte_vec = _mm_set1_epi8(match_byte);
  966. auto match_byte_cmp_vec = _mm_cmpeq_epi8(metadata_vec, match_byte_vec);
  967. // Extract the result of each byte-wise comparison into the low bits of an
  968. // integer.
  969. uint32_t match_bits = _mm_movemask_epi8(match_byte_cmp_vec);
  970. return MatchRange(match_bits);
  971. }
  972. #endif
  973. } // namespace Carbon::RawHashtable
  974. #endif // CARBON_COMMON_RAW_HASHTABLE_METADATA_GROUP_H_