int.h 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #ifndef CARBON_TOOLCHAIN_BASE_INT_H_
  5. #define CARBON_TOOLCHAIN_BASE_INT_H_
  6. #include "common/check.h"
  7. #include "llvm/ADT/APInt.h"
  8. #include "llvm/ADT/SmallVector.h"
  9. #include "toolchain/base/index_base.h"
  10. #include "toolchain/base/mem_usage.h"
  11. #include "toolchain/base/value_store.h"
  12. #include "toolchain/base/yaml.h"
  13. namespace Carbon {
  14. // Forward declare a testing peer so we can friend it.
  15. namespace Testing {
  16. struct IntStoreTestPeer;
  17. } // namespace Testing
  18. // Corresponds to a canonicalized integer value. This is used both for integer
  19. // literal tokens, and integer values in SemIR. These always represent the
  20. // abstract mathematical value -- signed and regardless of the needed precision.
  21. //
  22. // Small values are internalized into the ID itself. Large values are
  23. // represented as an index into an array of `APInt`s with a canonicalized bit
  24. // width. The ID itself can be queried for whether it is a value-embedded-ID or
  25. // an index ID. The ID also provides APIs for extracting either the value or an
  26. // index.
  27. //
  28. // ## Details of the encoding scheme ##
  29. //
  30. // We need all the values from a maximum to minimum, as well as a healthy range
  31. // of indices, to fit within the token ID bits.
  32. //
  33. // We represent this as a signed `TokenIdBits`-bit 2s compliment integer. The
  34. // sign extension from TokenIdBits to a register size can be folded into the
  35. // shift used to extract those bits from compressed bitfield storage.
  36. //
  37. // We then divide the smallest 1/4th of that bit width's space to represent
  38. // indices, and the larger 3/4ths to embedded values. For 23-bits total this
  39. // still gives us 2 million unique integers larger than the embedded ones, which
  40. // would be difficult to fill without exceeding the number of tokens we can lex
  41. // (8 million). For non-token based integers, the indices can continue downward
  42. // to the 32-bit signed integer minimum, supporting approximately 1.998 billion
  43. // unique larger integers.
  44. //
  45. // Note that the invalid ID can't be used with a token. This is OK as we
  46. // expect invalid tokens to be *error* tokens and not need to represent an
  47. // invalid integer.
  48. class IntId : public Printable<IntId> {
  49. public:
  50. static constexpr llvm::StringLiteral Label = "int";
  51. using ValueType = llvm::APInt;
  52. // The encoding of integer IDs ensures that valid IDs associated with tokens
  53. // during lexing can fit into a compressed storage space. We arrange for
  54. // `TokenIdBits` to be the minimum number of bits of storage for token
  55. // associated IDs. The constant is public so the lexer can ensure it reserves
  56. // adequate space.
  57. //
  58. // Note that there may still be IDs either not associated with
  59. // tokens or computed after lexing outside of this range.
  60. static constexpr int TokenIdBits = 23;
  61. static const IntId Invalid;
  62. static auto MakeFromTokenPayload(uint32_t payload) -> IntId {
  63. // Token-associated IDs are signed `TokenIdBits` integers, so force sign
  64. // extension from that bit.
  65. return IntId(static_cast<int32_t>(payload << TokenIdBitsShift) >>
  66. TokenIdBitsShift);
  67. }
  68. // Construct an ID from a raw 32-bit ID value.
  69. static constexpr auto MakeRaw(int32_t raw_id) -> IntId {
  70. return IntId(raw_id);
  71. }
  72. // Tests whether the ID is a value ID.
  73. //
  74. // Only *valid* IDs can have an embedded value, so when true this also implies
  75. // the ID is valid.
  76. constexpr auto is_value() const -> bool { return id_ > ZeroIndexId; }
  77. // Tests whether the ID is an index ID.
  78. //
  79. // Note that an invalid ID is represented as an index ID, so this is *not*
  80. // sufficient to test whether an ID is valid.
  81. constexpr auto is_index() const -> bool { return id_ <= ZeroIndexId; }
  82. // Test whether an ID is valid.
  83. //
  84. // This does not distinguish between value and index IDs, only whether valid.
  85. constexpr auto is_valid() const -> bool { return id_ != InvalidId; }
  86. // Converts an ID to the embedded value. Requires that `is_value()` is true.
  87. constexpr auto AsValue() const -> int {
  88. CARBON_DCHECK(is_value());
  89. return id_;
  90. }
  91. // Converts an ID to an index. Requires that `is_index()` is true.
  92. //
  93. // Note that this does *not* require that the ID is valid. An invalid ID will
  94. // turn into an invalid index.
  95. constexpr auto AsIndex() const -> int {
  96. CARBON_DCHECK(is_index());
  97. return ZeroIndexId - id_;
  98. }
  99. // Returns the ID formatted as a lex token payload.
  100. constexpr auto AsTokenPayload() const -> uint32_t {
  101. uint32_t payload = id_;
  102. // Ensure this ID round trips as the token payload.
  103. CARBON_DCHECK(*this == MakeFromTokenPayload(payload));
  104. return payload;
  105. }
  106. constexpr auto AsRaw() const -> int32_t { return id_; }
  107. auto Print(llvm::raw_ostream& out) const -> void {
  108. out << Label << "(";
  109. if (is_value()) {
  110. out << "value: " << AsValue();
  111. } else if (is_index()) {
  112. out << "index: " << AsIndex();
  113. } else {
  114. CARBON_CHECK(!is_valid());
  115. out << "<invalid>";
  116. }
  117. out << ")";
  118. }
  119. friend constexpr auto operator==(IntId lhs, IntId rhs) -> bool {
  120. return lhs.id_ == rhs.id_;
  121. }
  122. friend constexpr auto operator<=>(IntId lhs, IntId rhs)
  123. -> std::strong_ordering {
  124. return lhs.id_ <=> rhs.id_;
  125. }
  126. private:
  127. friend class IntStore;
  128. friend Testing::IntStoreTestPeer;
  129. // The shift needed when adjusting a between a `TokenIdBits`-width integer and
  130. // a 32-bit integer.
  131. static constexpr int TokenIdBitsShift = 32 - TokenIdBits;
  132. // The maximum embedded value in an ID.
  133. static constexpr int32_t MaxValue =
  134. std::numeric_limits<int32_t>::max() >> TokenIdBitsShift;
  135. // The ID value that represents an index of `0`. This is the first ID value
  136. // representing an index, and all indices are `<=` to this.
  137. //
  138. // `ZeroIndexId` is the first index ID, and we encode indices as successive
  139. // negative numbers counting downwards. The setup allows us to both use a
  140. // comparison with this ID to distinguish value and index IDs, and to compute
  141. // the actual index from the ID.
  142. //
  143. // The computation of an index in fact is just a subtraction:
  144. // `ZeroIndexId - id_`. Subtraction is *also* how most CPUs implement the
  145. // comparison, and so all of this ends up carefully constructed to enable very
  146. // small code size when testing for an embedded value and when that test fails
  147. // computing and using the index.
  148. static constexpr int32_t ZeroIndexId = std::numeric_limits<int32_t>::min() >>
  149. (TokenIdBitsShift + 1);
  150. // The minimum embedded value in an ID.
  151. static constexpr int32_t MinValue = ZeroIndexId + 1;
  152. // The invalid ID, which needs to be placed after the largest index, which
  153. // count downwards as IDs so below the smallest index ID, in order to optimize
  154. // the code sequence needed to distinguish between integer and value IDs and
  155. // to convert index IDs into actual indices small.
  156. static constexpr int32_t InvalidId = std::numeric_limits<int32_t>::min();
  157. // The invalid index. This is the result of converting an invalid ID into an
  158. // index. We ensure that conversion can be done so that we can simplify the
  159. // code that first tries to use an embedded value, then converts to an index
  160. // and checks that the index is valid.
  161. static const int32_t InvalidIndex;
  162. // Document the specific values of some of these constants to help visualize
  163. // how the bit patterns map from the above computations.
  164. //
  165. // clang-format off: visualizing bit positions
  166. //
  167. // Each bit is either `T` for part of the token or `P` as part
  168. // of the available payload that we use for the ID:
  169. //
  170. // 0bTTTT'TTTT'TPPP'PPPP'PPPP'PPPP'PPPP'PPPP
  171. static_assert(MaxValue == 0b0000'0000'0011'1111'1111'1111'1111'1111);
  172. static_assert(ZeroIndexId == 0b1111'1111'1110'0000'0000'0000'0000'0000);
  173. static_assert(MinValue == 0b1111'1111'1110'0000'0000'0000'0000'0001);
  174. static_assert(InvalidId == 0b1000'0000'0000'0000'0000'0000'0000'0000);
  175. // clang-format on
  176. constexpr explicit IntId(int32_t id) : id_(id) {}
  177. int32_t id_;
  178. };
  179. constexpr IntId IntId::Invalid(IntId::InvalidId);
  180. // Note that we initialize the invalid index in a constexpr context which
  181. // ensures there is no UB in forming it. This helps ensure all the ID -> index
  182. // conversions are correct because the invalid ID is at the limit of that range.
  183. constexpr int32_t IntId::InvalidIndex = Invalid.AsIndex();
  184. // A canonicalizing value store with deep optimizations for integers.
  185. //
  186. // This stores integers as abstract, signed mathematical integers. The bit width
  187. // of specific `APInt` values, either as inputs or outputs, is disregarded for
  188. // the purpose of canonicalization and the returned integer may use a very
  189. // different bit width `APInt` than was used when adding. There are also
  190. // optimized paths for adding integer values representable using native integer
  191. // types.
  192. //
  193. // Because the integers in the store are canonicalized with only a minimum bit
  194. // width, there are helper functions to coerce them to a specific desired bit
  195. // width for use.
  196. //
  197. // This leverages a significant optimization for small integer values -- rather
  198. // than canonicalizing and making them unique in a `ValueStore`, they are
  199. // directly embedded in the `IntId` itself. Only larger integers are stored in
  200. // an array of `APInt` values and represented as an index in the ID.
  201. class IntStore {
  202. public:
  203. // The maximum supported bit width of an integer type.
  204. // TODO: Pick a maximum size and document it in the design. For now
  205. // we use 2^^23, because that's the largest size that LLVM supports.
  206. static constexpr int MaxIntWidth = 1 << 23;
  207. // Pick a canonical bit width for the provided number of significant bits.
  208. static auto CanonicalBitWidth(int significant_bits) -> int;
  209. // Accepts a signed `int64_t` and uses the mathematical signed integer value
  210. // of it as the added integer value.
  211. //
  212. // Returns the ID corresponding to this integer value, storing an `APInt` if
  213. // necessary to represent it.
  214. auto Add(int64_t value) -> IntId {
  215. // First try directly making this into an ID.
  216. if (IntId id = TryMakeValue(value); id.is_valid()) [[likely]] {
  217. return id;
  218. }
  219. // Fallback for larger values.
  220. return AddLarge(value);
  221. }
  222. // Returns the ID corresponding to this signed integer value, storing an
  223. // `APInt` if necessary to represent it.
  224. auto AddSigned(llvm::APInt value) -> IntId {
  225. // First try directly making this into an ID.
  226. if (IntId id = TryMakeSignedValue(value); id.is_valid()) [[likely]] {
  227. return id;
  228. }
  229. // Fallback for larger values.
  230. return AddSignedLarge(std::move(value));
  231. }
  232. // Returns the ID corresponding to an equivalent signed integer value for the
  233. // provided unsigned integer value, storing an `APInt` if necessary to
  234. // represent it.
  235. auto AddUnsigned(llvm::APInt value) -> IntId {
  236. // First try directly making this into an ID.
  237. if (IntId id = TryMakeUnsignedValue(value); id.is_valid()) [[likely]] {
  238. return id;
  239. }
  240. // Fallback for larger values.
  241. return AddUnsignedLarge(std::move(value));
  242. }
  243. // Returns the value for an ID.
  244. //
  245. // This will always be a signed `APInt` with a canonical bit width for the
  246. // specific integer value in question.
  247. auto Get(IntId id) const -> llvm::APInt {
  248. if (id.is_value()) [[likely]] {
  249. return llvm::APInt(MinAPWidth, id.AsValue(), /*isSigned=*/true);
  250. }
  251. return values_.Get(APIntId(id.AsIndex()));
  252. }
  253. // Returns the value for an ID adjusted to a specific bit width.
  254. //
  255. // Note that because we store canonical mathematical integers as signed
  256. // integers, this always sign extends or truncates to the target width. The
  257. // caller can then use that as a signed or unsigned integer as needed.
  258. auto GetAtWidth(IntId id, int bit_width) const -> llvm::APInt {
  259. llvm::APInt value = Get(id);
  260. if (static_cast<int>(value.getBitWidth()) != bit_width) {
  261. value = value.sextOrTrunc(bit_width);
  262. }
  263. return value;
  264. }
  265. // Returns the value for an ID adjusted to the bit width specified with
  266. // another integer ID.
  267. //
  268. // This simply looks up the width integer ID, and then calls the above
  269. // `GetAtWidth` overload using the value found for it. See that overload for
  270. // more details.
  271. auto GetAtWidth(IntId id, IntId bit_width_id) const -> llvm::APInt {
  272. const llvm::APInt bit_width = Get(bit_width_id);
  273. CARBON_CHECK(
  274. bit_width.isStrictlyPositive() && bit_width.isSignedIntN(MinAPWidth),
  275. "Invalid bit width value: {0}", bit_width);
  276. return GetAtWidth(id, bit_width.getSExtValue());
  277. }
  278. // Accepts a signed `int64_t` and uses the mathematical signed integer value
  279. // of it as the integer value to lookup. Returns the canonical ID for that
  280. // value or returns invalid if not in the store.
  281. auto Lookup(int64_t value) const -> IntId {
  282. if (IntId id = TryMakeValue(value); id.is_valid()) [[likely]] {
  283. return id;
  284. }
  285. // Fallback for larger values.
  286. return LookupLarge(value);
  287. }
  288. // Looks up the canonical ID for this signed integer value, or returns invalid
  289. // if not in the store.
  290. auto LookupSigned(llvm::APInt value) const -> IntId {
  291. if (IntId id = TryMakeSignedValue(value); id.is_valid()) [[likely]] {
  292. return id;
  293. }
  294. // Fallback for larger values.
  295. return LookupSignedLarge(std::move(value));
  296. }
  297. // Output a YAML description of this data structure. Note that this will only
  298. // include the integers that required storing, not those successfully embedded
  299. // into the ID space.
  300. auto OutputYaml() const -> Yaml::OutputMapping;
  301. auto array_ref() const -> llvm::ArrayRef<llvm::APInt> {
  302. return values_.array_ref();
  303. }
  304. auto size() const -> size_t { return values_.size(); }
  305. // Collects the memory usage of the separately stored integers.
  306. auto CollectMemUsage(MemUsage& mem_usage, llvm::StringRef label) const
  307. -> void;
  308. private:
  309. friend struct Testing::IntStoreTestPeer;
  310. // Used for `values_`; tracked using `IntId`'s index range.
  311. struct APIntId : IdBase<APIntId> {
  312. static constexpr llvm::StringLiteral Label = "ap_int";
  313. using ValueType = llvm::APInt;
  314. static const APIntId Invalid;
  315. using IdBase::IdBase;
  316. };
  317. static constexpr int MinAPWidth = 64;
  318. static auto MakeIndexOrInvalid(int index) -> IntId {
  319. CARBON_DCHECK(index >= 0 && index <= IntId::InvalidIndex);
  320. return IntId(IntId::ZeroIndexId - index);
  321. }
  322. // Tries to make a signed 64-bit integer into an embedded value in the ID, and
  323. // if unable to do that returns the `Invalid` ID.
  324. static auto TryMakeValue(int64_t value) -> IntId {
  325. if (IntId::MinValue <= value && value <= IntId::MaxValue) {
  326. return IntId(value);
  327. }
  328. return IntId::Invalid;
  329. }
  330. // Tries to make a signed APInt into an embedded value in the ID, and if
  331. // unable to do that returns the `Invalid` ID.
  332. static auto TryMakeSignedValue(llvm::APInt value) -> IntId {
  333. if (value.sge(IntId::MinValue) && value.sle(IntId::MaxValue)) {
  334. return IntId(value.getSExtValue());
  335. }
  336. return IntId::Invalid;
  337. }
  338. // Tries to make an unsigned APInt into an embedded value in the ID, and if
  339. // unable to do that returns the `Invalid` ID.
  340. static auto TryMakeUnsignedValue(llvm::APInt value) -> IntId {
  341. if (value.ule(IntId::MaxValue)) {
  342. return IntId(value.getZExtValue());
  343. }
  344. return IntId::Invalid;
  345. }
  346. // Canonicalize an incoming signed APInt to the correct bit width.
  347. static auto CanonicalizeSigned(llvm::APInt value) -> llvm::APInt;
  348. // Canonicalize an incoming unsigned APInt to the correct bit width.
  349. static auto CanonicalizeUnsigned(llvm::APInt value) -> llvm::APInt;
  350. // Helper functions for handling values that are large enough to require an
  351. // allocated `APInt` for storage. Creating or manipulating that storage is
  352. // only a few lines of code, but we move these out-of-line because the
  353. // generated code is big and harms performance for the non-`Large` common
  354. // case.
  355. auto AddLarge(int64_t value) -> IntId;
  356. auto AddSignedLarge(llvm::APInt value) -> IntId;
  357. auto AddUnsignedLarge(llvm::APInt value) -> IntId;
  358. auto LookupLarge(int64_t value) const -> IntId;
  359. auto LookupSignedLarge(llvm::APInt value) const -> IntId;
  360. // Stores values which don't fit in an IntId. These are always signed.
  361. CanonicalValueStore<APIntId> values_;
  362. };
  363. constexpr IntStore::APIntId IntStore::APIntId::Invalid(
  364. IntId::Invalid.AsIndex());
  365. } // namespace Carbon
  366. #endif // CARBON_TOOLCHAIN_BASE_INT_H_