|
|
@@ -0,0 +1,428 @@
|
|
|
+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
|
|
|
+// Exceptions. See /LICENSE for license information.
|
|
|
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
|
+
|
|
|
+#ifndef CARBON_TOOLCHAIN_BASE_INT_STORE_H_
|
|
|
+#define CARBON_TOOLCHAIN_BASE_INT_STORE_H_
|
|
|
+
|
|
|
+#include "common/check.h"
|
|
|
+#include "llvm/ADT/APFloat.h"
|
|
|
+#include "llvm/ADT/APInt.h"
|
|
|
+#include "llvm/ADT/SmallVector.h"
|
|
|
+#include "toolchain/base/index_base.h"
|
|
|
+#include "toolchain/base/mem_usage.h"
|
|
|
+#include "toolchain/base/value_store.h"
|
|
|
+#include "toolchain/base/yaml.h"
|
|
|
+
|
|
|
+namespace Carbon {
|
|
|
+
|
|
|
+// Forward declare a testing peer so we can friend it.
|
|
|
+namespace Testing {
|
|
|
+struct IntStoreTestPeer;
|
|
|
+} // namespace Testing
|
|
|
+
|
|
|
+// Corresponds to a canonicalized integer value. This is used both for integer
|
|
|
+// literal tokens, and integer values in SemIR. These always represent the
|
|
|
+// abstract mathematical value -- signed and regardless of the needed precision.
|
|
|
+//
|
|
|
+// Small values are internalized into the ID itself. Large values are
|
|
|
+// represented as an index into an array of `APInt`s with a canonicalized bit
|
|
|
+// width. The ID itself can be queried for whether it is a value-embedded-ID or
|
|
|
+// an index ID. The ID also provides APIs for extracting either the value or an
|
|
|
+// index.
|
|
|
+//
|
|
|
+// ## Details of the encoding scheme ##
|
|
|
+//
|
|
|
+// We need all the values from a maximum to minimum, as well as a healthy range
|
|
|
+// of indices, to fit within the token ID bits.
|
|
|
+//
|
|
|
+// We represent this as a signed `TokenIdBits`-bit 2s compliment integer. The
|
|
|
+// sign extension from TokenIdBits to a register size can be folded into the
|
|
|
+// shift used to extract those bits from compressed bitfield storage.
|
|
|
+//
|
|
|
+// We then divide the smallest 1/4th of that bit width's space to represent
|
|
|
+// indices, and the larger 3/4ths to embedded values. For 23-bits total this
|
|
|
+// still gives us 2 million unique integers larger than the embedded ones, which
|
|
|
+// would be difficult to fill without exceeding the number of tokens we can lex
|
|
|
+// (8 million). For non-token based integers, the indices can continue downward
|
|
|
+// to the 32-bit signed integer minimum, supporting approximately 1.998 billion
|
|
|
+// unique larger integers.
|
|
|
+//
|
|
|
+// Note that the invalid ID can't be used with a token. This is OK as we
|
|
|
+// expect invalid tokens to be *error* tokens and not need to represent an
|
|
|
+// invalid integer.
|
|
|
+class IntId : public Printable<IntId> {
|
|
|
+ public:
|
|
|
+ using ValueType = llvm::APInt;
|
|
|
+
|
|
|
+ // The encoding of integer IDs ensures that valid IDs associated with tokens
|
|
|
+ // during lexing can fit into a compressed storage space. We arrange for
|
|
|
+ // `TokenIdBits` to be the minimum number of bits of storage for token
|
|
|
+ // associated IDs. The constant is public so the lexer can ensure it reserves
|
|
|
+ // adequate space.
|
|
|
+ //
|
|
|
+ // Note that there may still be IDs either not associated with
|
|
|
+ // tokens or computed after lexing outside of this range.
|
|
|
+ static constexpr int TokenIdBits = 23;
|
|
|
+
|
|
|
+ static const IntId Invalid;
|
|
|
+
|
|
|
+ static auto MakeFromTokenPayload(uint32_t payload) -> IntId {
|
|
|
+ // Token-associated IDs are signed `TokenIdBits` integers, so force sign
|
|
|
+ // extension from that bit.
|
|
|
+ return IntId(static_cast<int32_t>(payload << TokenIdBitsShift) >>
|
|
|
+ TokenIdBitsShift);
|
|
|
+ }
|
|
|
+
|
|
|
+ // Construct an ID from a raw 32-bit ID value.
|
|
|
+ static constexpr auto MakeRaw(int32_t raw_id) -> IntId {
|
|
|
+ return IntId(raw_id);
|
|
|
+ }
|
|
|
+
|
|
|
+ // Tests whether the ID is a value ID.
|
|
|
+ //
|
|
|
+ // Only *valid* IDs can have an embedded value, so when true this also implies
|
|
|
+ // the ID is valid.
|
|
|
+ constexpr auto is_value() const -> bool { return id_ > ZeroIndexId; }
|
|
|
+
|
|
|
+ // Tests whether the ID is an index ID.
|
|
|
+ //
|
|
|
+ // Note that an invalid ID is represented as an index ID, so this is *not*
|
|
|
+ // sufficient to test whether an ID is valid.
|
|
|
+ constexpr auto is_index() const -> bool { return id_ <= ZeroIndexId; }
|
|
|
+
|
|
|
+ // Test whether an ID is valid.
|
|
|
+ //
|
|
|
+ // This does not distinguish between value and index IDs, only whether valid.
|
|
|
+ constexpr auto is_valid() const -> bool { return id_ != InvalidId; }
|
|
|
+
|
|
|
+ // Converts an ID to the embedded value. Requires that `is_value()` is true.
|
|
|
+ constexpr auto AsValue() const -> int {
|
|
|
+ CARBON_DCHECK(is_value());
|
|
|
+ return id_;
|
|
|
+ }
|
|
|
+
|
|
|
+ // Converts an ID to an index. Requires that `is_index()` is true.
|
|
|
+ //
|
|
|
+ // Note that this does *not* require that the ID is valid. An invalid ID will
|
|
|
+ // turn into an invalid index.
|
|
|
+ constexpr auto AsIndex() const -> int {
|
|
|
+ CARBON_DCHECK(is_index());
|
|
|
+ return ZeroIndexId - id_;
|
|
|
+ }
|
|
|
+
|
|
|
+ // Returns the ID formatted as a lex token payload.
|
|
|
+ constexpr auto AsTokenPayload() const -> uint32_t {
|
|
|
+ uint32_t payload = id_;
|
|
|
+ // Ensure this ID round trips as the token payload.
|
|
|
+ CARBON_DCHECK(*this == MakeFromTokenPayload(payload));
|
|
|
+ return payload;
|
|
|
+ }
|
|
|
+
|
|
|
+ constexpr auto AsRaw() const -> int32_t { return id_; }
|
|
|
+
|
|
|
+ auto Print(llvm::raw_ostream& out) const -> void {
|
|
|
+ out << "int [";
|
|
|
+ if (is_value()) {
|
|
|
+ out << "value: " << AsValue() << "]";
|
|
|
+ } else if (is_index()) {
|
|
|
+ out << "index: " << AsIndex() << "]";
|
|
|
+ } else {
|
|
|
+ CARBON_CHECK(!is_valid());
|
|
|
+ out << "invalid]";
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ friend constexpr auto operator==(IntId lhs, IntId rhs) -> bool {
|
|
|
+ return lhs.id_ == rhs.id_;
|
|
|
+ }
|
|
|
+ friend constexpr auto operator<=>(IntId lhs, IntId rhs)
|
|
|
+ -> std::strong_ordering {
|
|
|
+ return lhs.id_ <=> rhs.id_;
|
|
|
+ }
|
|
|
+
|
|
|
+ private:
|
|
|
+ friend class IntStore;
|
|
|
+ friend Testing::IntStoreTestPeer;
|
|
|
+
|
|
|
+ // The shift needed when adjusting a between a `TokenIdBits`-width integer and
|
|
|
+ // a 32-bit integer.
|
|
|
+ static constexpr int TokenIdBitsShift = 32 - TokenIdBits;
|
|
|
+
|
|
|
+ // The maximum embedded value in an ID.
|
|
|
+ static constexpr int32_t MaxValue =
|
|
|
+ std::numeric_limits<int32_t>::max() >> TokenIdBitsShift;
|
|
|
+
|
|
|
+ // The ID value that represents an index of `0`. This is the first ID value
|
|
|
+ // representing an index, and all indices are `<=` to this.
|
|
|
+ //
|
|
|
+ // `ZeroIndexId` is the first index ID, and we encode indices as successive
|
|
|
+ // negative numbers counting downwards. The setup allows us to both use a
|
|
|
+ // comparison with this ID to distinguish value and index IDs, and to compute
|
|
|
+ // the actual index from the ID.
|
|
|
+ //
|
|
|
+ // The computation of an index in fact is just a subtraction:
|
|
|
+ // `ZeroIndexId - id_`. Subtraction is *also* how most CPUs implement the
|
|
|
+ // comparison, and so all of this ends up carefully constructed to enable very
|
|
|
+ // small code size when testing for an embedded value and when that test fails
|
|
|
+ // computing and using the index.
|
|
|
+ static constexpr int32_t ZeroIndexId = std::numeric_limits<int32_t>::min() >>
|
|
|
+ (TokenIdBitsShift + 1);
|
|
|
+
|
|
|
+ // The minimum embedded value in an ID.
|
|
|
+ static constexpr int32_t MinValue = ZeroIndexId + 1;
|
|
|
+
|
|
|
+ // The invalid ID, which needs to be placed after the largest index, which
|
|
|
+ // count downwards as IDs so below the smallest index ID, in order to optimize
|
|
|
+ // the code sequence needed to distinguish between integer and value IDs and
|
|
|
+ // to convert index IDs into actual indices small.
|
|
|
+ static constexpr int32_t InvalidId = std::numeric_limits<int32_t>::min();
|
|
|
+
|
|
|
+ // The invalid index. This is the result of converting an invalid ID into an
|
|
|
+ // index. We ensure that conversion can be done so that we can simplify the
|
|
|
+ // code that first tries to use an embedded value, then converts to an index
|
|
|
+ // and checks that the index is valid.
|
|
|
+ static const int32_t InvalidIndex;
|
|
|
+
|
|
|
+ // Document the specific values of some of these constants to help visualize
|
|
|
+ // how the bit patterns map from the above computations.
|
|
|
+ //
|
|
|
+ // clang-format off: visualizing bit positions
|
|
|
+ //
|
|
|
+ // Each bit is either `T` for part of the token or `P` as part
|
|
|
+ // of the available payload that we use for the ID:
|
|
|
+ //
|
|
|
+ // 0bTTTT'TTTT'TPPP'PPPP'PPPP'PPPP'PPPP'PPPP
|
|
|
+ static_assert(MaxValue == 0b0000'0000'0011'1111'1111'1111'1111'1111);
|
|
|
+ static_assert(ZeroIndexId == 0b1111'1111'1110'0000'0000'0000'0000'0000);
|
|
|
+ static_assert(MinValue == 0b1111'1111'1110'0000'0000'0000'0000'0001);
|
|
|
+ static_assert(InvalidId == 0b1000'0000'0000'0000'0000'0000'0000'0000);
|
|
|
+ // clang-format on
|
|
|
+
|
|
|
+ constexpr explicit IntId(int32_t id) : id_(id) {}
|
|
|
+
|
|
|
+ int32_t id_;
|
|
|
+};
|
|
|
+
|
|
|
+constexpr IntId IntId::Invalid(IntId::InvalidId);
|
|
|
+
|
|
|
+// Note that we initialize the invalid index in a constexpr context which
|
|
|
+// ensures there is no UB in forming it. This helps ensure all the ID -> index
|
|
|
+// conversions are correct because the invalid ID is at the limit of that range.
|
|
|
+constexpr int32_t IntId::InvalidIndex = Invalid.AsIndex();
|
|
|
+
|
|
|
+// A canonicalizing value store with deep optimizations for integers.
|
|
|
+//
|
|
|
+// This stores integers as abstract, signed mathematical integers. The bit width
|
|
|
+// of specific `APInt` values, either as inputs or outputs, is disregarded for
|
|
|
+// the purpose of canonicalization and the returned integer may use a very
|
|
|
+// different bit width `APInt` than was used when adding. There are also
|
|
|
+// optimized paths for adding integer values representable using native integer
|
|
|
+// types.
|
|
|
+//
|
|
|
+// Because the integers in the store are canonicalized with only a minimum bit
|
|
|
+// width, there are helper functions to coerce them to a specific desired bit
|
|
|
+// width for use.
|
|
|
+//
|
|
|
+// This leverages a significant optimization for small integer values -- rather
|
|
|
+// than canonicalizing and making them unique in a `ValueStore`, they are
|
|
|
+// directly embedded in the `IntId` itself. Only larger integers are stored in
|
|
|
+// an array of `APInt` values and represented as an index in the ID.
|
|
|
+class IntStore {
|
|
|
+ public:
|
|
|
+ // Accepts a signed `int64_t` and uses the mathematical signed integer value
|
|
|
+ // of it as the added integer value.
|
|
|
+ //
|
|
|
+ // Returns the ID corresponding to this integer value, storing an `APInt` if
|
|
|
+ // necessary to represent it.
|
|
|
+ auto Add(int64_t value) -> IntId {
|
|
|
+ // First try directly making this into an ID.
|
|
|
+ if (IntId id = TryMakeValue(value); id.is_valid()) [[likely]] {
|
|
|
+ return id;
|
|
|
+ }
|
|
|
+
|
|
|
+ // Fallback for larger values.
|
|
|
+ return AddLarge(value);
|
|
|
+ }
|
|
|
+
|
|
|
+ // Returns the ID corresponding to this signed integer value, storing an
|
|
|
+ // `APInt` if necessary to represent it.
|
|
|
+ auto AddSigned(llvm::APInt value) -> IntId {
|
|
|
+ // First try directly making this into an ID.
|
|
|
+ if (IntId id = TryMakeSignedValue(value); id.is_valid()) [[likely]] {
|
|
|
+ return id;
|
|
|
+ }
|
|
|
+
|
|
|
+ // Fallback for larger values.
|
|
|
+ return AddSignedLarge(std::move(value));
|
|
|
+ }
|
|
|
+
|
|
|
+ // Returns the ID corresponding to an equivalent signed integer value for the
|
|
|
+ // provided unsigned integer value, storing an `APInt` if necessary to
|
|
|
+ // represent it.
|
|
|
+ auto AddUnsigned(llvm::APInt value) -> IntId {
|
|
|
+ // First try directly making this into an ID.
|
|
|
+ if (IntId id = TryMakeUnsignedValue(value); id.is_valid()) [[likely]] {
|
|
|
+ return id;
|
|
|
+ }
|
|
|
+
|
|
|
+ // Fallback for larger values.
|
|
|
+ return AddUnsignedLarge(std::move(value));
|
|
|
+ }
|
|
|
+
|
|
|
+ // Returns the value for an ID.
|
|
|
+ //
|
|
|
+ // This will always be a signed `APInt` with a canonical bit width for the
|
|
|
+ // specific integer value in question.
|
|
|
+ auto Get(IntId id) const -> llvm::APInt {
|
|
|
+ if (id.is_value()) [[likely]] {
|
|
|
+ return llvm::APInt(MinAPWidth, id.AsValue(), /*isSigned=*/true);
|
|
|
+ }
|
|
|
+ return values_.Get(APIntId(id.AsIndex()));
|
|
|
+ }
|
|
|
+
|
|
|
+ // Returns the value for an ID adjusted to a specific bit width.
|
|
|
+ //
|
|
|
+ // Note that because we store canonical mathematical integers as signed
|
|
|
+ // integers, this always sign extends or truncates to the target width. The
|
|
|
+ // caller can then use that as a signed or unsigned integer as needed.
|
|
|
+ auto GetAtWidth(IntId id, int bit_width) const -> llvm::APInt {
|
|
|
+ llvm::APInt value = Get(id);
|
|
|
+ if (static_cast<int>(value.getBitWidth()) != bit_width) {
|
|
|
+ value = value.sextOrTrunc(bit_width);
|
|
|
+ }
|
|
|
+ return value;
|
|
|
+ }
|
|
|
+
|
|
|
+ // Returns the value for an ID adjusted to the bit width specified with
|
|
|
+ // another integer ID.
|
|
|
+ //
|
|
|
+ // This simply looks up the width integer ID, and then calls the above
|
|
|
+ // `GetAtWidth` overload using the value found for it. See that overload for
|
|
|
+ // more details.
|
|
|
+ auto GetAtWidth(IntId id, IntId bit_width_id) const -> llvm::APInt {
|
|
|
+ const llvm::APInt bit_width = Get(bit_width_id);
|
|
|
+ CARBON_CHECK(
|
|
|
+ bit_width.isStrictlyPositive() && bit_width.isSignedIntN(MinAPWidth),
|
|
|
+ "Invalid bit width value: {0}", bit_width);
|
|
|
+ return GetAtWidth(id, bit_width.getSExtValue());
|
|
|
+ }
|
|
|
+
|
|
|
+ // Accepts a signed `int64_t` and uses the mathematical signed integer value
|
|
|
+ // of it as the integer value to lookup. Returns the canonical ID for that
|
|
|
+ // value or returns invalid if not in the store.
|
|
|
+ auto Lookup(int64_t value) const -> IntId {
|
|
|
+ if (IntId id = TryMakeValue(value); id.is_valid()) [[likely]] {
|
|
|
+ return id;
|
|
|
+ }
|
|
|
+
|
|
|
+ // Fallback for larger values.
|
|
|
+ return LookupLarge(value);
|
|
|
+ }
|
|
|
+
|
|
|
+ // Looks up the canonical ID for this signed integer value, or returns invalid
|
|
|
+ // if not in the store.
|
|
|
+ auto LookupSigned(llvm::APInt value) const -> IntId {
|
|
|
+ if (IntId id = TryMakeSignedValue(value); id.is_valid()) [[likely]] {
|
|
|
+ return id;
|
|
|
+ }
|
|
|
+
|
|
|
+ // Fallback for larger values.
|
|
|
+ return LookupSignedLarge(std::move(value));
|
|
|
+ }
|
|
|
+
|
|
|
+ // Output a YAML description of this data structure. Note that this will only
|
|
|
+ // include the integers that required storing, not those successfully embedded
|
|
|
+ // into the ID space.
|
|
|
+ auto OutputYaml() const -> Yaml::OutputMapping;
|
|
|
+
|
|
|
+ auto array_ref() const -> llvm::ArrayRef<llvm::APInt> {
|
|
|
+ return values_.array_ref();
|
|
|
+ }
|
|
|
+ auto size() const -> size_t { return values_.size(); }
|
|
|
+
|
|
|
+ // Collects the memory usage of the separately stored integers.
|
|
|
+ auto CollectMemUsage(MemUsage& mem_usage, llvm::StringRef label) const
|
|
|
+ -> void;
|
|
|
+
|
|
|
+ private:
|
|
|
+ friend struct Testing::IntStoreTestPeer;
|
|
|
+
|
|
|
+ // Used for `values_`; tracked using `IntId`'s index range.
|
|
|
+ struct APIntId : IdBase, Printable<APIntId> {
|
|
|
+ using ValueType = llvm::APInt;
|
|
|
+ static const APIntId Invalid;
|
|
|
+ using IdBase::IdBase;
|
|
|
+ auto Print(llvm::raw_ostream& out) const -> void {
|
|
|
+ out << "ap_int";
|
|
|
+ IdBase::Print(out);
|
|
|
+ }
|
|
|
+ };
|
|
|
+
|
|
|
+ static constexpr int MinAPWidth = 64;
|
|
|
+
|
|
|
+ static auto MakeIndexOrInvalid(int index) -> IntId {
|
|
|
+ CARBON_DCHECK(index >= 0 && index <= IntId::InvalidIndex);
|
|
|
+ return IntId(IntId::ZeroIndexId - index);
|
|
|
+ }
|
|
|
+
|
|
|
+ // Tries to make a signed 64-bit integer into an embedded value in the ID, and
|
|
|
+ // if unable to do that returns the `Invalid` ID.
|
|
|
+ static auto TryMakeValue(int64_t value) -> IntId {
|
|
|
+ if (IntId::MinValue <= value && value <= IntId::MaxValue) {
|
|
|
+ return IntId(value);
|
|
|
+ }
|
|
|
+
|
|
|
+ return IntId::Invalid;
|
|
|
+ }
|
|
|
+
|
|
|
+ // Tries to make a signed APInt into an embedded value in the ID, and if
|
|
|
+ // unable to do that returns the `Invalid` ID.
|
|
|
+ static auto TryMakeSignedValue(llvm::APInt value) -> IntId {
|
|
|
+ if (value.sge(IntId::MinValue) && value.sle(IntId::MaxValue)) {
|
|
|
+ return IntId(value.getSExtValue());
|
|
|
+ }
|
|
|
+
|
|
|
+ return IntId::Invalid;
|
|
|
+ }
|
|
|
+
|
|
|
+ // Tries to make an unsigned APInt into an embedded value in the ID, and if
|
|
|
+ // unable to do that returns the `Invalid` ID.
|
|
|
+ static auto TryMakeUnsignedValue(llvm::APInt value) -> IntId {
|
|
|
+ if (value.ule(IntId::MaxValue)) {
|
|
|
+ return IntId(value.getZExtValue());
|
|
|
+ }
|
|
|
+
|
|
|
+ return IntId::Invalid;
|
|
|
+ }
|
|
|
+
|
|
|
+ // Pick a canonical bit width for the provided number of significant bits.
|
|
|
+ static auto CanonicalBitWidth(int significant_bits) -> int;
|
|
|
+
|
|
|
+ // Canonicalize an incoming signed APInt to the correct bit width.
|
|
|
+ static auto CanonicalizeSigned(llvm::APInt value) -> llvm::APInt;
|
|
|
+
|
|
|
+ // Canonicalize an incoming unsigned APInt to the correct bit width.
|
|
|
+ static auto CanonicalizeUnsigned(llvm::APInt value) -> llvm::APInt;
|
|
|
+
|
|
|
+ // Helper functions for handling values that are large enough to require an
|
|
|
+ // allocated `APInt` for storage. Creating or manipulating that storage is
|
|
|
+ // only a few lines of code, but we move these out-of-line because the
|
|
|
+ // generated code is big and harms performance for the non-`Large` common
|
|
|
+ // case.
|
|
|
+ auto AddLarge(int64_t value) -> IntId;
|
|
|
+ auto AddSignedLarge(llvm::APInt value) -> IntId;
|
|
|
+ auto AddUnsignedLarge(llvm::APInt value) -> IntId;
|
|
|
+ auto LookupLarge(int64_t value) const -> IntId;
|
|
|
+ auto LookupSignedLarge(llvm::APInt value) const -> IntId;
|
|
|
+
|
|
|
+ // Stores values which don't fit in an IntId. These are always signed.
|
|
|
+ CanonicalValueStore<APIntId> values_;
|
|
|
+};
|
|
|
+
|
|
|
+constexpr IntStore::APIntId IntStore::APIntId::Invalid(
|
|
|
+ IntId::Invalid.AsIndex());
|
|
|
+
|
|
|
+} // namespace Carbon
|
|
|
+
|
|
|
+#endif // CARBON_TOOLCHAIN_BASE_INT_STORE_H_
|