tomteb
/
carbon-lang


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437
							// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
// Exceptions. See /LICENSE for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#ifndef CARBON_TOOLCHAIN_BASE_INT_H_
#define CARBON_TOOLCHAIN_BASE_INT_H_

#include "common/check.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/SmallVector.h"
#include "toolchain/base/canonical_value_store.h"
#include "toolchain/base/index_base.h"
#include "toolchain/base/mem_usage.h"
#include "toolchain/base/value_store.h"
#include "toolchain/base/yaml.h"

namespace Carbon {

// Forward declare a testing peer so we can friend it.
namespace Testing {
struct IntStoreTestPeer;
}  // namespace Testing

// Corresponds to a canonicalized integer value. This is used both for integer
// literal tokens, and integer values in SemIR. These always represent the
// abstract mathematical value -- signed and regardless of the needed precision.
//
// Small values are internalized into the ID itself. Large values are
// represented as an index into an array of `APInt`s with a canonicalized bit
// width. The ID itself can be queried for whether it is a value-embedded-ID or
// an index ID. The ID also provides APIs for extracting either the value or an
// index.
//
// ## Details of the encoding scheme ##
//
// We need all the values from a maximum to minimum, as well as a healthy range
// of indices, to fit within the token ID bits.
//
// We represent this as a signed `TokenIdBits`-bit 2s compliment integer. The
// sign extension from TokenIdBits to a register size can be folded into the
// shift used to extract those bits from compressed bitfield storage.
//
// We then divide the smallest 1/4th of that bit width's space to represent
// indices, and the larger 3/4ths to embedded values. For 23-bits total this
// still gives us 2 million unique integers larger than the embedded ones, which
// would be difficult to fill without exceeding the number of tokens we can lex
// (8 million). For non-token based integers, the indices can continue downward
// to the 32-bit signed integer minimum, supporting approximately 1.998 billion
// unique larger integers.
//
// Note the `None` ID can't be used with a token. This is OK as we expect
// invalid tokens to be *error* tokens and not need to represent an invalid
// integer.
class IntId : public Printable<IntId> {
 public:
  static constexpr llvm::StringLiteral Label = "int";

  // The encoding of integer IDs ensures that IDs associated with tokens during
  // lexing can fit into a compressed storage space. We arrange for
  // `TokenIdBits` to be the minimum number of bits of storage for token
  // associated IDs. The constant is public so the lexer can ensure it reserves
  // adequate space.
  //
  // Note that there may still be IDs either not associated with
  // tokens or computed after lexing outside of this range.
  static constexpr int TokenIdBits = 23;

  static const IntId None;

  static auto MakeFromTokenPayload(uint32_t payload) -> IntId {
    // Token-associated IDs are signed `TokenIdBits` integers, so force sign
    // extension from that bit.
    return IntId(static_cast<int32_t>(payload << TokenIdBitsShift) >>
                 TokenIdBitsShift);
  }

  // Construct an ID from a raw 32-bit ID value.
  static constexpr auto MakeRaw(int32_t raw_id) -> IntId {
    return IntId(raw_id);
  }

  // Tests whether the ID is an embedded value ID.
  //
  // Note `None` is not an embedded value, so this implies `has_value()` is
  // true.
  constexpr auto is_embedded_value() const -> bool { return id_ > ZeroIndexId; }

  // Tests whether the ID is an index ID.
  //
  // Note `None` is represented as an index ID, so this is *not* sufficient to
  // test `has_value()`.
  constexpr auto is_index() const -> bool { return id_ <= ZeroIndexId; }

  // Test whether a value is present.
  //
  // This does not distinguish between embedded values and index IDs, only
  // whether some value is present.
  constexpr auto has_value() const -> bool { return id_ != NoneId; }

  // Converts an ID to the embedded value. Requires that `is_embedded_value()`
  // is true.
  constexpr auto AsValue() const -> int {
    CARBON_DCHECK(is_embedded_value());
    return id_;
  }

  // Converts an ID to an index. Requires that `is_index()` is true.
  //
  // Note `None` is represented as an index ID, and can be converted here.
  constexpr auto AsIndex() const -> int {
    CARBON_DCHECK(is_index());
    return ZeroIndexId - id_;
  }

  // Returns the ID formatted as a lex token payload.
  constexpr auto AsTokenPayload() const -> uint32_t {
    uint32_t payload = id_;
    // Ensure this ID round trips as the token payload.
    CARBON_DCHECK(*this == MakeFromTokenPayload(payload));
    return payload;
  }

  constexpr auto AsRaw() const -> int32_t { return id_; }

  auto Print(llvm::raw_ostream& out) const -> void {
    out << Label << "(";
    if (is_embedded_value()) {
      out << "value: " << AsValue();
    } else if (is_index()) {
      out << "index: " << AsIndex();
    } else {
      CARBON_CHECK(!has_value());
      out << "<none>";
    }
    out << ")";
  }

  friend constexpr auto operator==(IntId lhs, IntId rhs) -> bool {
    return lhs.id_ == rhs.id_;
  }
  friend constexpr auto operator<=>(IntId lhs, IntId rhs)
      -> std::strong_ordering {
    return lhs.id_ <=> rhs.id_;
  }

 private:
  friend class IntStore;
  friend Testing::IntStoreTestPeer;

  // The shift needed when adjusting a between a `TokenIdBits`-width integer and
  // a 32-bit integer.
  static constexpr int TokenIdBitsShift = 32 - TokenIdBits;

  // The maximum embedded value in an ID.
  static constexpr int32_t MaxValue =
      std::numeric_limits<int32_t>::max() >> TokenIdBitsShift;

  // The ID value that represents an index of `0`. This is the first ID value
  // representing an index, and all indices are `<=` to this.
  //
  // `ZeroIndexId` is the first index ID, and we encode indices as successive
  // negative numbers counting downwards. The setup allows us to both use a
  // comparison with this ID to distinguish value and index IDs, and to compute
  // the actual index from the ID.
  //
  // The computation of an index in fact is just a subtraction:
  // `ZeroIndexId - id_`. Subtraction is *also* how most CPUs implement the
  // comparison, and so all of this ends up carefully constructed to enable very
  // small code size when testing for an embedded value and when that test fails
  // computing and using the index.
  static constexpr int32_t ZeroIndexId =
      std::numeric_limits<int32_t>::min() >> (TokenIdBitsShift + 1);

  // The minimum embedded value in an ID.
  static constexpr int32_t MinValue = ZeroIndexId + 1;

  // The `None` ID, which needs to be placed after the largest index, which
  // count downwards as IDs so below the smallest index ID, in order to optimize
  // the code sequence needed to distinguish between integer and value IDs and
  // to convert index IDs into actual indices small.
  static constexpr int32_t NoneId = std::numeric_limits<int32_t>::min();

  // The `None` index. This is the result of converting a `None` ID into an
  // index. We ensure that conversion can be done so that we can simplify the
  // code that first tries to use an embedded value, then converts to an index
  // and checks that the index is still `None`.
  static const int32_t NoneIndex;

  // Document the specific values of some of these constants to help visualize
  // how the bit patterns map from the above computations.
  //
  // clang-format off: visualizing bit positions
  //
  // Each bit is either `T` for part of the token or `P` as part
  // of the available payload that we use for the ID:
  //
  //                           0bTTTT'TTTT'TPPP'PPPP'PPPP'PPPP'PPPP'PPPP
  static_assert(MaxValue    == 0b0000'0000'0011'1111'1111'1111'1111'1111);
  static_assert(ZeroIndexId == 0b1111'1111'1110'0000'0000'0000'0000'0000);
  static_assert(MinValue    == 0b1111'1111'1110'0000'0000'0000'0000'0001);
  static_assert(NoneId      == 0b1000'0000'0000'0000'0000'0000'0000'0000);
  // clang-format on

  constexpr explicit IntId(int32_t id) : id_(id) {}

  int32_t id_;
};

inline constexpr IntId IntId::None(IntId::NoneId);

// Note that we initialize the `None` index in a constexpr context which
// ensures there is no UB in forming it. This helps ensure all the ID -> index
// conversions are correct because the `None` ID is at the limit of that range.
inline constexpr int32_t IntId::NoneIndex = None.AsIndex();

// A canonicalizing value store with deep optimizations for integers.
//
// This stores integers as abstract, signed mathematical integers. The bit width
// of specific `APInt` values, either as inputs or outputs, is disregarded for
// the purpose of canonicalization and the returned integer may use a very
// different bit width `APInt` than was used when adding. There are also
// optimized paths for adding integer values representable using native integer
// types.
//
// Because the integers in the store are canonicalized with only a minimum bit
// width, there are helper functions to coerce them to a specific desired bit
// width for use.
//
// This leverages a significant optimization for small integer values -- rather
// than canonicalizing and making them unique in a `ValueStore`, they are
// directly embedded in the `IntId` itself. Only larger integers are stored in
// an array of `APInt` values and represented as an index in the ID.
class IntStore {
 public:
  // We rely on `APInt` values having a minimum bit width.
  static constexpr int MinAPWidth = 64;

  // The maximum supported bit width of an integer type.
  // TODO: Pick a maximum size and document it in the design. For now
  // we use 2^^23, because that's the largest size that LLVM supports.
  static constexpr int MaxIntWidth = 1 << 23;

  // Pick a canonical bit width for the provided number of significant bits.
  static auto CanonicalBitWidth(int significant_bits) -> int;

  // Accepts a signed `int64_t` and uses the mathematical signed integer value
  // of it as the added integer value.
  //
  // Returns the ID corresponding to this integer value, storing an `APInt` if
  // necessary to represent it.
  auto Add(int64_t value) -> IntId {
    // First try directly making this into an ID.
    if (IntId id = TryMakeValue(value); id.has_value()) [[likely]] {
      return id;
    }

    // Fallback for larger values.
    return AddLarge(value);
  }

  // Returns the ID corresponding to this integer value.
  auto Add(llvm::APSInt value) -> IntId {
    return value.isSigned() ? AddSigned(value) : AddUnsigned(value);
  }

  // Returns the ID corresponding to this signed integer value, storing an
  // `APInt` if necessary to represent it.
  auto AddSigned(llvm::APInt value) -> IntId {
    // First try directly making this into an ID.
    if (IntId id = TryMakeSignedValue(value); id.has_value()) [[likely]] {
      return id;
    }

    // Fallback for larger values.
    return AddSignedLarge(std::move(value));
  }

  // Returns the ID corresponding to an equivalent signed integer value for the
  // provided unsigned integer value, storing an `APInt` if necessary to
  // represent it.
  auto AddUnsigned(llvm::APInt value) -> IntId {
    // First try directly making this into an ID.
    if (IntId id = TryMakeUnsignedValue(value); id.has_value()) [[likely]] {
      return id;
    }

    // Fallback for larger values.
    return AddUnsignedLarge(std::move(value));
  }

  // Returns the value for an ID.
  //
  // This will always be a signed `APInt` with a canonical bit width for the
  // specific integer value in question.
  auto Get(IntId id) const -> llvm::APInt {
    if (id.is_embedded_value()) [[likely]] {
      return llvm::APInt(MinAPWidth, id.AsValue(), /*isSigned=*/true);
    }
    return values_.Get(APIntId(id.AsIndex()));
  }

  // Returns the value for an ID adjusted to a specific bit width.
  //
  // Note that because we store canonical mathematical integers as signed
  // integers, this always sign extends or truncates to the target width. The
  // caller can then use that as a signed or unsigned integer as needed.
  auto GetAtWidth(IntId id, int bit_width) const -> llvm::APInt {
    llvm::APInt value = Get(id);
    if (static_cast<int>(value.getBitWidth()) != bit_width) {
      value = value.sextOrTrunc(bit_width);
    }
    return value;
  }

  // Returns the value for an ID adjusted to the bit width specified with
  // another integer ID.
  //
  // This simply looks up the width integer ID, and then calls the above
  // `GetAtWidth` overload using the value found for it. See that overload for
  // more details.
  auto GetAtWidth(IntId id, IntId bit_width_id) const -> llvm::APInt {
    const llvm::APInt bit_width = Get(bit_width_id);
    CARBON_CHECK(
        bit_width.isStrictlyPositive() && bit_width.isSignedIntN(MinAPWidth),
        "Invalid bit width value: {0}", bit_width);
    return GetAtWidth(id, bit_width.getSExtValue());
  }

  // Accepts a signed `int64_t` and uses the mathematical signed integer value
  // of it as the integer value to lookup. Returns the canonical ID for that
  // value or returns `None` if not in the store.
  auto Lookup(int64_t value) const -> IntId {
    if (IntId id = TryMakeValue(value); id.has_value()) [[likely]] {
      return id;
    }

    // Fallback for larger values.
    return LookupLarge(value);
  }

  // Looks up the canonical ID for this signed integer value, or returns `None`
  // if not in the store.
  auto LookupSigned(llvm::APInt value) const -> IntId {
    if (IntId id = TryMakeSignedValue(value); id.has_value()) [[likely]] {
      return id;
    }

    // Fallback for larger values.
    return LookupSignedLarge(std::move(value));
  }

  // Output a YAML description of this data structure. Note that this will only
  // include the integers that required storing, not those successfully embedded
  // into the ID space.
  auto OutputYaml() const -> Yaml::OutputMapping;

  auto values() const [[clang::lifetimebound]] -> auto {
    return values_.values();
  }
  auto size() const -> size_t { return values_.size(); }

  // Collects the memory usage of the separately stored integers.
  auto CollectMemUsage(MemUsage& mem_usage, llvm::StringRef label) const
      -> void;

 private:
  friend struct Testing::IntStoreTestPeer;

  // Used for `values_`; tracked using `IntId`'s index range.
  struct APIntId : IdBase<APIntId> {
    static constexpr llvm::StringLiteral Label = "ap_int";
    static const APIntId None;
    using IdBase::IdBase;
  };

  static auto MakeIndexOrNone(int index) -> IntId {
    CARBON_DCHECK(index >= 0 && index <= IntId::NoneIndex);
    return IntId(IntId::ZeroIndexId - index);
  }

  // Tries to make a signed 64-bit integer into an embedded value in the ID, and
  // if unable to do that returns the `None` ID.
  static auto TryMakeValue(int64_t value) -> IntId {
    if (IntId::MinValue <= value && value <= IntId::MaxValue) {
      return IntId(value);
    }

    return IntId::None;
  }

  // Tries to make a signed APInt into an embedded value in the ID, and if
  // unable to do that returns the `None` ID.
  static auto TryMakeSignedValue(llvm::APInt value) -> IntId {
    if (value.sge(IntId::MinValue) && value.sle(IntId::MaxValue)) {
      return IntId(value.getSExtValue());
    }

    return IntId::None;
  }

  // Tries to make an unsigned APInt into an embedded value in the ID, and if
  // unable to do that returns the `None` ID.
  static auto TryMakeUnsignedValue(llvm::APInt value) -> IntId {
    if (value.ule(IntId::MaxValue)) {
      return IntId(value.getZExtValue());
    }

    return IntId::None;
  }

  // Canonicalize an incoming signed APInt to the correct bit width.
  static auto CanonicalizeSigned(llvm::APInt value) -> llvm::APInt;

  // Canonicalize an incoming unsigned APInt to the correct bit width.
  static auto CanonicalizeUnsigned(llvm::APInt value) -> llvm::APInt;

  // Helper functions for handling values that are large enough to require an
  // allocated `APInt` for storage. Creating or manipulating that storage is
  // only a few lines of code, but we move these out-of-line because the
  // generated code is big and harms performance for the non-`Large` common
  // case.
  auto AddLarge(int64_t value) -> IntId;
  auto AddSignedLarge(llvm::APInt value) -> IntId;
  auto AddUnsignedLarge(llvm::APInt value) -> IntId;
  auto LookupLarge(int64_t value) const -> IntId;
  auto LookupSignedLarge(llvm::APInt value) const -> IntId;

  // Stores values which don't fit in an IntId. These are always signed.
  CanonicalValueStore<APIntId, llvm::APInt> values_;
};

inline constexpr IntStore::APIntId IntStore::APIntId::None(
    IntId::None.AsIndex());

}  // namespace Carbon

#endif  // CARBON_TOOLCHAIN_BASE_INT_H_