| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194 |
- // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
- // Exceptions. See /LICENSE for license information.
- // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- #ifndef CARBON_TOOLCHAIN_LEX_TOKEN_INFO_H_
- #define CARBON_TOOLCHAIN_LEX_TOKEN_INFO_H_
- #include "common/check.h"
- #include "toolchain/base/int.h"
- #include "toolchain/base/value_ids.h"
- #include "toolchain/lex/token_index.h"
- #include "toolchain/lex/token_kind.h"
- namespace Carbon::Lex {
- // A character as a unicode code point.
- //
- // Unicode requires 21 bits, which should fit inside `TokenInfo::PayloadBits`,
- // so we store the value directly.
- struct CharLiteralValue {
- int32_t value;
- };
- // Storage for the information about a specific token, as an implementation
- // detail of `TokenizedBuffer`.
- //
- // This provides a friendly accessor API to the carefully space-optimized
- // storage model of the information we associated with each token.
- //
- // There are four pieces of information stored here:
- // - The kind of the token.
- // - Whether that token has leading whitespace before it.
- // - A kind-specific payload that can be compressed into a small integer.
- // - This class provides dedicated accessors for each different form of
- // payload that check the kind and payload correspond correctly.
- // - A 32-bit byte offset of the token within the source text.
- //
- // These are compressed and stored in 8-bytes for each token.
- //
- // Note that while the class provides some limited setters for payloads and
- // mutating methods, setters on this type may be unexpectedly expensive due to
- // the bit-packed representation and should be avoided. As such, only the
- // minimal necessary setters are provided.
- //
- // TODO: It might be worth considering a struct-of-arrays data layout in order
- // to move the byte offset to a separate array from the rest as it is only hot
- // during lexing, and then cold during parsing and semantic analysis. However,
- // a trivial approach to that adds more overhead than it saves due to tracking
- // two separate vectors and their growth. Making this profitable would likely
- // at least require a highly specialized single vector that manages the growth
- // once and then provides separate storage areas for the two arrays.
- class TokenInfo {
- public:
- // The kind for this token.
- auto kind() const -> TokenKind { return kind_; }
- // Whether this token is preceded by whitespace. We only store the preceding
- // state, and look at the next token to check for trailing whitespace.
- auto has_leading_space() const -> bool { return has_leading_space_; }
- // A collection of methods to access the specific payload included with
- // particular kinds of tokens. Only the specific payload accessor below may
- // be used for an info entry of a token with a particular kind, and these
- // check that the kind is valid. Some tokens do not include a payload at all
- // and none of these methods may be called.
- auto ident_id() const -> IdentifierId {
- CARBON_DCHECK(kind() == TokenKind::Identifier);
- return IdentifierId(token_payload_);
- }
- auto set_ident_id(IdentifierId ident_id) -> void {
- CARBON_DCHECK(kind() == TokenKind::Identifier);
- token_payload_ = ident_id.index;
- }
- auto string_literal_id() const -> StringLiteralValueId {
- CARBON_DCHECK(kind() == TokenKind::StringLiteral);
- return StringLiteralValueId(token_payload_);
- }
- auto char_literal() const -> CharLiteralValue {
- CARBON_DCHECK(kind() == TokenKind::CharLiteral);
- return CharLiteralValue(token_payload_);
- }
- auto int_id() const -> IntId {
- CARBON_DCHECK(kind() == TokenKind::IntLiteral ||
- kind() == TokenKind::IntTypeLiteral ||
- kind() == TokenKind::UnsignedIntTypeLiteral ||
- kind() == TokenKind::FloatTypeLiteral);
- return IntId::MakeFromTokenPayload(token_payload_);
- }
- auto real_id() const -> RealId {
- CARBON_DCHECK(kind() == TokenKind::RealLiteral);
- return RealId(token_payload_);
- }
- auto closing_token_index() const -> TokenIndex {
- CARBON_DCHECK(kind().is_opening_symbol());
- return TokenIndex(token_payload_);
- }
- auto set_closing_token_index(TokenIndex closing_index) -> void {
- CARBON_DCHECK(kind().is_opening_symbol());
- token_payload_ = closing_index.index;
- }
- auto opening_token_index() const -> TokenIndex {
- CARBON_DCHECK(kind().is_closing_symbol());
- return TokenIndex(token_payload_);
- }
- auto set_opening_token_index(TokenIndex opening_index) -> void {
- CARBON_DCHECK(kind().is_closing_symbol());
- token_payload_ = opening_index.index;
- }
- auto error_length() const -> int {
- CARBON_DCHECK(kind() == TokenKind::Error);
- return token_payload_;
- }
- // Zero-based byte offset of the token within the file. This can be combined
- // with the buffer's line information to locate the line and column of the
- // token as well.
- auto byte_offset() const -> int32_t { return byte_offset_; }
- // Transforms the token into an error token of the given length but at its
- // original position and with the same whitespace adjacency.
- auto ResetAsError(int error_length) -> void {
- // Construct a fresh token to establish any needed invariants and replace
- // this token with it.
- TokenInfo error(TokenKind::Error, has_leading_space(), error_length,
- byte_offset());
- *this = error;
- }
- private:
- friend class Lexer;
- static constexpr int PayloadBits = 23;
- // Make sure we have enough payload bits to represent token-associated IDs.
- static_assert(PayloadBits >= IntId::TokenIdBits);
- static_assert(PayloadBits >= TokenIndex::Bits);
- // Constructor for a TokenKind that carries no payload, or where the payload
- // will be set later.
- //
- // Only used by the lexer which enforces only the correct kinds are used.
- //
- // When the payload is not being set, we leave it uninitialized. At least in
- // some cases, this will allow MSan to correctly detect erroneous attempts
- // to access the payload, as it works to track uninitialized memory
- // bit-for-bit specifically to handle complex cases like bitfields.
- TokenInfo(TokenKind kind, bool has_leading_space, int32_t byte_offset)
- : kind_(kind),
- has_leading_space_(has_leading_space),
- byte_offset_(byte_offset) {}
- // Constructor for a TokenKind that carries a payload.
- //
- // Only used by the lexer which enforces the correct kind and payload types.
- TokenInfo(TokenKind kind, bool has_leading_space, int payload,
- int32_t byte_offset)
- : kind_(kind),
- has_leading_space_(has_leading_space),
- token_payload_(payload),
- byte_offset_(byte_offset) {}
- // A bitfield that encodes the token's kind, the leading space flag, and the
- // remaining bits in a payload. These are encoded together as a bitfield for
- // density and because these are the hottest fields of tokens for consumers
- // after lexing.
- //
- // Payload values are typically ID types for which we create at most one per
- // token, so we ensure that `token_payload_` is large enough to fit any
- // token index. Stores to this field may overflow, but we produce an error
- // in `Lexer::Finalize` if the file has more than `TokenIndex::Max` tokens,
- // so this value never overflows if lexing succeeds.
- TokenKind kind_;
- static_assert(sizeof(kind_) == 1, "TokenKind must pack to 8 bits");
- bool has_leading_space_ : 1;
- unsigned token_payload_ : PayloadBits;
- // Separate storage for the byte offset, this is hot while lexing but then
- // generally cold.
- int32_t byte_offset_;
- };
- static_assert(sizeof(TokenInfo) == 8,
- "Expected `TokenInfo` to pack to an 8-byte structure.");
- } // namespace Carbon::Lex
- #endif // CARBON_TOOLCHAIN_LEX_TOKEN_INFO_H_
|