token_info.h 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #ifndef CARBON_TOOLCHAIN_LEX_TOKEN_INFO_H_
  5. #define CARBON_TOOLCHAIN_LEX_TOKEN_INFO_H_
  6. #include "common/check.h"
  7. #include "toolchain/base/int.h"
  8. #include "toolchain/base/value_ids.h"
  9. #include "toolchain/lex/token_index.h"
  10. #include "toolchain/lex/token_kind.h"
  11. namespace Carbon::Lex {
  12. // Storage for the information about a specific token, as an implementation
  13. // detail of `TokenizedBuffer`.
  14. //
  15. // This provides a friendly accessor API to the carefully space-optimized
  16. // storage model of the information we associated with each token.
  17. //
  18. // There are four pieces of information stored here:
  19. // - The kind of the token.
  20. // - Whether that token has leading whitespace before it.
  21. // - A kind-specific payload that can be compressed into a small integer.
  22. // - This class provides dedicated accessors for each different form of
  23. // payload that check the kind and payload correspond correctly.
  24. // - A 32-bit byte offset of the token within the source text.
  25. //
  26. // These are compressed and stored in 8-bytes for each token.
  27. //
  28. // Note that while the class provides some limited setters for payloads and
  29. // mutating methods, setters on this type may be unexpectedly expensive due to
  30. // the bit-packed representation and should be avoided. As such, only the
  31. // minimal necessary setters are provided.
  32. //
  33. // TODO: It might be worth considering a struct-of-arrays data layout in order
  34. // to move the byte offset to a separate array from the rest as it is only hot
  35. // during lexing, and then cold during parsing and semantic analysis. However,
  36. // a trivial approach to that adds more overhead than it saves due to tracking
  37. // two separate vectors and their growth. Making this profitable would likely
  38. // at least require a highly specialized single vector that manages the growth
  39. // once and then provides separate storage areas for the two arrays.
  40. class TokenInfo {
  41. public:
  42. // The kind for this token.
  43. auto kind() const -> TokenKind { return kind_; }
  44. // Whether this token is preceded by whitespace. We only store the preceding
  45. // state, and look at the next token to check for trailing whitespace.
  46. auto has_leading_space() const -> bool { return has_leading_space_; }
  47. // A collection of methods to access the specific payload included with
  48. // particular kinds of tokens. Only the specific payload accessor below may
  49. // be used for an info entry of a token with a particular kind, and these
  50. // check that the kind is valid. Some tokens do not include a payload at all
  51. // and none of these methods may be called.
  52. auto ident_id() const -> IdentifierId {
  53. CARBON_DCHECK(kind() == TokenKind::Identifier);
  54. return IdentifierId(token_payload_);
  55. }
  56. auto set_ident_id(IdentifierId ident_id) -> void {
  57. CARBON_DCHECK(kind() == TokenKind::Identifier);
  58. token_payload_ = ident_id.index;
  59. }
  60. auto string_literal_id() const -> StringLiteralValueId {
  61. CARBON_DCHECK(kind() == TokenKind::StringLiteral);
  62. return StringLiteralValueId(token_payload_);
  63. }
  64. auto int_id() const -> IntId {
  65. CARBON_DCHECK(kind() == TokenKind::IntLiteral ||
  66. kind() == TokenKind::IntTypeLiteral ||
  67. kind() == TokenKind::UnsignedIntTypeLiteral ||
  68. kind() == TokenKind::FloatTypeLiteral);
  69. return IntId::MakeFromTokenPayload(token_payload_);
  70. }
  71. auto real_id() const -> RealId {
  72. CARBON_DCHECK(kind() == TokenKind::RealLiteral);
  73. return RealId(token_payload_);
  74. }
  75. auto closing_token_index() const -> TokenIndex {
  76. CARBON_DCHECK(kind().is_opening_symbol());
  77. return TokenIndex(token_payload_);
  78. }
  79. auto set_closing_token_index(TokenIndex closing_index) -> void {
  80. CARBON_DCHECK(kind().is_opening_symbol());
  81. token_payload_ = closing_index.index;
  82. }
  83. auto opening_token_index() const -> TokenIndex {
  84. CARBON_DCHECK(kind().is_closing_symbol());
  85. return TokenIndex(token_payload_);
  86. }
  87. auto set_opening_token_index(TokenIndex opening_index) -> void {
  88. CARBON_DCHECK(kind().is_closing_symbol());
  89. token_payload_ = opening_index.index;
  90. }
  91. auto error_length() const -> int {
  92. CARBON_DCHECK(kind() == TokenKind::Error);
  93. return token_payload_;
  94. }
  95. // Zero-based byte offset of the token within the file. This can be combined
  96. // with the buffer's line information to locate the line and column of the
  97. // token as well.
  98. auto byte_offset() const -> int32_t { return byte_offset_; }
  99. // Transforms the token into an error token of the given length but at its
  100. // original position and with the same whitespace adjacency.
  101. auto ResetAsError(int error_length) -> void {
  102. // Construct a fresh token to establish any needed invariants and replace
  103. // this token with it.
  104. TokenInfo error(TokenKind::Error, has_leading_space(), error_length,
  105. byte_offset());
  106. *this = error;
  107. }
  108. private:
  109. friend class Lexer;
  110. static constexpr int PayloadBits = 23;
  111. // Make sure we have enough payload bits to represent token-associated IDs.
  112. static_assert(PayloadBits >= IntId::TokenIdBits);
  113. static_assert(PayloadBits >= TokenIndex::Bits);
  114. // Constructor for a TokenKind that carries no payload, or where the payload
  115. // will be set later.
  116. //
  117. // Only used by the lexer which enforces only the correct kinds are used.
  118. //
  119. // When the payload is not being set, we leave it uninitialized. At least in
  120. // some cases, this will allow MSan to correctly detect erroneous attempts
  121. // to access the payload, as it works to track uninitialized memory
  122. // bit-for-bit specifically to handle complex cases like bitfields.
  123. TokenInfo(TokenKind kind, bool has_leading_space, int32_t byte_offset)
  124. : kind_(kind),
  125. has_leading_space_(has_leading_space),
  126. byte_offset_(byte_offset) {}
  127. // Constructor for a TokenKind that carries a payload.
  128. //
  129. // Only used by the lexer which enforces the correct kind and payload types.
  130. TokenInfo(TokenKind kind, bool has_leading_space, int payload,
  131. int32_t byte_offset)
  132. : kind_(kind),
  133. has_leading_space_(has_leading_space),
  134. token_payload_(payload),
  135. byte_offset_(byte_offset) {}
  136. // A bitfield that encodes the token's kind, the leading space flag, and the
  137. // remaining bits in a payload. These are encoded together as a bitfield for
  138. // density and because these are the hottest fields of tokens for consumers
  139. // after lexing.
  140. //
  141. // Payload values are typically ID types for which we create at most one per
  142. // token, so we ensure that `token_payload_` is large enough to fit any
  143. // token index. Stores to this field may overflow, but we produce an error
  144. // in `Lexer::Finalize` if the file has more than `TokenIndex::Max` tokens,
  145. // so this value never overflows if lexing succeeds.
  146. TokenKind kind_;
  147. static_assert(sizeof(kind_) == 1, "TokenKind must pack to 8 bits");
  148. bool has_leading_space_ : 1;
  149. unsigned token_payload_ : PayloadBits;
  150. // Separate storage for the byte offset, this is hot while lexing but then
  151. // generally cold.
  152. int32_t byte_offset_;
  153. };
  154. static_assert(sizeof(TokenInfo) == 8,
  155. "Expected `TokenInfo` to pack to an 8-byte structure.");
  156. } // namespace Carbon::Lex
  157. #endif // CARBON_TOOLCHAIN_LEX_TOKEN_INFO_H_