token_info.h 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #ifndef CARBON_TOOLCHAIN_LEX_TOKEN_INFO_H_
  5. #define CARBON_TOOLCHAIN_LEX_TOKEN_INFO_H_
  6. #include "common/check.h"
  7. #include "toolchain/base/int.h"
  8. #include "toolchain/base/value_ids.h"
  9. #include "toolchain/lex/token_index.h"
  10. #include "toolchain/lex/token_kind.h"
  11. namespace Carbon::Lex {
  12. // A character as a unicode code point.
  13. //
  14. // Unicode requires 21 bits, which should fit inside `TokenInfo::PayloadBits`,
  15. // so we store the value directly.
  16. struct CharLiteralValue {
  17. int32_t value;
  18. };
  19. // Storage for the information about a specific token, as an implementation
  20. // detail of `TokenizedBuffer`.
  21. //
  22. // This provides a friendly accessor API to the carefully space-optimized
  23. // storage model of the information we associated with each token.
  24. //
  25. // There are four pieces of information stored here:
  26. // - The kind of the token.
  27. // - Whether that token has leading whitespace before it.
  28. // - A kind-specific payload that can be compressed into a small integer.
  29. // - This class provides dedicated accessors for each different form of
  30. // payload that check the kind and payload correspond correctly.
  31. // - A 32-bit byte offset of the token within the source text.
  32. //
  33. // These are compressed and stored in 8-bytes for each token.
  34. //
  35. // Note that while the class provides some limited setters for payloads and
  36. // mutating methods, setters on this type may be unexpectedly expensive due to
  37. // the bit-packed representation and should be avoided. As such, only the
  38. // minimal necessary setters are provided.
  39. //
  40. // TODO: It might be worth considering a struct-of-arrays data layout in order
  41. // to move the byte offset to a separate array from the rest as it is only hot
  42. // during lexing, and then cold during parsing and semantic analysis. However,
  43. // a trivial approach to that adds more overhead than it saves due to tracking
  44. // two separate vectors and their growth. Making this profitable would likely
  45. // at least require a highly specialized single vector that manages the growth
  46. // once and then provides separate storage areas for the two arrays.
  47. class TokenInfo {
  48. public:
  49. // The kind for this token.
  50. auto kind() const -> TokenKind { return kind_; }
  51. // Whether this token is preceded by whitespace. We only store the preceding
  52. // state, and look at the next token to check for trailing whitespace.
  53. auto has_leading_space() const -> bool { return has_leading_space_; }
  54. // A collection of methods to access the specific payload included with
  55. // particular kinds of tokens. Only the specific payload accessor below may
  56. // be used for an info entry of a token with a particular kind, and these
  57. // check that the kind is valid. Some tokens do not include a payload at all
  58. // and none of these methods may be called.
  59. auto ident_id() const -> IdentifierId {
  60. CARBON_DCHECK(kind() == TokenKind::Identifier);
  61. return IdentifierId(token_payload_);
  62. }
  63. auto set_ident_id(IdentifierId ident_id) -> void {
  64. CARBON_DCHECK(kind() == TokenKind::Identifier);
  65. token_payload_ = ident_id.index;
  66. }
  67. auto string_literal_id() const -> StringLiteralValueId {
  68. CARBON_DCHECK(kind() == TokenKind::StringLiteral);
  69. return StringLiteralValueId(token_payload_);
  70. }
  71. auto char_literal() const -> CharLiteralValue {
  72. CARBON_DCHECK(kind() == TokenKind::CharLiteral);
  73. return CharLiteralValue(token_payload_);
  74. }
  75. auto int_id() const -> IntId {
  76. CARBON_DCHECK(kind() == TokenKind::IntLiteral ||
  77. kind() == TokenKind::IntTypeLiteral ||
  78. kind() == TokenKind::UnsignedIntTypeLiteral ||
  79. kind() == TokenKind::FloatTypeLiteral);
  80. return IntId::MakeFromTokenPayload(token_payload_);
  81. }
  82. auto real_id() const -> RealId {
  83. CARBON_DCHECK(kind() == TokenKind::RealLiteral);
  84. return RealId(token_payload_);
  85. }
  86. auto closing_token_index() const -> TokenIndex {
  87. CARBON_DCHECK(kind().is_opening_symbol());
  88. return TokenIndex(token_payload_);
  89. }
  90. auto set_closing_token_index(TokenIndex closing_index) -> void {
  91. CARBON_DCHECK(kind().is_opening_symbol());
  92. token_payload_ = closing_index.index;
  93. }
  94. auto opening_token_index() const -> TokenIndex {
  95. CARBON_DCHECK(kind().is_closing_symbol());
  96. return TokenIndex(token_payload_);
  97. }
  98. auto set_opening_token_index(TokenIndex opening_index) -> void {
  99. CARBON_DCHECK(kind().is_closing_symbol());
  100. token_payload_ = opening_index.index;
  101. }
  102. auto error_length() const -> int {
  103. CARBON_DCHECK(kind() == TokenKind::Error);
  104. return token_payload_;
  105. }
  106. // Zero-based byte offset of the token within the file. This can be combined
  107. // with the buffer's line information to locate the line and column of the
  108. // token as well.
  109. auto byte_offset() const -> int32_t { return byte_offset_; }
  110. // Transforms the token into an error token of the given length but at its
  111. // original position and with the same whitespace adjacency.
  112. auto ResetAsError(int error_length) -> void {
  113. // Construct a fresh token to establish any needed invariants and replace
  114. // this token with it.
  115. TokenInfo error(TokenKind::Error, has_leading_space(), error_length,
  116. byte_offset());
  117. *this = error;
  118. }
  119. private:
  120. friend class Lexer;
  121. static constexpr int PayloadBits = 23;
  122. // Make sure we have enough payload bits to represent token-associated IDs.
  123. static_assert(PayloadBits >= IntId::TokenIdBits);
  124. static_assert(PayloadBits >= TokenIndex::Bits);
  125. // Constructor for a TokenKind that carries no payload, or where the payload
  126. // will be set later.
  127. //
  128. // Only used by the lexer which enforces only the correct kinds are used.
  129. //
  130. // When the payload is not being set, we leave it uninitialized. At least in
  131. // some cases, this will allow MSan to correctly detect erroneous attempts
  132. // to access the payload, as it works to track uninitialized memory
  133. // bit-for-bit specifically to handle complex cases like bitfields.
  134. TokenInfo(TokenKind kind, bool has_leading_space, int32_t byte_offset)
  135. : kind_(kind),
  136. has_leading_space_(has_leading_space),
  137. byte_offset_(byte_offset) {}
  138. // Constructor for a TokenKind that carries a payload.
  139. //
  140. // Only used by the lexer which enforces the correct kind and payload types.
  141. TokenInfo(TokenKind kind, bool has_leading_space, int payload,
  142. int32_t byte_offset)
  143. : kind_(kind),
  144. has_leading_space_(has_leading_space),
  145. token_payload_(payload),
  146. byte_offset_(byte_offset) {}
  147. // A bitfield that encodes the token's kind, the leading space flag, and the
  148. // remaining bits in a payload. These are encoded together as a bitfield for
  149. // density and because these are the hottest fields of tokens for consumers
  150. // after lexing.
  151. //
  152. // Payload values are typically ID types for which we create at most one per
  153. // token, so we ensure that `token_payload_` is large enough to fit any
  154. // token index. Stores to this field may overflow, but we produce an error
  155. // in `Lexer::Finalize` if the file has more than `TokenIndex::Max` tokens,
  156. // so this value never overflows if lexing succeeds.
  157. TokenKind kind_;
  158. static_assert(sizeof(kind_) == 1, "TokenKind must pack to 8 bits");
  159. bool has_leading_space_ : 1;
  160. unsigned token_payload_ : PayloadBits;
  161. // Separate storage for the byte offset, this is hot while lexing but then
  162. // generally cold.
  163. int32_t byte_offset_;
  164. };
  165. static_assert(sizeof(TokenInfo) == 8,
  166. "Expected `TokenInfo` to pack to an 8-byte structure.");
  167. } // namespace Carbon::Lex
  168. #endif // CARBON_TOOLCHAIN_LEX_TOKEN_INFO_H_