tokenized_buffer.h 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #ifndef CARBON_TOOLCHAIN_LEX_TOKENIZED_BUFFER_H_
  5. #define CARBON_TOOLCHAIN_LEX_TOKENIZED_BUFFER_H_
  6. #include <compare>
  7. #include <cstdint>
  8. #include <iterator>
  9. #include "common/ostream.h"
  10. #include "llvm/ADT/APInt.h"
  11. #include "llvm/ADT/SmallVector.h"
  12. #include "llvm/ADT/StringRef.h"
  13. #include "llvm/ADT/iterator.h"
  14. #include "llvm/ADT/iterator_range.h"
  15. #include "llvm/Support/Allocator.h"
  16. #include "llvm/Support/raw_ostream.h"
  17. #include "toolchain/base/index_base.h"
  18. #include "toolchain/base/mem_usage.h"
  19. #include "toolchain/base/value_store.h"
  20. #include "toolchain/diagnostics/diagnostic_emitter.h"
  21. #include "toolchain/lex/token_index.h"
  22. #include "toolchain/lex/token_kind.h"
  23. #include "toolchain/source/source_buffer.h"
  24. namespace Carbon::Lex {
  25. class TokenizedBuffer;
  26. // A lightweight handle to a lexed line in a `TokenizedBuffer`.
  27. //
  28. // `LineIndex` objects are designed to be passed by value, not reference or
  29. // pointer. They are also designed to be small and efficient to store in data
  30. // structures.
  31. //
  32. // Each `LineIndex` object refers to a specific line in the source code that was
  33. // lexed. They can be compared directly to establish that they refer to the
  34. // same line or the relative position of different lines within the source.
  35. //
  36. // All other APIs to query a `LineIndex` are on the `TokenizedBuffer`.
  37. struct LineIndex : public IndexBase {
  38. static const LineIndex Invalid;
  39. using IndexBase::IndexBase;
  40. };
  41. constexpr LineIndex LineIndex::Invalid(LineIndex::InvalidIndex);
  42. // Random-access iterator over tokens within the buffer.
  43. class TokenIterator
  44. : public llvm::iterator_facade_base<TokenIterator,
  45. std::random_access_iterator_tag,
  46. const TokenIndex, int>,
  47. public Printable<TokenIterator> {
  48. public:
  49. TokenIterator() = delete;
  50. explicit TokenIterator(TokenIndex token) : token_(token) {}
  51. auto operator==(const TokenIterator& rhs) const -> bool {
  52. return token_ == rhs.token_;
  53. }
  54. auto operator<=>(const TokenIterator& rhs) const -> std::strong_ordering {
  55. return token_ <=> rhs.token_;
  56. }
  57. auto operator*() const -> const TokenIndex& { return token_; }
  58. using iterator_facade_base::operator-;
  59. auto operator-(const TokenIterator& rhs) const -> int {
  60. return token_.index - rhs.token_.index;
  61. }
  62. auto operator+=(int n) -> TokenIterator& {
  63. token_.index += n;
  64. return *this;
  65. }
  66. auto operator-=(int n) -> TokenIterator& {
  67. token_.index -= n;
  68. return *this;
  69. }
  70. // Prints the raw token index.
  71. auto Print(llvm::raw_ostream& output) const -> void;
  72. private:
  73. friend class TokenizedBuffer;
  74. TokenIndex token_;
  75. };
  76. // A diagnostic location converter that maps token locations into source
  77. // buffer locations.
  78. class TokenDiagnosticConverter : public DiagnosticConverter<TokenIndex> {
  79. public:
  80. explicit TokenDiagnosticConverter(const TokenizedBuffer* buffer)
  81. : buffer_(buffer) {}
  82. // Map the given token into a diagnostic location.
  83. auto ConvertLoc(TokenIndex token, ContextFnT context_fn) const
  84. -> DiagnosticLoc override;
  85. private:
  86. const TokenizedBuffer* buffer_;
  87. };
  88. // A buffer of tokenized Carbon source code.
  89. //
  90. // This is constructed by lexing the source code text into a series of tokens.
  91. // The buffer provides lightweight handles to tokens and other lexed entities,
  92. // as well as iterations to walk the sequence of tokens found in the buffer.
  93. //
  94. // Lexing errors result in a potentially incomplete sequence of tokens and
  95. // `HasError` returning true.
  96. class TokenizedBuffer : public Printable<TokenizedBuffer> {
  97. public:
  98. auto GetKind(TokenIndex token) const -> TokenKind;
  99. auto GetLine(TokenIndex token) const -> LineIndex;
  100. // Returns the 1-based line number.
  101. auto GetLineNumber(TokenIndex token) const -> int;
  102. // Returns the 1-based column number.
  103. auto GetColumnNumber(TokenIndex token) const -> int;
  104. // Returns the line and 1-based column number of the first character after
  105. // this token.
  106. auto GetEndLoc(TokenIndex token) const -> std::pair<LineIndex, int>;
  107. // Returns the source text lexed into this token.
  108. auto GetTokenText(TokenIndex token) const -> llvm::StringRef;
  109. // Returns the identifier associated with this token. The token kind must be
  110. // an `Identifier`.
  111. auto GetIdentifier(TokenIndex token) const -> IdentifierId;
  112. // Returns the value of an `IntLiteral()` token.
  113. auto GetIntLiteral(TokenIndex token) const -> IntId;
  114. // Returns the value of an `RealLiteral()` token.
  115. auto GetRealLiteral(TokenIndex token) const -> RealId;
  116. // Returns the value of a `StringLiteral()` token.
  117. auto GetStringLiteralValue(TokenIndex token) const -> StringLiteralValueId;
  118. // Returns the size specified in a `*TypeLiteral()` token.
  119. auto GetTypeLiteralSize(TokenIndex token) const -> IntId;
  120. // Returns the closing token matched with the given opening token.
  121. //
  122. // The given token must be an opening token kind.
  123. auto GetMatchedClosingToken(TokenIndex opening_token) const -> TokenIndex;
  124. // Returns the opening token matched with the given closing token.
  125. //
  126. // The given token must be a closing token kind.
  127. auto GetMatchedOpeningToken(TokenIndex closing_token) const -> TokenIndex;
  128. // Returns whether the given token has leading whitespace.
  129. auto HasLeadingWhitespace(TokenIndex token) const -> bool;
  130. // Returns whether the given token has trailing whitespace.
  131. auto HasTrailingWhitespace(TokenIndex token) const -> bool;
  132. // Returns whether the token was created as part of an error recovery effort.
  133. //
  134. // For example, a closing paren inserted to match an unmatched paren.
  135. auto IsRecoveryToken(TokenIndex token) const -> bool;
  136. // Returns the 1-based line number.
  137. auto GetLineNumber(LineIndex line) const -> int;
  138. // Returns the 1-based indentation column number.
  139. auto GetIndentColumnNumber(LineIndex line) const -> int;
  140. // Returns the next line handle.
  141. auto GetNextLine(LineIndex line) const -> LineIndex;
  142. // Returns the previous line handle.
  143. auto GetPrevLine(LineIndex line) const -> LineIndex;
  144. // Prints a description of the tokenized stream to the provided `raw_ostream`.
  145. //
  146. // It prints one line of information for each token in the buffer, including
  147. // the kind of token, where it occurs within the source file, indentation for
  148. // the associated line, the spelling of the token in source, and any
  149. // additional information tracked such as which unique identifier it is or any
  150. // matched grouping token.
  151. //
  152. // Each line is formatted as a YAML record:
  153. //
  154. // clang-format off
  155. // ```
  156. // token: { index: 0, kind: 'Semi', line: 1, column: 1, indent: 1, spelling: ';' }
  157. // ```
  158. // clang-format on
  159. //
  160. // This can be parsed as YAML using tools like `python-yq` combined with `jq`
  161. // on the command line. The format is also reasonably amenable to other
  162. // line-oriented shell tools from `grep` to `awk`.
  163. auto Print(llvm::raw_ostream& output_stream) const -> void;
  164. // Prints a description of a single token. See `Print` for details on the
  165. // format.
  166. auto PrintToken(llvm::raw_ostream& output_stream, TokenIndex token) const
  167. -> void;
  168. // Collects memory usage of members.
  169. auto CollectMemUsage(MemUsage& mem_usage, llvm::StringRef label) const
  170. -> void;
  171. // Returns true if the buffer has errors that were detected at lexing time.
  172. auto has_errors() const -> bool { return has_errors_; }
  173. auto tokens() const -> llvm::iterator_range<TokenIterator> {
  174. return llvm::make_range(TokenIterator(TokenIndex(0)),
  175. TokenIterator(TokenIndex(token_infos_.size())));
  176. }
  177. auto size() const -> int { return token_infos_.size(); }
  178. // This is an upper bound on the number of output parse nodes in the absence
  179. // of errors.
  180. auto expected_max_parse_tree_size() const -> int {
  181. return expected_max_parse_tree_size_;
  182. }
  183. auto source() const -> const SourceBuffer& { return *source_; }
  184. private:
  185. friend class Lexer;
  186. friend class TokenDiagnosticConverter;
  187. // A diagnostic location converter that maps token locations into source
  188. // buffer locations.
  189. class SourceBufferDiagnosticConverter
  190. : public DiagnosticConverter<const char*> {
  191. public:
  192. explicit SourceBufferDiagnosticConverter(const TokenizedBuffer* buffer)
  193. : buffer_(buffer) {}
  194. // Map the given position within the source buffer into a diagnostic
  195. // location.
  196. auto ConvertLoc(const char* loc, ContextFnT context_fn) const
  197. -> DiagnosticLoc override;
  198. private:
  199. const TokenizedBuffer* buffer_;
  200. };
  201. // Specifies minimum widths to use when printing a token's fields via
  202. // `printToken`.
  203. struct PrintWidths {
  204. // Widens `this` to the maximum of `this` and `new_width` for each
  205. // dimension.
  206. auto Widen(const PrintWidths& widths) -> void;
  207. int index;
  208. int kind;
  209. int line;
  210. int column;
  211. int indent;
  212. };
  213. // Storage for the information about a specific token in the buffer.
  214. //
  215. // This provides a friendly accessor API to the carefully space-optimized
  216. // storage model of the information we associated with each token.
  217. //
  218. // There are four pieces of information stored here:
  219. // - The kind of the token.
  220. // - Whether that token has leading whitespace before it.
  221. // - A kind-specific payload that can be compressed into a small integer.
  222. // - This class provides dedicated accessors for each different form of
  223. // payload that check the kind and payload correspond correctly.
  224. // - A 32-bit byte offset of the token within the source text.
  225. //
  226. // These are compressed and stored in 8-bytes for each token.
  227. //
  228. // Note that while the class provides some limited setters for payloads and
  229. // mutating methods, setters on this type may be unexpectedly expensive due to
  230. // the bit-packed representation and should be avoided. As such, only the
  231. // minimal necessary setters are provided.
  232. //
  233. // TODO: It might be worth considering a struct-of-arrays data layout in order
  234. // to move the byte offset to a separate array from the rest as it is only hot
  235. // during lexing, and then cold during parsing and semantic analysis. However,
  236. // a trivial approach to that adds more overhead than it saves due to tracking
  237. // two separate vectors and their growth. Making this profitable would likely
  238. // at least require a highly specialized single vector that manages the growth
  239. // once and then provides separate storage areas for the two arrays.
  240. class TokenInfo {
  241. public:
  242. // The kind for this token.
  243. auto kind() const -> TokenKind { return TokenKind::Make(kind_); }
  244. // Whether this token is preceded by whitespace. We only store the preceding
  245. // state, and look at the next token to check for trailing whitespace.
  246. auto has_leading_space() const -> bool { return has_leading_space_; }
  247. // A collection of methods to access the specific payload included with
  248. // particular kinds of tokens. Only the specific payload accessor below may
  249. // be used for an info entry of a token with a particular kind, and these
  250. // check that the kind is valid. Some tokens do not include a payload at all
  251. // and none of these methods may be called.
  252. auto ident_id() const -> IdentifierId {
  253. CARBON_DCHECK(kind() == TokenKind::Identifier);
  254. return IdentifierId(token_payload_);
  255. }
  256. auto set_ident_id(IdentifierId ident_id) -> void {
  257. CARBON_DCHECK(kind() == TokenKind::Identifier);
  258. CARBON_DCHECK(ident_id.index < (2 << PayloadBits));
  259. token_payload_ = ident_id.index;
  260. }
  261. auto string_literal_id() const -> StringLiteralValueId {
  262. CARBON_DCHECK(kind() == TokenKind::StringLiteral);
  263. return StringLiteralValueId(token_payload_);
  264. }
  265. auto int_id() const -> IntId {
  266. CARBON_DCHECK(kind() == TokenKind::IntLiteral ||
  267. kind() == TokenKind::IntTypeLiteral ||
  268. kind() == TokenKind::UnsignedIntTypeLiteral ||
  269. kind() == TokenKind::FloatTypeLiteral);
  270. return IntId(token_payload_);
  271. }
  272. auto real_id() const -> RealId {
  273. CARBON_DCHECK(kind() == TokenKind::RealLiteral);
  274. return RealId(token_payload_);
  275. }
  276. auto closing_token_index() const -> TokenIndex {
  277. CARBON_DCHECK(kind().is_opening_symbol());
  278. return TokenIndex(token_payload_);
  279. }
  280. auto set_closing_token_index(TokenIndex closing_index) -> void {
  281. CARBON_DCHECK(kind().is_opening_symbol());
  282. CARBON_DCHECK(closing_index.index < (2 << PayloadBits));
  283. token_payload_ = closing_index.index;
  284. }
  285. auto opening_token_index() const -> TokenIndex {
  286. CARBON_DCHECK(kind().is_closing_symbol());
  287. return TokenIndex(token_payload_);
  288. }
  289. auto set_opening_token_index(TokenIndex opening_index) -> void {
  290. CARBON_DCHECK(kind().is_closing_symbol());
  291. CARBON_DCHECK(opening_index.index < (2 << PayloadBits));
  292. token_payload_ = opening_index.index;
  293. }
  294. auto error_length() const -> int {
  295. CARBON_DCHECK(kind() == TokenKind::Error);
  296. return token_payload_;
  297. }
  298. // Zero-based byte offset of the token within the file. This can be combined
  299. // with the buffer's line information to locate the line and column of the
  300. // token as well.
  301. auto byte_offset() const -> int32_t { return byte_offset_; }
  302. // Transforms the token into an error token of the given length but at its
  303. // original position and with the same whitespace adjacency.
  304. auto ResetAsError(int error_length) -> void {
  305. // Construct a fresh token to establish any needed invariants and replace
  306. // this token with it.
  307. TokenInfo error(TokenKind::Error, has_leading_space(), error_length,
  308. byte_offset());
  309. *this = error;
  310. }
  311. private:
  312. friend class Lexer;
  313. static constexpr int PayloadBits = 23;
  314. // Constructor for a TokenKind that carries no payload, or where the payload
  315. // will be set later.
  316. //
  317. // Only used by the lexer which enforces only the correct kinds are used.
  318. //
  319. // When the payload is not being set, we leave it uninitialized. At least in
  320. // some cases, this will allow MSan to correctly detect erroneous attempts
  321. // to access the payload, as it works to track uninitialized memory
  322. // bit-for-bit specifically to handle complex cases like bitfields.
  323. TokenInfo(TokenKind kind, bool has_leading_space, int32_t byte_offset)
  324. : kind_(kind),
  325. has_leading_space_(has_leading_space),
  326. byte_offset_(byte_offset) {}
  327. // Constructor for a TokenKind that carries a payload.
  328. //
  329. // Only used by the lexer which enforces the correct kind and payload types.
  330. TokenInfo(TokenKind kind, bool has_leading_space, int payload,
  331. int32_t byte_offset)
  332. : kind_(kind),
  333. has_leading_space_(has_leading_space),
  334. token_payload_(payload),
  335. byte_offset_(byte_offset) {
  336. CARBON_DCHECK(payload >= 0 && payload < (2 << PayloadBits),
  337. "Payload won't fit into unsigned bit pack: {0}", payload);
  338. }
  339. // A bitfield that encodes the token's kind, the leading space flag, and the
  340. // remaining bits in a payload. These are encoded together as a bitfield for
  341. // density and because these are the hottest fields of tokens for consumers
  342. // after lexing.
  343. TokenKind::RawEnumType kind_ : sizeof(TokenKind) * 8;
  344. bool has_leading_space_ : 1;
  345. unsigned token_payload_ : PayloadBits;
  346. // Separate storage for the byte offset, this is hot while lexing but then
  347. // generally cold.
  348. int32_t byte_offset_;
  349. };
  350. static_assert(sizeof(TokenInfo) == 8,
  351. "Expected `TokenInfo` to pack to an 8-byte structure.");
  352. struct LineInfo {
  353. explicit LineInfo(int32_t start) : start(start), indent(0) {}
  354. // Zero-based byte offset of the start of the line within the source buffer
  355. // provided.
  356. int32_t start;
  357. // The byte offset from the start of the line of the first non-whitespace
  358. // character.
  359. int32_t indent;
  360. };
  361. // The constructor is merely responsible for trivial initialization of
  362. // members. A working object of this type is built with `Lex::Lex` so that its
  363. // return can indicate if an error was encountered while lexing.
  364. explicit TokenizedBuffer(SharedValueStores& value_stores,
  365. SourceBuffer& source)
  366. : value_stores_(&value_stores), source_(&source) {}
  367. auto FindLineIndex(int32_t byte_offset) const -> LineIndex;
  368. auto GetLineInfo(LineIndex line) -> LineInfo&;
  369. auto GetLineInfo(LineIndex line) const -> const LineInfo&;
  370. auto AddLine(LineInfo info) -> LineIndex;
  371. auto GetTokenInfo(TokenIndex token) -> TokenInfo&;
  372. auto GetTokenInfo(TokenIndex token) const -> const TokenInfo&;
  373. auto AddToken(TokenInfo info) -> TokenIndex;
  374. auto GetTokenPrintWidths(TokenIndex token) const -> PrintWidths;
  375. auto PrintToken(llvm::raw_ostream& output_stream, TokenIndex token,
  376. PrintWidths widths) const -> void;
  377. // Used to allocate computed string literals.
  378. llvm::BumpPtrAllocator allocator_;
  379. SharedValueStores* value_stores_;
  380. SourceBuffer* source_;
  381. llvm::SmallVector<TokenInfo> token_infos_;
  382. llvm::SmallVector<LineInfo> line_infos_;
  383. // An upper bound on the number of parse tree nodes that we expect to be
  384. // created for the tokens in this buffer.
  385. int expected_max_parse_tree_size_ = 0;
  386. bool has_errors_ = false;
  387. // A vector of flags for recovery tokens. If empty, there are none. When doing
  388. // token recovery, this will be extended to be indexable by token indices and
  389. // contain true for the tokens that were synthesized for recovery.
  390. llvm::BitVector recovery_tokens_;
  391. };
  392. // A diagnostic emitter that uses positions within a source buffer's text as
  393. // its source of location information.
  394. using LexerDiagnosticEmitter = DiagnosticEmitter<const char*>;
  395. // A diagnostic emitter that uses tokens as its source of location information.
  396. using TokenDiagnosticEmitter = DiagnosticEmitter<TokenIndex>;
  397. inline auto TokenizedBuffer::GetKind(TokenIndex token) const -> TokenKind {
  398. return GetTokenInfo(token).kind();
  399. }
  400. inline auto TokenizedBuffer::HasLeadingWhitespace(TokenIndex token) const
  401. -> bool {
  402. return GetTokenInfo(token).has_leading_space();
  403. }
  404. inline auto TokenizedBuffer::HasTrailingWhitespace(TokenIndex token) const
  405. -> bool {
  406. TokenIterator it(token);
  407. ++it;
  408. return it != tokens().end() && GetTokenInfo(*it).has_leading_space();
  409. }
  410. inline auto TokenizedBuffer::GetTokenInfo(TokenIndex token) -> TokenInfo& {
  411. return token_infos_[token.index];
  412. }
  413. inline auto TokenizedBuffer::GetTokenInfo(TokenIndex token) const
  414. -> const TokenInfo& {
  415. return token_infos_[token.index];
  416. }
  417. inline auto TokenizedBuffer::AddToken(TokenInfo info) -> TokenIndex {
  418. TokenIndex index(token_infos_.size());
  419. token_infos_.push_back(info);
  420. expected_max_parse_tree_size_ += info.kind().expected_max_parse_tree_size();
  421. return index;
  422. }
  423. } // namespace Carbon::Lex
  424. #endif // CARBON_TOOLCHAIN_LEX_TOKENIZED_BUFFER_H_