tokenized_buffer.h 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #ifndef CARBON_TOOLCHAIN_LEXER_TOKENIZED_BUFFER_H_
  5. #define CARBON_TOOLCHAIN_LEXER_TOKENIZED_BUFFER_H_
  6. #include <cstdint>
  7. #include <iterator>
  8. #include <optional>
  9. #include "common/ostream.h"
  10. #include "llvm/ADT/APInt.h"
  11. #include "llvm/ADT/DenseMap.h"
  12. #include "llvm/ADT/SmallVector.h"
  13. #include "llvm/ADT/StringRef.h"
  14. #include "llvm/ADT/iterator.h"
  15. #include "llvm/ADT/iterator_range.h"
  16. #include "llvm/Support/raw_ostream.h"
  17. #include "toolchain/base/index_base.h"
  18. #include "toolchain/diagnostics/diagnostic_emitter.h"
  19. #include "toolchain/lexer/token_kind.h"
  20. #include "toolchain/source/source_buffer.h"
  21. namespace Carbon {
  22. class TokenizedBuffer;
  23. // A buffer of tokenized Carbon source code.
  24. //
  25. // This is constructed by lexing the source code text into a series of tokens.
  26. // The buffer provides lightweight handles to tokens and other lexed entities,
  27. // as well as iterations to walk the sequence of tokens found in the buffer.
  28. //
  29. // Lexing errors result in a potentially incomplete sequence of tokens and
  30. // `HasError` returning true.
  31. class TokenizedBuffer {
  32. public:
  33. // A lightweight handle to a lexed token in a `TokenizedBuffer`.
  34. //
  35. // `Token` objects are designed to be passed by value, not reference or
  36. // pointer. They are also designed to be small and efficient to store in data
  37. // structures.
  38. //
  39. // `Token` objects from the same `TokenizedBuffer` can be compared with each
  40. // other, both for being the same token within the buffer, and to establish
  41. // relative position within the token stream that has been lexed out of the
  42. // buffer. `Token` objects from different `TokenizedBuffer`s cannot be
  43. // meaningfully compared.
  44. //
  45. // All other APIs to query a `Token` are on the `TokenizedBuffer`.
  46. struct Token : public ComparableIndexBase {
  47. using ComparableIndexBase::ComparableIndexBase;
  48. };
  49. // A lightweight handle to a lexed line in a `TokenizedBuffer`.
  50. //
  51. // `Line` objects are designed to be passed by value, not reference or
  52. // pointer. They are also designed to be small and efficient to store in data
  53. // structures.
  54. //
  55. // Each `Line` object refers to a specific line in the source code that was
  56. // lexed. They can be compared directly to establish that they refer to the
  57. // same line or the relative position of different lines within the source.
  58. //
  59. // All other APIs to query a `Line` are on the `TokenizedBuffer`.
  60. struct Line : public ComparableIndexBase {
  61. using ComparableIndexBase::ComparableIndexBase;
  62. };
  63. // A lightweight handle to a lexed identifier in a `TokenizedBuffer`.
  64. //
  65. // `Identifier` objects are designed to be passed by value, not reference or
  66. // pointer. They are also designed to be small and efficient to store in data
  67. // structures.
  68. //
  69. // Each identifier lexed is canonicalized to a single entry in the identifier
  70. // table. `Identifier` objects will compare equal if they refer to the same
  71. // identifier spelling. Where the identifier was written is not preserved.
  72. //
  73. // All other APIs to query a `Identifier` are on the `TokenizedBuffer`.
  74. struct Identifier : public IndexBase {
  75. using IndexBase::IndexBase;
  76. static const Identifier Invalid;
  77. };
  78. // Random-access iterator over tokens within the buffer.
  79. class TokenIterator
  80. : public llvm::iterator_facade_base<
  81. TokenIterator, std::random_access_iterator_tag, const Token, int> {
  82. public:
  83. TokenIterator() = delete;
  84. explicit TokenIterator(Token token) : token_(token) {}
  85. auto operator==(const TokenIterator& rhs) const -> bool {
  86. return token_ == rhs.token_;
  87. }
  88. auto operator<(const TokenIterator& rhs) const -> bool {
  89. return token_ < rhs.token_;
  90. }
  91. auto operator*() const -> const Token& { return token_; }
  92. using iterator_facade_base::operator-;
  93. auto operator-(const TokenIterator& rhs) const -> int {
  94. return token_.index - rhs.token_.index;
  95. }
  96. auto operator+=(int n) -> TokenIterator& {
  97. token_.index += n;
  98. return *this;
  99. }
  100. auto operator-=(int n) -> TokenIterator& {
  101. token_.index -= n;
  102. return *this;
  103. }
  104. // Prints the raw token index.
  105. auto Print(llvm::raw_ostream& output) const -> void;
  106. private:
  107. friend class TokenizedBuffer;
  108. Token token_;
  109. };
  110. // The value of a real literal.
  111. //
  112. // This is either a dyadic fraction (mantissa * 2^exponent) or a decadic
  113. // fraction (mantissa * 10^exponent).
  114. //
  115. // The `TokenizedBuffer` must outlive any `RealLiteralValue`s referring to
  116. // its tokens.
  117. class RealLiteralValue {
  118. public:
  119. // The mantissa, represented as an unsigned integer.
  120. [[nodiscard]] auto Mantissa() const -> const llvm::APInt& {
  121. return buffer_->literal_int_storage_[literal_index_];
  122. }
  123. // The exponent, represented as a signed integer.
  124. [[nodiscard]] auto Exponent() const -> const llvm::APInt& {
  125. return buffer_->literal_int_storage_[literal_index_ + 1];
  126. }
  127. // If false, the value is mantissa * 2^exponent.
  128. // If true, the value is mantissa * 10^exponent.
  129. [[nodiscard]] auto IsDecimal() const -> bool { return is_decimal_; }
  130. auto Print(llvm::raw_ostream& output_stream) const -> void {
  131. output_stream << Mantissa() << "*" << (is_decimal_ ? "10" : "2") << "^"
  132. << Exponent();
  133. }
  134. private:
  135. friend class TokenizedBuffer;
  136. RealLiteralValue(const TokenizedBuffer* buffer, int32_t literal_index,
  137. bool is_decimal)
  138. : buffer_(buffer),
  139. literal_index_(literal_index),
  140. is_decimal_(is_decimal) {}
  141. const TokenizedBuffer* buffer_;
  142. int32_t literal_index_;
  143. bool is_decimal_;
  144. };
  145. // A diagnostic location translator that maps token locations into source
  146. // buffer locations.
  147. class TokenLocationTranslator : public DiagnosticLocationTranslator<Token> {
  148. public:
  149. explicit TokenLocationTranslator(const TokenizedBuffer* buffer)
  150. : buffer_(buffer) {}
  151. // Map the given token into a diagnostic location.
  152. auto GetLocation(Token token) -> DiagnosticLocation override;
  153. private:
  154. const TokenizedBuffer* buffer_;
  155. };
  156. // Lexes a buffer of source code into a tokenized buffer.
  157. //
  158. // The provided source buffer must outlive any returned `TokenizedBuffer`
  159. // which will refer into the source.
  160. static auto Lex(SourceBuffer& source, DiagnosticConsumer& consumer)
  161. -> TokenizedBuffer;
  162. [[nodiscard]] auto GetKind(Token token) const -> TokenKind;
  163. [[nodiscard]] auto GetLine(Token token) const -> Line;
  164. // Returns the 1-based line number.
  165. [[nodiscard]] auto GetLineNumber(Token token) const -> int;
  166. // Returns the 1-based column number.
  167. [[nodiscard]] auto GetColumnNumber(Token token) const -> int;
  168. // Returns the source text lexed into this token.
  169. [[nodiscard]] auto GetTokenText(Token token) const -> llvm::StringRef;
  170. // Returns the identifier associated with this token. The token kind must be
  171. // an `Identifier`.
  172. [[nodiscard]] auto GetIdentifier(Token token) const -> Identifier;
  173. // Returns the value of an `IntegerLiteral()` token.
  174. [[nodiscard]] auto GetIntegerLiteral(Token token) const -> const llvm::APInt&;
  175. // Returns the value of an `RealLiteral()` token.
  176. [[nodiscard]] auto GetRealLiteral(Token token) const -> RealLiteralValue;
  177. // Returns the value of a `StringLiteral()` token.
  178. [[nodiscard]] auto GetStringLiteral(Token token) const -> llvm::StringRef;
  179. // Returns the size specified in a `*TypeLiteral()` token.
  180. [[nodiscard]] auto GetTypeLiteralSize(Token token) const
  181. -> const llvm::APInt&;
  182. // Returns the closing token matched with the given opening token.
  183. //
  184. // The given token must be an opening token kind.
  185. [[nodiscard]] auto GetMatchedClosingToken(Token opening_token) const -> Token;
  186. // Returns the opening token matched with the given closing token.
  187. //
  188. // The given token must be a closing token kind.
  189. [[nodiscard]] auto GetMatchedOpeningToken(Token closing_token) const -> Token;
  190. // Returns whether the given token has leading whitespace.
  191. [[nodiscard]] auto HasLeadingWhitespace(Token token) const -> bool;
  192. // Returns whether the given token has trailing whitespace.
  193. [[nodiscard]] auto HasTrailingWhitespace(Token token) const -> bool;
  194. // Returns whether the token was created as part of an error recovery effort.
  195. //
  196. // For example, a closing paren inserted to match an unmatched paren.
  197. [[nodiscard]] auto IsRecoveryToken(Token token) const -> bool;
  198. // Returns the 1-based line number.
  199. [[nodiscard]] auto GetLineNumber(Line line) const -> int;
  200. // Returns the 1-based indentation column number.
  201. [[nodiscard]] auto GetIndentColumnNumber(Line line) const -> int;
  202. // Returns the text for an identifier.
  203. [[nodiscard]] auto GetIdentifierText(Identifier id) const -> llvm::StringRef;
  204. // Prints a description of the tokenized stream to the provided `raw_ostream`.
  205. //
  206. // It prints one line of information for each token in the buffer, including
  207. // the kind of token, where it occurs within the source file, indentation for
  208. // the associated line, the spelling of the token in source, and any
  209. // additional information tracked such as which unique identifier it is or any
  210. // matched grouping token.
  211. //
  212. // Each line is formatted as a YAML record:
  213. //
  214. // clang-format off
  215. // ```
  216. // token: { index: 0, kind: 'Semi', line: 1, column: 1, indent: 1, spelling: ';' }
  217. // ```
  218. // clang-format on
  219. //
  220. // This can be parsed as YAML using tools like `python-yq` combined with `jq`
  221. // on the command line. The format is also reasonably amenable to other
  222. // line-oriented shell tools from `grep` to `awk`.
  223. auto Print(llvm::raw_ostream& output_stream) const -> void;
  224. // Prints a description of a single token. See `Print` for details on the
  225. // format.
  226. auto PrintToken(llvm::raw_ostream& output_stream, Token token) const -> void;
  227. // Returns true if the buffer has errors that are detectable at lexing time.
  228. [[nodiscard]] auto has_errors() const -> bool { return has_errors_; }
  229. [[nodiscard]] auto tokens() const -> llvm::iterator_range<TokenIterator> {
  230. return llvm::make_range(TokenIterator(Token(0)),
  231. TokenIterator(Token(token_infos_.size())));
  232. }
  233. [[nodiscard]] auto size() const -> int { return token_infos_.size(); }
  234. [[nodiscard]] auto expected_parse_tree_size() const -> int {
  235. return expected_parse_tree_size_;
  236. }
  237. private:
  238. // Implementation detail struct implementing the actual lexer logic.
  239. class Lexer;
  240. friend Lexer;
  241. // A diagnostic location translator that maps token locations into source
  242. // buffer locations.
  243. class SourceBufferLocationTranslator
  244. : public DiagnosticLocationTranslator<const char*> {
  245. public:
  246. explicit SourceBufferLocationTranslator(const TokenizedBuffer* buffer)
  247. : buffer_(buffer) {}
  248. // Map the given position within the source buffer into a diagnostic
  249. // location.
  250. auto GetLocation(const char* loc) -> DiagnosticLocation override;
  251. private:
  252. const TokenizedBuffer* buffer_;
  253. };
  254. // Specifies minimum widths to use when printing a token's fields via
  255. // `printToken`.
  256. struct PrintWidths {
  257. // Widens `this` to the maximum of `this` and `new_width` for each
  258. // dimension.
  259. auto Widen(const PrintWidths& widths) -> void;
  260. int index;
  261. int kind;
  262. int line;
  263. int column;
  264. int indent;
  265. };
  266. struct TokenInfo {
  267. TokenKind kind;
  268. // Whether the token has trailing whitespace.
  269. bool has_trailing_space = false;
  270. // Whether the token was injected artificially during error recovery.
  271. bool is_recovery = false;
  272. // Line on which the Token starts.
  273. Line token_line;
  274. // Zero-based byte offset of the token within its line.
  275. int32_t column;
  276. // We may have up to 32 bits of payload, based on the kind of token.
  277. union {
  278. static_assert(
  279. sizeof(Token) <= sizeof(int32_t),
  280. "Unable to pack token and identifier index into the same space!");
  281. Identifier id = Identifier::Invalid;
  282. int32_t literal_index;
  283. Token closing_token;
  284. Token opening_token;
  285. int32_t error_length;
  286. };
  287. };
  288. struct LineInfo {
  289. // The length will always be assigned later. Indent may be assigned if
  290. // non-zero.
  291. explicit LineInfo(int64_t start)
  292. : start(start),
  293. length(static_cast<int32_t>(llvm::StringRef::npos)),
  294. indent(0) {}
  295. // Zero-based byte offset of the start of the line within the source buffer
  296. // provided.
  297. int64_t start;
  298. // The byte length of the line. Does not include the newline character (or a
  299. // nul-terminator or EOF).
  300. int32_t length;
  301. // The byte offset from the start of the line of the first non-whitespace
  302. // character.
  303. int32_t indent;
  304. };
  305. struct IdentifierInfo {
  306. llvm::StringRef text;
  307. };
  308. // The constructor is merely responsible for trivial initialization of
  309. // members. A working object of this type is built with the `lex` function
  310. // above so that its return can indicate if an error was encountered while
  311. // lexing.
  312. explicit TokenizedBuffer(SourceBuffer& source) : source_(&source) {}
  313. auto GetLineInfo(Line line) -> LineInfo&;
  314. [[nodiscard]] auto GetLineInfo(Line line) const -> const LineInfo&;
  315. auto AddLine(LineInfo info) -> Line;
  316. auto GetTokenInfo(Token token) -> TokenInfo&;
  317. [[nodiscard]] auto GetTokenInfo(Token token) const -> const TokenInfo&;
  318. auto AddToken(TokenInfo info) -> Token;
  319. [[nodiscard]] auto GetTokenPrintWidths(Token token) const -> PrintWidths;
  320. auto PrintToken(llvm::raw_ostream& output_stream, Token token,
  321. PrintWidths widths) const -> void;
  322. SourceBuffer* source_;
  323. llvm::SmallVector<TokenInfo> token_infos_;
  324. llvm::SmallVector<LineInfo> line_infos_;
  325. llvm::SmallVector<IdentifierInfo> identifier_infos_;
  326. // Storage for integers that form part of the value of a numeric or type
  327. // literal.
  328. llvm::SmallVector<llvm::APInt> literal_int_storage_;
  329. llvm::SmallVector<std::string> literal_string_storage_;
  330. llvm::DenseMap<llvm::StringRef, Identifier> identifier_map_;
  331. // The number of parse tree nodes that we expect to be created for the tokens
  332. // in this buffer.
  333. int expected_parse_tree_size_ = 0;
  334. bool has_errors_ = false;
  335. };
  336. constexpr TokenizedBuffer::Identifier TokenizedBuffer::Identifier::Invalid =
  337. TokenizedBuffer::Identifier(TokenizedBuffer::Identifier::InvalidIndex);
  338. // A diagnostic emitter that uses positions within a source buffer's text as
  339. // its source of location information.
  340. using LexerDiagnosticEmitter = DiagnosticEmitter<const char*>;
  341. // A diagnostic emitter that uses tokens as its source of location information.
  342. using TokenDiagnosticEmitter = DiagnosticEmitter<TokenizedBuffer::Token>;
  343. } // namespace Carbon
  344. #endif // CARBON_TOOLCHAIN_LEXER_TOKENIZED_BUFFER_H_