tokenized_buffer.h 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #ifndef CARBON_TOOLCHAIN_LEXER_TOKENIZED_BUFFER_H_
  5. #define CARBON_TOOLCHAIN_LEXER_TOKENIZED_BUFFER_H_
  6. #include <cstdint>
  7. #include <iterator>
  8. #include "common/ostream.h"
  9. #include "llvm/ADT/APInt.h"
  10. #include "llvm/ADT/DenseMap.h"
  11. #include "llvm/ADT/Optional.h"
  12. #include "llvm/ADT/SmallVector.h"
  13. #include "llvm/ADT/StringRef.h"
  14. #include "llvm/ADT/iterator.h"
  15. #include "llvm/ADT/iterator_range.h"
  16. #include "llvm/Support/raw_ostream.h"
  17. #include "toolchain/diagnostics/diagnostic_emitter.h"
  18. #include "toolchain/lexer/token_kind.h"
  19. #include "toolchain/source/source_buffer.h"
  20. namespace Carbon {
  21. class TokenizedBuffer;
  22. namespace Internal {
  23. // A lightweight handle to a lexed token in a `TokenizedBuffer`.
  24. //
  25. // This type's preferred name is `TokenizedBuffer::Token` and is only defined
  26. // outside the class to break a dependency cycle.
  27. //
  28. // `Token` objects are designed to be passed by value, not reference or
  29. // pointer. They are also designed to be small and efficient to store in data
  30. // structures.
  31. //
  32. // `Token` objects from the same `TokenizedBuffer` can be compared with each
  33. // other, both for being the same token within the buffer, and to establish
  34. // relative position within the token stream that has been lexed out of the
  35. // buffer. `Token` objects from different `TokenizedBuffer`s cannot be
  36. // meaningfully compared.
  37. //
  38. // All other APIs to query a `Token` are on the `TokenizedBuffer`.
  39. class TokenizedBufferToken {
  40. public:
  41. using Token = TokenizedBufferToken;
  42. TokenizedBufferToken() = default;
  43. friend auto operator==(Token lhs, Token rhs) -> bool {
  44. return lhs.index_ == rhs.index_;
  45. }
  46. friend auto operator!=(Token lhs, Token rhs) -> bool {
  47. return lhs.index_ != rhs.index_;
  48. }
  49. friend auto operator<(Token lhs, Token rhs) -> bool {
  50. return lhs.index_ < rhs.index_;
  51. }
  52. friend auto operator<=(Token lhs, Token rhs) -> bool {
  53. return lhs.index_ <= rhs.index_;
  54. }
  55. friend auto operator>(Token lhs, Token rhs) -> bool {
  56. return lhs.index_ > rhs.index_;
  57. }
  58. friend auto operator>=(Token lhs, Token rhs) -> bool {
  59. return lhs.index_ >= rhs.index_;
  60. }
  61. private:
  62. friend TokenizedBuffer;
  63. explicit TokenizedBufferToken(int index) : index_(index) {}
  64. int32_t index_;
  65. };
  66. } // namespace Internal
  67. // A buffer of tokenized Carbon source code.
  68. //
  69. // This is constructed by lexing the source code text into a series of tokens.
  70. // The buffer provides lightweight handles to tokens and other lexed entities,
  71. // as well as iterations to walk the sequence of tokens found in the buffer.
  72. //
  73. // Lexing errors result in a potentially incomplete sequence of tokens and
  74. // `HasError` returning true.
  75. class TokenizedBuffer {
  76. public:
  77. // A lightweight handle to a lexed token in a `TokenizedBuffer`.
  78. using Token = Internal::TokenizedBufferToken;
  79. // A lightweight handle to a lexed line in a `TokenizedBuffer`.
  80. //
  81. // `Line` objects are designed to be passed by value, not reference or
  82. // pointer. They are also designed to be small and efficient to store in data
  83. // structures.
  84. //
  85. // Each `Line` object refers to a specific line in the source code that was
  86. // lexed. They can be compared directly to establish that they refer to the
  87. // same line or the relative position of different lines within the source.
  88. //
  89. // All other APIs to query a `Line` are on the `TokenizedBuffer`.
  90. class Line {
  91. public:
  92. Line() = default;
  93. friend auto operator==(Line lhs, Line rhs) -> bool {
  94. return lhs.index_ == rhs.index_;
  95. }
  96. friend auto operator!=(Line lhs, Line rhs) -> bool {
  97. return lhs.index_ != rhs.index_;
  98. }
  99. friend auto operator<(Line lhs, Line rhs) -> bool {
  100. return lhs.index_ < rhs.index_;
  101. }
  102. friend auto operator<=(Line lhs, Line rhs) -> bool {
  103. return lhs.index_ <= rhs.index_;
  104. }
  105. friend auto operator>(Line lhs, Line rhs) -> bool {
  106. return lhs.index_ > rhs.index_;
  107. }
  108. friend auto operator>=(Line lhs, Line rhs) -> bool {
  109. return lhs.index_ >= rhs.index_;
  110. }
  111. private:
  112. friend class TokenizedBuffer;
  113. explicit Line(int index) : index_(index) {}
  114. int32_t index_;
  115. };
  116. // A lightweight handle to a lexed identifier in a `TokenizedBuffer`.
  117. //
  118. // `Identifier` objects are designed to be passed by value, not reference or
  119. // pointer. They are also designed to be small and efficient to store in data
  120. // structures.
  121. //
  122. // Each identifier lexed is canonicalized to a single entry in the identifier
  123. // table. `Identifier` objects will compare equal if they refer to the same
  124. // identifier spelling. Where the identifier was written is not preserved.
  125. //
  126. // All other APIs to query a `Identifier` are on the `TokenizedBuffer`.
  127. class Identifier {
  128. public:
  129. Identifier() = default;
  130. // Most normal APIs are provided by the `TokenizedBuffer`, we just support
  131. // basic comparison operations.
  132. friend auto operator==(Identifier lhs, Identifier rhs) -> bool {
  133. return lhs.index_ == rhs.index_;
  134. }
  135. friend auto operator!=(Identifier lhs, Identifier rhs) -> bool {
  136. return lhs.index_ != rhs.index_;
  137. }
  138. private:
  139. friend class TokenizedBuffer;
  140. explicit Identifier(int index) : index_(index) {}
  141. int32_t index_;
  142. };
  143. // Random-access iterator over tokens within the buffer.
  144. class TokenIterator
  145. : public llvm::iterator_facade_base<
  146. TokenIterator, std::random_access_iterator_tag, const Token, int> {
  147. public:
  148. TokenIterator() = default;
  149. explicit TokenIterator(Token token) : token_(token) {}
  150. auto operator==(const TokenIterator& rhs) const -> bool {
  151. return token_ == rhs.token_;
  152. }
  153. auto operator<(const TokenIterator& rhs) const -> bool {
  154. return token_ < rhs.token_;
  155. }
  156. auto operator*() const -> const Token& { return token_; }
  157. using iterator_facade_base::operator-;
  158. auto operator-(const TokenIterator& rhs) const -> int {
  159. return token_.index_ - rhs.token_.index_;
  160. }
  161. auto operator+=(int n) -> TokenIterator& {
  162. token_.index_ += n;
  163. return *this;
  164. }
  165. auto operator-=(int n) -> TokenIterator& {
  166. token_.index_ -= n;
  167. return *this;
  168. }
  169. // Prints the raw token index.
  170. auto Print(llvm::raw_ostream& output) const -> void;
  171. private:
  172. friend class TokenizedBuffer;
  173. Token token_;
  174. };
  175. // The value of a real literal.
  176. //
  177. // This is either a dyadic fraction (mantissa * 2^exponent) or a decadic
  178. // fraction (mantissa * 10^exponent).
  179. //
  180. // The `TokenizedBuffer` must outlive any `RealLiteralValue`s referring to
  181. // its tokens.
  182. class RealLiteralValue {
  183. public:
  184. // The mantissa, represented as an unsigned integer.
  185. [[nodiscard]] auto Mantissa() const -> const llvm::APInt& {
  186. return buffer_->literal_int_storage_[literal_index_];
  187. }
  188. // The exponent, represented as a signed integer.
  189. [[nodiscard]] auto Exponent() const -> const llvm::APInt& {
  190. return buffer_->literal_int_storage_[literal_index_ + 1];
  191. }
  192. // If false, the value is mantissa * 2^exponent.
  193. // If true, the value is mantissa * 10^exponent.
  194. [[nodiscard]] auto IsDecimal() const -> bool { return is_decimal_; }
  195. void Print(llvm::raw_ostream& output_stream) const {
  196. output_stream << Mantissa() << "*" << (is_decimal_ ? "10" : "2") << "^"
  197. << Exponent();
  198. }
  199. private:
  200. friend class TokenizedBuffer;
  201. RealLiteralValue(const TokenizedBuffer* buffer, int32_t literal_index,
  202. bool is_decimal)
  203. : buffer_(buffer),
  204. literal_index_(literal_index),
  205. is_decimal_(is_decimal) {}
  206. const TokenizedBuffer* buffer_;
  207. int32_t literal_index_;
  208. bool is_decimal_;
  209. };
  210. // A diagnostic location translator that maps token locations into source
  211. // buffer locations.
  212. class TokenLocationTranslator
  213. : public DiagnosticLocationTranslator<Internal::TokenizedBufferToken> {
  214. public:
  215. explicit TokenLocationTranslator(TokenizedBuffer& buffer,
  216. int* last_line_lexed_to_column)
  217. : buffer_(&buffer),
  218. last_line_lexed_to_column_(last_line_lexed_to_column) {}
  219. // Map the given token into a diagnostic location.
  220. auto GetLocation(Token token) -> DiagnosticLocation override;
  221. private:
  222. TokenizedBuffer* buffer_;
  223. // Passed to SourceBufferLocationTranslator.
  224. int* last_line_lexed_to_column_;
  225. };
  226. // Lexes a buffer of source code into a tokenized buffer.
  227. //
  228. // The provided source buffer must outlive any returned `TokenizedBuffer`
  229. // which will refer into the source.
  230. static auto Lex(SourceBuffer& source, DiagnosticConsumer& consumer)
  231. -> TokenizedBuffer;
  232. [[nodiscard]] auto GetKind(Token token) const -> TokenKind;
  233. [[nodiscard]] auto GetLine(Token token) const -> Line;
  234. // Returns the 1-based line number.
  235. [[nodiscard]] auto GetLineNumber(Token token) const -> int;
  236. // Returns the 1-based column number.
  237. [[nodiscard]] auto GetColumnNumber(Token token) const -> int;
  238. // Returns the source text lexed into this token.
  239. [[nodiscard]] auto GetTokenText(Token token) const -> llvm::StringRef;
  240. // Returns the identifier associated with this token. The token kind must be
  241. // an `Identifier`.
  242. [[nodiscard]] auto GetIdentifier(Token token) const -> Identifier;
  243. // Returns the value of an `IntegerLiteral()` token.
  244. [[nodiscard]] auto GetIntegerLiteral(Token token) const -> const llvm::APInt&;
  245. // Returns the value of an `RealLiteral()` token.
  246. [[nodiscard]] auto GetRealLiteral(Token token) const -> RealLiteralValue;
  247. // Returns the value of a `StringLiteral()` token.
  248. [[nodiscard]] auto GetStringLiteral(Token token) const -> llvm::StringRef;
  249. // Returns the size specified in a `*TypeLiteral()` token.
  250. [[nodiscard]] auto GetTypeLiteralSize(Token token) const
  251. -> const llvm::APInt&;
  252. // Returns the closing token matched with the given opening token.
  253. //
  254. // The given token must be an opening token kind.
  255. [[nodiscard]] auto GetMatchedClosingToken(Token opening_token) const -> Token;
  256. // Returns the opening token matched with the given closing token.
  257. //
  258. // The given token must be a closing token kind.
  259. [[nodiscard]] auto GetMatchedOpeningToken(Token closing_token) const -> Token;
  260. // Returns whether the given token has leading whitespace.
  261. [[nodiscard]] auto HasLeadingWhitespace(Token token) const -> bool;
  262. // Returns whether the given token has trailing whitespace.
  263. [[nodiscard]] auto HasTrailingWhitespace(Token token) const -> bool;
  264. // Returns whether the token was created as part of an error recovery effort.
  265. //
  266. // For example, a closing paren inserted to match an unmatched paren.
  267. [[nodiscard]] auto IsRecoveryToken(Token token) const -> bool;
  268. // Returns the 1-based line number.
  269. [[nodiscard]] auto GetLineNumber(Line line) const -> int;
  270. // Returns the 1-based indentation column number.
  271. [[nodiscard]] auto GetIndentColumnNumber(Line line) const -> int;
  272. // Returns the text for an identifier.
  273. [[nodiscard]] auto GetIdentifierText(Identifier id) const -> llvm::StringRef;
  274. // Prints a description of the tokenized stream to the provided `raw_ostream`.
  275. //
  276. // It prints one line of information for each token in the buffer, including
  277. // the kind of token, where it occurs within the source file, indentation for
  278. // the associated line, the spelling of the token in source, and any
  279. // additional information tracked such as which unique identifier it is or any
  280. // matched grouping token.
  281. //
  282. // Each line is formatted as a YAML record:
  283. //
  284. // clang-format off
  285. // ```
  286. // token: { index: 0, kind: 'Semi', line: 1, column: 1, indent: 1, spelling: ';' }
  287. // ```
  288. // clang-format on
  289. //
  290. // This can be parsed as YAML using tools like `python-yq` combined with `jq`
  291. // on the command line. The format is also reasonably amenable to other
  292. // line-oriented shell tools from `grep` to `awk`.
  293. auto Print(llvm::raw_ostream& output_stream) const -> void;
  294. // Prints a description of a single token. See `print` for details on the
  295. // format.
  296. auto PrintToken(llvm::raw_ostream& output_stream, Token token) const -> void;
  297. // Returns true if the buffer has errors that are detectable at lexing time.
  298. [[nodiscard]] auto has_errors() const -> bool { return has_errors_; }
  299. [[nodiscard]] auto tokens() const -> llvm::iterator_range<TokenIterator> {
  300. return llvm::make_range(TokenIterator(Token(0)),
  301. TokenIterator(Token(token_infos_.size())));
  302. }
  303. [[nodiscard]] auto size() const -> int { return token_infos_.size(); }
  304. private:
  305. // Implementation detail struct implementing the actual lexer logic.
  306. class Lexer;
  307. friend Lexer;
  308. // A diagnostic location translator that maps token locations into source
  309. // buffer locations.
  310. class SourceBufferLocationTranslator
  311. : public DiagnosticLocationTranslator<const char*> {
  312. public:
  313. explicit SourceBufferLocationTranslator(TokenizedBuffer& buffer,
  314. int* last_line_lexed_to_column)
  315. : buffer_(&buffer),
  316. last_line_lexed_to_column_(last_line_lexed_to_column) {}
  317. // Map the given position within the source buffer into a diagnostic
  318. // location.
  319. auto GetLocation(const char* loc) -> DiagnosticLocation override;
  320. private:
  321. TokenizedBuffer* buffer_;
  322. // The last lexed column, for determining whether the last line should be
  323. // checked for unlexed newlines. May be null after lexing is complete.
  324. int* last_line_lexed_to_column_;
  325. };
  326. // Specifies minimum widths to use when printing a token's fields via
  327. // `printToken`.
  328. struct PrintWidths {
  329. // Widens `this` to the maximum of `this` and `new_width` for each
  330. // dimension.
  331. auto Widen(const PrintWidths& widths) -> void;
  332. int index;
  333. int kind;
  334. int column;
  335. int line;
  336. int indent;
  337. };
  338. struct TokenInfo {
  339. TokenKind kind;
  340. // Whether the token has trailing whitespace.
  341. bool has_trailing_space = false;
  342. // Whether the token was injected artificially during error recovery.
  343. bool is_recovery = false;
  344. // Line on which the Token starts.
  345. Line token_line;
  346. // Zero-based byte offset of the token within its line.
  347. int32_t column;
  348. // We may have up to 32 bits of payload, based on the kind of token.
  349. union {
  350. static_assert(
  351. sizeof(Token) <= sizeof(int32_t),
  352. "Unable to pack token and identifier index into the same space!");
  353. Identifier id;
  354. int32_t literal_index;
  355. Token closing_token;
  356. Token opening_token;
  357. int32_t error_length;
  358. };
  359. };
  360. struct LineInfo {
  361. // Zero-based byte offset of the start of the line within the source buffer
  362. // provided.
  363. int64_t start;
  364. // The byte length of the line. Does not include the newline character (or a
  365. // null terminator or EOF).
  366. int32_t length;
  367. // The byte offset from the start of the line of the first non-whitespace
  368. // character.
  369. int32_t indent;
  370. };
  371. struct IdentifierInfo {
  372. llvm::StringRef text;
  373. };
  374. // The constructor is merely responsible for trivial initialization of
  375. // members. A working object of this type is built with the `lex` function
  376. // above so that its return can indicate if an error was encountered while
  377. // lexing.
  378. explicit TokenizedBuffer(SourceBuffer& source) : source_(&source) {}
  379. auto GetLineInfo(Line line) -> LineInfo&;
  380. [[nodiscard]] auto GetLineInfo(Line line) const -> const LineInfo&;
  381. auto AddLine(LineInfo info) -> Line;
  382. auto GetTokenInfo(Token token) -> TokenInfo&;
  383. [[nodiscard]] auto GetTokenInfo(Token token) const -> const TokenInfo&;
  384. auto AddToken(TokenInfo info) -> Token;
  385. [[nodiscard]] auto GetTokenPrintWidths(Token token) const -> PrintWidths;
  386. auto PrintToken(llvm::raw_ostream& output_stream, Token token,
  387. PrintWidths widths) const -> void;
  388. SourceBuffer* source_;
  389. llvm::SmallVector<TokenInfo, 16> token_infos_;
  390. llvm::SmallVector<LineInfo, 16> line_infos_;
  391. llvm::SmallVector<IdentifierInfo, 16> identifier_infos_;
  392. // Storage for integers that form part of the value of a numeric or type
  393. // literal.
  394. llvm::SmallVector<llvm::APInt, 16> literal_int_storage_;
  395. llvm::SmallVector<std::string, 16> literal_string_storage_;
  396. llvm::DenseMap<llvm::StringRef, Identifier> identifier_map_;
  397. bool has_errors_ = false;
  398. };
  399. // A diagnostic emitter that uses positions within a source buffer's text as
  400. // its source of location information.
  401. using LexerDiagnosticEmitter = DiagnosticEmitter<const char*>;
  402. // A diagnostic emitter that uses tokens as its source of location information.
  403. using TokenDiagnosticEmitter = DiagnosticEmitter<TokenizedBuffer::Token>;
  404. } // namespace Carbon
  405. #endif // CARBON_TOOLCHAIN_LEXER_TOKENIZED_BUFFER_H_