tokenized_buffer.h 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #ifndef TOOLCHAIN_LEXER_TOKENIZED_BUFFER_H_
  5. #define TOOLCHAIN_LEXER_TOKENIZED_BUFFER_H_
  6. #include <cstdint>
  7. #include <iterator>
  8. #include "llvm/ADT/APInt.h"
  9. #include "llvm/ADT/DenseMap.h"
  10. #include "llvm/ADT/Optional.h"
  11. #include "llvm/ADT/SmallVector.h"
  12. #include "llvm/ADT/StringRef.h"
  13. #include "llvm/ADT/iterator.h"
  14. #include "llvm/ADT/iterator_range.h"
  15. #include "toolchain/diagnostics/diagnostic_emitter.h"
  16. #include "toolchain/lexer/token_kind.h"
  17. #include "toolchain/source/source_buffer.h"
  18. namespace Carbon {
  19. class TokenizedBuffer;
  20. namespace Internal {
  21. // A lightweight handle to a lexed token in a `TokenizedBuffer`.
  22. //
  23. // This type's preferred name is `TokenizedBuffer::Token` and is only defined
  24. // outside the class to break a dependency cycle.
  25. //
  26. // `Token` objects are designed to be passed by value, not reference or
  27. // pointer. They are also designed to be small and efficient to store in data
  28. // structures.
  29. //
  30. // `Token` objects from the same `TokenizedBuffer` can be compared with each
  31. // other, both for being the same token within the buffer, and to establish
  32. // relative position within the token stream that has been lexed out of the
  33. // buffer. `Token` objects from different `TokenizedBuffer`s cannot be
  34. // meaningfully compared.
  35. //
  36. // All other APIs to query a `Token` are on the `TokenizedBuffer`.
  37. class TokenizedBufferToken {
  38. public:
  39. using Token = TokenizedBufferToken;
  40. TokenizedBufferToken() = default;
  41. friend auto operator==(Token lhs, Token rhs) -> bool {
  42. return lhs.index == rhs.index;
  43. }
  44. friend auto operator!=(Token lhs, Token rhs) -> bool {
  45. return lhs.index != rhs.index;
  46. }
  47. friend auto operator<(Token lhs, Token rhs) -> bool {
  48. return lhs.index < rhs.index;
  49. }
  50. friend auto operator<=(Token lhs, Token rhs) -> bool {
  51. return lhs.index <= rhs.index;
  52. }
  53. friend auto operator>(Token lhs, Token rhs) -> bool {
  54. return lhs.index > rhs.index;
  55. }
  56. friend auto operator>=(Token lhs, Token rhs) -> bool {
  57. return lhs.index >= rhs.index;
  58. }
  59. private:
  60. friend TokenizedBuffer;
  61. explicit TokenizedBufferToken(int index) : index(index) {}
  62. int32_t index;
  63. };
  64. } // namespace Internal
  65. // A buffer of tokenized Carbon source code.
  66. //
  67. // This is constructed by lexing the source code text into a series of tokens.
  68. // The buffer provides lightweight handles to tokens and other lexed entities,
  69. // as well as iterations to walk the sequence of tokens found in the buffer.
  70. //
  71. // Lexing errors result in a potentially incomplete sequence of tokens and
  72. // `HasError` returning true.
  73. class TokenizedBuffer {
  74. public:
  75. // A lightweight handle to a lexed token in a `TokenizedBuffer`.
  76. using Token = Internal::TokenizedBufferToken;
  77. // A lightweight handle to a lexed line in a `TokenizedBuffer`.
  78. //
  79. // `Line` objects are designed to be passed by value, not reference or
  80. // pointer. They are also designed to be small and efficient to store in data
  81. // structures.
  82. //
  83. // Each `Line` object refers to a specific line in the source code that was
  84. // lexed. They can be compared directly to establish that they refer to the
  85. // same line or the relative position of different lines within the source.
  86. //
  87. // All other APIs to query a `Line` are on the `TokenizedBuffer`.
  88. class Line {
  89. public:
  90. Line() = default;
  91. friend auto operator==(Line lhs, Line rhs) -> bool {
  92. return lhs.index == rhs.index;
  93. }
  94. friend auto operator!=(Line lhs, Line rhs) -> bool {
  95. return lhs.index != rhs.index;
  96. }
  97. friend auto operator<(Line lhs, Line rhs) -> bool {
  98. return lhs.index < rhs.index;
  99. }
  100. friend auto operator<=(Line lhs, Line rhs) -> bool {
  101. return lhs.index <= rhs.index;
  102. }
  103. friend auto operator>(Line lhs, Line rhs) -> bool {
  104. return lhs.index > rhs.index;
  105. }
  106. friend auto operator>=(Line lhs, Line rhs) -> bool {
  107. return lhs.index >= rhs.index;
  108. }
  109. private:
  110. friend class TokenizedBuffer;
  111. explicit Line(int index) : index(index) {}
  112. int32_t index;
  113. };
  114. // A lightweight handle to a lexed identifier in a `TokenizedBuffer`.
  115. //
  116. // `Identifier` objects are designed to be passed by value, not reference or
  117. // pointer. They are also designed to be small and efficient to store in data
  118. // structures.
  119. //
  120. // Each identifier lexed is canonicalized to a single entry in the identifier
  121. // table. `Identifier` objects will compare equal if they refer to the same
  122. // identifier spelling. Where the identifier was written is not preserved.
  123. //
  124. // All other APIs to query a `Identifier` are on the `TokenizedBuffer`.
  125. class Identifier {
  126. public:
  127. Identifier() = default;
  128. // Most normal APIs are provided by the `TokenizedBuffer`, we just support
  129. // basic comparison operations.
  130. friend auto operator==(Identifier lhs, Identifier rhs) -> bool {
  131. return lhs.index == rhs.index;
  132. }
  133. friend auto operator!=(Identifier lhs, Identifier rhs) -> bool {
  134. return lhs.index != rhs.index;
  135. }
  136. private:
  137. friend class TokenizedBuffer;
  138. explicit Identifier(int index) : index(index) {}
  139. int32_t index;
  140. };
  141. // Random-access iterator over tokens within the buffer.
  142. class TokenIterator
  143. : public llvm::iterator_facade_base<
  144. TokenIterator, std::random_access_iterator_tag, const Token, int> {
  145. public:
  146. TokenIterator() = default;
  147. explicit TokenIterator(Token token) : token(token) {}
  148. auto operator==(const TokenIterator& rhs) const -> bool {
  149. return token == rhs.token;
  150. }
  151. auto operator<(const TokenIterator& rhs) const -> bool {
  152. return token < rhs.token;
  153. }
  154. auto operator*() const -> const Token& { return token; }
  155. using iterator_facade_base::operator-;
  156. auto operator-(const TokenIterator& rhs) const -> int {
  157. return token.index - rhs.token.index;
  158. }
  159. auto operator+=(int n) -> TokenIterator& {
  160. token.index += n;
  161. return *this;
  162. }
  163. auto operator-=(int n) -> TokenIterator& {
  164. token.index -= n;
  165. return *this;
  166. }
  167. private:
  168. friend class TokenizedBuffer;
  169. Token token;
  170. };
  171. // The value of a real literal.
  172. //
  173. // This is either a dyadic fraction (mantissa * 2^exponent) or a decadic
  174. // fraction (mantissa * 10^exponent).
  175. //
  176. // The `TokenizedBuffer` must outlive any `RealLiteralValue`s referring to
  177. // its tokens.
  178. class RealLiteralValue {
  179. const TokenizedBuffer* buffer;
  180. int32_t literal_index;
  181. bool is_decimal;
  182. public:
  183. // The mantissa, represented as an unsigned integer.
  184. [[nodiscard]] auto Mantissa() const -> const llvm::APInt& {
  185. return buffer->literal_int_storage[literal_index];
  186. }
  187. // The exponent, represented as a signed integer.
  188. [[nodiscard]] auto Exponent() const -> const llvm::APInt& {
  189. return buffer->literal_int_storage[literal_index + 1];
  190. }
  191. // If false, the value is mantissa * 2^exponent.
  192. // If true, the value is mantissa * 10^exponent.
  193. [[nodiscard]] auto IsDecimal() const -> bool { return is_decimal; }
  194. private:
  195. friend class TokenizedBuffer;
  196. RealLiteralValue(const TokenizedBuffer* buffer, int32_t literal_index,
  197. bool is_decimal)
  198. : buffer(buffer),
  199. literal_index(literal_index),
  200. is_decimal(is_decimal) {}
  201. };
  202. // A diagnostic location translator that maps token locations into source
  203. // buffer locations.
  204. class TokenLocationTranslator
  205. : public DiagnosticLocationTranslator<Internal::TokenizedBufferToken> {
  206. public:
  207. explicit TokenLocationTranslator(TokenizedBuffer& buffer)
  208. : buffer_(&buffer) {}
  209. // Map the given token into a diagnostic location.
  210. auto GetLocation(Token token) -> Diagnostic::Location override;
  211. private:
  212. TokenizedBuffer* buffer_;
  213. };
  214. // Lexes a buffer of source code into a tokenized buffer.
  215. //
  216. // The provided source buffer must outlive any returned `TokenizedBuffer`
  217. // which will refer into the source.
  218. static auto Lex(SourceBuffer& source, DiagnosticConsumer& consumer)
  219. -> TokenizedBuffer;
  220. // Returns true if the buffer has errors that are detectable at lexing time.
  221. [[nodiscard]] auto HasErrors() const -> bool { return has_errors; }
  222. [[nodiscard]] auto Tokens() const -> llvm::iterator_range<TokenIterator> {
  223. return llvm::make_range(TokenIterator(Token(0)),
  224. TokenIterator(Token(token_infos.size())));
  225. }
  226. [[nodiscard]] auto Size() const -> int { return token_infos.size(); }
  227. [[nodiscard]] auto GetKind(Token token) const -> TokenKind;
  228. [[nodiscard]] auto GetLine(Token token) const -> Line;
  229. // Returns the 1-based line number.
  230. [[nodiscard]] auto GetLineNumber(Token token) const -> int;
  231. // Returns the 1-based column number.
  232. [[nodiscard]] auto GetColumnNumber(Token token) const -> int;
  233. // Returns the source text lexed into this token.
  234. [[nodiscard]] auto GetTokenText(Token token) const -> llvm::StringRef;
  235. // Returns the identifier associated with this token. The token kind must be
  236. // an `Identifier`.
  237. [[nodiscard]] auto GetIdentifier(Token token) const -> Identifier;
  238. // Returns the value of an `IntegerLiteral()` token.
  239. [[nodiscard]] auto GetIntegerLiteral(Token token) const -> const llvm::APInt&;
  240. // Returns the value of an `RealLiteral()` token.
  241. [[nodiscard]] auto GetRealLiteral(Token token) const -> RealLiteralValue;
  242. // Returns the value of a `StringLiteral()` token.
  243. [[nodiscard]] auto GetStringLiteral(Token token) const -> llvm::StringRef;
  244. // Returns the size specified in a `*TypeLiteral()` token.
  245. [[nodiscard]] auto GetTypeLiteralSize(Token token) const
  246. -> const llvm::APInt&;
  247. // Returns the closing token matched with the given opening token.
  248. //
  249. // The given token must be an opening token kind.
  250. [[nodiscard]] auto GetMatchedClosingToken(Token opening_token) const -> Token;
  251. // Returns the opening token matched with the given closing token.
  252. //
  253. // The given token must be a closing token kind.
  254. [[nodiscard]] auto GetMatchedOpeningToken(Token closing_token) const -> Token;
  255. // Returns whether the given token has leading whitespace.
  256. [[nodiscard]] auto HasLeadingWhitespace(Token token) const -> bool;
  257. // Returns whether the given token has trailing whitespace.
  258. [[nodiscard]] auto HasTrailingWhitespace(Token token) const -> bool;
  259. // Returns whether the token was created as part of an error recovery effort.
  260. //
  261. // For example, a closing paren inserted to match an unmatched paren.
  262. [[nodiscard]] auto IsRecoveryToken(Token token) const -> bool;
  263. // Returns the 1-based line number.
  264. [[nodiscard]] auto GetLineNumber(Line line) const -> int;
  265. // Returns the 1-based indentation column number.
  266. [[nodiscard]] auto GetIndentColumnNumber(Line line) const -> int;
  267. // Returns the text for an identifier.
  268. [[nodiscard]] auto GetIdentifierText(Identifier id) const -> llvm::StringRef;
  269. // Prints a description of the tokenized stream to the provided `raw_ostream`.
  270. //
  271. // It prints one line of information for each token in the buffer, including
  272. // the kind of token, where it occurs within the source file, indentation for
  273. // the associated line, the spelling of the token in source, and any
  274. // additional information tracked such as which unique identifier it is or any
  275. // matched grouping token.
  276. //
  277. // Each line is formatted as a YAML record:
  278. //
  279. // clang-format off
  280. // ```
  281. // token: { index: 0, kind: 'Semi', line: 1, column: 1, indent: 1, spelling: ';' }
  282. // ```
  283. // clang-format on
  284. //
  285. // This can be parsed as YAML using tools like `python-yq` combined with `jq`
  286. // on the command line. The format is also reasonably amenable to other
  287. // line-oriented shell tools from `grep` to `awk`.
  288. auto Print(llvm::raw_ostream& output_stream) const -> void;
  289. // Prints a description of a single token. See `print` for details on the
  290. // format.
  291. auto PrintToken(llvm::raw_ostream& output_stream, Token token) const -> void;
  292. private:
  293. // Implementation detail struct implementing the actual lexer logic.
  294. class Lexer;
  295. friend Lexer;
  296. // A diagnostic location translator that maps token locations into source
  297. // buffer locations.
  298. class SourceBufferLocationTranslator
  299. : public DiagnosticLocationTranslator<const char*> {
  300. public:
  301. explicit SourceBufferLocationTranslator(TokenizedBuffer& buffer)
  302. : buffer_(&buffer) {}
  303. // Map the given position within the source buffer into a diagnostic
  304. // location.
  305. auto GetLocation(const char* pos) -> Diagnostic::Location override;
  306. private:
  307. TokenizedBuffer* buffer_;
  308. };
  309. // Specifies minimum widths to use when printing a token's fields via
  310. // `printToken`.
  311. struct PrintWidths {
  312. // Widens `this` to the maximum of `this` and `new_width` for each
  313. // dimension.
  314. auto Widen(const PrintWidths& new_width) -> void;
  315. int index;
  316. int kind;
  317. int column;
  318. int line;
  319. int indent;
  320. };
  321. struct TokenInfo {
  322. TokenKind kind;
  323. // Whether the token has trailing whitespace.
  324. bool has_trailing_space = false;
  325. // Whether the token was injected artificially during error recovery.
  326. bool is_recovery = false;
  327. // Line on which the Token starts.
  328. Line token_line;
  329. // Zero-based byte offset of the token within its line.
  330. int32_t column;
  331. // We may have up to 32 bits of payload, based on the kind of token.
  332. union {
  333. static_assert(
  334. sizeof(Token) <= sizeof(int32_t),
  335. "Unable to pack token and identifier index into the same space!");
  336. Identifier id;
  337. int32_t literal_index;
  338. Token closing_token;
  339. Token opening_token;
  340. int32_t error_length;
  341. };
  342. };
  343. struct LineInfo {
  344. // Zero-based byte offset of the start of the line within the source buffer
  345. // provided.
  346. int64_t start;
  347. // The byte length of the line. Does not include the newline character (or a
  348. // null terminator or EOF).
  349. int32_t length;
  350. // The byte offset from the start of the line of the first non-whitespace
  351. // character.
  352. int32_t indent;
  353. };
  354. struct IdentifierInfo {
  355. llvm::StringRef text;
  356. };
  357. // The constructor is merely responsible for trivial initialization of
  358. // members. A working object of this type is built with the `lex` function
  359. // above so that its return can indicate if an error was encountered while
  360. // lexing.
  361. explicit TokenizedBuffer(SourceBuffer& source) : source(&source) {}
  362. auto GetLineInfo(Line line) -> LineInfo&;
  363. [[nodiscard]] auto GetLineInfo(Line line) const -> const LineInfo&;
  364. auto AddLine(LineInfo info) -> Line;
  365. auto GetTokenInfo(Token token) -> TokenInfo&;
  366. [[nodiscard]] auto GetTokenInfo(Token token) const -> const TokenInfo&;
  367. auto AddToken(TokenInfo info) -> Token;
  368. [[nodiscard]] auto GetTokenPrintWidths(Token token) const -> PrintWidths;
  369. auto PrintToken(llvm::raw_ostream& output_stream, Token token,
  370. PrintWidths widths) const -> void;
  371. SourceBuffer* source;
  372. llvm::SmallVector<TokenInfo, 16> token_infos;
  373. llvm::SmallVector<LineInfo, 16> line_infos;
  374. llvm::SmallVector<IdentifierInfo, 16> identifier_infos;
  375. // Storage for integers that form part of the value of a numeric or type
  376. // literal.
  377. llvm::SmallVector<llvm::APInt, 16> literal_int_storage;
  378. llvm::SmallVector<std::string, 16> literal_string_storage;
  379. llvm::DenseMap<llvm::StringRef, Identifier> identifier_map;
  380. bool has_errors = false;
  381. };
  382. // A diagnostic emitter that uses positions within a source buffer's text as
  383. // its source of location information.
  384. using LexerDiagnosticEmitter = DiagnosticEmitter<const char*>;
  385. // A diagnostic emitter that uses tokens as its source of location information.
  386. using TokenDiagnosticEmitter = DiagnosticEmitter<TokenizedBuffer::Token>;
  387. } // namespace Carbon
  388. #endif // TOOLCHAIN_LEXER_TOKENIZED_BUFFER_H_