tokenized_buffer.h 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #ifndef CARBON_TOOLCHAIN_LEX_TOKENIZED_BUFFER_H_
  5. #define CARBON_TOOLCHAIN_LEX_TOKENIZED_BUFFER_H_
  6. #include <cstdint>
  7. #include <iterator>
  8. #include "common/ostream.h"
  9. #include "llvm/ADT/APInt.h"
  10. #include "llvm/ADT/SmallVector.h"
  11. #include "llvm/ADT/StringRef.h"
  12. #include "llvm/ADT/iterator.h"
  13. #include "llvm/ADT/iterator_range.h"
  14. #include "llvm/Support/Allocator.h"
  15. #include "llvm/Support/raw_ostream.h"
  16. #include "toolchain/base/index_base.h"
  17. #include "toolchain/base/value_store.h"
  18. #include "toolchain/diagnostics/diagnostic_emitter.h"
  19. #include "toolchain/lex/token_kind.h"
  20. #include "toolchain/source/source_buffer.h"
  21. namespace Carbon::Lex {
  22. class TokenizedBuffer;
  23. // A lightweight handle to a lexed token in a `TokenizedBuffer`.
  24. //
  25. // `Token` objects are designed to be passed by value, not reference or
  26. // pointer. They are also designed to be small and efficient to store in data
  27. // structures.
  28. //
  29. // `Token` objects from the same `TokenizedBuffer` can be compared with each
  30. // other, both for being the same token within the buffer, and to establish
  31. // relative position within the token stream that has been lexed out of the
  32. // buffer. `Token` objects from different `TokenizedBuffer`s cannot be
  33. // meaningfully compared.
  34. //
  35. // All other APIs to query a `Token` are on the `TokenizedBuffer`.
  36. struct Token : public ComparableIndexBase {
  37. static const Token Invalid;
  38. // Comments aren't tokenized, so this is the first token after StartOfFile.
  39. static const Token FirstNonCommentToken;
  40. using ComparableIndexBase::ComparableIndexBase;
  41. };
  42. constexpr Token Token::Invalid(Token::InvalidIndex);
  43. constexpr Token Token::FirstNonCommentToken(1);
  44. // A lightweight handle to a lexed line in a `TokenizedBuffer`.
  45. //
  46. // `Line` objects are designed to be passed by value, not reference or
  47. // pointer. They are also designed to be small and efficient to store in data
  48. // structures.
  49. //
  50. // Each `Line` object refers to a specific line in the source code that was
  51. // lexed. They can be compared directly to establish that they refer to the
  52. // same line or the relative position of different lines within the source.
  53. //
  54. // All other APIs to query a `Line` are on the `TokenizedBuffer`.
  55. struct Line : public ComparableIndexBase {
  56. static const Line Invalid;
  57. using ComparableIndexBase::ComparableIndexBase;
  58. };
  59. constexpr Line Line::Invalid(Line::InvalidIndex);
  60. // Random-access iterator over tokens within the buffer.
  61. class TokenIterator
  62. : public llvm::iterator_facade_base<
  63. TokenIterator, std::random_access_iterator_tag, const Token, int>,
  64. public Printable<TokenIterator> {
  65. public:
  66. TokenIterator() = delete;
  67. explicit TokenIterator(Token token) : token_(token) {}
  68. auto operator==(const TokenIterator& rhs) const -> bool {
  69. return token_ == rhs.token_;
  70. }
  71. auto operator<(const TokenIterator& rhs) const -> bool {
  72. return token_ < rhs.token_;
  73. }
  74. auto operator*() const -> const Token& { return token_; }
  75. using iterator_facade_base::operator-;
  76. auto operator-(const TokenIterator& rhs) const -> int {
  77. return token_.index - rhs.token_.index;
  78. }
  79. auto operator+=(int n) -> TokenIterator& {
  80. token_.index += n;
  81. return *this;
  82. }
  83. auto operator-=(int n) -> TokenIterator& {
  84. token_.index -= n;
  85. return *this;
  86. }
  87. // Prints the raw token index.
  88. auto Print(llvm::raw_ostream& output) const -> void;
  89. private:
  90. friend class TokenizedBuffer;
  91. Token token_;
  92. };
  93. // A diagnostic location translator that maps token locations into source
  94. // buffer locations.
  95. class TokenLocationTranslator : public DiagnosticLocationTranslator<Token> {
  96. public:
  97. explicit TokenLocationTranslator(const TokenizedBuffer* buffer)
  98. : buffer_(buffer) {}
  99. // Map the given token into a diagnostic location.
  100. auto GetLocation(Token token) -> DiagnosticLocation override;
  101. private:
  102. const TokenizedBuffer* buffer_;
  103. };
  104. // A buffer of tokenized Carbon source code.
  105. //
  106. // This is constructed by lexing the source code text into a series of tokens.
  107. // The buffer provides lightweight handles to tokens and other lexed entities,
  108. // as well as iterations to walk the sequence of tokens found in the buffer.
  109. //
  110. // Lexing errors result in a potentially incomplete sequence of tokens and
  111. // `HasError` returning true.
  112. class TokenizedBuffer : public Printable<TokenizedBuffer> {
  113. public:
  114. [[nodiscard]] auto GetKind(Token token) const -> TokenKind;
  115. [[nodiscard]] auto GetLine(Token token) const -> Line;
  116. // Returns the 1-based line number.
  117. [[nodiscard]] auto GetLineNumber(Token token) const -> int;
  118. // Returns the 1-based column number.
  119. [[nodiscard]] auto GetColumnNumber(Token token) const -> int;
  120. // Returns the source text lexed into this token.
  121. [[nodiscard]] auto GetTokenText(Token token) const -> llvm::StringRef;
  122. // Returns the identifier associated with this token. The token kind must be
  123. // an `Identifier`.
  124. [[nodiscard]] auto GetIdentifier(Token token) const -> IdentifierId;
  125. // Returns the value of an `IntegerLiteral()` token.
  126. [[nodiscard]] auto GetIntegerLiteral(Token token) const -> IntegerId;
  127. // Returns the value of an `RealLiteral()` token.
  128. [[nodiscard]] auto GetRealLiteral(Token token) const -> RealId;
  129. // Returns the value of a `StringLiteral()` token.
  130. [[nodiscard]] auto GetStringLiteral(Token token) const -> StringLiteralId;
  131. // Returns the size specified in a `*TypeLiteral()` token.
  132. [[nodiscard]] auto GetTypeLiteralSize(Token token) const
  133. -> const llvm::APInt&;
  134. // Returns the closing token matched with the given opening token.
  135. //
  136. // The given token must be an opening token kind.
  137. [[nodiscard]] auto GetMatchedClosingToken(Token opening_token) const -> Token;
  138. // Returns the opening token matched with the given closing token.
  139. //
  140. // The given token must be a closing token kind.
  141. [[nodiscard]] auto GetMatchedOpeningToken(Token closing_token) const -> Token;
  142. // Returns whether the given token has leading whitespace.
  143. [[nodiscard]] auto HasLeadingWhitespace(Token token) const -> bool;
  144. // Returns whether the given token has trailing whitespace.
  145. [[nodiscard]] auto HasTrailingWhitespace(Token token) const -> bool;
  146. // Returns whether the token was created as part of an error recovery effort.
  147. //
  148. // For example, a closing paren inserted to match an unmatched paren.
  149. [[nodiscard]] auto IsRecoveryToken(Token token) const -> bool;
  150. // Returns the 1-based line number.
  151. [[nodiscard]] auto GetLineNumber(Line line) const -> int;
  152. // Returns the 1-based indentation column number.
  153. [[nodiscard]] auto GetIndentColumnNumber(Line line) const -> int;
  154. // Returns the next line handle.
  155. [[nodiscard]] auto GetNextLine(Line line) const -> Line;
  156. // Returns the previous line handle.
  157. [[nodiscard]] auto GetPrevLine(Line line) const -> Line;
  158. // Prints a description of the tokenized stream to the provided `raw_ostream`.
  159. //
  160. // It prints one line of information for each token in the buffer, including
  161. // the kind of token, where it occurs within the source file, indentation for
  162. // the associated line, the spelling of the token in source, and any
  163. // additional information tracked such as which unique identifier it is or any
  164. // matched grouping token.
  165. //
  166. // Each line is formatted as a YAML record:
  167. //
  168. // clang-format off
  169. // ```
  170. // token: { index: 0, kind: 'Semi', line: 1, column: 1, indent: 1, spelling: ';' }
  171. // ```
  172. // clang-format on
  173. //
  174. // This can be parsed as YAML using tools like `python-yq` combined with `jq`
  175. // on the command line. The format is also reasonably amenable to other
  176. // line-oriented shell tools from `grep` to `awk`.
  177. auto Print(llvm::raw_ostream& output_stream) const -> void;
  178. // Prints a description of a single token. See `Print` for details on the
  179. // format.
  180. auto PrintToken(llvm::raw_ostream& output_stream, Token token) const -> void;
  181. // Returns true if the buffer has errors that were detected at lexing time.
  182. [[nodiscard]] auto has_errors() const -> bool { return has_errors_; }
  183. [[nodiscard]] auto tokens() const -> llvm::iterator_range<TokenIterator> {
  184. return llvm::make_range(TokenIterator(Token(0)),
  185. TokenIterator(Token(token_infos_.size())));
  186. }
  187. [[nodiscard]] auto size() const -> int { return token_infos_.size(); }
  188. [[nodiscard]] auto expected_parse_tree_size() const -> int {
  189. return expected_parse_tree_size_;
  190. }
  191. auto source() const -> const SourceBuffer& { return *source_; }
  192. private:
  193. friend class Lexer;
  194. friend class TokenLocationTranslator;
  195. // A diagnostic location translator that maps token locations into source
  196. // buffer locations.
  197. class SourceBufferLocationTranslator
  198. : public DiagnosticLocationTranslator<const char*> {
  199. public:
  200. explicit SourceBufferLocationTranslator(const TokenizedBuffer* buffer)
  201. : buffer_(buffer) {}
  202. // Map the given position within the source buffer into a diagnostic
  203. // location.
  204. auto GetLocation(const char* loc) -> DiagnosticLocation override;
  205. private:
  206. const TokenizedBuffer* buffer_;
  207. };
  208. // Specifies minimum widths to use when printing a token's fields via
  209. // `printToken`.
  210. struct PrintWidths {
  211. // Widens `this` to the maximum of `this` and `new_width` for each
  212. // dimension.
  213. auto Widen(const PrintWidths& widths) -> void;
  214. int index;
  215. int kind;
  216. int line;
  217. int column;
  218. int indent;
  219. };
  220. struct TokenInfo {
  221. TokenKind kind;
  222. // Whether the token has trailing whitespace.
  223. bool has_trailing_space = false;
  224. // Whether the token was injected artificially during error recovery.
  225. bool is_recovery = false;
  226. // Line on which the Token starts.
  227. Line token_line;
  228. // Zero-based byte offset of the token within its line.
  229. int32_t column;
  230. // We may have up to 32 bits of payload, based on the kind of token.
  231. union {
  232. static_assert(
  233. sizeof(Token) <= sizeof(int32_t),
  234. "Unable to pack token and identifier index into the same space!");
  235. IdentifierId ident_id = IdentifierId::Invalid;
  236. StringLiteralId string_literal_id;
  237. IntegerId integer_id;
  238. RealId real_id;
  239. Token closing_token;
  240. Token opening_token;
  241. int32_t error_length;
  242. };
  243. };
  244. struct LineInfo {
  245. // The length will always be assigned later. Indent may be assigned if
  246. // non-zero.
  247. explicit LineInfo(int64_t start)
  248. : start(start),
  249. length(static_cast<int32_t>(llvm::StringRef::npos)),
  250. indent(0) {}
  251. explicit LineInfo(int64_t start, int32_t length)
  252. : start(start), length(length), indent(0) {}
  253. // Zero-based byte offset of the start of the line within the source buffer
  254. // provided.
  255. int64_t start;
  256. // The byte length of the line. Does not include the newline character (or a
  257. // nul-terminator or EOF).
  258. int32_t length;
  259. // The byte offset from the start of the line of the first non-whitespace
  260. // character.
  261. int32_t indent;
  262. };
  263. // The constructor is merely responsible for trivial initialization of
  264. // members. A working object of this type is built with `Lex::Lex` so that its
  265. // return can indicate if an error was encountered while lexing.
  266. explicit TokenizedBuffer(SharedValueStores& value_stores,
  267. SourceBuffer& source)
  268. : value_stores_(&value_stores), source_(&source) {}
  269. auto GetLineInfo(Line line) -> LineInfo&;
  270. [[nodiscard]] auto GetLineInfo(Line line) const -> const LineInfo&;
  271. auto AddLine(LineInfo info) -> Line;
  272. auto GetTokenInfo(Token token) -> TokenInfo&;
  273. [[nodiscard]] auto GetTokenInfo(Token token) const -> const TokenInfo&;
  274. auto AddToken(TokenInfo info) -> Token;
  275. [[nodiscard]] auto GetTokenPrintWidths(Token token) const -> PrintWidths;
  276. auto PrintToken(llvm::raw_ostream& output_stream, Token token,
  277. PrintWidths widths) const -> void;
  278. // Used to allocate computed string literals.
  279. llvm::BumpPtrAllocator allocator_;
  280. SharedValueStores* value_stores_;
  281. SourceBuffer* source_;
  282. llvm::SmallVector<TokenInfo> token_infos_;
  283. llvm::SmallVector<LineInfo> line_infos_;
  284. // Stores the computed value of string literals so that StringRefs are
  285. // durable.
  286. llvm::SmallVector<std::unique_ptr<std::string>> computed_strings_;
  287. // The number of parse tree nodes that we expect to be created for the tokens
  288. // in this buffer.
  289. int expected_parse_tree_size_ = 0;
  290. bool has_errors_ = false;
  291. };
  292. // A diagnostic emitter that uses positions within a source buffer's text as
  293. // its source of location information.
  294. using LexerDiagnosticEmitter = DiagnosticEmitter<const char*>;
  295. // A diagnostic emitter that uses tokens as its source of location information.
  296. using TokenDiagnosticEmitter = DiagnosticEmitter<Token>;
  297. } // namespace Carbon::Lex
  298. #endif // CARBON_TOOLCHAIN_LEX_TOKENIZED_BUFFER_H_