tokenized_buffer.h 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #ifndef CARBON_TOOLCHAIN_LEX_TOKENIZED_BUFFER_H_
  5. #define CARBON_TOOLCHAIN_LEX_TOKENIZED_BUFFER_H_
  6. #include <compare>
  7. #include <cstdint>
  8. #include <iterator>
  9. #include "common/ostream.h"
  10. #include "llvm/ADT/APInt.h"
  11. #include "llvm/ADT/SmallVector.h"
  12. #include "llvm/ADT/StringRef.h"
  13. #include "llvm/ADT/iterator.h"
  14. #include "llvm/ADT/iterator_range.h"
  15. #include "llvm/Support/Allocator.h"
  16. #include "llvm/Support/raw_ostream.h"
  17. #include "toolchain/base/index_base.h"
  18. #include "toolchain/base/value_store.h"
  19. #include "toolchain/diagnostics/diagnostic_emitter.h"
  20. #include "toolchain/lex/token_kind.h"
  21. #include "toolchain/source/source_buffer.h"
  22. namespace Carbon::Lex {
  23. class TokenizedBuffer;
  24. // A lightweight handle to a lexed token in a `TokenizedBuffer`.
  25. //
  26. // `TokenIndex` objects are designed to be passed by value, not reference or
  27. // pointer. They are also designed to be small and efficient to store in data
  28. // structures.
  29. //
  30. // `TokenIndex` objects from the same `TokenizedBuffer` can be compared with
  31. // each other, both for being the same token within the buffer, and to establish
  32. // relative position within the token stream that has been lexed out of the
  33. // buffer. `TokenIndex` objects from different `TokenizedBuffer`s cannot be
  34. // meaningfully compared.
  35. //
  36. // All other APIs to query a `TokenIndex` are on the `TokenizedBuffer`.
  37. struct TokenIndex : public IndexBase {
  38. static const TokenIndex Invalid;
  39. // Comments aren't tokenized, so this is the first token after FileStart.
  40. static const TokenIndex FirstNonCommentToken;
  41. using IndexBase::IndexBase;
  42. };
  43. constexpr TokenIndex TokenIndex::Invalid(TokenIndex::InvalidIndex);
  44. constexpr TokenIndex TokenIndex::FirstNonCommentToken(1);
  45. // A lightweight handle to a lexed line in a `TokenizedBuffer`.
  46. //
  47. // `LineIndex` objects are designed to be passed by value, not reference or
  48. // pointer. They are also designed to be small and efficient to store in data
  49. // structures.
  50. //
  51. // Each `LineIndex` object refers to a specific line in the source code that was
  52. // lexed. They can be compared directly to establish that they refer to the
  53. // same line or the relative position of different lines within the source.
  54. //
  55. // All other APIs to query a `LineIndex` are on the `TokenizedBuffer`.
  56. struct LineIndex : public IndexBase {
  57. static const LineIndex Invalid;
  58. using IndexBase::IndexBase;
  59. };
  60. constexpr LineIndex LineIndex::Invalid(LineIndex::InvalidIndex);
  61. // Random-access iterator over tokens within the buffer.
  62. class TokenIterator
  63. : public llvm::iterator_facade_base<TokenIterator,
  64. std::random_access_iterator_tag,
  65. const TokenIndex, int>,
  66. public Printable<TokenIterator> {
  67. public:
  68. TokenIterator() = delete;
  69. explicit TokenIterator(TokenIndex token) : token_(token) {}
  70. auto operator==(const TokenIterator& rhs) const -> bool {
  71. return token_ == rhs.token_;
  72. }
  73. auto operator<=>(const TokenIterator& rhs) const -> std::strong_ordering {
  74. return token_ <=> rhs.token_;
  75. }
  76. auto operator*() const -> const TokenIndex& { return token_; }
  77. using iterator_facade_base::operator-;
  78. auto operator-(const TokenIterator& rhs) const -> int {
  79. return token_.index - rhs.token_.index;
  80. }
  81. auto operator+=(int n) -> TokenIterator& {
  82. token_.index += n;
  83. return *this;
  84. }
  85. auto operator-=(int n) -> TokenIterator& {
  86. token_.index -= n;
  87. return *this;
  88. }
  89. // Prints the raw token index.
  90. auto Print(llvm::raw_ostream& output) const -> void;
  91. private:
  92. friend class TokenizedBuffer;
  93. TokenIndex token_;
  94. };
  95. // A diagnostic location converter that maps token locations into source
  96. // buffer locations.
  97. class TokenDiagnosticConverter : public DiagnosticConverter<TokenIndex> {
  98. public:
  99. explicit TokenDiagnosticConverter(const TokenizedBuffer* buffer)
  100. : buffer_(buffer) {}
  101. // Map the given token into a diagnostic location.
  102. auto ConvertLocation(TokenIndex token, ContextFnT context_fn) const
  103. -> DiagnosticLocation override;
  104. private:
  105. const TokenizedBuffer* buffer_;
  106. };
  107. // A buffer of tokenized Carbon source code.
  108. //
  109. // This is constructed by lexing the source code text into a series of tokens.
  110. // The buffer provides lightweight handles to tokens and other lexed entities,
  111. // as well as iterations to walk the sequence of tokens found in the buffer.
  112. //
  113. // Lexing errors result in a potentially incomplete sequence of tokens and
  114. // `HasError` returning true.
  115. class TokenizedBuffer : public Printable<TokenizedBuffer> {
  116. public:
  117. auto GetKind(TokenIndex token) const -> TokenKind;
  118. auto GetLine(TokenIndex token) const -> LineIndex;
  119. // Returns the 1-based line number.
  120. auto GetLineNumber(TokenIndex token) const -> int;
  121. // Returns the 1-based column number.
  122. auto GetColumnNumber(TokenIndex token) const -> int;
  123. // Returns the line and 1-based column number of the first character after
  124. // this token.
  125. auto GetEndLocation(TokenIndex token) const -> std::pair<LineIndex, int>;
  126. // Returns the source text lexed into this token.
  127. auto GetTokenText(TokenIndex token) const -> llvm::StringRef;
  128. // Returns the identifier associated with this token. The token kind must be
  129. // an `Identifier`.
  130. auto GetIdentifier(TokenIndex token) const -> IdentifierId;
  131. // Returns the value of an `IntLiteral()` token.
  132. auto GetIntLiteral(TokenIndex token) const -> IntId;
  133. // Returns the value of an `RealLiteral()` token.
  134. auto GetRealLiteral(TokenIndex token) const -> RealId;
  135. // Returns the value of a `StringLiteral()` token.
  136. auto GetStringLiteralValue(TokenIndex token) const -> StringLiteralValueId;
  137. // Returns the size specified in a `*TypeLiteral()` token.
  138. auto GetTypeLiteralSize(TokenIndex token) const -> const llvm::APInt&;
  139. // Returns the closing token matched with the given opening token.
  140. //
  141. // The given token must be an opening token kind.
  142. auto GetMatchedClosingToken(TokenIndex opening_token) const -> TokenIndex;
  143. // Returns the opening token matched with the given closing token.
  144. //
  145. // The given token must be a closing token kind.
  146. auto GetMatchedOpeningToken(TokenIndex closing_token) const -> TokenIndex;
  147. // Returns whether the given token has leading whitespace.
  148. auto HasLeadingWhitespace(TokenIndex token) const -> bool;
  149. // Returns whether the given token has trailing whitespace.
  150. auto HasTrailingWhitespace(TokenIndex token) const -> bool;
  151. // Returns whether the token was created as part of an error recovery effort.
  152. //
  153. // For example, a closing paren inserted to match an unmatched paren.
  154. auto IsRecoveryToken(TokenIndex token) const -> bool;
  155. // Returns the 1-based line number.
  156. auto GetLineNumber(LineIndex line) const -> int;
  157. // Returns the 1-based indentation column number.
  158. auto GetIndentColumnNumber(LineIndex line) const -> int;
  159. // Returns the next line handle.
  160. auto GetNextLine(LineIndex line) const -> LineIndex;
  161. // Returns the previous line handle.
  162. auto GetPrevLine(LineIndex line) const -> LineIndex;
  163. // Prints a description of the tokenized stream to the provided `raw_ostream`.
  164. //
  165. // It prints one line of information for each token in the buffer, including
  166. // the kind of token, where it occurs within the source file, indentation for
  167. // the associated line, the spelling of the token in source, and any
  168. // additional information tracked such as which unique identifier it is or any
  169. // matched grouping token.
  170. //
  171. // Each line is formatted as a YAML record:
  172. //
  173. // clang-format off
  174. // ```
  175. // token: { index: 0, kind: 'Semi', line: 1, column: 1, indent: 1, spelling: ';' }
  176. // ```
  177. // clang-format on
  178. //
  179. // This can be parsed as YAML using tools like `python-yq` combined with `jq`
  180. // on the command line. The format is also reasonably amenable to other
  181. // line-oriented shell tools from `grep` to `awk`.
  182. auto Print(llvm::raw_ostream& output_stream) const -> void;
  183. // Prints a description of a single token. See `Print` for details on the
  184. // format.
  185. auto PrintToken(llvm::raw_ostream& output_stream, TokenIndex token) const
  186. -> void;
  187. // Returns true if the buffer has errors that were detected at lexing time.
  188. auto has_errors() const -> bool { return has_errors_; }
  189. auto tokens() const -> llvm::iterator_range<TokenIterator> {
  190. return llvm::make_range(TokenIterator(TokenIndex(0)),
  191. TokenIterator(TokenIndex(token_infos_.size())));
  192. }
  193. auto size() const -> int { return token_infos_.size(); }
  194. auto expected_parse_tree_size() const -> int {
  195. return expected_parse_tree_size_;
  196. }
  197. auto source() const -> const SourceBuffer& { return *source_; }
  198. private:
  199. friend class Lexer;
  200. friend class TokenDiagnosticConverter;
  201. // A diagnostic location converter that maps token locations into source
  202. // buffer locations.
  203. class SourceBufferDiagnosticConverter
  204. : public DiagnosticConverter<const char*> {
  205. public:
  206. explicit SourceBufferDiagnosticConverter(const TokenizedBuffer* buffer)
  207. : buffer_(buffer) {}
  208. // Map the given position within the source buffer into a diagnostic
  209. // location.
  210. auto ConvertLocation(const char* loc, ContextFnT context_fn) const
  211. -> DiagnosticLocation override;
  212. private:
  213. const TokenizedBuffer* buffer_;
  214. };
  215. // Specifies minimum widths to use when printing a token's fields via
  216. // `printToken`.
  217. struct PrintWidths {
  218. // Widens `this` to the maximum of `this` and `new_width` for each
  219. // dimension.
  220. auto Widen(const PrintWidths& widths) -> void;
  221. int index;
  222. int kind;
  223. int line;
  224. int column;
  225. int indent;
  226. };
  227. struct TokenInfo {
  228. TokenKind kind;
  229. // Whether the token has trailing whitespace.
  230. bool has_trailing_space = false;
  231. // Whether the token was injected artificially during error recovery.
  232. bool is_recovery = false;
  233. // LineIndex on which the TokenIndex starts.
  234. LineIndex token_line;
  235. // Zero-based byte offset of the token within its line.
  236. int32_t column;
  237. // We may have up to 32 bits of payload, based on the kind of token.
  238. union {
  239. static_assert(
  240. sizeof(TokenIndex) <= sizeof(int32_t),
  241. "Unable to pack token and identifier index into the same space!");
  242. IdentifierId ident_id = IdentifierId::Invalid;
  243. StringLiteralValueId string_literal_id;
  244. IntId int_id;
  245. RealId real_id;
  246. TokenIndex closing_token;
  247. TokenIndex opening_token;
  248. int32_t error_length;
  249. };
  250. };
  251. struct LineInfo {
  252. // The length will always be assigned later. Indent may be assigned if
  253. // non-zero.
  254. explicit LineInfo(int64_t start)
  255. : start(start),
  256. length(static_cast<int32_t>(llvm::StringRef::npos)),
  257. indent(0) {}
  258. explicit LineInfo(int64_t start, int32_t length)
  259. : start(start), length(length), indent(0) {}
  260. // Zero-based byte offset of the start of the line within the source buffer
  261. // provided.
  262. int64_t start;
  263. // The byte length of the line. Does not include the newline character (or a
  264. // nul-terminator or EOF).
  265. int32_t length;
  266. // The byte offset from the start of the line of the first non-whitespace
  267. // character.
  268. int32_t indent;
  269. };
  270. // The constructor is merely responsible for trivial initialization of
  271. // members. A working object of this type is built with `Lex::Lex` so that its
  272. // return can indicate if an error was encountered while lexing.
  273. explicit TokenizedBuffer(SharedValueStores& value_stores,
  274. SourceBuffer& source)
  275. : value_stores_(&value_stores), source_(&source) {}
  276. auto GetLineInfo(LineIndex line) -> LineInfo&;
  277. auto GetLineInfo(LineIndex line) const -> const LineInfo&;
  278. auto AddLine(LineInfo info) -> LineIndex;
  279. auto GetTokenInfo(TokenIndex token) -> TokenInfo&;
  280. auto GetTokenInfo(TokenIndex token) const -> const TokenInfo&;
  281. auto AddToken(TokenInfo info) -> TokenIndex;
  282. auto GetTokenPrintWidths(TokenIndex token) const -> PrintWidths;
  283. auto PrintToken(llvm::raw_ostream& output_stream, TokenIndex token,
  284. PrintWidths widths) const -> void;
  285. // Used to allocate computed string literals.
  286. llvm::BumpPtrAllocator allocator_;
  287. SharedValueStores* value_stores_;
  288. SourceBuffer* source_;
  289. llvm::SmallVector<TokenInfo> token_infos_;
  290. llvm::SmallVector<LineInfo> line_infos_;
  291. // Stores the computed value of string literals so that StringRefs are
  292. // durable.
  293. llvm::SmallVector<std::unique_ptr<std::string>> computed_strings_;
  294. // The number of parse tree nodes that we expect to be created for the tokens
  295. // in this buffer.
  296. int expected_parse_tree_size_ = 0;
  297. bool has_errors_ = false;
  298. };
  299. // A diagnostic emitter that uses positions within a source buffer's text as
  300. // its source of location information.
  301. using LexerDiagnosticEmitter = DiagnosticEmitter<const char*>;
  302. // A diagnostic emitter that uses tokens as its source of location information.
  303. using TokenDiagnosticEmitter = DiagnosticEmitter<TokenIndex>;
  304. } // namespace Carbon::Lex
  305. #endif // CARBON_TOOLCHAIN_LEX_TOKENIZED_BUFFER_H_