tokenized_buffer.h 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #ifndef CARBON_TOOLCHAIN_LEX_TOKENIZED_BUFFER_H_
  5. #define CARBON_TOOLCHAIN_LEX_TOKENIZED_BUFFER_H_
  6. #include <cstdint>
  7. #include "common/ostream.h"
  8. #include "llvm/ADT/APInt.h"
  9. #include "llvm/ADT/SmallVector.h"
  10. #include "llvm/ADT/StringRef.h"
  11. #include "llvm/ADT/iterator_range.h"
  12. #include "llvm/Support/Allocator.h"
  13. #include "llvm/Support/raw_ostream.h"
  14. #include "toolchain/base/index_base.h"
  15. #include "toolchain/base/mem_usage.h"
  16. #include "toolchain/base/shared_value_stores.h"
  17. #include "toolchain/diagnostics/diagnostic_emitter.h"
  18. #include "toolchain/lex/token_index.h"
  19. #include "toolchain/lex/token_info.h"
  20. #include "toolchain/lex/token_kind.h"
  21. #include "toolchain/source/source_buffer.h"
  22. namespace Carbon::Lex {
  23. class TokenizedBuffer;
  24. struct LineInfo {
  25. explicit LineInfo(int32_t start) : start(start), indent(0) {}
  26. // Zero-based byte offset of the start of the line within the source buffer
  27. // provided.
  28. int32_t start;
  29. // The byte offset from the start of the line of the first non-whitespace
  30. // character.
  31. int32_t indent;
  32. };
  33. // A lightweight handle to a lexed `LineInfo` in a `TokenizedBuffer`.
  34. //
  35. // `LineIndex` objects are designed to be passed by value, not reference or
  36. // pointer. They are also designed to be small and efficient to store in data
  37. // structures.
  38. //
  39. // Each `LineIndex` object refers to a specific line in the source code that was
  40. // lexed. They can be compared directly to establish that they refer to the
  41. // same line or the relative position of different lines within the source.
  42. //
  43. // All other APIs to query a `LineIndex` are on the `TokenizedBuffer`.
  44. struct LineIndex : public IndexBase<LineIndex> {
  45. static constexpr llvm::StringLiteral Label = "line";
  46. static const LineIndex None;
  47. using IndexBase::IndexBase;
  48. };
  49. inline constexpr LineIndex LineIndex::None(NoneIndex);
  50. // A comment, which can be a block of lines. These are tracked separately from
  51. // tokens because they don't affect parse; if they were part of tokens, we'd
  52. // need more general special-casing within token logic.
  53. //
  54. // Note that `CommentInfo` is used for an API to expose the comment.
  55. struct CommentData {
  56. // Zero-based byte offset of the start of the comment within the source
  57. // buffer provided.
  58. int32_t start;
  59. // The comment's length.
  60. int32_t length;
  61. };
  62. // Indices for `CommentData` within the buffer.
  63. struct CommentIndex : public IndexBase<CommentIndex> {
  64. static constexpr llvm::StringLiteral Label = "comment";
  65. static const CommentIndex None;
  66. using IndexBase::IndexBase;
  67. };
  68. inline constexpr CommentIndex CommentIndex::None(NoneIndex);
  69. // Random-access iterator over comments within the buffer.
  70. using CommentIterator = IndexIterator<CommentIndex>;
  71. // Random-access iterator over tokens within the buffer.
  72. using TokenIterator = IndexIterator<TokenIndex>;
  73. // A token range which is inclusive of the begin and end.
  74. struct InclusiveTokenRange {
  75. TokenIndex begin;
  76. TokenIndex end;
  77. };
  78. // A buffer of tokenized Carbon source code.
  79. //
  80. // This is constructed by lexing the source code text into a series of tokens.
  81. // The buffer provides lightweight handles to tokens and other lexed entities,
  82. // as well as iterations to walk the sequence of tokens found in the buffer.
  83. //
  84. // Lexing errors result in a potentially incomplete sequence of tokens and
  85. // `HasError` returning true.
  86. class TokenizedBuffer : public Printable<TokenizedBuffer> {
  87. public:
  88. // A comment, which can be a block of lines.
  89. //
  90. // This is the API version of `CommentData`.
  91. struct CommentInfo {
  92. // The comment's full text, including `//` symbols. This may have several
  93. // lines for block comments.
  94. llvm::StringRef text;
  95. // The comment's indent.
  96. int32_t indent;
  97. // The first line of the comment.
  98. LineIndex start_line;
  99. };
  100. auto GetKind(TokenIndex token) const -> TokenKind;
  101. auto GetLine(TokenIndex token) const -> LineIndex;
  102. // Returns the 1-based line number.
  103. auto GetLineNumber(TokenIndex token) const -> int;
  104. // Returns the 1-based column number.
  105. auto GetColumnNumber(TokenIndex token) const -> int;
  106. // Returns the line and 1-based column number of the first character after
  107. // this token.
  108. auto GetEndLoc(TokenIndex token) const -> std::pair<LineIndex, int>;
  109. // Returns the source text lexed into this token.
  110. auto GetTokenText(TokenIndex token) const -> llvm::StringRef;
  111. // Returns the identifier associated with this token. The token kind must be
  112. // an `Identifier`.
  113. auto GetIdentifier(TokenIndex token) const -> IdentifierId;
  114. // Returns the value of an `IntLiteral` token.
  115. auto GetIntLiteral(TokenIndex token) const -> IntId;
  116. // Returns the value of an `RealLiteral` token.
  117. auto GetRealLiteral(TokenIndex token) const -> RealId;
  118. // Returns the value of a `StringLiteral` token.
  119. auto GetStringLiteralValue(TokenIndex token) const -> StringLiteralValueId;
  120. // Returns the value of a `CharLiteral` token.
  121. auto GetCharLiteralValue(TokenIndex token) const -> CharLiteralValue;
  122. // Returns the size specified in a `*TypeLiteral` token.
  123. auto GetTypeLiteralSize(TokenIndex token) const -> IntId;
  124. // Returns the closing token matched with the given opening token.
  125. //
  126. // The given token must be an opening token kind.
  127. auto GetMatchedClosingToken(TokenIndex opening_token) const -> TokenIndex;
  128. // Returns the opening token matched with the given closing token.
  129. //
  130. // The given token must be a closing token kind.
  131. auto GetMatchedOpeningToken(TokenIndex closing_token) const -> TokenIndex;
  132. // Returns whether the given token has leading whitespace.
  133. auto HasLeadingWhitespace(TokenIndex token) const -> bool;
  134. // Returns whether the given token has trailing whitespace.
  135. auto HasTrailingWhitespace(TokenIndex token) const -> bool;
  136. // Returns whether the token was created as part of an error recovery effort.
  137. //
  138. // For example, a closing paren inserted to match an unmatched paren.
  139. auto IsRecoveryToken(TokenIndex token) const -> bool;
  140. // Returns the 1-based indentation column number.
  141. auto GetIndentColumnNumber(LineIndex line) const -> int;
  142. auto GetByteOffset(TokenIndex token) const -> int32_t {
  143. return token_infos_.Get(token).byte_offset();
  144. }
  145. // Returns true if the token comes after the comment.
  146. auto IsAfterComment(TokenIndex token, CommentIndex comment_index) const
  147. -> bool;
  148. // Returns the comment's full text range.
  149. auto GetCommentText(CommentIndex comment_index) const -> llvm::StringRef;
  150. // Returns tokens as YAML. This prints the tracked token information on a
  151. // single line for each token. We use the single-line format so that output is
  152. // compact, and so that tools like `grep` are compatible.
  153. //
  154. // An example token looks like:
  155. //
  156. // - { index: 1, kind: 'Semi', line: 1, column: 1, indent: 1, spelling: ';' }
  157. auto Print(llvm::raw_ostream& out,
  158. bool omit_file_boundary_tokens = false) const -> void;
  159. // Prints a description of a single token. See `Print` for details on the
  160. // format.
  161. auto PrintToken(llvm::raw_ostream& output_stream, TokenIndex token) const
  162. -> void;
  163. // Collects memory usage of members.
  164. auto CollectMemUsage(MemUsage& mem_usage, llvm::StringRef label) const
  165. -> void;
  166. // Converts a token to a diagnostic location.
  167. auto TokenToDiagnosticLoc(TokenIndex token) const
  168. -> Diagnostics::ConvertedLoc;
  169. // Returns true if the given range overlaps with an entry in
  170. // `dump_sem_ir_ranges_`. Must not be called when there are no ranges; query
  171. // `has_dump_sem_ir_ranges` first.
  172. auto OverlapsWithDumpSemIRRange(Lex::InclusiveTokenRange range) const -> bool;
  173. // Returns true if the buffer has errors that were detected at lexing time.
  174. auto has_errors() const -> bool { return has_errors_; }
  175. auto tokens() const -> llvm::iterator_range<TokenIterator> {
  176. return llvm::make_range(TokenIterator(TokenIndex(0)),
  177. TokenIterator(TokenIndex(token_infos_.size())));
  178. }
  179. auto size() const -> int { return token_infos_.size(); }
  180. auto comments() const -> llvm::iterator_range<CommentIterator> {
  181. return llvm::make_range(CommentIterator(CommentIndex(0)),
  182. CommentIterator(CommentIndex(comments_.size())));
  183. }
  184. auto comments_size() const -> size_t { return comments_.size(); }
  185. auto has_include_in_dumps() const -> bool { return has_include_in_dumps_; }
  186. // Returns true if any `DumpSemIRRange`s were provided.
  187. auto has_dump_sem_ir_ranges() const -> bool {
  188. return !dump_sem_ir_ranges_.empty();
  189. }
  190. // This is an upper bound on the number of output parse nodes in the absence
  191. // of errors.
  192. auto expected_max_parse_tree_size() const -> int {
  193. return expected_max_parse_tree_size_;
  194. }
  195. auto source() const -> const SourceBuffer& { return *source_; }
  196. private:
  197. friend class Lexer;
  198. class SourcePointerDiagnosticEmitter
  199. : public Diagnostics::Emitter<const char*> {
  200. public:
  201. explicit SourcePointerDiagnosticEmitter(Diagnostics::Consumer* consumer,
  202. const TokenizedBuffer* tokens)
  203. : Emitter(consumer), tokens_(tokens) {}
  204. protected:
  205. auto ConvertLoc(const char* loc, ContextFnT /*context_fn*/) const
  206. -> Diagnostics::ConvertedLoc override {
  207. return tokens_->SourcePointerToDiagnosticLoc(loc);
  208. }
  209. private:
  210. const TokenizedBuffer* tokens_;
  211. };
  212. class TokenDiagnosticEmitter : public Diagnostics::Emitter<TokenIndex> {
  213. public:
  214. explicit TokenDiagnosticEmitter(Diagnostics::Consumer* consumer,
  215. const TokenizedBuffer* tokens)
  216. : Emitter(consumer), tokens_(tokens) {}
  217. protected:
  218. auto ConvertLoc(TokenIndex token, ContextFnT /*context_fn*/) const
  219. -> Diagnostics::ConvertedLoc override {
  220. return tokens_->TokenToDiagnosticLoc(token);
  221. }
  222. private:
  223. const TokenizedBuffer* tokens_;
  224. };
  225. // Converts a pointer into the source to a diagnostic location.
  226. auto SourcePointerToDiagnosticLoc(const char* loc) const
  227. -> Diagnostics::ConvertedLoc;
  228. // Specifies minimum widths to use when printing a token's fields via
  229. // `printToken`.
  230. struct PrintWidths {
  231. // Widens `this` to the maximum of `this` and `new_width` for each
  232. // dimension.
  233. auto Widen(const PrintWidths& widths) -> void;
  234. int index;
  235. int kind;
  236. int line;
  237. int column;
  238. int indent;
  239. };
  240. // The constructor is merely responsible for trivial initialization of
  241. // members. A working object of this type is built with `Lex::Lex` so that its
  242. // return can indicate if an error was encountered while lexing.
  243. explicit TokenizedBuffer(SharedValueStores& value_stores
  244. [[clang::lifetimebound]],
  245. SourceBuffer& source [[clang::lifetimebound]])
  246. : value_stores_(&value_stores), source_(&source) {}
  247. auto FindLineIndex(int32_t byte_offset) const -> LineIndex;
  248. // Adds the token and adjusts the expected tree size.
  249. auto AddToken(TokenInfo info) -> TokenIndex;
  250. auto GetTokenPrintWidths(TokenIndex token) const -> PrintWidths;
  251. auto PrintToken(llvm::raw_ostream& output_stream, TokenIndex token,
  252. PrintWidths widths) const -> void;
  253. // Adds a comment. This uses the indent to potentially stitch together two
  254. // adjacent comments.
  255. auto AddComment(int32_t indent, int32_t start, int32_t end) -> void;
  256. // Used to allocate computed string literals.
  257. llvm::BumpPtrAllocator allocator_;
  258. SharedValueStores* value_stores_;
  259. SourceBuffer* source_;
  260. ValueStore<TokenIndex, TokenInfo> token_infos_;
  261. ValueStore<LineIndex, LineInfo> line_infos_;
  262. // Comments in the file.
  263. ValueStore<CommentIndex, CommentData> comments_;
  264. // Whether SemIR dumping is explicitly enabled for this file. This is marked
  265. // by `//@include-in-dumps`, and overrides other file-inclusion selection
  266. // choices. It can be combined with ranges.
  267. bool has_include_in_dumps_ = false;
  268. // A range of tokens marked by `//@dump-sem-ir-[begin|end]`.
  269. //
  270. // The particular syntax was chosen because it can be lexed efficiently. It
  271. // only occurs in invalid comment strings, so shouldn't slow down lexing of
  272. // correct code. It's also comment-like because its presence won't affect
  273. // parse/check.
  274. llvm::SmallVector<InclusiveTokenRange> dump_sem_ir_ranges_;
  275. // An upper bound on the number of parse tree nodes that we expect to be
  276. // created for the tokens in this buffer.
  277. int expected_max_parse_tree_size_ = 0;
  278. bool has_errors_ = false;
  279. // A vector of flags for recovery tokens. If empty, there are none. When doing
  280. // token recovery, this will be extended to be indexable by token indices and
  281. // contain true for the tokens that were synthesized for recovery.
  282. llvm::BitVector recovery_tokens_;
  283. };
  284. inline auto TokenizedBuffer::GetKind(TokenIndex token) const -> TokenKind {
  285. return token_infos_.Get(token).kind();
  286. }
  287. inline auto TokenizedBuffer::HasLeadingWhitespace(TokenIndex token) const
  288. -> bool {
  289. return token_infos_.Get(token).has_leading_space();
  290. }
  291. inline auto TokenizedBuffer::HasTrailingWhitespace(TokenIndex token) const
  292. -> bool {
  293. TokenIterator it(token);
  294. ++it;
  295. return it != tokens().end() && token_infos_.Get(*it).has_leading_space();
  296. }
  297. inline auto TokenizedBuffer::AddToken(TokenInfo info) -> TokenIndex {
  298. expected_max_parse_tree_size_ += info.kind().expected_max_parse_tree_size();
  299. return token_infos_.Add(info);
  300. }
  301. } // namespace Carbon::Lex
  302. #endif // CARBON_TOOLCHAIN_LEX_TOKENIZED_BUFFER_H_