tokenized_buffer.h 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #ifndef CARBON_TOOLCHAIN_LEX_TOKENIZED_BUFFER_H_
  5. #define CARBON_TOOLCHAIN_LEX_TOKENIZED_BUFFER_H_
  6. #include <cstdint>
  7. #include "common/ostream.h"
  8. #include "llvm/ADT/APInt.h"
  9. #include "llvm/ADT/SmallVector.h"
  10. #include "llvm/ADT/StringRef.h"
  11. #include "llvm/ADT/iterator_range.h"
  12. #include "llvm/Support/Allocator.h"
  13. #include "llvm/Support/raw_ostream.h"
  14. #include "toolchain/base/index_base.h"
  15. #include "toolchain/base/mem_usage.h"
  16. #include "toolchain/base/shared_value_stores.h"
  17. #include "toolchain/diagnostics/diagnostic_emitter.h"
  18. #include "toolchain/lex/token_index.h"
  19. #include "toolchain/lex/token_info.h"
  20. #include "toolchain/lex/token_kind.h"
  21. #include "toolchain/source/source_buffer.h"
  22. namespace Carbon::Lex {
  23. class TokenizedBuffer;
  24. struct LineInfo {
  25. explicit LineInfo(int32_t start) : start(start), indent(0) {}
  26. // Zero-based byte offset of the start of the line within the source buffer
  27. // provided.
  28. int32_t start;
  29. // The byte offset from the start of the line of the first non-whitespace
  30. // character.
  31. int32_t indent;
  32. };
  33. // A lightweight handle to a lexed `LineInfo` in a `TokenizedBuffer`.
  34. //
  35. // `LineIndex` objects are designed to be passed by value, not reference or
  36. // pointer. They are also designed to be small and efficient to store in data
  37. // structures.
  38. //
  39. // Each `LineIndex` object refers to a specific line in the source code that was
  40. // lexed. They can be compared directly to establish that they refer to the
  41. // same line or the relative position of different lines within the source.
  42. //
  43. // All other APIs to query a `LineIndex` are on the `TokenizedBuffer`.
  44. struct LineIndex : public IndexBase<LineIndex> {
  45. static constexpr llvm::StringLiteral Label = "line";
  46. static const LineIndex None;
  47. using IndexBase::IndexBase;
  48. };
  49. constexpr LineIndex LineIndex::None(NoneIndex);
  50. // A comment, which can be a block of lines. These are tracked separately from
  51. // tokens because they don't affect parse; if they were part of tokens, we'd
  52. // need more general special-casing within token logic.
  53. //
  54. // Note that `CommentInfo` is used for an API to expose the comment.
  55. struct CommentData {
  56. // Zero-based byte offset of the start of the comment within the source
  57. // buffer provided.
  58. int32_t start;
  59. // The comment's length.
  60. int32_t length;
  61. };
  62. // Indices for `CommentData` within the buffer.
  63. struct CommentIndex : public IndexBase<CommentIndex> {
  64. static constexpr llvm::StringLiteral Label = "comment";
  65. static const CommentIndex None;
  66. using IndexBase::IndexBase;
  67. };
  68. constexpr CommentIndex CommentIndex::None(NoneIndex);
  69. // Random-access iterator over comments within the buffer.
  70. using CommentIterator = IndexIterator<CommentIndex>;
  71. // Random-access iterator over tokens within the buffer.
  72. using TokenIterator = IndexIterator<TokenIndex>;
  73. // A token range which is inclusive of the begin and end.
  74. struct InclusiveTokenRange {
  75. TokenIndex begin;
  76. TokenIndex end;
  77. };
  78. // A buffer of tokenized Carbon source code.
  79. //
  80. // This is constructed by lexing the source code text into a series of tokens.
  81. // The buffer provides lightweight handles to tokens and other lexed entities,
  82. // as well as iterations to walk the sequence of tokens found in the buffer.
  83. //
  84. // Lexing errors result in a potentially incomplete sequence of tokens and
  85. // `HasError` returning true.
  86. class TokenizedBuffer : public Printable<TokenizedBuffer> {
  87. public:
  88. // A comment, which can be a block of lines.
  89. //
  90. // This is the API version of `CommentData`.
  91. struct CommentInfo {
  92. // The comment's full text, including `//` symbols. This may have several
  93. // lines for block comments.
  94. llvm::StringRef text;
  95. // The comment's indent.
  96. int32_t indent;
  97. // The first line of the comment.
  98. LineIndex start_line;
  99. };
  100. auto GetKind(TokenIndex token) const -> TokenKind;
  101. auto GetLine(TokenIndex token) const -> LineIndex;
  102. // Returns the 1-based line number.
  103. auto GetLineNumber(TokenIndex token) const -> int;
  104. // Returns the 1-based column number.
  105. auto GetColumnNumber(TokenIndex token) const -> int;
  106. // Returns the line and 1-based column number of the first character after
  107. // this token.
  108. auto GetEndLoc(TokenIndex token) const -> std::pair<LineIndex, int>;
  109. // Returns the source text lexed into this token.
  110. auto GetTokenText(TokenIndex token) const -> llvm::StringRef;
  111. // Returns the identifier associated with this token. The token kind must be
  112. // an `Identifier`.
  113. auto GetIdentifier(TokenIndex token) const -> IdentifierId;
  114. // Returns the value of an `IntLiteral()` token.
  115. auto GetIntLiteral(TokenIndex token) const -> IntId;
  116. // Returns the value of an `RealLiteral()` token.
  117. auto GetRealLiteral(TokenIndex token) const -> RealId;
  118. // Returns the value of a `StringLiteral()` token.
  119. auto GetStringLiteralValue(TokenIndex token) const -> StringLiteralValueId;
  120. // Returns the size specified in a `*TypeLiteral()` token.
  121. auto GetTypeLiteralSize(TokenIndex token) const -> IntId;
  122. // Returns the closing token matched with the given opening token.
  123. //
  124. // The given token must be an opening token kind.
  125. auto GetMatchedClosingToken(TokenIndex opening_token) const -> TokenIndex;
  126. // Returns the opening token matched with the given closing token.
  127. //
  128. // The given token must be a closing token kind.
  129. auto GetMatchedOpeningToken(TokenIndex closing_token) const -> TokenIndex;
  130. // Returns whether the given token has leading whitespace.
  131. auto HasLeadingWhitespace(TokenIndex token) const -> bool;
  132. // Returns whether the given token has trailing whitespace.
  133. auto HasTrailingWhitespace(TokenIndex token) const -> bool;
  134. // Returns whether the token was created as part of an error recovery effort.
  135. //
  136. // For example, a closing paren inserted to match an unmatched paren.
  137. auto IsRecoveryToken(TokenIndex token) const -> bool;
  138. // Returns the 1-based indentation column number.
  139. auto GetIndentColumnNumber(LineIndex line) const -> int;
  140. auto GetByteOffset(TokenIndex token) const -> int32_t {
  141. return token_infos_.Get(token).byte_offset();
  142. }
  143. // Returns true if the token comes after the comment.
  144. auto IsAfterComment(TokenIndex token, CommentIndex comment_index) const
  145. -> bool;
  146. // Returns the comment's full text range.
  147. auto GetCommentText(CommentIndex comment_index) const -> llvm::StringRef;
  148. // Returns tokens as YAML. This prints the tracked token information on a
  149. // single line for each token. We use the single-line format so that output is
  150. // compact, and so that tools like `grep` are compatible.
  151. //
  152. // An example token looks like:
  153. //
  154. // - { index: 1, kind: 'Semi', line: 1, column: 1, indent: 1, spelling: ';' }
  155. auto Print(llvm::raw_ostream& out,
  156. bool omit_file_boundary_tokens = false) const -> void;
  157. // Prints a description of a single token. See `Print` for details on the
  158. // format.
  159. auto PrintToken(llvm::raw_ostream& output_stream, TokenIndex token) const
  160. -> void;
  161. // Collects memory usage of members.
  162. auto CollectMemUsage(MemUsage& mem_usage, llvm::StringRef label) const
  163. -> void;
  164. // Converts a token to a diagnostic location.
  165. auto TokenToDiagnosticLoc(TokenIndex token) const
  166. -> Diagnostics::ConvertedLoc;
  167. // Returns true if the given range overlaps with an entry in
  168. // `dump_sem_ir_ranges_`. Must not be called when there are no ranges; query
  169. // `has_dump_sem_ir_ranges` first.
  170. auto OverlapsWithDumpSemIRRange(Lex::InclusiveTokenRange range) const -> bool;
  171. // Returns true if the buffer has errors that were detected at lexing time.
  172. auto has_errors() const -> bool { return has_errors_; }
  173. auto tokens() const -> llvm::iterator_range<TokenIterator> {
  174. return llvm::make_range(TokenIterator(TokenIndex(0)),
  175. TokenIterator(TokenIndex(token_infos_.size())));
  176. }
  177. auto size() const -> int { return token_infos_.size(); }
  178. auto comments() const -> llvm::iterator_range<CommentIterator> {
  179. return llvm::make_range(CommentIterator(CommentIndex(0)),
  180. CommentIterator(CommentIndex(comments_.size())));
  181. }
  182. auto comments_size() const -> size_t { return comments_.size(); }
  183. // Returns true if any `DumpSemIRRange`s were provided.
  184. auto has_dump_sem_ir_ranges() const -> bool {
  185. return !dump_sem_ir_ranges_.empty();
  186. }
  187. // This is an upper bound on the number of output parse nodes in the absence
  188. // of errors.
  189. auto expected_max_parse_tree_size() const -> int {
  190. return expected_max_parse_tree_size_;
  191. }
  192. auto source() const -> const SourceBuffer& { return *source_; }
  193. private:
  194. friend class Lexer;
  195. class SourcePointerDiagnosticEmitter
  196. : public Diagnostics::Emitter<const char*> {
  197. public:
  198. explicit SourcePointerDiagnosticEmitter(Diagnostics::Consumer* consumer,
  199. const TokenizedBuffer* tokens)
  200. : Emitter(consumer), tokens_(tokens) {}
  201. protected:
  202. auto ConvertLoc(const char* loc, ContextFnT /*context_fn*/) const
  203. -> Diagnostics::ConvertedLoc override {
  204. return tokens_->SourcePointerToDiagnosticLoc(loc);
  205. }
  206. private:
  207. const TokenizedBuffer* tokens_;
  208. };
  209. class TokenDiagnosticEmitter : public Diagnostics::Emitter<TokenIndex> {
  210. public:
  211. explicit TokenDiagnosticEmitter(Diagnostics::Consumer* consumer,
  212. const TokenizedBuffer* tokens)
  213. : Emitter(consumer), tokens_(tokens) {}
  214. protected:
  215. auto ConvertLoc(TokenIndex token, ContextFnT /*context_fn*/) const
  216. -> Diagnostics::ConvertedLoc override {
  217. return tokens_->TokenToDiagnosticLoc(token);
  218. }
  219. private:
  220. const TokenizedBuffer* tokens_;
  221. };
  222. // Converts a pointer into the source to a diagnostic location.
  223. auto SourcePointerToDiagnosticLoc(const char* loc) const
  224. -> Diagnostics::ConvertedLoc;
  225. // Specifies minimum widths to use when printing a token's fields via
  226. // `printToken`.
  227. struct PrintWidths {
  228. // Widens `this` to the maximum of `this` and `new_width` for each
  229. // dimension.
  230. auto Widen(const PrintWidths& widths) -> void;
  231. int index;
  232. int kind;
  233. int line;
  234. int column;
  235. int indent;
  236. };
  237. // The constructor is merely responsible for trivial initialization of
  238. // members. A working object of this type is built with `Lex::Lex` so that its
  239. // return can indicate if an error was encountered while lexing.
  240. explicit TokenizedBuffer(SharedValueStores& value_stores
  241. [[clang::lifetimebound]],
  242. SourceBuffer& source [[clang::lifetimebound]])
  243. : value_stores_(&value_stores), source_(&source) {}
  244. auto FindLineIndex(int32_t byte_offset) const -> LineIndex;
  245. // Adds the token and adjusts the expected tree size.
  246. auto AddToken(TokenInfo info) -> TokenIndex;
  247. auto GetTokenPrintWidths(TokenIndex token) const -> PrintWidths;
  248. auto PrintToken(llvm::raw_ostream& output_stream, TokenIndex token,
  249. PrintWidths widths) const -> void;
  250. // Adds a comment. This uses the indent to potentially stitch together two
  251. // adjacent comments.
  252. auto AddComment(int32_t indent, int32_t start, int32_t end) -> void;
  253. // Used to allocate computed string literals.
  254. llvm::BumpPtrAllocator allocator_;
  255. SharedValueStores* value_stores_;
  256. SourceBuffer* source_;
  257. ValueStore<TokenIndex, TokenInfo> token_infos_;
  258. ValueStore<LineIndex, LineInfo> line_infos_;
  259. // Comments in the file.
  260. ValueStore<CommentIndex, CommentData> comments_;
  261. // A range of tokens marked by `//@dump-semir-[begin|end]`.
  262. //
  263. // The particular syntax was chosen because it can be lexed efficiently. It
  264. // only occurs in invalid comment strings, so shouldn't slow down lexing of
  265. // correct code. It's also comment-like because its presence won't affect
  266. // parse/check.
  267. llvm::SmallVector<InclusiveTokenRange> dump_sem_ir_ranges_;
  268. // An upper bound on the number of parse tree nodes that we expect to be
  269. // created for the tokens in this buffer.
  270. int expected_max_parse_tree_size_ = 0;
  271. bool has_errors_ = false;
  272. // A vector of flags for recovery tokens. If empty, there are none. When doing
  273. // token recovery, this will be extended to be indexable by token indices and
  274. // contain true for the tokens that were synthesized for recovery.
  275. llvm::BitVector recovery_tokens_;
  276. };
  277. inline auto TokenizedBuffer::GetKind(TokenIndex token) const -> TokenKind {
  278. return token_infos_.Get(token).kind();
  279. }
  280. inline auto TokenizedBuffer::HasLeadingWhitespace(TokenIndex token) const
  281. -> bool {
  282. return token_infos_.Get(token).has_leading_space();
  283. }
  284. inline auto TokenizedBuffer::HasTrailingWhitespace(TokenIndex token) const
  285. -> bool {
  286. TokenIterator it(token);
  287. ++it;
  288. return it != tokens().end() && token_infos_.Get(*it).has_leading_space();
  289. }
  290. inline auto TokenizedBuffer::AddToken(TokenInfo info) -> TokenIndex {
  291. expected_max_parse_tree_size_ += info.kind().expected_max_parse_tree_size();
  292. return token_infos_.Add(info);
  293. }
  294. } // namespace Carbon::Lex
  295. #endif // CARBON_TOOLCHAIN_LEX_TOKENIZED_BUFFER_H_