tokenized_buffer.h 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #ifndef LEXER_TOKENIZED_BUFFER_H_
  5. #define LEXER_TOKENIZED_BUFFER_H_
  6. #include <stdint.h>
  7. #include <iterator>
  8. #include "diagnostics/diagnostic_emitter.h"
  9. #include "lexer/token_kind.h"
  10. #include "llvm/ADT/APInt.h"
  11. #include "llvm/ADT/DenseMap.h"
  12. #include "llvm/ADT/Optional.h"
  13. #include "llvm/ADT/SmallVector.h"
  14. #include "llvm/ADT/StringRef.h"
  15. #include "llvm/ADT/iterator.h"
  16. #include "llvm/ADT/iterator_range.h"
  17. #include "source/source_buffer.h"
  18. namespace Carbon {
  19. // A buffer of tokenized Carbon source code.
  20. //
  21. // This is constructed by lexing the source code text into a series of tokens.
  22. // The buffer provides lightweight handles to tokens and other lexed entities,
  23. // as well as iterations to walk the sequence of tokens found in the buffer.
  24. //
  25. // Lexing errors result in a potentially incomplete sequence of tokens and
  26. // `HasError` returning true.
  27. class TokenizedBuffer {
  28. public:
  29. // A lightweight handle to a lexed token in a `TokenizedBuffer`.
  30. //
  31. // `Token` objects are designed to be passed by value, not reference or
  32. // pointer. They are also designed to be small and efficient to store in data
  33. // structures.
  34. //
  35. // `Token` objects from the same `TokenizedBuffer` can be compared with each
  36. // other, both for being the same token within the buffer, and to establish
  37. // relative position within the token stream that has been lexed out of the
  38. // buffer.
  39. //
  40. // All other APIs to query a `Token` are on the `TokenizedBuffer`.
  41. class Token {
  42. public:
  43. Token() = default;
  44. bool operator==(const Token& rhs) const { return index == rhs.index; }
  45. bool operator!=(const Token& rhs) const { return index != rhs.index; }
  46. bool operator<(const Token& rhs) const { return index < rhs.index; }
  47. bool operator<=(const Token& rhs) const { return index <= rhs.index; }
  48. bool operator>(const Token& rhs) const { return index > rhs.index; }
  49. bool operator>=(const Token& rhs) const { return index >= rhs.index; }
  50. private:
  51. friend class TokenizedBuffer;
  52. explicit Token(int index) : index(index) {}
  53. int32_t index;
  54. };
  55. // A lightweight handle to a lexed line in a `TokenizedBuffer`.
  56. //
  57. // `Line` objects are designed to be passed by value, not reference or
  58. // pointer. They are also designed to be small and efficient to store in data
  59. // structures.
  60. //
  61. // Each `Line` object refers to a specific line in the source code that was
  62. // lexed. They can be compared directly to establish that they refer to the
  63. // same line or the relative position of different lines within the source.
  64. //
  65. // All other APIs to query a `Line` are on the `TokenizedBuffer`.
  66. class Line {
  67. public:
  68. Line() = default;
  69. bool operator==(const Line& rhs) const { return index == rhs.index; }
  70. bool operator!=(const Line& rhs) const { return index != rhs.index; }
  71. bool operator<(const Line& rhs) const { return index < rhs.index; }
  72. bool operator<=(const Line& rhs) const { return index <= rhs.index; }
  73. bool operator>(const Line& rhs) const { return index > rhs.index; }
  74. bool operator>=(const Line& rhs) const { return index >= rhs.index; }
  75. private:
  76. friend class TokenizedBuffer;
  77. explicit Line(int index) : index(index) {}
  78. int32_t index;
  79. };
  80. // A lightweight handle to a lexed identifier in a `TokenizedBuffer`.
  81. //
  82. // `Identifier` objects are designed to be passed by value, not reference or
  83. // pointer. They are also designed to be small and efficient to store in data
  84. // structures.
  85. //
  86. // Each identifier lexed is canonicalized to a single entry in the identifier
  87. // table. `Identifier` objects will compare equal if they refer to the same
  88. // identifier spelling. Where the identifier was written is not preserved.
  89. //
  90. // All other APIs to query a `Identifier` are on the `TokenizedBuffer`.
  91. class Identifier {
  92. public:
  93. Identifier() = default;
  94. // Most normal APIs are provided by the `TokenizedBuffer`, we just support
  95. // basic comparison operations.
  96. bool operator==(const Identifier& rhs) const { return index == rhs.index; }
  97. bool operator!=(const Identifier& rhs) const { return index != rhs.index; }
  98. private:
  99. friend class TokenizedBuffer;
  100. explicit Identifier(int index) : index(index) {}
  101. int32_t index;
  102. };
  103. // Random-access iterator over tokens within the buffer.
  104. class TokenIterator
  105. : public llvm::iterator_facade_base<
  106. TokenIterator, std::random_access_iterator_tag, Token, int> {
  107. public:
  108. TokenIterator() = default;
  109. explicit TokenIterator(Token token) : token(token) {}
  110. bool operator==(const TokenIterator& rhs) const {
  111. return token == rhs.token;
  112. }
  113. bool operator<(const TokenIterator& rhs) const { return token < rhs.token; }
  114. const Token& operator*() const { return token; }
  115. Token& operator*() { return token; }
  116. int operator-(const TokenIterator& rhs) const {
  117. return token.index - rhs.token.index;
  118. }
  119. TokenIterator& operator+=(int n) {
  120. token.index += n;
  121. return *this;
  122. }
  123. TokenIterator& operator-=(int n) {
  124. token.index -= n;
  125. return *this;
  126. }
  127. private:
  128. friend class TokenizedBuffer;
  129. Token token;
  130. };
  131. // Lexes a buffer of source code into a tokenized buffer.
  132. //
  133. // The provided source buffer must outlive any returned `TokenizedBuffer`
  134. // which will refer into the source.
  135. //
  136. // FIXME: Need to pass in some diagnostic machinery to report the details of
  137. // the error! Right now it prints to stderr.
  138. static TokenizedBuffer Lex(SourceBuffer& source, DiagnosticEmitter& emitter);
  139. // Returns true if the buffer has errors that are detectable at lexing time.
  140. auto HasErrors() const -> bool { return has_errors; }
  141. llvm::iterator_range<TokenIterator> Tokens() const {
  142. return llvm::make_range(TokenIterator(Token(0)),
  143. TokenIterator(Token(token_infos.size())));
  144. }
  145. auto Size() const -> int { return token_infos.size(); }
  146. auto GetKind(Token token) const -> TokenKind;
  147. auto GetLine(Token token) const -> Line;
  148. // Returns the 1-based line number.
  149. auto GetLineNumber(Token token) const -> int;
  150. // Returns the 1-based column number.
  151. auto GetColumnNumber(Token token) const -> int;
  152. // Returns the source text lexed into this token.
  153. auto GetTokenText(Token token) const -> llvm::StringRef;
  154. // Returns the identifier associated with this token. The token kind must be
  155. // an `Identifier`.
  156. auto GetIdentifier(Token token) const -> Identifier;
  157. // Returns the value of an `IntegerLiteral()` token.
  158. auto GetIntegerLiteral(Token token) const -> llvm::APInt;
  159. // Returns the closing token matched with the given opening token.
  160. //
  161. // The given token must be an opening token kind.
  162. auto GetMatchedClosingToken(Token opening_token) const -> Token;
  163. // Returns the opening token matched with the given closing token.
  164. //
  165. // The given token must be a closing token kind.
  166. auto GetMatchedOpeningToken(Token closing_token) const -> Token;
  167. // Returns whether the token was created as part of an error recovery effort.
  168. //
  169. // For example, a closing paren inserted to match an unmatched paren.
  170. auto IsRecoveryToken(Token token) const -> bool;
  171. // Returns the 1-based line number.
  172. auto GetLineNumber(Line line) const -> int;
  173. // Returns the 1-based indentation column number.
  174. auto GetIndentColumnNumber(Line line) const -> int;
  175. // Returns the text for an identifier.
  176. auto GetIdentifierText(Identifier id) const -> llvm::StringRef;
  177. // Prints a description of the tokenized stream to the provided `raw_ostream`.
  178. //
  179. // It prints one line of information for each token in the buffer, including
  180. // the kind of token, where it occurs within the source file, indentation for
  181. // the associated line, the spelling of the token in source, and any
  182. // additional information tracked such as which unique identifier it is or any
  183. // matched grouping token.
  184. //
  185. // Each line is formatted as a YAML record:
  186. //
  187. // clang-format off
  188. // ```
  189. // token: { index: 0, kind: 'Semi', line: 1, column: 1, indent: 1, spelling: ';' }
  190. // ```
  191. // clang-format on
  192. //
  193. // This can be parsed as YAML using tools like `python-yq` combined with `jq`
  194. // on the command line. The format is also reasonably amenable to other
  195. // line-oriented shell tools from `grep` to `awk`.
  196. auto Print(llvm::raw_ostream& output_stream) const -> void;
  197. // Prints a description of a single token. See `print` for details on the
  198. // format.
  199. auto PrintToken(llvm::raw_ostream& output_stream, Token token) const -> void;
  200. private:
  201. // Implementation detail struct implementing the actual lexer logic.
  202. class Lexer;
  203. friend Lexer;
  204. // Specifies minimum widths to use when printing a token's fields via
  205. // `printToken`.
  206. struct PrintWidths {
  207. int index;
  208. int kind;
  209. int column;
  210. int line;
  211. int indent;
  212. // Widens `this` to the maximum of `this` and `new_width` for each
  213. // dimension.
  214. void Widen(const PrintWidths& new_width);
  215. };
  216. struct TokenInfo {
  217. TokenKind kind;
  218. // Whether the token was injected artificially during error recovery.
  219. bool is_recovery = false;
  220. // Line on which the Token starts.
  221. Line token_line;
  222. // Zero-based byte offset of the token within its line.
  223. int32_t column;
  224. // We may have up to 32 bits of payload, based on the kind of token.
  225. union {
  226. static_assert(
  227. sizeof(Token) <= sizeof(int32_t),
  228. "Unable to pack token and identifier index into the same space!");
  229. Identifier id;
  230. int32_t literal_index;
  231. Token closing_token;
  232. Token opening_token;
  233. int32_t error_length;
  234. };
  235. };
  236. struct LineInfo {
  237. // Zero-based byte offset of the start of the line within the source buffer
  238. // provided.
  239. int64_t start;
  240. // The byte length of the line. Does not include the newline character (or a
  241. // null terminator or EOF).
  242. int32_t length;
  243. // The byte offset from the start of the line of the first non-whitespace
  244. // character.
  245. int32_t indent;
  246. };
  247. struct IdentifierInfo {
  248. llvm::StringRef text;
  249. };
  250. // The constructor is merely responsible for trivial initialization of
  251. // members. A working object of this type is built with the `lex` function
  252. // above so that its return can indicate if an error was encountered while
  253. // lexing.
  254. explicit TokenizedBuffer(SourceBuffer& source) : source(&source) {}
  255. auto GetLineInfo(Line line) -> LineInfo&;
  256. auto GetLineInfo(Line line) const -> const LineInfo&;
  257. auto AddLine(LineInfo info) -> Line;
  258. auto GetTokenInfo(Token token) -> TokenInfo&;
  259. auto GetTokenInfo(Token token) const -> const TokenInfo&;
  260. auto AddToken(TokenInfo info) -> Token;
  261. auto GetTokenPrintWidths(Token token) const -> PrintWidths;
  262. auto PrintToken(llvm::raw_ostream& output_stream, Token token,
  263. PrintWidths widths) const -> void;
  264. SourceBuffer* source;
  265. llvm::SmallVector<TokenInfo, 16> token_infos;
  266. llvm::SmallVector<LineInfo, 16> line_infos;
  267. llvm::SmallVector<IdentifierInfo, 16> identifier_infos;
  268. llvm::SmallVector<llvm::APInt, 16> int_literals;
  269. llvm::DenseMap<llvm::StringRef, Identifier> identifier_map;
  270. bool has_errors = false;
  271. };
  272. } // namespace Carbon
  273. #endif // LEXER_TOKENIZED_BUFFER_H_