parser_impl.cpp 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #include "parser/parser_impl.h"
  5. #include <cstdlib>
  6. #include "lexer/token_kind.h"
  7. #include "lexer/tokenized_buffer.h"
  8. #include "llvm/ADT/Optional.h"
  9. #include "llvm/Support/raw_ostream.h"
  10. #include "parser/parse_node_kind.h"
  11. #include "parser/parse_tree.h"
  12. namespace Carbon {
  13. auto ParseTree::Parser::Parse(TokenizedBuffer& tokens,
  14. DiagnosticEmitter& /*unused*/) -> ParseTree {
  15. ParseTree tree(tokens);
  16. // We expect to have a 1:1 correspondence between tokens and tree nodes, so
  17. // reserve the space we expect to need here to avoid allocation and copying
  18. // overhead.
  19. tree.node_impls.reserve(tokens.Size());
  20. Parser parser(tree, tokens);
  21. while (parser.position != parser.end)
  22. parser.ParseDeclaration();
  23. assert(tree.Verify() && "Parse tree built but does not verify!");
  24. return tree;
  25. }
  26. auto ParseTree::Parser::Consume(TokenKind kind) -> TokenizedBuffer::Token {
  27. TokenizedBuffer::Token t = *position;
  28. assert(tokens.GetKind(t) == kind && "The current token is the wrong kind!");
  29. ++position;
  30. return t;
  31. }
  32. auto ParseTree::Parser::ConsumeIf(TokenKind kind)
  33. -> llvm::Optional<TokenizedBuffer::Token> {
  34. if (tokens.GetKind(*position) != kind)
  35. return {};
  36. return *position++;
  37. }
  38. auto ParseTree::Parser::AddLeafNode(ParseNodeKind kind,
  39. TokenizedBuffer::Token token) -> Node {
  40. Node n(tree.node_impls.size());
  41. tree.node_impls.push_back(NodeImpl(kind, token, /*SubtreeSize=*/1));
  42. return n;
  43. }
  44. auto ParseTree::Parser::ConsumeAndAddLeafNodeIf(TokenKind t_kind,
  45. ParseNodeKind n_kind)
  46. -> llvm::Optional<Node> {
  47. auto t = ConsumeIf(t_kind);
  48. if (!t)
  49. return {};
  50. return AddLeafNode(n_kind, *t);
  51. }
  52. auto ParseTree::Parser::MarkNodeError(Node n) -> void {
  53. tree.node_impls[n.index].has_error = true;
  54. tree.has_errors = true;
  55. }
  56. // A marker for the start of a node's subtree.
  57. //
  58. // This is used to track the size of the node's subtree and ensure at least one
  59. // parse node is added. It can be used repeatedly if multiple subtrees start at
  60. // the same position.
  61. struct ParseTree::Parser::SubtreeStart {
  62. int tree_size;
  63. bool node_added = false;
  64. ~SubtreeStart() {
  65. assert(node_added && "Never added a node for a subtree region!");
  66. }
  67. };
  68. auto ParseTree::Parser::StartSubtree() -> SubtreeStart {
  69. return {static_cast<int>(tree.node_impls.size())};
  70. }
  71. auto ParseTree::Parser::AddNode(ParseNodeKind n_kind, TokenizedBuffer::Token t,
  72. SubtreeStart& start, bool has_error) -> Node {
  73. // The size of the subtree is the change in size from when we started this
  74. // subtree to now, but including the node we're about to add.
  75. int tree_stop_size = tree.node_impls.size() + 1;
  76. int subtree_size = tree_stop_size - start.tree_size;
  77. Node n(tree.node_impls.size());
  78. tree.node_impls.push_back(NodeImpl(n_kind, t, subtree_size));
  79. if (has_error)
  80. MarkNodeError(n);
  81. start.node_added = true;
  82. return n;
  83. }
  84. auto ParseTree::Parser::SkipMatchingGroup() -> bool {
  85. assert(position != end && "Cannot skip at the end!");
  86. TokenizedBuffer::Token t = *position;
  87. TokenKind t_kind = tokens.GetKind(t);
  88. if (!t_kind.IsOpeningSymbol())
  89. return false;
  90. position = std::next(
  91. TokenizedBuffer::TokenIterator(tokens.GetMatchedClosingToken(t)));
  92. return true;
  93. }
  94. auto ParseTree::Parser::SkipPastLikelyDeclarationEnd(
  95. TokenizedBuffer::Token skip_root, bool is_inside_declaration)
  96. -> llvm::Optional<Node> {
  97. if (position == end)
  98. return {};
  99. TokenizedBuffer::Line root_line = tokens.GetLine(skip_root);
  100. int root_line_indent = tokens.GetIndentColumnNumber(root_line);
  101. // We will keep scanning through tokens on the same line as the root or
  102. // lines with greater indentation than root's line.
  103. auto is_same_line_or_indent_greater_than_root =
  104. [&](TokenizedBuffer::Token t) {
  105. TokenizedBuffer::Line l = tokens.GetLine(t);
  106. if (l == root_line)
  107. return true;
  108. return tokens.GetIndentColumnNumber(l) > root_line_indent;
  109. };
  110. do {
  111. TokenKind current_kind = tokens.GetKind(*position);
  112. if (current_kind == TokenKind::CloseCurlyBrace())
  113. // Immediately bail out if we hit an unmatched close curly, this will
  114. // pop us up a level of the syntax grouping.
  115. return {};
  116. // If we find a semicolon, we want to parse it to end the declaration.
  117. if (current_kind == TokenKind::Semi()) {
  118. TokenizedBuffer::Token semi = *position++;
  119. // Add a node for the semicolon. If we're inside of a declaration, this
  120. // is a declaration ending semicolon, otherwise it simply forms an empty
  121. // declaration.
  122. return AddLeafNode(is_inside_declaration
  123. ? ParseNodeKind::DeclarationEnd()
  124. : ParseNodeKind::EmptyDeclaration(),
  125. semi);
  126. }
  127. // Skip over any matching group of tokens.
  128. if (SkipMatchingGroup())
  129. continue;
  130. // Otherwise just step forward one token.
  131. ++position;
  132. } while (position != end &&
  133. is_same_line_or_indent_greater_than_root(*position));
  134. return {};
  135. }
  136. auto ParseTree::Parser::ParseFunctionSignature() -> Node {
  137. assert(position != end && "Cannot parse past the end!");
  138. TokenizedBuffer::Token open_paren = Consume(TokenKind::OpenParen());
  139. assert(position != end &&
  140. "The lexer ensures we always have a closing paren!");
  141. auto start = StartSubtree();
  142. // FIXME: Add support for parsing parameters.
  143. bool has_errors = false;
  144. auto close_paren = ConsumeIf(TokenKind::CloseParen());
  145. if (!close_paren) {
  146. llvm::errs() << "ERROR: unexpected token before the close of the "
  147. "parameters on line "
  148. << tokens.GetLineNumber(*position) << "!\n";
  149. has_errors = true;
  150. // We can trivially skip to the actual close parenthesis from here.
  151. close_paren = tokens.GetMatchedClosingToken(open_paren);
  152. position = std::next(TokenizedBuffer::TokenIterator(*close_paren));
  153. }
  154. AddLeafNode(ParseNodeKind::ParameterListEnd(), *close_paren);
  155. // FIXME: Implement parsing of a return type.
  156. return AddNode(ParseNodeKind::ParameterList(), open_paren, start, has_errors);
  157. }
  158. auto ParseTree::Parser::ParseCodeBlock() -> Node {
  159. assert(position != end && "Cannot parse past the end!");
  160. TokenizedBuffer::Token open_curly = Consume(TokenKind::OpenCurlyBrace());
  161. assert(position != end &&
  162. "The lexer ensures we always have a closing curly!");
  163. auto start = StartSubtree();
  164. bool has_errors = false;
  165. // Loop over all the different possibly nested elements in the code block.
  166. for (;;) {
  167. switch (tokens.GetKind(*position)) {
  168. default:
  169. // FIXME: Add support for parsing more expressions & statements.
  170. llvm::errs() << "ERROR: unexpected token before the close of the "
  171. "function definition on line "
  172. << tokens.GetLineNumber(*position) << "!\n";
  173. has_errors = true;
  174. // We can trivially skip to the actual close curly brace from here.
  175. position = TokenizedBuffer::TokenIterator(
  176. tokens.GetMatchedClosingToken(open_curly));
  177. // Now fall through to the close curly brace handling code.
  178. LLVM_FALLTHROUGH;
  179. case TokenKind::CloseCurlyBrace():
  180. break;
  181. case TokenKind::OpenCurlyBrace():
  182. // FIXME: We should consider avoiding recursion here with some side
  183. // stack.
  184. ParseCodeBlock();
  185. continue;
  186. }
  187. // We only continue looping with `continue` above.
  188. break;
  189. }
  190. // We always reach here having set our position in the token stream to the
  191. // close curly brace.
  192. AddLeafNode(ParseNodeKind::CodeBlockEnd(),
  193. Consume(TokenKind::CloseCurlyBrace()));
  194. return AddNode(ParseNodeKind::CodeBlock(), open_curly, start, has_errors);
  195. }
  196. auto ParseTree::Parser::ParseFunctionDeclaration() -> Node {
  197. assert(position != end && "Cannot parse past the end!");
  198. TokenizedBuffer::Token function_intro_token = Consume(TokenKind::FnKeyword());
  199. auto start = StartSubtree();
  200. auto add_error_function_node = [&] {
  201. return AddNode(ParseNodeKind::FunctionDeclaration(), function_intro_token,
  202. start, /*has_error=*/true);
  203. };
  204. if (position == end) {
  205. llvm::errs() << "ERROR: File ended with a function introducer on line "
  206. << tokens.GetLineNumber(function_intro_token) << "!\n";
  207. return add_error_function_node();
  208. }
  209. auto name_n = ConsumeAndAddLeafNodeIf(TokenKind::Identifier(),
  210. ParseNodeKind::Identifier());
  211. if (!name_n) {
  212. llvm::errs() << "ERROR: Function declaration with no name on line "
  213. << tokens.GetLineNumber(function_intro_token) << "!\n";
  214. // FIXME: We could change the lexer to allow us to synthesize certain
  215. // kinds of tokens and try to "recover" here, but unclear that this is
  216. // really useful.
  217. SkipPastLikelyDeclarationEnd(function_intro_token);
  218. return add_error_function_node();
  219. }
  220. if (position == end) {
  221. llvm::errs() << "ERROR: File ended after a function introducer and "
  222. "identifier on line "
  223. << tokens.GetLineNumber(function_intro_token) << "!\n";
  224. return add_error_function_node();
  225. }
  226. TokenizedBuffer::Token open_paren = *position;
  227. if (tokens.GetKind(open_paren) != TokenKind::OpenParen()) {
  228. llvm::errs()
  229. << "ERROR: Missing open parentheses in declaration of function '"
  230. << tokens.GetTokenText(tree.GetNodeToken(*name_n)) << "' on line "
  231. << tokens.GetLineNumber(function_intro_token) << "!\n";
  232. SkipPastLikelyDeclarationEnd(function_intro_token);
  233. return add_error_function_node();
  234. }
  235. assert(std::next(position) != end &&
  236. "Unbalanced parentheses should be rejected by the lexer.");
  237. TokenizedBuffer::Token close_paren =
  238. tokens.GetMatchedClosingToken(open_paren);
  239. Node signature_n = ParseFunctionSignature();
  240. assert(*std::prev(position) == close_paren &&
  241. "Should have parsed through the close paren, whether successfully "
  242. "or with errors.");
  243. if (tree.node_impls[signature_n.index].has_error) {
  244. // Don't try to parse more of the function declaration, but consume a
  245. // declaration ending semicolon if found (without going to a new line).
  246. SkipPastLikelyDeclarationEnd(function_intro_token);
  247. return add_error_function_node();
  248. }
  249. // See if we should parse a definition which is represented as a code block.
  250. if (tokens.GetKind(*position) == TokenKind::OpenCurlyBrace()) {
  251. ParseCodeBlock();
  252. } else if (!ConsumeAndAddLeafNodeIf(TokenKind::Semi(),
  253. ParseNodeKind::DeclarationEnd())) {
  254. llvm::errs() << "ERROR: Function declaration not terminated by a "
  255. "semicolon on line "
  256. << tokens.GetLineNumber(close_paren) << "!\n";
  257. if (tokens.GetLine(*position) == tokens.GetLine(close_paren))
  258. // Only need to skip if we've not already found a new line.
  259. SkipPastLikelyDeclarationEnd(function_intro_token);
  260. return add_error_function_node();
  261. }
  262. // Successfully parsed the function, add that node.
  263. return AddNode(ParseNodeKind::FunctionDeclaration(), function_intro_token,
  264. start);
  265. }
  266. auto ParseTree::Parser::ParseEmptyDeclaration() -> Node {
  267. assert(position != end && "Cannot parse past the end!");
  268. return AddLeafNode(ParseNodeKind::EmptyDeclaration(),
  269. Consume(TokenKind::Semi()));
  270. }
  271. auto ParseTree::Parser::ParseDeclaration() -> llvm::Optional<Node> {
  272. assert(position != end && "Cannot parse past the end!");
  273. TokenizedBuffer::Token t = *position;
  274. switch (tokens.GetKind(t)) {
  275. case TokenKind::FnKeyword():
  276. return ParseFunctionDeclaration();
  277. case TokenKind::Semi():
  278. return ParseEmptyDeclaration();
  279. }
  280. // We didn't recognize an introducer for a valid declaration.
  281. llvm::errs() << "ERROR: Unrecognized declaration introducer '"
  282. << tokens.GetTokenText(t) << "' on line "
  283. << tokens.GetLineNumber(t) << "!\n";
  284. // Skip forward past any end of a declaration we simply didn't understand so
  285. // that we can find the start of the next declaration or the end of a scope.
  286. if (auto found_semi_n =
  287. SkipPastLikelyDeclarationEnd(t, /*is_inside_declaration=*/false)) {
  288. MarkNodeError(*found_semi_n);
  289. return *found_semi_n;
  290. }
  291. // Nothing, not even a semicolon found. We still need to mark that an error
  292. // occurred though.
  293. tree.has_errors = true;
  294. return {};
  295. }
  296. } // namespace Carbon