parser_impl.cpp 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #include "parser/parser_impl.h"
  5. #include <cstdlib>
  6. #include "lexer/token_kind.h"
  7. #include "lexer/tokenized_buffer.h"
  8. #include "llvm/ADT/Optional.h"
  9. #include "llvm/Support/raw_ostream.h"
  10. #include "parser/parse_node_kind.h"
  11. #include "parser/parse_tree.h"
  12. namespace Carbon {
  13. auto ParseTree::Parser::Parse(TokenizedBuffer& tokens,
  14. DiagnosticEmitter& /*unused*/) -> ParseTree {
  15. ParseTree tree(tokens);
  16. // We expect to have a 1:1 correspondence between tokens and tree nodes, so
  17. // reserve the space we expect to need here to avoid allocation and copying
  18. // overhead.
  19. tree.node_impls.reserve(tokens.Size());
  20. Parser parser(tree, tokens);
  21. while (parser.position != parser.end) {
  22. parser.ParseDeclaration();
  23. }
  24. assert(tree.Verify() && "Parse tree built but does not verify!");
  25. return tree;
  26. }
  27. auto ParseTree::Parser::Consume(TokenKind kind) -> TokenizedBuffer::Token {
  28. TokenizedBuffer::Token t = *position;
  29. assert(tokens.GetKind(t) == kind && "The current token is the wrong kind!");
  30. ++position;
  31. return t;
  32. }
  33. auto ParseTree::Parser::ConsumeIf(TokenKind kind)
  34. -> llvm::Optional<TokenizedBuffer::Token> {
  35. if (tokens.GetKind(*position) != kind) {
  36. return {};
  37. }
  38. return *position++;
  39. }
  40. auto ParseTree::Parser::AddLeafNode(ParseNodeKind kind,
  41. TokenizedBuffer::Token token) -> Node {
  42. Node n(tree.node_impls.size());
  43. tree.node_impls.push_back(NodeImpl(kind, token, /*subtree_size_arg=*/1));
  44. return n;
  45. }
  46. auto ParseTree::Parser::ConsumeAndAddLeafNodeIf(TokenKind t_kind,
  47. ParseNodeKind n_kind)
  48. -> llvm::Optional<Node> {
  49. auto t = ConsumeIf(t_kind);
  50. if (!t) {
  51. return {};
  52. }
  53. return AddLeafNode(n_kind, *t);
  54. }
  55. auto ParseTree::Parser::MarkNodeError(Node n) -> void {
  56. tree.node_impls[n.index].has_error = true;
  57. tree.has_errors = true;
  58. }
  59. // A marker for the start of a node's subtree.
  60. //
  61. // This is used to track the size of the node's subtree and ensure at least one
  62. // parse node is added. It can be used repeatedly if multiple subtrees start at
  63. // the same position.
  64. struct ParseTree::Parser::SubtreeStart {
  65. int tree_size;
  66. bool node_added = false;
  67. ~SubtreeStart() {
  68. assert(node_added && "Never added a node for a subtree region!");
  69. }
  70. };
  71. auto ParseTree::Parser::StartSubtree() -> SubtreeStart {
  72. return {static_cast<int>(tree.node_impls.size())};
  73. }
  74. auto ParseTree::Parser::AddNode(ParseNodeKind n_kind, TokenizedBuffer::Token t,
  75. SubtreeStart& start, bool has_error) -> Node {
  76. // The size of the subtree is the change in size from when we started this
  77. // subtree to now, but including the node we're about to add.
  78. int tree_stop_size = static_cast<int>(tree.node_impls.size()) + 1;
  79. int subtree_size = tree_stop_size - start.tree_size;
  80. Node n(tree.node_impls.size());
  81. tree.node_impls.push_back(NodeImpl(n_kind, t, subtree_size));
  82. if (has_error) {
  83. MarkNodeError(n);
  84. }
  85. start.node_added = true;
  86. return n;
  87. }
  88. auto ParseTree::Parser::SkipMatchingGroup() -> bool {
  89. assert(position != end && "Cannot skip at the end!");
  90. TokenizedBuffer::Token t = *position;
  91. TokenKind t_kind = tokens.GetKind(t);
  92. if (!t_kind.IsOpeningSymbol()) {
  93. return false;
  94. }
  95. position = std::next(
  96. TokenizedBuffer::TokenIterator(tokens.GetMatchedClosingToken(t)));
  97. return true;
  98. }
  99. auto ParseTree::Parser::SkipPastLikelyDeclarationEnd(
  100. TokenizedBuffer::Token skip_root, bool is_inside_declaration)
  101. -> llvm::Optional<Node> {
  102. if (position == end) {
  103. return {};
  104. }
  105. TokenizedBuffer::Line root_line = tokens.GetLine(skip_root);
  106. int root_line_indent = tokens.GetIndentColumnNumber(root_line);
  107. // We will keep scanning through tokens on the same line as the root or
  108. // lines with greater indentation than root's line.
  109. auto is_same_line_or_indent_greater_than_root =
  110. [&](TokenizedBuffer::Token t) {
  111. TokenizedBuffer::Line l = tokens.GetLine(t);
  112. if (l == root_line) {
  113. return true;
  114. }
  115. return tokens.GetIndentColumnNumber(l) > root_line_indent;
  116. };
  117. do {
  118. TokenKind current_kind = tokens.GetKind(*position);
  119. if (current_kind == TokenKind::CloseCurlyBrace()) {
  120. // Immediately bail out if we hit an unmatched close curly, this will
  121. // pop us up a level of the syntax grouping.
  122. return {};
  123. }
  124. // If we find a semicolon, we want to parse it to end the declaration.
  125. if (current_kind == TokenKind::Semi()) {
  126. TokenizedBuffer::Token semi = *position++;
  127. // Add a node for the semicolon. If we're inside of a declaration, this
  128. // is a declaration ending semicolon, otherwise it simply forms an empty
  129. // declaration.
  130. return AddLeafNode(is_inside_declaration
  131. ? ParseNodeKind::DeclarationEnd()
  132. : ParseNodeKind::EmptyDeclaration(),
  133. semi);
  134. }
  135. // Skip over any matching group of tokens.
  136. if (SkipMatchingGroup()) {
  137. continue;
  138. }
  139. // Otherwise just step forward one token.
  140. ++position;
  141. } while (position != end &&
  142. is_same_line_or_indent_greater_than_root(*position));
  143. return {};
  144. }
  145. auto ParseTree::Parser::ParseFunctionSignature() -> Node {
  146. assert(position != end && "Cannot parse past the end!");
  147. TokenizedBuffer::Token open_paren = Consume(TokenKind::OpenParen());
  148. assert(position != end &&
  149. "The lexer ensures we always have a closing paren!");
  150. auto start = StartSubtree();
  151. // FIXME: Add support for parsing parameters.
  152. bool has_errors = false;
  153. auto close_paren = ConsumeIf(TokenKind::CloseParen());
  154. if (!close_paren) {
  155. llvm::errs() << "ERROR: unexpected token before the close of the "
  156. "parameters on line "
  157. << tokens.GetLineNumber(*position) << "!\n";
  158. has_errors = true;
  159. // We can trivially skip to the actual close parenthesis from here.
  160. close_paren = tokens.GetMatchedClosingToken(open_paren);
  161. position = std::next(TokenizedBuffer::TokenIterator(*close_paren));
  162. }
  163. AddLeafNode(ParseNodeKind::ParameterListEnd(), *close_paren);
  164. // FIXME: Implement parsing of a return type.
  165. return AddNode(ParseNodeKind::ParameterList(), open_paren, start, has_errors);
  166. }
  167. auto ParseTree::Parser::ParseCodeBlock() -> Node {
  168. assert(position != end && "Cannot parse past the end!");
  169. TokenizedBuffer::Token open_curly = Consume(TokenKind::OpenCurlyBrace());
  170. assert(position != end &&
  171. "The lexer ensures we always have a closing curly!");
  172. auto start = StartSubtree();
  173. bool has_errors = false;
  174. // Loop over all the different possibly nested elements in the code block.
  175. for (;;) {
  176. switch (tokens.GetKind(*position)) {
  177. default:
  178. // FIXME: Add support for parsing more expressions & statements.
  179. llvm::errs() << "ERROR: unexpected token before the close of the "
  180. "function definition on line "
  181. << tokens.GetLineNumber(*position) << "!\n";
  182. has_errors = true;
  183. // We can trivially skip to the actual close curly brace from here.
  184. position = TokenizedBuffer::TokenIterator(
  185. tokens.GetMatchedClosingToken(open_curly));
  186. // Now fall through to the close curly brace handling code.
  187. LLVM_FALLTHROUGH;
  188. case TokenKind::CloseCurlyBrace():
  189. break;
  190. case TokenKind::OpenCurlyBrace():
  191. // FIXME: We should consider avoiding recursion here with some side
  192. // stack.
  193. ParseCodeBlock();
  194. continue;
  195. }
  196. // We only continue looping with `continue` above.
  197. break;
  198. }
  199. // We always reach here having set our position in the token stream to the
  200. // close curly brace.
  201. AddLeafNode(ParseNodeKind::CodeBlockEnd(),
  202. Consume(TokenKind::CloseCurlyBrace()));
  203. return AddNode(ParseNodeKind::CodeBlock(), open_curly, start, has_errors);
  204. }
  205. auto ParseTree::Parser::ParseFunctionDeclaration() -> Node {
  206. assert(position != end && "Cannot parse past the end!");
  207. TokenizedBuffer::Token function_intro_token = Consume(TokenKind::FnKeyword());
  208. auto start = StartSubtree();
  209. auto add_error_function_node = [&] {
  210. return AddNode(ParseNodeKind::FunctionDeclaration(), function_intro_token,
  211. start, /*has_error=*/true);
  212. };
  213. if (position == end) {
  214. llvm::errs() << "ERROR: File ended with a function introducer on line "
  215. << tokens.GetLineNumber(function_intro_token) << "!\n";
  216. return add_error_function_node();
  217. }
  218. auto name_n = ConsumeAndAddLeafNodeIf(TokenKind::Identifier(),
  219. ParseNodeKind::Identifier());
  220. if (!name_n) {
  221. llvm::errs() << "ERROR: Function declaration with no name on line "
  222. << tokens.GetLineNumber(function_intro_token) << "!\n";
  223. // FIXME: We could change the lexer to allow us to synthesize certain
  224. // kinds of tokens and try to "recover" here, but unclear that this is
  225. // really useful.
  226. SkipPastLikelyDeclarationEnd(function_intro_token);
  227. return add_error_function_node();
  228. }
  229. if (position == end) {
  230. llvm::errs() << "ERROR: File ended after a function introducer and "
  231. "identifier on line "
  232. << tokens.GetLineNumber(function_intro_token) << "!\n";
  233. return add_error_function_node();
  234. }
  235. TokenizedBuffer::Token open_paren = *position;
  236. if (tokens.GetKind(open_paren) != TokenKind::OpenParen()) {
  237. llvm::errs()
  238. << "ERROR: Missing open parentheses in declaration of function '"
  239. << tokens.GetTokenText(tree.GetNodeToken(*name_n)) << "' on line "
  240. << tokens.GetLineNumber(function_intro_token) << "!\n";
  241. SkipPastLikelyDeclarationEnd(function_intro_token);
  242. return add_error_function_node();
  243. }
  244. assert(std::next(position) != end &&
  245. "Unbalanced parentheses should be rejected by the lexer.");
  246. TokenizedBuffer::Token close_paren =
  247. tokens.GetMatchedClosingToken(open_paren);
  248. Node signature_n = ParseFunctionSignature();
  249. assert(*std::prev(position) == close_paren &&
  250. "Should have parsed through the close paren, whether successfully "
  251. "or with errors.");
  252. if (tree.node_impls[signature_n.index].has_error) {
  253. // Don't try to parse more of the function declaration, but consume a
  254. // declaration ending semicolon if found (without going to a new line).
  255. SkipPastLikelyDeclarationEnd(function_intro_token);
  256. return add_error_function_node();
  257. }
  258. // See if we should parse a definition which is represented as a code block.
  259. if (tokens.GetKind(*position) == TokenKind::OpenCurlyBrace()) {
  260. ParseCodeBlock();
  261. } else if (!ConsumeAndAddLeafNodeIf(TokenKind::Semi(),
  262. ParseNodeKind::DeclarationEnd())) {
  263. llvm::errs() << "ERROR: Function declaration not terminated by a "
  264. "semicolon on line "
  265. << tokens.GetLineNumber(close_paren) << "!\n";
  266. if (tokens.GetLine(*position) == tokens.GetLine(close_paren)) {
  267. // Only need to skip if we've not already found a new line.
  268. SkipPastLikelyDeclarationEnd(function_intro_token);
  269. }
  270. return add_error_function_node();
  271. }
  272. // Successfully parsed the function, add that node.
  273. return AddNode(ParseNodeKind::FunctionDeclaration(), function_intro_token,
  274. start);
  275. }
  276. auto ParseTree::Parser::ParseEmptyDeclaration() -> Node {
  277. assert(position != end && "Cannot parse past the end!");
  278. return AddLeafNode(ParseNodeKind::EmptyDeclaration(),
  279. Consume(TokenKind::Semi()));
  280. }
  281. auto ParseTree::Parser::ParseDeclaration() -> llvm::Optional<Node> {
  282. assert(position != end && "Cannot parse past the end!");
  283. TokenizedBuffer::Token t = *position;
  284. switch (tokens.GetKind(t)) {
  285. case TokenKind::FnKeyword():
  286. return ParseFunctionDeclaration();
  287. case TokenKind::Semi():
  288. return ParseEmptyDeclaration();
  289. default:
  290. // Errors are handled outside the switch.
  291. break;
  292. }
  293. // We didn't recognize an introducer for a valid declaration.
  294. llvm::errs() << "ERROR: Unrecognized declaration introducer '"
  295. << tokens.GetTokenText(t) << "' on line "
  296. << tokens.GetLineNumber(t) << "!\n";
  297. // Skip forward past any end of a declaration we simply didn't understand so
  298. // that we can find the start of the next declaration or the end of a scope.
  299. if (auto found_semi_n =
  300. SkipPastLikelyDeclarationEnd(t, /*is_inside_declaration=*/false)) {
  301. MarkNodeError(*found_semi_n);
  302. return *found_semi_n;
  303. }
  304. // Nothing, not even a semicolon found. We still need to mark that an error
  305. // occurred though.
  306. tree.has_errors = true;
  307. return {};
  308. }
  309. } // namespace Carbon