parser_impl.cpp 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #include "parser/parser_impl.h"
  5. #include <cstdlib>
  6. #include "lexer/token_kind.h"
  7. #include "lexer/tokenized_buffer.h"
  8. #include "llvm/ADT/Optional.h"
  9. #include "llvm/Support/raw_ostream.h"
  10. #include "parser/parse_node_kind.h"
  11. #include "parser/parse_tree.h"
  12. namespace Carbon {
  13. struct UnexpectedTokenInFunctionParams
  14. : SimpleDiagnostic<UnexpectedTokenInFunctionParams> {
  15. static constexpr llvm::StringLiteral ShortName = "syntax-error";
  16. static constexpr llvm::StringLiteral Message =
  17. "Unexpected token in function parameter list.";
  18. };
  19. struct UnexpectedTokenInCodeBlock
  20. : SimpleDiagnostic<UnexpectedTokenInCodeBlock> {
  21. static constexpr llvm::StringLiteral ShortName = "syntax-error";
  22. static constexpr llvm::StringLiteral Message =
  23. "Unexpected token in code block.";
  24. };
  25. struct ExpectedFunctionName : SimpleDiagnostic<ExpectedFunctionName> {
  26. static constexpr llvm::StringLiteral ShortName = "syntax-error";
  27. static constexpr llvm::StringLiteral Message =
  28. "Expected function name after `fn` keyword.";
  29. };
  30. struct ExpectedFunctionParams : SimpleDiagnostic<ExpectedFunctionParams> {
  31. static constexpr llvm::StringLiteral ShortName = "syntax-error";
  32. static constexpr llvm::StringLiteral Message =
  33. "Expected `(` after function name.";
  34. };
  35. struct ExpectedFunctionBodyOrSemi
  36. : SimpleDiagnostic<ExpectedFunctionBodyOrSemi> {
  37. static constexpr llvm::StringLiteral ShortName = "syntax-error";
  38. static constexpr llvm::StringLiteral Message =
  39. "Expected function definition or `;` after function declaration.";
  40. };
  41. struct UnrecognizedDeclaration : SimpleDiagnostic<UnrecognizedDeclaration> {
  42. static constexpr llvm::StringLiteral ShortName = "syntax-error";
  43. static constexpr llvm::StringLiteral Message =
  44. "Unrecognized declaration introducer.";
  45. };
  46. ParseTree::Parser::Parser(ParseTree& tree_arg, TokenizedBuffer& tokens_arg,
  47. TokenDiagnosticEmitter& emitter)
  48. : tree(tree_arg),
  49. tokens(tokens_arg),
  50. emitter(emitter),
  51. position(tokens.Tokens().begin()),
  52. end(tokens.Tokens().end()) {
  53. assert(std::find_if(position, end,
  54. [&](TokenizedBuffer::Token t) {
  55. return tokens.GetKind(t) == TokenKind::EndOfFile();
  56. }) != end &&
  57. "No EndOfFileToken in token buffer.");
  58. }
  59. auto ParseTree::Parser::Parse(TokenizedBuffer& tokens,
  60. TokenDiagnosticEmitter& emitter) -> ParseTree {
  61. ParseTree tree(tokens);
  62. // We expect to have a 1:1 correspondence between tokens and tree nodes, so
  63. // reserve the space we expect to need here to avoid allocation and copying
  64. // overhead.
  65. tree.node_impls.reserve(tokens.Size());
  66. Parser parser(tree, tokens, emitter);
  67. while (!parser.AtEndOfFile()) {
  68. parser.ParseDeclaration();
  69. }
  70. parser.AddLeafNode(ParseNodeKind::FileEnd(), *parser.position);
  71. assert(tree.Verify() && "Parse tree built but does not verify!");
  72. return tree;
  73. }
  74. auto ParseTree::Parser::Consume(TokenKind kind) -> TokenizedBuffer::Token {
  75. TokenizedBuffer::Token t = *position;
  76. assert(kind != TokenKind::EndOfFile() && "Cannot consume the EOF token!");
  77. assert(tokens.GetKind(t) == kind && "The current token is the wrong kind!");
  78. ++position;
  79. assert(position != end && "Reached end of tokens without finding EOF token.");
  80. return t;
  81. }
  82. auto ParseTree::Parser::ConsumeIf(TokenKind kind)
  83. -> llvm::Optional<TokenizedBuffer::Token> {
  84. if (tokens.GetKind(*position) != kind) {
  85. return {};
  86. }
  87. return Consume(kind);
  88. }
  89. auto ParseTree::Parser::AddLeafNode(ParseNodeKind kind,
  90. TokenizedBuffer::Token token) -> Node {
  91. Node n(tree.node_impls.size());
  92. tree.node_impls.push_back(NodeImpl(kind, token, /*subtree_size_arg=*/1));
  93. return n;
  94. }
  95. auto ParseTree::Parser::ConsumeAndAddLeafNodeIf(TokenKind t_kind,
  96. ParseNodeKind n_kind)
  97. -> llvm::Optional<Node> {
  98. auto t = ConsumeIf(t_kind);
  99. if (!t) {
  100. return {};
  101. }
  102. return AddLeafNode(n_kind, *t);
  103. }
  104. auto ParseTree::Parser::MarkNodeError(Node n) -> void {
  105. tree.node_impls[n.index].has_error = true;
  106. tree.has_errors = true;
  107. }
  108. // A marker for the start of a node's subtree.
  109. //
  110. // This is used to track the size of the node's subtree and ensure at least one
  111. // parse node is added. It can be used repeatedly if multiple subtrees start at
  112. // the same position.
  113. struct ParseTree::Parser::SubtreeStart {
  114. int tree_size;
  115. bool node_added = false;
  116. ~SubtreeStart() {
  117. assert(node_added && "Never added a node for a subtree region!");
  118. }
  119. };
  120. auto ParseTree::Parser::StartSubtree() -> SubtreeStart {
  121. return {static_cast<int>(tree.node_impls.size())};
  122. }
  123. auto ParseTree::Parser::AddNode(ParseNodeKind n_kind, TokenizedBuffer::Token t,
  124. SubtreeStart& start, bool has_error) -> Node {
  125. // The size of the subtree is the change in size from when we started this
  126. // subtree to now, but including the node we're about to add.
  127. int tree_stop_size = static_cast<int>(tree.node_impls.size()) + 1;
  128. int subtree_size = tree_stop_size - start.tree_size;
  129. Node n(tree.node_impls.size());
  130. tree.node_impls.push_back(NodeImpl(n_kind, t, subtree_size));
  131. if (has_error) {
  132. MarkNodeError(n);
  133. }
  134. start.node_added = true;
  135. return n;
  136. }
  137. auto ParseTree::Parser::SkipMatchingGroup() -> bool {
  138. TokenizedBuffer::Token t = *position;
  139. TokenKind t_kind = tokens.GetKind(t);
  140. if (!t_kind.IsOpeningSymbol()) {
  141. return false;
  142. }
  143. SkipTo(tokens.GetMatchedClosingToken(t));
  144. Consume(t_kind.GetClosingSymbol());
  145. return true;
  146. }
  147. auto ParseTree::Parser::SkipTo(TokenizedBuffer::Token t) -> void {
  148. assert(t >= *position && "Tried to skip backwards.");
  149. position = TokenizedBuffer::TokenIterator(t);
  150. assert(position != end && "Skipped past EOF.");
  151. }
  152. auto ParseTree::Parser::SkipPastLikelyDeclarationEnd(
  153. TokenizedBuffer::Token skip_root, bool is_inside_declaration)
  154. -> llvm::Optional<Node> {
  155. if (AtEndOfFile()) {
  156. return {};
  157. }
  158. TokenizedBuffer::Line root_line = tokens.GetLine(skip_root);
  159. int root_line_indent = tokens.GetIndentColumnNumber(root_line);
  160. // We will keep scanning through tokens on the same line as the root or
  161. // lines with greater indentation than root's line.
  162. auto is_same_line_or_indent_greater_than_root =
  163. [&](TokenizedBuffer::Token t) {
  164. TokenizedBuffer::Line l = tokens.GetLine(t);
  165. if (l == root_line) {
  166. return true;
  167. }
  168. return tokens.GetIndentColumnNumber(l) > root_line_indent;
  169. };
  170. do {
  171. TokenKind current_kind = tokens.GetKind(*position);
  172. if (current_kind == TokenKind::CloseCurlyBrace()) {
  173. // Immediately bail out if we hit an unmatched close curly, this will
  174. // pop us up a level of the syntax grouping.
  175. return {};
  176. }
  177. // If we find a semicolon, parse it and add a corresponding node. If we're
  178. // inside of a declaration, this is a declaration ending semicolon,
  179. // otherwise it simply forms an empty declaration.
  180. if (auto end_node = ConsumeAndAddLeafNodeIf(
  181. TokenKind::Semi(), is_inside_declaration
  182. ? ParseNodeKind::DeclarationEnd()
  183. : ParseNodeKind::EmptyDeclaration())) {
  184. return end_node;
  185. }
  186. // Skip over any matching group of tokens.
  187. if (SkipMatchingGroup()) {
  188. continue;
  189. }
  190. // Otherwise just step forward one token.
  191. Consume(current_kind);
  192. } while (!AtEndOfFile() &&
  193. is_same_line_or_indent_greater_than_root(*position));
  194. return {};
  195. }
  196. auto ParseTree::Parser::ParseFunctionSignature() -> Node {
  197. TokenizedBuffer::Token open_paren = Consume(TokenKind::OpenParen());
  198. auto start = StartSubtree();
  199. // FIXME: Add support for parsing parameters.
  200. bool has_errors = false;
  201. if (tokens.GetKind(*position) != TokenKind::CloseParen()) {
  202. emitter.EmitError<UnexpectedTokenInFunctionParams>(*position);
  203. has_errors = true;
  204. // We can trivially skip to the actual close parenthesis from here.
  205. SkipTo(tokens.GetMatchedClosingToken(open_paren));
  206. }
  207. AddLeafNode(ParseNodeKind::ParameterListEnd(),
  208. Consume(TokenKind::CloseParen()));
  209. // FIXME: Implement parsing of a return type.
  210. return AddNode(ParseNodeKind::ParameterList(), open_paren, start, has_errors);
  211. }
  212. auto ParseTree::Parser::ParseCodeBlock() -> Node {
  213. TokenizedBuffer::Token open_curly = Consume(TokenKind::OpenCurlyBrace());
  214. auto start = StartSubtree();
  215. bool has_errors = false;
  216. // Loop over all the different possibly nested elements in the code block.
  217. for (;;) {
  218. switch (tokens.GetKind(*position)) {
  219. default:
  220. // FIXME: Add support for parsing more expressions & statements.
  221. emitter.EmitError<UnexpectedTokenInCodeBlock>(*position);
  222. has_errors = true;
  223. // We can trivially skip to the actual close curly brace from here.
  224. SkipTo(tokens.GetMatchedClosingToken(open_curly));
  225. // Now fall through to the close curly brace handling code.
  226. LLVM_FALLTHROUGH;
  227. case TokenKind::CloseCurlyBrace():
  228. break;
  229. case TokenKind::OpenCurlyBrace():
  230. // FIXME: We should consider avoiding recursion here with some side
  231. // stack.
  232. ParseCodeBlock();
  233. continue;
  234. }
  235. // We only continue looping with `continue` above.
  236. break;
  237. }
  238. // We always reach here having set our position in the token stream to the
  239. // close curly brace.
  240. AddLeafNode(ParseNodeKind::CodeBlockEnd(),
  241. Consume(TokenKind::CloseCurlyBrace()));
  242. return AddNode(ParseNodeKind::CodeBlock(), open_curly, start, has_errors);
  243. }
  244. auto ParseTree::Parser::ParseFunctionDeclaration() -> Node {
  245. TokenizedBuffer::Token function_intro_token = Consume(TokenKind::FnKeyword());
  246. auto start = StartSubtree();
  247. auto add_error_function_node = [&] {
  248. return AddNode(ParseNodeKind::FunctionDeclaration(), function_intro_token,
  249. start, /*has_error=*/true);
  250. };
  251. auto name_n = ConsumeAndAddLeafNodeIf(TokenKind::Identifier(),
  252. ParseNodeKind::Identifier());
  253. if (!name_n) {
  254. emitter.EmitError<ExpectedFunctionName>(*position);
  255. // FIXME: We could change the lexer to allow us to synthesize certain
  256. // kinds of tokens and try to "recover" here, but unclear that this is
  257. // really useful.
  258. SkipPastLikelyDeclarationEnd(function_intro_token);
  259. return add_error_function_node();
  260. }
  261. TokenizedBuffer::Token open_paren = *position;
  262. if (tokens.GetKind(open_paren) != TokenKind::OpenParen()) {
  263. emitter.EmitError<ExpectedFunctionParams>(open_paren);
  264. SkipPastLikelyDeclarationEnd(function_intro_token);
  265. return add_error_function_node();
  266. }
  267. TokenizedBuffer::Token close_paren =
  268. tokens.GetMatchedClosingToken(open_paren);
  269. Node signature_n = ParseFunctionSignature();
  270. assert(*std::prev(position) == close_paren &&
  271. "Should have parsed through the close paren, whether successfully "
  272. "or with errors.");
  273. if (tree.node_impls[signature_n.index].has_error) {
  274. // Don't try to parse more of the function declaration, but consume a
  275. // declaration ending semicolon if found (without going to a new line).
  276. SkipPastLikelyDeclarationEnd(function_intro_token);
  277. return add_error_function_node();
  278. }
  279. // See if we should parse a definition which is represented as a code block.
  280. if (tokens.GetKind(*position) == TokenKind::OpenCurlyBrace()) {
  281. ParseCodeBlock();
  282. } else if (!ConsumeAndAddLeafNodeIf(TokenKind::Semi(),
  283. ParseNodeKind::DeclarationEnd())) {
  284. emitter.EmitError<ExpectedFunctionBodyOrSemi>(*position);
  285. if (tokens.GetLine(*position) == tokens.GetLine(close_paren)) {
  286. // Only need to skip if we've not already found a new line.
  287. SkipPastLikelyDeclarationEnd(function_intro_token);
  288. }
  289. return add_error_function_node();
  290. }
  291. // Successfully parsed the function, add that node.
  292. return AddNode(ParseNodeKind::FunctionDeclaration(), function_intro_token,
  293. start);
  294. }
  295. auto ParseTree::Parser::ParseEmptyDeclaration() -> Node {
  296. return AddLeafNode(ParseNodeKind::EmptyDeclaration(),
  297. Consume(TokenKind::Semi()));
  298. }
  299. auto ParseTree::Parser::ParseDeclaration() -> llvm::Optional<Node> {
  300. TokenizedBuffer::Token t = *position;
  301. switch (tokens.GetKind(t)) {
  302. case TokenKind::FnKeyword():
  303. return ParseFunctionDeclaration();
  304. case TokenKind::Semi():
  305. return ParseEmptyDeclaration();
  306. case TokenKind::EndOfFile():
  307. return llvm::None;
  308. default:
  309. // Errors are handled outside the switch.
  310. break;
  311. }
  312. // We didn't recognize an introducer for a valid declaration.
  313. emitter.EmitError<UnrecognizedDeclaration>(t);
  314. // Skip forward past any end of a declaration we simply didn't understand so
  315. // that we can find the start of the next declaration or the end of a scope.
  316. if (auto found_semi_n =
  317. SkipPastLikelyDeclarationEnd(t, /*is_inside_declaration=*/false)) {
  318. MarkNodeError(*found_semi_n);
  319. return *found_semi_n;
  320. }
  321. // Nothing, not even a semicolon found. We still need to mark that an error
  322. // occurred though.
  323. tree.has_errors = true;
  324. return {};
  325. }
  326. } // namespace Carbon