Преглед на файлове

Checkpoint for a parser rewrite (#2364)

The intent of this approach is to eliminate recursion limits as a barrier for the parser. While it may not be urgent to address, I want to avoid pouring effort into a parser approach that we don't think will be usable long-term.

Right now this is passing a minor set of tests. It's intended to be enough to show how I'm thinking about flow control for the parser. I'm manually switching back and forth because it seemed like the easiest approach that avoids duplicating tests.
Jon Ross-Perkins преди 3 години
родител
ревизия
9107916b11

+ 14 - 1
bazel/testing/lit.bzl

@@ -22,14 +22,27 @@ def glob_lit_tests(driver, data, test_file_exts, **kwargs):
         exclude_directories = 1,
     )
     data.append("@llvm-project//llvm:lit")
+    suites = dict()
     for f in test_files:
         if f.split(".")[-1] not in test_file_exts:
             continue
+        test = "%s.test" % f
         native.py_test(
-            name = "%s.test" % f,
+            name = test,
             srcs = ["//bazel/testing:lit_test.py"],
             main = "//bazel/testing:lit_test.py",
             data = data + [driver, f],
             args = ["--package_name=%s" % native.package_name(), "--"],
             **kwargs
         )
+
+        # Cluster tests by directory in order to produce suites. For example,
+        # foo/bar/baz.carbon.test is added to suites :foo and :foo/bar.
+        dirs = f.split("/")[:-1]
+        for num_parts in range(1, 1 + len(dirs)):
+            dir = "/".join(dirs[:num_parts])
+            if dir not in suites:
+                suites[dir] = []
+            suites[dir].append(test)
+    for suite, tests in suites.items():
+        native.test_suite(name = suite, tests = tests)

+ 14 - 0
toolchain/parser/BUILD

@@ -17,6 +17,17 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "parser_state",
+    srcs = ["parser_state.cpp"],
+    hdrs = ["parser_state.h"],
+    textual_hdrs = ["parser_state.def"],
+    deps = [
+        "//common:ostream",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
 cc_test(
     name = "parse_node_kind_test",
     size = "small",
@@ -35,10 +46,13 @@ cc_library(
         "parse_tree.cpp",
         "parser_impl.cpp",
         "parser_impl.h",
+        "parser2.cpp",
+        "parser2.h",
     ],
     hdrs = ["parse_tree.h"],
     deps = [
         ":parse_node_kind",
+        ":parser_state",
         ":precedence",
         "//common:check",
         "//common:ostream",

+ 3 - 0
toolchain/parser/parse_tree.cpp

@@ -16,6 +16,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "toolchain/lexer/token_kind.h"
 #include "toolchain/parser/parse_node_kind.h"
+#include "toolchain/parser/parser2.h"
 #include "toolchain/parser/parser_impl.h"
 
 namespace Carbon {
@@ -27,6 +28,8 @@ auto ParseTree::Parse(TokenizedBuffer& tokens, DiagnosticConsumer& consumer)
   TokenDiagnosticEmitter emitter(translator, consumer);
 
   // Delegate to the parser.
+  // TODO: Edit this to swap between Parser and Parser2. This is manual in order
+  // to avoid test duplication.
   return Parser::Parse(tokens, emitter);
 }
 

+ 10 - 0
toolchain/parser/parse_tree.h

@@ -145,6 +145,7 @@ class ParseTree {
  private:
   class Parser;
   friend Parser;
+  friend class Parser2;
 
   // The in-memory representation of data used for a particular node in the
   // tree.
@@ -153,6 +154,15 @@ class ParseTree {
                       int subtree_size_arg)
         : kind(k), token(t), subtree_size(subtree_size_arg) {}
 
+    // TODO: Parser2 only uses this construct. Can remove the other if we
+    // switch.
+    NodeImpl(ParseNodeKind kind, bool has_error, TokenizedBuffer::Token token,
+             int subtree_size)
+        : kind(kind),
+          has_error(has_error),
+          token(token),
+          subtree_size(subtree_size) {}
+
     // The kind of this node. Note that this is only a single byte.
     ParseNodeKind kind;
 

+ 276 - 0
toolchain/parser/parser2.cpp

@@ -0,0 +1,276 @@
+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+// Exceptions. See /LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "toolchain/parser/parser2.h"
+
+#include <cstdlib>
+#include <memory>
+
+#include "common/check.h"
+#include "llvm/ADT/Optional.h"
+#include "toolchain/lexer/token_kind.h"
+#include "toolchain/lexer/tokenized_buffer.h"
+#include "toolchain/parser/parse_node_kind.h"
+#include "toolchain/parser/parse_tree.h"
+
+namespace Carbon {
+
+Parser2::Parser2(ParseTree& tree_arg, TokenizedBuffer& tokens_arg,
+                 TokenDiagnosticEmitter& emitter)
+    : tree_(tree_arg),
+      tokens_(tokens_arg),
+      emitter_(emitter),
+      position_(tokens_.tokens().begin()),
+      end_(tokens_.tokens().end()) {
+  CARBON_CHECK(position_ != end_) << "Empty TokenizedBuffer";
+  --end_;
+  CARBON_CHECK(tokens_.GetKind(*end_) == TokenKind::EndOfFile())
+      << "TokenizedBuffer should end with EndOfFile, ended with "
+      << tokens_.GetKind(*end_).Name();
+}
+
+auto Parser2::AddLeafNode(ParseNodeKind kind, TokenizedBuffer::Token token,
+                          bool has_error) -> void {
+  tree_.node_impls_.push_back(
+      ParseTree::NodeImpl(kind, has_error, token, /*subtree_size=*/1));
+  if (has_error) {
+    tree_.has_errors_ = true;
+  }
+}
+
+auto Parser2::AddNode(ParseNodeKind kind, TokenizedBuffer::Token token,
+                      int subtree_start, bool has_error) -> void {
+  int subtree_size = tree_.size() - subtree_start + 1;
+  tree_.node_impls_.push_back(
+      ParseTree::NodeImpl(kind, has_error, token, subtree_size));
+  if (has_error) {
+    tree_.has_errors_ = true;
+  }
+}
+
+auto Parser2::ConsumeAndAddLeafNodeIf(TokenKind token_kind,
+                                      ParseNodeKind node_kind) -> bool {
+  auto token = ConsumeIf(token_kind);
+  if (!token) {
+    return false;
+  }
+
+  AddLeafNode(node_kind, *token);
+  return true;
+}
+
+auto Parser2::ConsumeIf(TokenKind kind)
+    -> llvm::Optional<TokenizedBuffer::Token> {
+  if (!PositionIs(kind)) {
+    return llvm::None;
+  }
+  auto token = *position_;
+  ++position_;
+  return token;
+}
+
+auto Parser2::Parse() -> void {
+  PushState(ParserState::Declaration());
+  while (!state_stack_.empty()) {
+    switch (state_stack_.back().state) {
+#define CARBON_PARSER_STATE(Name) \
+  case ParserState::Name():       \
+    Handle##Name##State();        \
+    break;
+#include "toolchain/parser/parser_state.def"
+    }
+  }
+
+  AddLeafNode(ParseNodeKind::FileEnd(), *position_);
+}
+
+auto Parser2::SkipMatchingGroup() -> bool {
+  if (!PositionKind().IsOpeningSymbol()) {
+    return false;
+  }
+
+  SkipTo(tokens_.GetMatchedClosingToken(*position_));
+  ++position_;
+  return true;
+}
+
+auto Parser2::SkipPastLikelyEnd(TokenizedBuffer::Token skip_root)
+    -> llvm::Optional<TokenizedBuffer::Token> {
+  CARBON_CHECK(position_ < end_);
+
+  TokenizedBuffer::Line root_line = tokens_.GetLine(skip_root);
+  int root_line_indent = tokens_.GetIndentColumnNumber(root_line);
+
+  // We will keep scanning through tokens on the same line as the root or
+  // lines with greater indentation than root's line.
+  auto is_same_line_or_indent_greater_than_root =
+      [&](TokenizedBuffer::Token t) {
+        TokenizedBuffer::Line l = tokens_.GetLine(t);
+        if (l == root_line) {
+          return true;
+        }
+
+        return tokens_.GetIndentColumnNumber(l) > root_line_indent;
+      };
+
+  do {
+    if (PositionIs(TokenKind::CloseCurlyBrace())) {
+      // Immediately bail out if we hit an unmatched close curly, this will
+      // pop us up a level of the syntax grouping.
+      return llvm::None;
+    }
+
+    // We assume that a semicolon is always intended to be the end of the
+    // current construct.
+    if (auto semi = ConsumeIf(TokenKind::Semi())) {
+      return semi;
+    }
+
+    // Skip over any matching group of tokens_.
+    if (SkipMatchingGroup()) {
+      continue;
+    }
+
+    // Otherwise just step forward one token.
+    ++position_;
+  } while (position_ != end_ &&
+           is_same_line_or_indent_greater_than_root(*position_));
+
+  return llvm::None;
+}
+
+auto Parser2::SkipTo(TokenizedBuffer::Token t) -> void {
+  CARBON_CHECK(t > *position_) << "Tried to skip backwards.";
+  position_ = TokenizedBuffer::TokenIterator(t);
+  CARBON_CHECK(position_ != end_) << "Skipped past EOF.";
+}
+
+auto Parser2::HandleDeclarationState() -> void {
+  do {
+    switch (auto token_kind = PositionKind()) {
+      case TokenKind::EndOfFile(): {
+        state_stack_.pop_back();
+        return;
+      }
+      case TokenKind::Fn(): {
+        PushState(ParserState::FunctionIntroducer());
+        AddLeafNode(ParseNodeKind::FunctionIntroducer(), *position_);
+        ++position_;
+        return;
+      }
+      case TokenKind::Semi(): {
+        AddLeafNode(ParseNodeKind::EmptyDeclaration(), *position_);
+        ++position_;
+        break;
+      }
+      default: {
+        CARBON_DIAGNOSTIC(UnrecognizedDeclaration, Error,
+                          "Unrecognized declaration introducer.");
+        emitter_.Emit(*position_, UnrecognizedDeclaration);
+        tree_.has_errors_ = true;
+        if (auto semi = SkipPastLikelyEnd(*position_)) {
+          AddLeafNode(ParseNodeKind::EmptyDeclaration(), *semi,
+                      /*has_error=*/true);
+        }
+        break;
+      }
+    }
+  } while (position_ < end_);
+}
+
+auto Parser2::HandleFunctionError(bool skip_past_likely_end) -> void {
+  auto token = state_stack_.back().start_token;
+  if (skip_past_likely_end && SkipPastLikelyEnd(token)) {
+    token = *position_;
+  }
+  AddNode(ParseNodeKind::FunctionDeclaration(), token,
+          state_stack_.back().subtree_start,
+          /*has_error=*/true);
+  state_stack_.pop_back();
+}
+
+auto Parser2::HandleFunctionIntroducerState() -> void {
+  if (!ConsumeAndAddLeafNodeIf(TokenKind::Identifier(),
+                               ParseNodeKind::DeclaredName())) {
+    CARBON_DIAGNOSTIC(ExpectedFunctionName, Error,
+                      "Expected function name after `fn` keyword.");
+    emitter_.Emit(*position_, ExpectedFunctionName);
+    // TODO: We could change the lexer to allow us to synthesize certain
+    // kinds of tokens and try to "recover" here, but unclear that this is
+    // really useful.
+    HandleFunctionError(true);
+    return;
+  }
+
+  if (!PositionIs(TokenKind::OpenParen())) {
+    CARBON_DIAGNOSTIC(ExpectedFunctionParams, Error,
+                      "Expected `(` after function name.");
+    emitter_.Emit(*position_, ExpectedFunctionParams);
+    HandleFunctionError(true);
+    return;
+  }
+
+  // Parse the parameter list as its own subtree; once that pops, resume
+  // function parsing.
+  state_stack_.back().state = ParserState::FunctionParameterListDone();
+  PushState(ParserState::FunctionParameterList());
+  // Advance past the open parenthesis before continuing.
+  // TODO: When swapping () start/end, this should AddNode the open before
+  // continuing.
+  ++position_;
+}
+
+auto Parser2::HandleFunctionParameterListState() -> void {
+  // TODO: Handle non-empty lists.
+  if (!PositionIs(TokenKind::CloseParen())) {
+    CARBON_DIAGNOSTIC(ExpectedFunctionParams, Error,
+                      "Expected `(` after function name.");
+    emitter_.Emit(*position_, ExpectedFunctionParams);
+    SkipTo(tokens_.GetMatchedClosingToken(state_stack_.back().start_token));
+    AddLeafNode(ParseNodeKind::ParameterListEnd(), *position_,
+                /*has_error=*/true);
+    AddNode(ParseNodeKind::ParameterList(), state_stack_.back().start_token,
+            state_stack_.back().subtree_start);
+    ++position_;
+    return;
+  }
+  AddLeafNode(ParseNodeKind::ParameterListEnd(), *position_);
+  AddNode(ParseNodeKind::ParameterList(), state_stack_.back().start_token,
+          state_stack_.back().subtree_start);
+  ++position_;
+  state_stack_.pop_back();
+}
+
+auto Parser2::HandleFunctionParameterListDoneState() -> void {
+  switch (auto token_kind = PositionKind()) {
+    case TokenKind::Semi(): {
+      AddNode(ParseNodeKind::FunctionDeclaration(), *position_,
+              state_stack_.back().subtree_start);
+      ++position_;
+      state_stack_.pop_back();
+      break;
+    }
+    // TODO: OpenCurlyBrace is a definition.
+    case TokenKind::OpenCurlyBrace(): {
+      CARBON_DIAGNOSTIC(
+          ExpectedFunctionBodyOrSemi, Error,
+          "Expected function definition or `;` after function declaration.");
+      emitter_.Emit(*position_, ExpectedFunctionBodyOrSemi);
+      HandleFunctionError(true);
+      break;
+    }
+    default: {
+      CARBON_DIAGNOSTIC(
+          ExpectedFunctionBodyOrSemi, Error,
+          "Expected function definition or `;` after function declaration.");
+      emitter_.Emit(*position_, ExpectedFunctionBodyOrSemi);
+      // Only need to skip if we've not already found a new line.
+      HandleFunctionError(tokens_.GetLine(*position_) ==
+                          tokens_.GetLine(state_stack_.back().start_token));
+      break;
+    }
+  }
+}
+
+}  // namespace Carbon

+ 126 - 0
toolchain/parser/parser2.h

@@ -0,0 +1,126 @@
+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+// Exceptions. See /LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef CARBON_TOOLCHAIN_PARSER_PARSER2_H_
+#define CARBON_TOOLCHAIN_PARSER_PARSER2_H_
+
+#include "llvm/ADT/Optional.h"
+#include "toolchain/lexer/token_kind.h"
+#include "toolchain/lexer/tokenized_buffer.h"
+#include "toolchain/parser/parse_node_kind.h"
+#include "toolchain/parser/parse_tree.h"
+#include "toolchain/parser/parser_state.h"
+
+namespace Carbon {
+
+class Parser2 {
+ public:
+  // Parses the tokens into a parse tree, emitting any errors encountered.
+  //
+  // This is the entry point to the parser implementation.
+  static auto Parse(TokenizedBuffer& tokens, TokenDiagnosticEmitter& emitter)
+      -> ParseTree {
+    ParseTree tree(tokens);
+    Parser2 parser(tree, tokens, emitter);
+    parser.Parse();
+    return tree;
+  }
+
+ private:
+  // Used to track state on state_stack_.
+  struct StateStackEntry {
+    // The state.
+    ParserState state;
+    // The token indicating the start of a tracked subtree.
+    TokenizedBuffer::Token start_token;
+    // The offset within the ParseTree of the subtree start.
+    int32_t subtree_start;
+  };
+
+  Parser2(ParseTree& tree, TokenizedBuffer& tokens,
+          TokenDiagnosticEmitter& emitter);
+
+  auto Parse() -> void;
+
+  // Adds a node to the parse tree that is fully parsed, has no children
+  // ("leaf"), and has a subsequent sibling.
+  //
+  // This sets up the next sibling of the node to be the next node in the parse
+  // tree's preorder sequence.
+  auto AddLeafNode(ParseNodeKind kind, TokenizedBuffer::Token token,
+                   bool has_error = false) -> void;
+
+  auto AddNode(ParseNodeKind kind, TokenizedBuffer::Token token,
+               int subtree_start, bool has_error = false) -> void;
+
+  // Composes `ConsumeIf` and `AddLeafNode`, returning false when ConsumeIf
+  // fails.
+  auto ConsumeAndAddLeafNodeIf(TokenKind token_kind, ParseNodeKind node_kind)
+      -> bool;
+
+  // If the current position's token matches this `Kind`, returns it and
+  // advances to the next position. Otherwise returns an empty optional.
+  auto ConsumeIf(TokenKind kind) -> llvm::Optional<TokenizedBuffer::Token>;
+
+  // Gets the kind of the next token to be consumed.
+  auto PositionKind() const -> TokenKind { return tokens_.GetKind(*position_); }
+
+  // Tests whether the next token to be consumed is of the specified kind.
+  auto PositionIs(TokenKind kind) const -> bool {
+    return PositionKind() == kind;
+  }
+
+  // If the token is an opening symbol for a matched group, skips to the matched
+  // closing symbol and returns true. Otherwise, returns false.
+  auto SkipMatchingGroup() -> bool;
+
+  // Skips forward to move past the likely end of a declaration or statement.
+  //
+  // Looks forward, skipping over any matched symbol groups, to find the next
+  // position that is likely past the end of a declaration or statement. This
+  // is a heuristic and should only be called when skipping past parse errors.
+  //
+  // The strategy for recognizing when we have likely passed the end of a
+  // declaration or statement:
+  // - If we get to a close curly brace, we likely ended the entire context.
+  // - If we get to a semicolon, that should have ended the declaration or
+  //   statement.
+  // - If we get to a new line from the `SkipRoot` token, but with the same or
+  //   less indentation, there is likely a missing semicolon. Continued
+  //   declarations or statements across multiple lines should be indented.
+  //
+  // Returns a semicolon token if one is the likely end.
+  auto SkipPastLikelyEnd(TokenizedBuffer::Token skip_root)
+      -> llvm::Optional<TokenizedBuffer::Token>;
+
+  // Skip forward to the given token. Verifies that it is actually forward.
+  auto SkipTo(TokenizedBuffer::Token t) -> void;
+
+  auto PushState(ParserState state) -> void {
+    state_stack_.push_back({state, *position_, tree_.size()});
+  }
+
+  // When handling errors before the start of the definition, treat it as a
+  // declaration. Recover to a semicolon when it makes sense as a possible
+  // function end, otherwise use the fn token for the error.
+  auto HandleFunctionError(bool skip_past_likely_end) -> void;
+
+#define CARBON_PARSER_STATE(Name) auto Handle##Name##State()->void;
+#include "toolchain/parser/parser_state.def"
+
+  ParseTree& tree_;
+  TokenizedBuffer& tokens_;
+  TokenDiagnosticEmitter& emitter_;
+
+  // The current position within the token buffer.
+  TokenizedBuffer::TokenIterator position_;
+  // The EndOfFile token.
+  TokenizedBuffer::TokenIterator end_;
+
+  llvm::SmallVector<StateStackEntry> state_stack_;
+};
+
+}  // namespace Carbon
+
+#endif  // CARBON_TOOLCHAIN_PARSER_PARSER2_H_

+ 19 - 0
toolchain/parser/parser_state.cpp

@@ -0,0 +1,19 @@
+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+// Exceptions. See /LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "toolchain/parser/parser_state.h"
+
+#include "llvm/ADT/StringRef.h"
+
+namespace Carbon {
+
+auto ParserState::name() const -> llvm::StringRef {
+  static constexpr llvm::StringLiteral Names[] = {
+#define CARBON_PARSER_STATE(Name) #Name,
+#include "toolchain/parser/parser_state.def"
+  };
+  return Names[static_cast<int>(state_)];
+}
+
+}  // namespace Carbon

+ 21 - 0
toolchain/parser/parser_state.def

@@ -0,0 +1,21 @@
+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+// Exceptions. See /LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Note that this is an X-macro header.
+//
+// It does not use `#include` guards, and instead is designed to be `#include`ed
+// after the x-macro is defined in order for its inclusion to expand to the
+// desired output. The x-macro for this header is `CARBON_PARSE_NODE_KIND`. The
+// definition provided will be removed at the end of this file to clean up.
+
+#ifndef CARBON_PARSER_STATE
+#error "Must define the x-macro to use this file."
+#endif
+
+CARBON_PARSER_STATE(Declaration)
+CARBON_PARSER_STATE(FunctionIntroducer)
+CARBON_PARSER_STATE(FunctionParameterList)
+CARBON_PARSER_STATE(FunctionParameterListDone)
+
+#undef CARBON_PARSER_STATE

+ 67 - 0
toolchain/parser/parser_state.h

@@ -0,0 +1,67 @@
+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+// Exceptions. See /LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef CARBON_TOOLCHAIN_PARSER_PARSER_STATE_H_
+#define CARBON_TOOLCHAIN_PARSER_PARSER_STATE_H_
+
+#include <cstdint>
+#include <iterator>
+
+#include "common/ostream.h"
+#include "llvm/ADT/StringRef.h"
+
+namespace Carbon {
+
+class ParserState {
+  // Note that this must be declared earlier in the class so that its type can
+  // be used, for example in the conversion operator.
+  enum class StateEnum : uint8_t {
+#define CARBON_PARSER_STATE(Name) Name,
+#include "toolchain/parser/parser_state.def"
+  };
+
+ public:
+  // `clang-format` has a bug with spacing around `->` returns in macros. See
+  // https://bugs.llvm.org/show_bug.cgi?id=48320 for details.
+#define CARBON_PARSER_STATE(Name)             \
+  static constexpr auto Name()->ParserState { \
+    return ParserState(StateEnum::Name);      \
+  }
+#include "toolchain/parser/parser_state.def"
+
+  // The default constructor is deleted because objects of this type should
+  // always be constructed using the above factory functions for each unique
+  // kind.
+  ParserState() = delete;
+
+  friend auto operator==(ParserState lhs, ParserState rhs) -> bool {
+    return lhs.state_ == rhs.state_;
+  }
+  friend auto operator!=(ParserState lhs, ParserState rhs) -> bool {
+    return lhs.state_ != rhs.state_;
+  }
+
+  // Gets a friendly name for the token for logging or debugging.
+  [[nodiscard]] auto name() const -> llvm::StringRef;
+
+  // Enable conversion to our private enum, including in a `constexpr` context,
+  // to enable usage in `switch` and `case`. The enum remains private and
+  // nothing else should be using this function.
+  // NOLINTNEXTLINE(google-explicit-constructor)
+  constexpr operator StateEnum() const { return state_; }
+
+  void Print(llvm::raw_ostream& out) const { out << name(); }
+
+ private:
+  constexpr explicit ParserState(StateEnum k) : state_(k) {}
+
+  StateEnum state_;
+};
+
+// We expect the parse node kind to fit compactly into 8 bits.
+static_assert(sizeof(ParserState) == 1, "ParserState objects include padding!");
+
+}  // namespace Carbon
+
+#endif  // CARBON_TOOLCHAIN_PARSER_PARSER_STATE_H_