tomteb
/
carbon-lang


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360
							// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
// Exceptions. See /LICENSE for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#include "parser/parser_impl.h"

#include <cstdlib>

#include "lexer/token_kind.h"
#include "lexer/tokenized_buffer.h"
#include "llvm/ADT/Optional.h"
#include "llvm/Support/raw_ostream.h"
#include "parser/parse_node_kind.h"
#include "parser/parse_tree.h"

namespace Carbon {

auto ParseTree::Parser::Parse(TokenizedBuffer& tokens,
                              DiagnosticEmitter& /*unused*/) -> ParseTree {
  ParseTree tree(tokens);

  // We expect to have a 1:1 correspondence between tokens and tree nodes, so
  // reserve the space we expect to need here to avoid allocation and copying
  // overhead.
  tree.node_impls.reserve(tokens.Size());

  Parser parser(tree, tokens);
  while (parser.position != parser.end)
    parser.ParseDeclaration();

  assert(tree.Verify() && "Parse tree built but does not verify!");
  return tree;
}

auto ParseTree::Parser::Consume(TokenKind kind) -> TokenizedBuffer::Token {
  TokenizedBuffer::Token t = *position;
  assert(tokens.GetKind(t) == kind && "The current token is the wrong kind!");
  ++position;
  return t;
}

auto ParseTree::Parser::ConsumeIf(TokenKind kind)
    -> llvm::Optional<TokenizedBuffer::Token> {
  if (tokens.GetKind(*position) != kind)
    return {};

  return *position++;
}

auto ParseTree::Parser::AddLeafNode(ParseNodeKind kind,
                                    TokenizedBuffer::Token token) -> Node {
  Node n(tree.node_impls.size());
  tree.node_impls.push_back(NodeImpl(kind, token, /*SubtreeSize=*/1));
  return n;
}

auto ParseTree::Parser::ConsumeAndAddLeafNodeIf(TokenKind t_kind,
                                                ParseNodeKind n_kind)
    -> llvm::Optional<Node> {
  auto t = ConsumeIf(t_kind);
  if (!t)
    return {};

  return AddLeafNode(n_kind, *t);
}

auto ParseTree::Parser::MarkNodeError(Node n) -> void {
  tree.node_impls[n.index].has_error = true;
  tree.has_errors = true;
}

// A marker for the start of a node's subtree.
//
// This is used to track the size of the node's subtree and ensure at least one
// parse node is added. It can be used repeatedly if multiple subtrees start at
// the same position.
struct ParseTree::Parser::SubtreeStart {
  int tree_size;
  bool node_added = false;

  ~SubtreeStart() {
    assert(node_added && "Never added a node for a subtree region!");
  }
};

auto ParseTree::Parser::StartSubtree() -> SubtreeStart {
  return {static_cast<int>(tree.node_impls.size())};
}

auto ParseTree::Parser::AddNode(ParseNodeKind n_kind, TokenizedBuffer::Token t,
                                SubtreeStart& start, bool has_error) -> Node {
  // The size of the subtree is the change in size from when we started this
  // subtree to now, but including the node we're about to add.
  int tree_stop_size = tree.node_impls.size() + 1;
  int subtree_size = tree_stop_size - start.tree_size;

  Node n(tree.node_impls.size());
  tree.node_impls.push_back(NodeImpl(n_kind, t, subtree_size));
  if (has_error)
    MarkNodeError(n);

  start.node_added = true;
  return n;
}

auto ParseTree::Parser::SkipMatchingGroup() -> bool {
  assert(position != end && "Cannot skip at the end!");
  TokenizedBuffer::Token t = *position;
  TokenKind t_kind = tokens.GetKind(t);
  if (!t_kind.IsOpeningSymbol())
    return false;

  position = std::next(
      TokenizedBuffer::TokenIterator(tokens.GetMatchedClosingToken(t)));
  return true;
}

auto ParseTree::Parser::SkipPastLikelyDeclarationEnd(
    TokenizedBuffer::Token skip_root, bool is_inside_declaration)
    -> llvm::Optional<Node> {
  if (position == end)
    return {};

  TokenizedBuffer::Line root_line = tokens.GetLine(skip_root);
  int root_line_indent = tokens.GetIndentColumnNumber(root_line);

  // We will keep scanning through tokens on the same line as the root or
  // lines with greater indentation than root's line.
  auto is_same_line_or_indent_greater_than_root =
      [&](TokenizedBuffer::Token t) {
        TokenizedBuffer::Line l = tokens.GetLine(t);
        if (l == root_line)
          return true;

        return tokens.GetIndentColumnNumber(l) > root_line_indent;
      };

  do {
    TokenKind current_kind = tokens.GetKind(*position);
    if (current_kind == TokenKind::CloseCurlyBrace())
      // Immediately bail out if we hit an unmatched close curly, this will
      // pop us up a level of the syntax grouping.
      return {};

    // If we find a semicolon, we want to parse it to end the declaration.
    if (current_kind == TokenKind::Semi()) {
      TokenizedBuffer::Token semi = *position++;

      // Add a node for the semicolon. If we're inside of a declaration, this
      // is a declaration ending semicolon, otherwise it simply forms an empty
      // declaration.
      return AddLeafNode(is_inside_declaration
                             ? ParseNodeKind::DeclarationEnd()
                             : ParseNodeKind::EmptyDeclaration(),
                         semi);
    }

    // Skip over any matching group of tokens.
    if (SkipMatchingGroup())
      continue;

    // Otherwise just step forward one token.
    ++position;
  } while (position != end &&
           is_same_line_or_indent_greater_than_root(*position));

  return {};
}

auto ParseTree::Parser::ParseFunctionSignature() -> Node {
  assert(position != end && "Cannot parse past the end!");

  TokenizedBuffer::Token open_paren = Consume(TokenKind::OpenParen());
  assert(position != end &&
         "The lexer ensures we always have a closing paren!");
  auto start = StartSubtree();

  // FIXME: Add support for parsing parameters.

  bool has_errors = false;
  auto close_paren = ConsumeIf(TokenKind::CloseParen());
  if (!close_paren) {
    llvm::errs() << "ERROR: unexpected token before the close of the "
                    "parameters on line "
                 << tokens.GetLineNumber(*position) << "!\n";
    has_errors = true;

    // We can trivially skip to the actual close parenthesis from here.
    close_paren = tokens.GetMatchedClosingToken(open_paren);
    position = std::next(TokenizedBuffer::TokenIterator(*close_paren));
  }
  AddLeafNode(ParseNodeKind::ParameterListEnd(), *close_paren);

  // FIXME: Implement parsing of a return type.

  return AddNode(ParseNodeKind::ParameterList(), open_paren, start, has_errors);
}

auto ParseTree::Parser::ParseCodeBlock() -> Node {
  assert(position != end && "Cannot parse past the end!");

  TokenizedBuffer::Token open_curly = Consume(TokenKind::OpenCurlyBrace());
  assert(position != end &&
         "The lexer ensures we always have a closing curly!");
  auto start = StartSubtree();

  bool has_errors = false;

  // Loop over all the different possibly nested elements in the code block.
  for (;;) {
    switch (tokens.GetKind(*position)) {
      default:
        // FIXME: Add support for parsing more expressions & statements.
        llvm::errs() << "ERROR: unexpected token before the close of the "
                        "function definition on line "
                     << tokens.GetLineNumber(*position) << "!\n";
        has_errors = true;

        // We can trivially skip to the actual close curly brace from here.
        position = TokenizedBuffer::TokenIterator(
            tokens.GetMatchedClosingToken(open_curly));
        // Now fall through to the close curly brace handling code.
        LLVM_FALLTHROUGH;

      case TokenKind::CloseCurlyBrace():
        break;

      case TokenKind::OpenCurlyBrace():
        // FIXME: We should consider avoiding recursion here with some side
        // stack.
        ParseCodeBlock();
        continue;
    }

    // We only continue looping with `continue` above.
    break;
  }

  // We always reach here having set our position in the token stream to the
  // close curly brace.
  AddLeafNode(ParseNodeKind::CodeBlockEnd(),
              Consume(TokenKind::CloseCurlyBrace()));

  return AddNode(ParseNodeKind::CodeBlock(), open_curly, start, has_errors);
}

auto ParseTree::Parser::ParseFunctionDeclaration() -> Node {
  assert(position != end && "Cannot parse past the end!");

  TokenizedBuffer::Token function_intro_token = Consume(TokenKind::FnKeyword());
  auto start = StartSubtree();
  auto add_error_function_node = [&] {
    return AddNode(ParseNodeKind::FunctionDeclaration(), function_intro_token,
                   start, /*has_error=*/true);
  };

  if (position == end) {
    llvm::errs() << "ERROR: File ended with a function introducer on line "
                 << tokens.GetLineNumber(function_intro_token) << "!\n";
    return add_error_function_node();
  }

  auto name_n = ConsumeAndAddLeafNodeIf(TokenKind::Identifier(),
                                        ParseNodeKind::Identifier());
  if (!name_n) {
    llvm::errs() << "ERROR: Function declaration with no name on line "
                 << tokens.GetLineNumber(function_intro_token) << "!\n";
    // FIXME: We could change the lexer to allow us to synthesize certain
    // kinds of tokens and try to "recover" here, but unclear that this is
    // really useful.
    SkipPastLikelyDeclarationEnd(function_intro_token);
    return add_error_function_node();
  }
  if (position == end) {
    llvm::errs() << "ERROR: File ended after a function introducer and "
                    "identifier on line "
                 << tokens.GetLineNumber(function_intro_token) << "!\n";
    return add_error_function_node();
  }

  TokenizedBuffer::Token open_paren = *position;
  if (tokens.GetKind(open_paren) != TokenKind::OpenParen()) {
    llvm::errs()
        << "ERROR: Missing open parentheses in declaration of function '"
        << tokens.GetTokenText(tree.GetNodeToken(*name_n)) << "' on line "
        << tokens.GetLineNumber(function_intro_token) << "!\n";
    SkipPastLikelyDeclarationEnd(function_intro_token);
    return add_error_function_node();
  }
  assert(std::next(position) != end &&
         "Unbalanced parentheses should be rejected by the lexer.");
  TokenizedBuffer::Token close_paren =
      tokens.GetMatchedClosingToken(open_paren);

  Node signature_n = ParseFunctionSignature();
  assert(*std::prev(position) == close_paren &&
         "Should have parsed through the close paren, whether successfully "
         "or with errors.");
  if (tree.node_impls[signature_n.index].has_error) {
    // Don't try to parse more of the function declaration, but consume a
    // declaration ending semicolon if found (without going to a new line).
    SkipPastLikelyDeclarationEnd(function_intro_token);
    return add_error_function_node();
  }

  // See if we should parse a definition which is represented as a code block.
  if (tokens.GetKind(*position) == TokenKind::OpenCurlyBrace()) {
    ParseCodeBlock();
  } else if (!ConsumeAndAddLeafNodeIf(TokenKind::Semi(),
                                      ParseNodeKind::DeclarationEnd())) {
    llvm::errs() << "ERROR: Function declaration not terminated by a "
                    "semicolon on line "
                 << tokens.GetLineNumber(close_paren) << "!\n";
    if (tokens.GetLine(*position) == tokens.GetLine(close_paren))
      // Only need to skip if we've not already found a new line.
      SkipPastLikelyDeclarationEnd(function_intro_token);
    return add_error_function_node();
  }

  // Successfully parsed the function, add that node.
  return AddNode(ParseNodeKind::FunctionDeclaration(), function_intro_token,
                 start);
}

auto ParseTree::Parser::ParseEmptyDeclaration() -> Node {
  assert(position != end && "Cannot parse past the end!");
  return AddLeafNode(ParseNodeKind::EmptyDeclaration(),
                     Consume(TokenKind::Semi()));
}

auto ParseTree::Parser::ParseDeclaration() -> llvm::Optional<Node> {
  assert(position != end && "Cannot parse past the end!");
  TokenizedBuffer::Token t = *position;
  switch (tokens.GetKind(t)) {
    case TokenKind::FnKeyword():
      return ParseFunctionDeclaration();
    case TokenKind::Semi():
      return ParseEmptyDeclaration();
  }

  // We didn't recognize an introducer for a valid declaration.
  llvm::errs() << "ERROR: Unrecognized declaration introducer '"
               << tokens.GetTokenText(t) << "' on line "
               << tokens.GetLineNumber(t) << "!\n";

  // Skip forward past any end of a declaration we simply didn't understand so
  // that we can find the start of the next declaration or the end of a scope.
  if (auto found_semi_n =
          SkipPastLikelyDeclarationEnd(t, /*is_inside_declaration=*/false)) {
    MarkNodeError(*found_semi_n);
    return *found_semi_n;
  }

  // Nothing, not even a semicolon found. We still need to mark that an error
  // occurred though.
  tree.has_errors = true;
  return {};
}

}  // namespace Carbon