2 anos atrás · b2db2ca7cf
--- a/toolchain/lexer/token_kind.cpp
+++ b/toolchain/lexer/token_kind.cpp
@@ -79,6 +79,15 @@ auto TokenKind::opening_symbol() const -> TokenKind {
 
															   return result;
														
 
															 }
														
 
															+auto TokenKind::is_one_char_symbol() const -> bool {
														
 
															+  static constexpr bool Table[] = {
														
 
															+#define CARBON_TOKEN(TokenName) false,
														
 
															+#define CARBON_ONE_CHAR_SYMBOL_TOKEN(TokenName, Spelling) true,
														
 
															+#include "toolchain/lexer/token_kind.def"
														
 
															+  };
														
 
															+  return Table[AsInt()];
														
 
															+}
														
 
															+
														
 
															 auto TokenKind::is_keyword() const -> bool {
														
 
															   static constexpr bool Table[] = {
														
 
															 #define CARBON_TOKEN(TokenName) false,
														
--- a/toolchain/lexer/token_kind.def
+++ b/toolchain/lexer/token_kind.def
@@ -36,6 +36,11 @@
 
															 #define CARBON_SYMBOL_TOKEN(Name, Spelling) CARBON_TOKEN(Name)
														
 
															 #endif
														
 
															+#ifndef CARBON_ONE_CHAR_SYMBOL_TOKEN
														
 
															+#define CARBON_ONE_CHAR_SYMBOL_TOKEN(Name, Spelling) \
														
 
															+  CARBON_SYMBOL_TOKEN(Name, Spelling)
														
 
															+#endif
														
 
															+
														
 
															 #ifndef CARBON_TOKEN_WITH_VIRTUAL_NODE
														
 
															 #define CARBON_TOKEN_WITH_VIRTUAL_NODE(Name) Name
														
 
															 #endif
														
@@ -78,7 +83,6 @@ CARBON_SYMBOL_TOKEN(At,                  "@")
 
															 CARBON_SYMBOL_TOKEN(Backslash,           "\\")
														
 
															 CARBON_SYMBOL_TOKEN(Caret,               "^")
														
 
															 CARBON_SYMBOL_TOKEN(Colon,               ":")
														
 
															-CARBON_SYMBOL_TOKEN(Comma,               ",")
														
 
															 CARBON_SYMBOL_TOKEN(Equal,               "=")
														
 
															 CARBON_SYMBOL_TOKEN(Exclaim,             "!")
														
 
															 CARBON_SYMBOL_TOKEN(Greater,             ">")
														
@@ -89,16 +93,25 @@ CARBON_SYMBOL_TOKEN(Period,              ".")
 
															 CARBON_SYMBOL_TOKEN(Pipe,                "|")
														
 
															 CARBON_SYMBOL_TOKEN(Plus,                "+")
														
 
															 CARBON_SYMBOL_TOKEN(Question,            "?")
														
 
															-CARBON_SYMBOL_TOKEN(Semi,                ";")
														
 
															 CARBON_SYMBOL_TOKEN(Slash,               "/")
														
 
															 CARBON_SYMBOL_TOKEN(Star,                "*")
														
 
															 CARBON_SYMBOL_TOKEN(Tilde,               "~")
														
 
															+// Some Carbon symbols are constructively exactly one character and cannot be
														
 
															+// combined with any other characters to form new symbols. We can lex these
														
 
															+// without needing to max-munch any other characters. These are typically
														
 
															+// expected to be terminators or separators that need to compose with all other
														
 
															+// parts of the grammar. Group symbols are also currently one-character symbols,
														
 
															+// although we may choose to remove that if we need to add composite grouping
														
 
															+// symbols in the future.
														
 
															+CARBON_ONE_CHAR_SYMBOL_TOKEN(Comma,      ",")
														
 
															+CARBON_ONE_CHAR_SYMBOL_TOKEN(Semi,       ";")
														
 
															+
														
 
															 // clang-format on
														
 
															 #ifndef CARBON_OPENING_GROUP_SYMBOL_TOKEN
														
 
															 #define CARBON_OPENING_GROUP_SYMBOL_TOKEN(Name, Spelling, ClosingName) \
														
 
															-  CARBON_SYMBOL_TOKEN(Name, Spelling)
														
 
															+  CARBON_ONE_CHAR_SYMBOL_TOKEN(Name, Spelling)
														
 
															 #endif
														
 
															 CARBON_OPENING_GROUP_SYMBOL_TOKEN(OpenParen, "(", CloseParen)
														
 
															 CARBON_OPENING_GROUP_SYMBOL_TOKEN(OpenCurlyBrace, "{", CloseCurlyBrace)
														
@@ -107,13 +120,14 @@ CARBON_OPENING_GROUP_SYMBOL_TOKEN(OpenSquareBracket, "[", CloseSquareBracket)
 
															 #ifndef CARBON_CLOSING_GROUP_SYMBOL_TOKEN
														
 
															 #define CARBON_CLOSING_GROUP_SYMBOL_TOKEN(Name, Spelling, OpeningName) \
														
 
															-  CARBON_SYMBOL_TOKEN(Name, Spelling)
														
 
															+  CARBON_ONE_CHAR_SYMBOL_TOKEN(Name, Spelling)
														
 
															 #endif
														
 
															 CARBON_CLOSING_GROUP_SYMBOL_TOKEN(CloseParen, ")", OpenParen)
														
 
															 CARBON_CLOSING_GROUP_SYMBOL_TOKEN(CloseCurlyBrace, "}", OpenCurlyBrace)
														
 
															 CARBON_CLOSING_GROUP_SYMBOL_TOKEN(CloseSquareBracket, "]", OpenSquareBracket)
														
 
															 #undef CARBON_CLOSING_GROUP_SYMBOL_TOKEN
														
 
															+#undef CARBON_ONE_CHAR_SYMBOL_TOKEN
														
 
															 #undef CARBON_SYMBOL_TOKEN
														
 
															 #ifndef CARBON_KEYWORD_TOKEN
														
--- a/toolchain/lexer/token_kind.h
+++ b/toolchain/lexer/token_kind.h
@@ -54,6 +54,10 @@ class TokenKind : public CARBON_ENUM_BASE(TokenKind) {
 
															   // The token kind must be a closing symbol.
														
 
															   [[nodiscard]] auto opening_symbol() const -> TokenKind;
														
 
															+  // Test whether this kind of token is a one-character symbol whose character
														
 
															+  // is not part of any other symbol.
														
 
															+  [[nodiscard]] auto is_one_char_symbol() const -> bool;
														
 
															+
														
 
															   // Test whether this kind of token is a keyword.
														
 
															   [[nodiscard]] auto is_keyword() const -> bool;
														
--- a/toolchain/lexer/tokenized_buffer.cpp
+++ b/toolchain/lexer/tokenized_buffer.cpp
@@ -13,6 +13,7 @@
 
															 #include "common/check.h"
														
 
															 #include "common/string_helpers.h"
														
 
															 #include "llvm/ADT/STLExtras.h"
														
 
															+#include "llvm/ADT/Sequence.h"
														
 
															 #include "llvm/ADT/StringRef.h"
														
 
															 #include "llvm/ADT/StringSwitch.h"
														
 
															 #include "llvm/Support/ErrorHandling.h"
														
@@ -247,6 +248,10 @@ class TokenizedBuffer::Lexer {
 
															     bool formed_token_;
														
 
															   };
														
 
															+  using DispatchFunctionT = auto(Lexer& lexer, llvm::StringRef& source_text)
														
 
															+      -> LexResult;
														
 
															+  using DispatchTableT = std::array<DispatchFunctionT*, 256>;
														
 
															+
														
 
															   Lexer(TokenizedBuffer& buffer, DiagnosticConsumer& consumer)
														
 
															       : buffer_(&buffer),
														
 
															         translator_(&buffer),
														
@@ -351,7 +356,7 @@ class TokenizedBuffer::Lexer {
 
															     std::optional<LexedNumericLiteral> literal =
														
 
															         LexedNumericLiteral::Lex(source_text);
														
 
															     if (!literal) {
														
 
															-      return LexResult::NoMatch();
														
 
															+      return LexError(source_text);
														
 
															     }
														
 
															     int int_column = current_column_;
														
@@ -402,7 +407,7 @@ class TokenizedBuffer::Lexer {
 
															     std::optional<LexedStringLiteral> literal =
														
 
															         LexedStringLiteral::Lex(source_text);
														
 
															     if (!literal) {
														
 
															-      return LexResult::NoMatch();
														
 
															+      return LexError(source_text);
														
 
															     }
														
 
															     Line string_line = current_line_;
														
@@ -453,14 +458,30 @@ class TokenizedBuffer::Lexer {
 
															     }
														
 
															   }
														
 
															-  auto LexSymbolToken(llvm::StringRef& source_text) -> LexResult {
														
 
															-    TokenKind kind = llvm::StringSwitch<TokenKind>(source_text)
														
 
															+  auto LexSymbolToken(llvm::StringRef& source_text,
														
 
															+                      TokenKind kind = TokenKind::Error) -> LexResult {
														
 
															+    auto compute_symbol_kind = [](llvm::StringRef source_text) {
														
 
															+      return llvm::StringSwitch<TokenKind>(source_text)
														
 
															 #define CARBON_SYMBOL_TOKEN(Name, Spelling) \
														
 
															   .StartsWith(Spelling, TokenKind::Name)
														
 
															 #include "toolchain/lexer/token_kind.def"
														
 
															-                         .Default(TokenKind::Error);
														
 
															-    if (kind == TokenKind::Error) {
														
 
															-      return LexResult::NoMatch();
														
 
															+          .Default(TokenKind::Error);
														
 
															+    };
														
 
															+
														
 
															+    // We use the `error` token as a place-holder for cases where one character
														
 
															+    // isn't enough to pick a definitive symbol token. Recompute the kind using
														
 
															+    // the full symbol set.
														
 
															+    if (LLVM_UNLIKELY(kind == TokenKind::Error)) {
														
 
															+      kind = compute_symbol_kind(source_text);
														
 
															+      if (kind == TokenKind::Error) {
														
 
															+        return LexError(source_text);
														
 
															+      }
														
 
															+    } else {
														
 
															+      // Verify in a debug build that the incoming token kind is correct.
														
 
															+      CARBON_DCHECK(kind == compute_symbol_kind(source_text))
														
 
															+          << "Incoming token kind '" << kind
														
 
															+          << "' does not match computed kind '"
														
 
															+          << compute_symbol_kind(source_text) << "'!";
														
 
															     }
														
 
															     if (!set_indent_) {
														
@@ -609,9 +630,11 @@ class TokenizedBuffer::Lexer {
 
															   }
														
 
															   auto LexKeywordOrIdentifier(llvm::StringRef& source_text) -> LexResult {
														
 
															-    if (!IsAlpha(source_text.front()) && source_text.front() != '_') {
														
 
															-      return LexResult::NoMatch();
														
 
															+    if (static_cast<unsigned char>(source_text.front()) > 0x7F) {
														
 
															+      // TODO: Need to add support for Unicode lexing.
														
 
															+      return LexError(source_text);
														
 
															     }
														
 
															+    CARBON_CHECK(IsAlpha(source_text.front()) || source_text.front() == '_');
														
 
															     if (!set_indent_) {
														
 
															       current_line_info_->indent = current_column_;
														
@@ -692,6 +715,76 @@ class TokenizedBuffer::Lexer {
 
															                        .column = current_column_});
														
 
															   }
														
 
															+  constexpr static auto MakeDispatchTable() -> DispatchTableT {
														
 
															+    DispatchTableT table = {};
														
 
															+    auto dispatch_lex_error = +[](Lexer& lexer, llvm::StringRef& source_text) {
														
 
															+      return lexer.LexError(source_text);
														
 
															+    };
														
 
															+    for (int i = 0; i < 256; ++i) {
														
 
															+      table[i] = dispatch_lex_error;
														
 
															+    }
														
 
															+
														
 
															+    // Symbols have some special dispatching. First, set the first character of
														
 
															+    // each symbol token spelling to dispatch to the symbol lexer. We don't
														
 
															+    // provide a pre-computed token here, so the symbol lexer will compute the
														
 
															+    // exact symbol token kind.
														
 
															+    auto dispatch_lex_symbol = +[](Lexer& lexer, llvm::StringRef& source_text) {
														
 
															+      return lexer.LexSymbolToken(source_text);
														
 
															+    };
														
 
															+#define CARBON_SYMBOL_TOKEN(TokenName, Spelling) \
														
 
															+  table[(Spelling)[0]] = dispatch_lex_symbol;
														
 
															+#include "toolchain/lexer/token_kind.def"
														
 
															+
														
 
															+    // Now special cased single-character symbols that are guaranteed to not
														
 
															+    // join with another symbol. These are grouping symbols, terminators,
														
 
															+    // or separators in the grammar and have a good reason to be
														
 
															+    // orthogonal to any other punctuation. We do this separately because this
														
 
															+    // needs to override some of the generic handling above, and provide a
														
 
															+    // custom token.
														
 
															+#define CARBON_ONE_CHAR_SYMBOL_TOKEN(TokenName, Spelling)                  \
														
 
															+  table[(Spelling)[0]] = +[](Lexer& lexer, llvm::StringRef& source_text) { \
														
 
															+    return lexer.LexSymbolToken(source_text, TokenKind::TokenName);        \
														
 
															+  };
														
 
															+#include "toolchain/lexer/token_kind.def"
														
 
															+
														
 
															+    auto dispatch_lex_word = +[](Lexer& lexer, llvm::StringRef& source_text) {
														
 
															+      return lexer.LexKeywordOrIdentifier(source_text);
														
 
															+    };
														
 
															+    table['_'] = dispatch_lex_word;
														
 
															+    // Note that we don't use `llvm::seq` because this needs to be `constexpr`
														
 
															+    // evaluated.
														
 
															+    for (unsigned char c = 'a'; c <= 'z'; ++c) {
														
 
															+      table[c] = dispatch_lex_word;
														
 
															+    }
														
 
															+    for (unsigned char c = 'A'; c <= 'Z'; ++c) {
														
 
															+      table[c] = dispatch_lex_word;
														
 
															+    }
														
 
															+    // We dispatch all non-ASCII UTF-8 characters to the identifier lexing
														
 
															+    // as whitespace characters should already have been skipped and the
														
 
															+    // only remaining valid Unicode characters would be part of an
														
 
															+    // identifier. That code can either accept or reject.
														
 
															+    for (int i = 0x80; i < 0x100; ++i) {
														
 
															+      table[i] = dispatch_lex_word;
														
 
															+    }
														
 
															+
														
 
															+    auto dispatch_lex_numeric =
														
 
															+        +[](Lexer& lexer, llvm::StringRef& source_text) {
														
 
															+          return lexer.LexNumericLiteral(source_text);
														
 
															+        };
														
 
															+    for (unsigned char c = '0'; c <= '9'; ++c) {
														
 
															+      table[c] = dispatch_lex_numeric;
														
 
															+    }
														
 
															+
														
 
															+    auto dispatch_lex_string = +[](Lexer& lexer, llvm::StringRef& source_text) {
														
 
															+      return lexer.LexStringLiteral(source_text);
														
 
															+    };
														
 
															+    table['\''] = dispatch_lex_string;
														
 
															+    table['"'] = dispatch_lex_string;
														
 
															+    table['#'] = dispatch_lex_string;
														
 
															+
														
 
															+    return table;
														
 
															+  };
														
 
															+
														
 
															  private:
														
 
															   TokenizedBuffer* buffer_;
														
@@ -716,24 +809,40 @@ auto TokenizedBuffer::Lex(SourceBuffer& source, DiagnosticConsumer& consumer)
 
															   ErrorTrackingDiagnosticConsumer error_tracking_consumer(consumer);
														
 
															   Lexer lexer(buffer, error_tracking_consumer);
														
 
															+  // Build a table of function pointers that we can use to dispatch to the
														
 
															+  // correct lexer routine based on the first byte of source text.
														
 
															+  //
														
 
															+  // While it is tempting to simply use a `switch` on the first byte and
														
 
															+  // dispatch with cases into this, in practice that doesn't produce great code.
														
 
															+  // There seem to be two issues that are the root cause.
														
 
															+  //
														
 
															+  // First, there are lots of different values of bytes that dispatch to a
														
 
															+  // fairly small set of routines, and then some byte values that dispatch
														
 
															+  // differently for each byte. This pattern isn't one that the compiler-based
														
 
															+  // lowering of switches works well with -- it tries to balance all the cases,
														
 
															+  // and in doing so emits several compares and other control flow rather than a
														
 
															+  // simple jump table.
														
 
															+  //
														
 
															+  // Second, with a `case`, it isn't as obvious how to create a single, uniform
														
 
															+  // interface that is effective for *every* byte value, and thus makes for a
														
 
															+  // single consistent table-based dispatch. By forcing these to be function
														
 
															+  // pointers, we also coerce the code to use a strictly homogeneous structure
														
 
															+  // that can form a single dispatch table.
														
 
															+  //
														
 
															+  // These two actually interact -- the second issue is part of what makes the
														
 
															+  // non-table lowering in the first one desirable for many switches and cases.
														
 
															+  //
														
 
															+  // Ultimately, when table-based dispatch is such an important technique, we
														
 
															+  // get better results by taking full control and manually creating the
														
 
															+  // dispatch structures.
														
 
															+  constexpr Lexer::DispatchTableT DispatchTable = Lexer::MakeDispatchTable();
														
 
															+
														
 
															   llvm::StringRef source_text = source.text();
														
 
															   while (lexer.SkipWhitespace(source_text)) {
														
 
															-    // Each time we find non-whitespace characters, try each kind of token we
														
 
															-    // support lexing, from simplest to most complex.
														
 
															-    Lexer::LexResult result = lexer.LexSymbolToken(source_text);
														
 
															-    if (!result) {
														
 
															-      result = lexer.LexKeywordOrIdentifier(source_text);
														
 
															-    }
														
 
															-    if (!result) {
														
 
															-      result = lexer.LexNumericLiteral(source_text);
														
 
															-    }
														
 
															-    if (!result) {
														
 
															-      result = lexer.LexStringLiteral(source_text);
														
 
															-    }
														
 
															-    if (!result) {
														
 
															-      result = lexer.LexError(source_text);
														
 
															-    }
														
 
															-    CARBON_CHECK(result) << "No token was lexed.";
														
 
															+    Lexer::LexResult result =
														
 
															+        DispatchTable[static_cast<unsigned char>(source_text.front())](
														
 
															+            lexer, source_text);
														
 
															+    CARBON_CHECK(result) << "Failed to form a token!";
														
 
															   }
														
 
															   // The end-of-file token is always considered to be whitespace.