Просмотр исходного кода

Rework raw identifier lexing to avoid slowing down regular identifiers. (#3855)

Instead of special-casing tokens starting with `r`, lex them as normal
identifiers, and add a special case to `#` handling to detect if the
previous token was an `r` identifier.

This roughly doubles the time to lex a raw identifier, because we do two
hash table insertions rather than one, and probably slightly slows down
lexing string literals starting with `#`, but should remove the 2%
overhead to identifier lexing from the previous approach.
Richard Smith 2 лет назад
Родитель
Сommit
45c071f2af

+ 32 - 31
toolchain/lex/lex.cpp

@@ -16,6 +16,7 @@
 #include "toolchain/lex/helpers.h"
 #include "toolchain/lex/numeric_literal.h"
 #include "toolchain/lex/string_literal.h"
+#include "toolchain/lex/token_kind.h"
 #include "toolchain/lex/tokenized_buffer.h"
 
 #if __ARM_NEON
@@ -144,8 +145,7 @@ class [[clang::internal_linkage]] Lexer {
   auto LexKeywordOrIdentifier(llvm::StringRef source_text, ssize_t& position)
       -> LexResult;
 
-  auto LexKeywordOrIdentifierMaybeRaw(llvm::StringRef source_text,
-                                      ssize_t& position) -> LexResult;
+  auto LexHash(llvm::StringRef source_text, ssize_t& position) -> LexResult;
 
   auto LexError(llvm::StringRef source_text, ssize_t& position) -> LexResult;
 
@@ -472,7 +472,7 @@ static auto DispatchNext(Lexer& lexer, llvm::StringRef source_text,
 CARBON_DISPATCH_LEX_TOKEN(LexError)
 CARBON_DISPATCH_LEX_TOKEN(LexSymbolToken)
 CARBON_DISPATCH_LEX_TOKEN(LexKeywordOrIdentifier)
-CARBON_DISPATCH_LEX_TOKEN(LexKeywordOrIdentifierMaybeRaw)
+CARBON_DISPATCH_LEX_TOKEN(LexHash)
 CARBON_DISPATCH_LEX_TOKEN(LexNumericLiteral)
 CARBON_DISPATCH_LEX_TOKEN(LexStringLiteral)
 
@@ -576,7 +576,6 @@ static constexpr auto MakeDispatchTable() -> DispatchTableT {
   for (unsigned char c = 'a'; c <= 'z'; ++c) {
     table[c] = &DispatchLexKeywordOrIdentifier;
   }
-  table['r'] = &DispatchLexKeywordOrIdentifierMaybeRaw;
   for (unsigned char c = 'A'; c <= 'Z'; ++c) {
     table[c] = &DispatchLexKeywordOrIdentifier;
   }
@@ -594,7 +593,7 @@ static constexpr auto MakeDispatchTable() -> DispatchTableT {
 
   table['\''] = &DispatchLexStringLiteral;
   table['"'] = &DispatchLexStringLiteral;
-  table['#'] = &DispatchLexStringLiteral;
+  table['#'] = &DispatchLexHash;
 
   table[' '] = &DispatchLexHorizontalWhitespace;
   table['\t'] = &DispatchLexHorizontalWhitespace;
@@ -1104,40 +1103,42 @@ auto Lexer::LexKeywordOrIdentifier(llvm::StringRef source_text,
        .ident_id = buffer_.value_stores_->identifiers().Add(identifier_text)});
 }
 
-auto Lexer::LexKeywordOrIdentifierMaybeRaw(llvm::StringRef source_text,
-                                           ssize_t& position) -> LexResult {
-  CARBON_CHECK(source_text[position] == 'r');
-  // Raw identifiers must look like `r#<valid identifier>`, otherwise it's an
-  // identifier starting with the 'r'.
-  // TODO: Need to add support for Unicode lexing.
-  if (LLVM_LIKELY(position + 2 >= static_cast<ssize_t>(source_text.size()) ||
-                  source_text[position + 1] != '#' ||
-                  !IsIdStartByteTable[static_cast<unsigned char>(
-                      source_text[position + 2])])) {
-    // TODO: Should this print a different error when there is `r#`, but it
-    // isn't followed by identifier text? Or is it right to put it back so
-    // that the `#` could be parsed as part of a raw string literal?
-    return LexKeywordOrIdentifier(source_text, position);
+auto Lexer::LexHash(llvm::StringRef source_text, ssize_t& position)
+    -> LexResult {
+  // For `r#`, we already lexed an `r` identifier token. Detect that case and
+  // replace that token with a raw identifier. We do this to keep identifier
+  // lexing as fast as possible.
+
+  // Look for the `r` token. Note that this is always in bounds because we
+  // create a start of file token.
+  auto& prev_token_info = buffer_.token_infos_.back();
+
+  // If the previous token isn't the identifier `r`, or the character after `#`
+  // isn't the start of an identifier, this is not a raw identifier.
+  if (prev_token_info.kind != TokenKind::Identifier ||
+      source_text[position - 1] != 'r' ||
+      position + 1 == static_cast<ssize_t>(source_text.size()) ||
+      !IsIdStartByteTable[static_cast<unsigned char>(
+          source_text[position + 1])] ||
+      prev_token_info.token_line != current_line() ||
+      prev_token_info.column != ComputeColumn(position) - 1) {
+    [[clang::musttail]] return LexStringLiteral(source_text, position);
   }
-
-  int column = ComputeColumn(position);
+  CARBON_DCHECK(buffer_.value_stores_->identifiers().Get(
+                    prev_token_info.ident_id) == "r");
 
   // Take the valid characters off the front of the source buffer.
   llvm::StringRef identifier_text =
-      ScanForIdentifierPrefix(source_text.substr(position + 2));
+      ScanForIdentifierPrefix(source_text.substr(position + 1));
   CARBON_CHECK(!identifier_text.empty()) << "Must have at least one character!";
-  position += identifier_text.size() + 2;
+  position += 1 + identifier_text.size();
 
-  // Versus LexKeywordOrIdentifier, raw identifiers do not do keyword checks.
-
-  // Otherwise we have a raw identifier.
+  // Replace the `r` identifier's value with the raw identifier.
   // TODO: This token doesn't carry any indicator that it's raw, so
   // diagnostics are unclear.
-  return buffer_.AddToken(
-      {.kind = TokenKind::Identifier,
-       .token_line = current_line(),
-       .column = column,
-       .ident_id = buffer_.value_stores_->identifiers().Add(identifier_text)});
+  prev_token_info.ident_id =
+      buffer_.value_stores_->identifiers().Add(identifier_text);
+  return LexResult(TokenIndex(buffer_.token_infos_.size() - 1));
 }
 
 auto Lexer::LexError(llvm::StringRef source_text, ssize_t& position)

+ 74 - 10
toolchain/lex/testdata/fail_bad_raw_identifier.carbon

@@ -3,9 +3,12 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 // AUTOUPDATE
+
+// --- fail_bad_raw_identifier.carbon
 // CHECK:STDOUT: - filename: fail_bad_raw_identifier.carbon
 // CHECK:STDOUT:   tokens: [
-// CHECK:STDOUT:     { index: 0, kind:  'FileStart', line: {{ *\d+}}, column:  1, indent: 1, spelling: '', has_trailing_space: true },
+// CHECK:STDOUT:     { index:  0, kind:     'FileStart', line: {{ *\d+}}, column:  1, indent: 1, spelling: '', has_trailing_space: true },
+
 
 // Missing the character after `#`.
 // CHECK:STDERR: fail_bad_raw_identifier.carbon:[[@LINE+4]]:2: ERROR: Encountered unrecognized characters while parsing.
@@ -13,8 +16,8 @@
 // CHECK:STDERR:  ^
 // CHECK:STDERR:
 r#
-// CHECK:STDOUT:     { index: 1, kind: 'Identifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'r', identifier: 0 },
-// CHECK:STDOUT:     { index: 2, kind:      'Error', line: {{ *}}[[@LINE-2]], column:  2, indent: 1, spelling: '#', has_trailing_space: true },
+// CHECK:STDOUT:     { index:  1, kind:    'Identifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'r', identifier: 0 },
+// CHECK:STDOUT:     { index:  2, kind:         'Error', line: {{ *}}[[@LINE-2]], column:  2, indent: 1, spelling: '#', has_trailing_space: true },
 
 // Not a valid identifier.
 // CHECK:STDERR: fail_bad_raw_identifier.carbon:[[@LINE+4]]:2: ERROR: Encountered unrecognized characters while parsing.
@@ -22,17 +25,78 @@ r#
 // CHECK:STDERR:  ^
 // CHECK:STDERR:
 r#3
-// CHECK:STDOUT:     { index: 3, kind: 'Identifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'r', identifier: 0 },
-// CHECK:STDOUT:     { index: 4, kind:      'Error', line: {{ *}}[[@LINE-2]], column:  2, indent: 1, spelling: '#' },
-// CHECK:STDOUT:     { index: 5, kind: 'IntLiteral', line: {{ *}}[[@LINE-3]], column:  3, indent: 1, spelling: '3', value: `3`, has_trailing_space: true },
+// CHECK:STDOUT:     { index:  3, kind:    'Identifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'r', identifier: 0 },
+// CHECK:STDOUT:     { index:  4, kind:         'Error', line: {{ *}}[[@LINE-2]], column:  2, indent: 1, spelling: '#' },
+// CHECK:STDOUT:     { index:  5, kind:    'IntLiteral', line: {{ *}}[[@LINE-3]], column:  3, indent: 1, spelling: '3', value: `3`, has_trailing_space: true },
 
 // Non ascii start to identifier.
-// CHECK:STDERR: fail_bad_raw_identifier.carbon:[[@LINE+3]]:2: ERROR: Encountered unrecognized characters while parsing.
+// CHECK:STDERR: fail_bad_raw_identifier.carbon:[[@LINE+4]]:2: ERROR: Encountered unrecognized characters while parsing.
 // CHECK:STDERR: r#á
 // CHECK:STDERR:  ^
+// CHECK:STDERR:
 r#á
-// CHECK:STDOUT:     { index: 6, kind: 'Identifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'r', identifier: 0 },
-// CHECK:STDOUT:     { index: 7, kind:      'Error', line: {{ *}}[[@LINE-2]], column:  2, indent: 1, spelling: '#á', has_trailing_space: true },
+// CHECK:STDOUT:     { index:  6, kind:    'Identifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'r', identifier: 0 },
+// CHECK:STDOUT:     { index:  7, kind:         'Error', line: {{ *}}[[@LINE-2]], column:  2, indent: 1, spelling: '#á', has_trailing_space: true },
+
+// Raw `r` identifier doesn't start a second raw identifier.
+// CHECK:STDERR: fail_bad_raw_identifier.carbon:[[@LINE+4]]:4: ERROR: Encountered unrecognized characters while parsing.
+// CHECK:STDERR: r#r#foo
+// CHECK:STDERR:    ^
+// CHECK:STDERR:
+r#r#foo
+// CHECK:STDOUT:     { index:  8, kind:    'Identifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'r', identifier: 0 },
+// CHECK:STDOUT:     { index:  9, kind:         'Error', line: {{ *}}[[@LINE-2]], column:  4, indent: 1, spelling: '#' },
+// CHECK:STDOUT:     { index: 10, kind:    'Identifier', line: {{ *}}[[@LINE-3]], column:  5, indent: 1, spelling: 'foo', identifier: 1, has_trailing_space: true },
+
+// Other identifier characters don't start a raw identifier.
+// CHECK:STDERR: fail_bad_raw_identifier.carbon:[[@LINE+4]]:2: ERROR: Encountered unrecognized characters while parsing.
+// CHECK:STDERR: s#foo
+// CHECK:STDERR:  ^
+// CHECK:STDERR:
+s#foo
+// CHECK:STDOUT:     { index: 11, kind:    'Identifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 's', identifier: 2 },
+// CHECK:STDOUT:     { index: 12, kind:         'Error', line: {{ *}}[[@LINE-2]], column:  2, indent: 1, spelling: '#' },
+// CHECK:STDOUT:     { index: 13, kind:    'Identifier', line: {{ *}}[[@LINE-3]], column:  3, indent: 1, spelling: 'foo', identifier: 1, has_trailing_space: true },
+
+// Identifier ending in `r` doesn't start a raw identifier.
+// CHECK:STDERR: fail_bad_raw_identifier.carbon:[[@LINE+4]]:4: ERROR: Encountered unrecognized characters while parsing.
+// CHECK:STDERR: arr#foo
+// CHECK:STDERR:    ^
+// CHECK:STDERR:
+arr#foo
+// CHECK:STDOUT:     { index: 14, kind:    'Identifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'arr', identifier: 3 },
+// CHECK:STDOUT:     { index: 15, kind:         'Error', line: {{ *}}[[@LINE-2]], column:  4, indent: 1, spelling: '#' },
+// CHECK:STDOUT:     { index: 16, kind:    'Identifier', line: {{ *}}[[@LINE-3]], column:  5, indent: 1, spelling: 'foo', identifier: 1, has_trailing_space: true },
+
+// Whitespace between `r` and `#` isn't allowed.
+// CHECK:STDERR: fail_bad_raw_identifier.carbon:[[@LINE+4]]:3: ERROR: Encountered unrecognized characters while parsing.
+// CHECK:STDERR: r #foo
+// CHECK:STDERR:   ^
+// CHECK:STDERR:
+r #foo
+// CHECK:STDOUT:     { index: 17, kind:    'Identifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'r', identifier: 0, has_trailing_space: true },
+// CHECK:STDOUT:     { index: 18, kind:         'Error', line: {{ *}}[[@LINE-2]], column:  3, indent: 1, spelling: '#' },
+// CHECK:STDOUT:     { index: 19, kind:    'Identifier', line: {{ *}}[[@LINE-3]], column:  4, indent: 1, spelling: 'foo', identifier: 1, has_trailing_space: true },
+
+// This is an `r` identifier followed by a string literal.
+r#"hello"#
+// CHECK:STDOUT:     { index: 20, kind:    'Identifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'r', identifier: 0 },
+// CHECK:STDOUT:     { index: 21, kind: 'StringLiteral', line: {{ *}}[[@LINE-2]], column:  2, indent: 1, spelling: '#"hello"#', value: `hello`, has_trailing_space: true },
+
+// CHECK:STDOUT:     { index: 22, kind:       'FileEnd', line: {{ *}}[[@LINE+1]], column: {{ *\d+}}, indent: 1, spelling: '' },
+// CHECK:STDOUT:   ]
+// --- fail_hash_at_start_of_file.carbon
+// CHECK:STDOUT: - filename: fail_hash_at_start_of_file.carbon
+// CHECK:STDOUT:   tokens: [
+// CHECK:STDOUT:     { index: 0, kind:  'FileStart', line: {{ *\d+}}, column:  1, indent: 1, spelling: '', has_trailing_space: true },
+
+// Ensure that we correctly handle a `#` as the first token in the file.
+// CHECK:STDERR: fail_hash_at_start_of_file.carbon:[[@LINE+3]]:1: ERROR: Encountered unrecognized characters while parsing.
+// CHECK:STDERR: #foo
+// CHECK:STDERR: ^
+#foo
+// CHECK:STDOUT:     { index: 1, kind:      'Error', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: '#' },
+// CHECK:STDOUT:     { index: 2, kind: 'Identifier', line: {{ *}}[[@LINE-2]], column:  2, indent: 1, spelling: 'foo', identifier: 0, has_trailing_space: true },
 
-// CHECK:STDOUT:     { index: 8, kind:    'FileEnd', line: {{ *}}[[@LINE+1]], column: {{ *\d+}}, indent: 1, spelling: '' },
+// CHECK:STDOUT:     { index: 3, kind:    'FileEnd', line: {{ *}}[[@LINE+1]], column: {{ *\d+}}, indent: 1, spelling: '' },
 // CHECK:STDOUT:   ]

+ 4 - 4
toolchain/lex/testdata/raw_identifier.carbon

@@ -9,15 +9,15 @@
 
 // A non-keyword identifier.
 r#foo
-// CHECK:STDOUT:     { index: 1, kind:          'Identifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'foo', identifier: 0, has_trailing_space: true },
+// CHECK:STDOUT:     { index: 1, kind:          'Identifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'foo', identifier: 1, has_trailing_space: true },
 
 // The same non-keyword identifier, for comparison.
 foo
-// CHECK:STDOUT:     { index: 2, kind:          'Identifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'foo', identifier: 0, has_trailing_space: true },
+// CHECK:STDOUT:     { index: 2, kind:          'Identifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'foo', identifier: 1, has_trailing_space: true },
 
 // A keyword as a raw identifier.
 r#self
-// CHECK:STDOUT:     { index: 3, kind:          'Identifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'self', identifier: 1, has_trailing_space: true },
+// CHECK:STDOUT:     { index: 3, kind:          'Identifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'self', identifier: 2, has_trailing_space: true },
 
 // The same keyword, for comparison.
 self
@@ -25,7 +25,7 @@ self
 
 // A type literal as a raw identifier.
 r#i32
-// CHECK:STDOUT:     { index: 5, kind:          'Identifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'i32', identifier: 2, has_trailing_space: true },
+// CHECK:STDOUT:     { index: 5, kind:          'Identifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'i32', identifier: 3, has_trailing_space: true },
 
 // The same type literal, for comparison.
 i32