2 лет назад · 45c071f2af
--- a/toolchain/lex/lex.cpp
+++ b/toolchain/lex/lex.cpp
@@ -16,6 +16,7 @@
 
				 #include "toolchain/lex/helpers.h"
			
 
				 #include "toolchain/lex/numeric_literal.h"
			
 
				 #include "toolchain/lex/string_literal.h"
			
 
				+#include "toolchain/lex/token_kind.h"
			
 
				 #include "toolchain/lex/tokenized_buffer.h"
			
 
				 
			
 
				 #if __ARM_NEON
			
@@ -144,8 +145,7 @@ class [[clang::internal_linkage]] Lexer {
 
				   auto LexKeywordOrIdentifier(llvm::StringRef source_text, ssize_t& position)
			
 
				       -> LexResult;
			
 
				 
			
 
				-  auto LexKeywordOrIdentifierMaybeRaw(llvm::StringRef source_text,
			
 
				-                                      ssize_t& position) -> LexResult;
			
 
				+  auto LexHash(llvm::StringRef source_text, ssize_t& position) -> LexResult;
			
 
				 
			
 
				   auto LexError(llvm::StringRef source_text, ssize_t& position) -> LexResult;
			
 
				 
			
@@ -472,7 +472,7 @@ static auto DispatchNext(Lexer& lexer, llvm::StringRef source_text,
 
				 CARBON_DISPATCH_LEX_TOKEN(LexError)
			
 
				 CARBON_DISPATCH_LEX_TOKEN(LexSymbolToken)
			
 
				 CARBON_DISPATCH_LEX_TOKEN(LexKeywordOrIdentifier)
			
 
				-CARBON_DISPATCH_LEX_TOKEN(LexKeywordOrIdentifierMaybeRaw)
			
 
				+CARBON_DISPATCH_LEX_TOKEN(LexHash)
			
 
				 CARBON_DISPATCH_LEX_TOKEN(LexNumericLiteral)
			
 
				 CARBON_DISPATCH_LEX_TOKEN(LexStringLiteral)
			
 
				 
			
@@ -576,7 +576,6 @@ static constexpr auto MakeDispatchTable() -> DispatchTableT {
 
				   for (unsigned char c = 'a'; c <= 'z'; ++c) {
			
 
				     table[c] = &DispatchLexKeywordOrIdentifier;
			
 
				   }
			
 
				-  table['r'] = &DispatchLexKeywordOrIdentifierMaybeRaw;
			
 
				   for (unsigned char c = 'A'; c <= 'Z'; ++c) {
			
 
				     table[c] = &DispatchLexKeywordOrIdentifier;
			
 
				   }
			
@@ -594,7 +593,7 @@ static constexpr auto MakeDispatchTable() -> DispatchTableT {
 
				 
			
 
				   table['\''] = &DispatchLexStringLiteral;
			
 
				   table['"'] = &DispatchLexStringLiteral;
			
 
				-  table['#'] = &DispatchLexStringLiteral;
			
 
				+  table['#'] = &DispatchLexHash;
			
 
				 
			
 
				   table[' '] = &DispatchLexHorizontalWhitespace;
			
 
				   table['\t'] = &DispatchLexHorizontalWhitespace;
			
@@ -1104,40 +1103,42 @@ auto Lexer::LexKeywordOrIdentifier(llvm::StringRef source_text,
 
				        .ident_id = buffer_.value_stores_->identifiers().Add(identifier_text)});
			
 
				 }
			
 
				 
			
 
				-auto Lexer::LexKeywordOrIdentifierMaybeRaw(llvm::StringRef source_text,
			
 
				-                                           ssize_t& position) -> LexResult {
			
 
				-  CARBON_CHECK(source_text[position] == 'r');
			
 
				-  // Raw identifiers must look like `r#<valid identifier>`, otherwise it's an
			
 
				-  // identifier starting with the 'r'.
			
 
				-  // TODO: Need to add support for Unicode lexing.
			
 
				-  if (LLVM_LIKELY(position + 2 >= static_cast<ssize_t>(source_text.size()) ||
			
 
				-                  source_text[position + 1] != '#' ||
			
 
				-                  !IsIdStartByteTable[static_cast<unsigned char>(
			
 
				-                      source_text[position + 2])])) {
			
 
				-    // TODO: Should this print a different error when there is `r#`, but it
			
 
				-    // isn't followed by identifier text? Or is it right to put it back so
			
 
				-    // that the `#` could be parsed as part of a raw string literal?
			
 
				-    return LexKeywordOrIdentifier(source_text, position);
			
 
				+auto Lexer::LexHash(llvm::StringRef source_text, ssize_t& position)
			
 
				+    -> LexResult {
			
 
				+  // For `r#`, we already lexed an `r` identifier token. Detect that case and
			
 
				+  // replace that token with a raw identifier. We do this to keep identifier
			
 
				+  // lexing as fast as possible.
			
 
				+
			
 
				+  // Look for the `r` token. Note that this is always in bounds because we
			
 
				+  // create a start of file token.
			
 
				+  auto& prev_token_info = buffer_.token_infos_.back();
			
 
				+
			
 
				+  // If the previous token isn't the identifier `r`, or the character after `#`
			
 
				+  // isn't the start of an identifier, this is not a raw identifier.
			
 
				+  if (prev_token_info.kind != TokenKind::Identifier ||
			
 
				+      source_text[position - 1] != 'r' ||
			
 
				+      position + 1 == static_cast<ssize_t>(source_text.size()) ||
			
 
				+      !IsIdStartByteTable[static_cast<unsigned char>(
			
 
				+          source_text[position + 1])] ||
			
 
				+      prev_token_info.token_line != current_line() ||
			
 
				+      prev_token_info.column != ComputeColumn(position) - 1) {
			
 
				+    [[clang::musttail]] return LexStringLiteral(source_text, position);
			
 
				   }
			
 
				-
			
 
				-  int column = ComputeColumn(position);
			
 
				+  CARBON_DCHECK(buffer_.value_stores_->identifiers().Get(
			
 
				+                    prev_token_info.ident_id) == "r");
			
 
				 
			
 
				   // Take the valid characters off the front of the source buffer.
			
 
				   llvm::StringRef identifier_text =
			
 
				-      ScanForIdentifierPrefix(source_text.substr(position + 2));
			
 
				+      ScanForIdentifierPrefix(source_text.substr(position + 1));
			
 
				   CARBON_CHECK(!identifier_text.empty()) << "Must have at least one character!";
			
 
				-  position += identifier_text.size() + 2;
			
 
				+  position += 1 + identifier_text.size();
			
 
				 
			
 
				-  // Versus LexKeywordOrIdentifier, raw identifiers do not do keyword checks.
			
 
				-
			
 
				-  // Otherwise we have a raw identifier.
			
 
				+  // Replace the `r` identifier's value with the raw identifier.
			
 
				   // TODO: This token doesn't carry any indicator that it's raw, so
			
 
				   // diagnostics are unclear.
			
 
				-  return buffer_.AddToken(
			
 
				-      {.kind = TokenKind::Identifier,
			
 
				-       .token_line = current_line(),
			
 
				-       .column = column,
			
 
				-       .ident_id = buffer_.value_stores_->identifiers().Add(identifier_text)});
			
 
				+  prev_token_info.ident_id =
			
 
				+      buffer_.value_stores_->identifiers().Add(identifier_text);
			
 
				+  return LexResult(TokenIndex(buffer_.token_infos_.size() - 1));
			
 
				 }
			
 
				 
			
 
				 auto Lexer::LexError(llvm::StringRef source_text, ssize_t& position)
			
--- a/toolchain/lex/testdata/fail_bad_raw_identifier.carbon
+++ b/toolchain/lex/testdata/fail_bad_raw_identifier.carbon
@@ -3,9 +3,12 @@
 
				 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
			
 
				 //
			
 
				 // AUTOUPDATE
			
 
				+
			
 
				+// --- fail_bad_raw_identifier.carbon
			
 
				 // CHECK:STDOUT: - filename: fail_bad_raw_identifier.carbon
			
 
				 // CHECK:STDOUT:   tokens: [
			
 
				-// CHECK:STDOUT:     { index: 0, kind:  'FileStart', line: {{ *\d+}}, column:  1, indent: 1, spelling: '', has_trailing_space: true },
			
 
				+// CHECK:STDOUT:     { index:  0, kind:     'FileStart', line: {{ *\d+}}, column:  1, indent: 1, spelling: '', has_trailing_space: true },
			
 
				+
			
 
				 
			
 
				 // Missing the character after `#`.
			
 
				 // CHECK:STDERR: fail_bad_raw_identifier.carbon:[[@LINE+4]]:2: ERROR: Encountered unrecognized characters while parsing.
			
@@ -13,8 +16,8 @@
 
				 // CHECK:STDERR:  ^
			
 
				 // CHECK:STDERR:
			
 
				 r#
			
 
				-// CHECK:STDOUT:     { index: 1, kind: 'Identifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'r', identifier: 0 },
			
 
				-// CHECK:STDOUT:     { index: 2, kind:      'Error', line: {{ *}}[[@LINE-2]], column:  2, indent: 1, spelling: '#', has_trailing_space: true },
			
 
				+// CHECK:STDOUT:     { index:  1, kind:    'Identifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'r', identifier: 0 },
			
 
				+// CHECK:STDOUT:     { index:  2, kind:         'Error', line: {{ *}}[[@LINE-2]], column:  2, indent: 1, spelling: '#', has_trailing_space: true },
			
 
				 
			
 
				 // Not a valid identifier.
			
 
				 // CHECK:STDERR: fail_bad_raw_identifier.carbon:[[@LINE+4]]:2: ERROR: Encountered unrecognized characters while parsing.
			
@@ -22,17 +25,78 @@ r#
 
				 // CHECK:STDERR:  ^
			
 
				 // CHECK:STDERR:
			
 
				 r#3
			
 
				-// CHECK:STDOUT:     { index: 3, kind: 'Identifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'r', identifier: 0 },
			
 
				-// CHECK:STDOUT:     { index: 4, kind:      'Error', line: {{ *}}[[@LINE-2]], column:  2, indent: 1, spelling: '#' },
			
 
				-// CHECK:STDOUT:     { index: 5, kind: 'IntLiteral', line: {{ *}}[[@LINE-3]], column:  3, indent: 1, spelling: '3', value: `3`, has_trailing_space: true },
			
 
				+// CHECK:STDOUT:     { index:  3, kind:    'Identifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'r', identifier: 0 },
			
 
				+// CHECK:STDOUT:     { index:  4, kind:         'Error', line: {{ *}}[[@LINE-2]], column:  2, indent: 1, spelling: '#' },
			
 
				+// CHECK:STDOUT:     { index:  5, kind:    'IntLiteral', line: {{ *}}[[@LINE-3]], column:  3, indent: 1, spelling: '3', value: `3`, has_trailing_space: true },
			
 
				 
			
 
				 // Non ascii start to identifier.
			
 
				-// CHECK:STDERR: fail_bad_raw_identifier.carbon:[[@LINE+3]]:2: ERROR: Encountered unrecognized characters while parsing.
			
 
				+// CHECK:STDERR: fail_bad_raw_identifier.carbon:[[@LINE+4]]:2: ERROR: Encountered unrecognized characters while parsing.
			
 
				 // CHECK:STDERR: r#á
			
 
				 // CHECK:STDERR:  ^
			
 
				+// CHECK:STDERR:
			
 
				 r#á
			
 
				-// CHECK:STDOUT:     { index: 6, kind: 'Identifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'r', identifier: 0 },
			
 
				-// CHECK:STDOUT:     { index: 7, kind:      'Error', line: {{ *}}[[@LINE-2]], column:  2, indent: 1, spelling: '#á', has_trailing_space: true },
			
 
				+// CHECK:STDOUT:     { index:  6, kind:    'Identifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'r', identifier: 0 },
			
 
				+// CHECK:STDOUT:     { index:  7, kind:         'Error', line: {{ *}}[[@LINE-2]], column:  2, indent: 1, spelling: '#á', has_trailing_space: true },
			
 
				+
			
 
				+// Raw `r` identifier doesn't start a second raw identifier.
			
 
				+// CHECK:STDERR: fail_bad_raw_identifier.carbon:[[@LINE+4]]:4: ERROR: Encountered unrecognized characters while parsing.
			
 
				+// CHECK:STDERR: r#r#foo
			
 
				+// CHECK:STDERR:    ^
			
 
				+// CHECK:STDERR:
			
 
				+r#r#foo
			
 
				+// CHECK:STDOUT:     { index:  8, kind:    'Identifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'r', identifier: 0 },
			
 
				+// CHECK:STDOUT:     { index:  9, kind:         'Error', line: {{ *}}[[@LINE-2]], column:  4, indent: 1, spelling: '#' },
			
 
				+// CHECK:STDOUT:     { index: 10, kind:    'Identifier', line: {{ *}}[[@LINE-3]], column:  5, indent: 1, spelling: 'foo', identifier: 1, has_trailing_space: true },
			
 
				+
			
 
				+// Other identifier characters don't start a raw identifier.
			
 
				+// CHECK:STDERR: fail_bad_raw_identifier.carbon:[[@LINE+4]]:2: ERROR: Encountered unrecognized characters while parsing.
			
 
				+// CHECK:STDERR: s#foo
			
 
				+// CHECK:STDERR:  ^
			
 
				+// CHECK:STDERR:
			
 
				+s#foo
			
 
				+// CHECK:STDOUT:     { index: 11, kind:    'Identifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 's', identifier: 2 },
			
 
				+// CHECK:STDOUT:     { index: 12, kind:         'Error', line: {{ *}}[[@LINE-2]], column:  2, indent: 1, spelling: '#' },
			
 
				+// CHECK:STDOUT:     { index: 13, kind:    'Identifier', line: {{ *}}[[@LINE-3]], column:  3, indent: 1, spelling: 'foo', identifier: 1, has_trailing_space: true },
			
 
				+
			
 
				+// Identifier ending in `r` doesn't start a raw identifier.
			
 
				+// CHECK:STDERR: fail_bad_raw_identifier.carbon:[[@LINE+4]]:4: ERROR: Encountered unrecognized characters while parsing.
			
 
				+// CHECK:STDERR: arr#foo
			
 
				+// CHECK:STDERR:    ^
			
 
				+// CHECK:STDERR:
			
 
				+arr#foo
			
 
				+// CHECK:STDOUT:     { index: 14, kind:    'Identifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'arr', identifier: 3 },
			
 
				+// CHECK:STDOUT:     { index: 15, kind:         'Error', line: {{ *}}[[@LINE-2]], column:  4, indent: 1, spelling: '#' },
			
 
				+// CHECK:STDOUT:     { index: 16, kind:    'Identifier', line: {{ *}}[[@LINE-3]], column:  5, indent: 1, spelling: 'foo', identifier: 1, has_trailing_space: true },
			
 
				+
			
 
				+// Whitespace between `r` and `#` isn't allowed.
			
 
				+// CHECK:STDERR: fail_bad_raw_identifier.carbon:[[@LINE+4]]:3: ERROR: Encountered unrecognized characters while parsing.
			
 
				+// CHECK:STDERR: r #foo
			
 
				+// CHECK:STDERR:   ^
			
 
				+// CHECK:STDERR:
			
 
				+r #foo
			
 
				+// CHECK:STDOUT:     { index: 17, kind:    'Identifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'r', identifier: 0, has_trailing_space: true },
			
 
				+// CHECK:STDOUT:     { index: 18, kind:         'Error', line: {{ *}}[[@LINE-2]], column:  3, indent: 1, spelling: '#' },
			
 
				+// CHECK:STDOUT:     { index: 19, kind:    'Identifier', line: {{ *}}[[@LINE-3]], column:  4, indent: 1, spelling: 'foo', identifier: 1, has_trailing_space: true },
			
 
				+
			
 
				+// This is an `r` identifier followed by a string literal.
			
 
				+r#"hello"#
			
 
				+// CHECK:STDOUT:     { index: 20, kind:    'Identifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'r', identifier: 0 },
			
 
				+// CHECK:STDOUT:     { index: 21, kind: 'StringLiteral', line: {{ *}}[[@LINE-2]], column:  2, indent: 1, spelling: '#"hello"#', value: `hello`, has_trailing_space: true },
			
 
				+
			
 
				+// CHECK:STDOUT:     { index: 22, kind:       'FileEnd', line: {{ *}}[[@LINE+1]], column: {{ *\d+}}, indent: 1, spelling: '' },
			
 
				+// CHECK:STDOUT:   ]
			
 
				+// --- fail_hash_at_start_of_file.carbon
			
 
				+// CHECK:STDOUT: - filename: fail_hash_at_start_of_file.carbon
			
 
				+// CHECK:STDOUT:   tokens: [
			
 
				+// CHECK:STDOUT:     { index: 0, kind:  'FileStart', line: {{ *\d+}}, column:  1, indent: 1, spelling: '', has_trailing_space: true },
			
 
				+
			
 
				+// Ensure that we correctly handle a `#` as the first token in the file.
			
 
				+// CHECK:STDERR: fail_hash_at_start_of_file.carbon:[[@LINE+3]]:1: ERROR: Encountered unrecognized characters while parsing.
			
 
				+// CHECK:STDERR: #foo
			
 
				+// CHECK:STDERR: ^
			
 
				+#foo
			
 
				+// CHECK:STDOUT:     { index: 1, kind:      'Error', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: '#' },
			
 
				+// CHECK:STDOUT:     { index: 2, kind: 'Identifier', line: {{ *}}[[@LINE-2]], column:  2, indent: 1, spelling: 'foo', identifier: 0, has_trailing_space: true },
			
 
				 
			
 
				-// CHECK:STDOUT:     { index: 8, kind:    'FileEnd', line: {{ *}}[[@LINE+1]], column: {{ *\d+}}, indent: 1, spelling: '' },
			
 
				+// CHECK:STDOUT:     { index: 3, kind:    'FileEnd', line: {{ *}}[[@LINE+1]], column: {{ *\d+}}, indent: 1, spelling: '' },
			
 
				 // CHECK:STDOUT:   ]
			
--- a/toolchain/lex/testdata/raw_identifier.carbon
+++ b/toolchain/lex/testdata/raw_identifier.carbon
@@ -9,15 +9,15 @@
 
				 
			
 
				 // A non-keyword identifier.
			
 
				 r#foo
			
 
				-// CHECK:STDOUT:     { index: 1, kind:          'Identifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'foo', identifier: 0, has_trailing_space: true },
			
 
				+// CHECK:STDOUT:     { index: 1, kind:          'Identifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'foo', identifier: 1, has_trailing_space: true },
			
 
				 
			
 
				 // The same non-keyword identifier, for comparison.
			
 
				 foo
			
 
				-// CHECK:STDOUT:     { index: 2, kind:          'Identifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'foo', identifier: 0, has_trailing_space: true },
			
 
				+// CHECK:STDOUT:     { index: 2, kind:          'Identifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'foo', identifier: 1, has_trailing_space: true },
			
 
				 
			
 
				 // A keyword as a raw identifier.
			
 
				 r#self
			
 
				-// CHECK:STDOUT:     { index: 3, kind:          'Identifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'self', identifier: 1, has_trailing_space: true },
			
 
				+// CHECK:STDOUT:     { index: 3, kind:          'Identifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'self', identifier: 2, has_trailing_space: true },
			
 
				 
			
 
				 // The same keyword, for comparison.
			
 
				 self
			
@@ -25,7 +25,7 @@ self
 
				 
			
 
				 // A type literal as a raw identifier.
			
 
				 r#i32
			
 
				-// CHECK:STDOUT:     { index: 5, kind:          'Identifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'i32', identifier: 2, has_trailing_space: true },
			
 
				+// CHECK:STDOUT:     { index: 5, kind:          'Identifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'i32', identifier: 3, has_trailing_space: true },
			
 
				 
			
 
				 // The same type literal, for comparison.
			
 
				 i32