2 vuotta sitten · 6742d0d048
--- a/toolchain/check/testdata/basics/raw_identifier.carbon
+++ b/toolchain/check/testdata/basics/raw_identifier.carbon
@@ -0,0 +1,45 @@
 
				+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
			
 
				+// Exceptions. See /LICENSE for license information.
			
 
				+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
			
 
				+//
			
 
				+// ARGS: compile --phase=check --dump-sem-ir %s
			
 
				+//
			
 
				+// Check that the command-line flag to dump textual IR works.
			
 
				+//
			
 
				+// AUTOUPDATE
			
 
				+
			
 
				+fn A(n: i32) -> i32 {
			
 
				+  return r#n;
			
 
				+}
			
 
				+
			
 
				+fn B(r#n: i32) -> i32 {
			
 
				+  return n;
			
 
				+}
			
 
				+
			
 
				+fn C(r#if: i32) -> i32 {
			
 
				+  return r#if;
			
 
				+}
			
 
				+
			
 
				+// CHECK:STDOUT: file "raw_identifier.carbon" {
			
 
				+// CHECK:STDOUT:   %A: <function> = fn_decl @A
			
 
				+// CHECK:STDOUT:   %B: <function> = fn_decl @B
			
 
				+// CHECK:STDOUT:   %C: <function> = fn_decl @C
			
 
				+// CHECK:STDOUT: }
			
 
				+// CHECK:STDOUT:
			
 
				+// CHECK:STDOUT: fn @A(%n: i32) -> i32 {
			
 
				+// CHECK:STDOUT: !entry:
			
 
				+// CHECK:STDOUT:   %n.ref: i32 = name_reference "n", %n
			
 
				+// CHECK:STDOUT:   return %n.ref
			
 
				+// CHECK:STDOUT: }
			
 
				+// CHECK:STDOUT:
			
 
				+// CHECK:STDOUT: fn @B(%n: i32) -> i32 {
			
 
				+// CHECK:STDOUT: !entry:
			
 
				+// CHECK:STDOUT:   %n.ref: i32 = name_reference "n", %n
			
 
				+// CHECK:STDOUT:   return %n.ref
			
 
				+// CHECK:STDOUT: }
			
 
				+// CHECK:STDOUT:
			
 
				+// CHECK:STDOUT: fn @C(%if: i32) -> i32 {
			
 
				+// CHECK:STDOUT: !entry:
			
 
				+// CHECK:STDOUT:   %if.ref: i32 = name_reference "if", %if
			
 
				+// CHECK:STDOUT:   return %if.ref
			
 
				+// CHECK:STDOUT: }
			
--- a/toolchain/lex/testdata/fail_bad_raw_identifier.carbon
+++ b/toolchain/lex/testdata/fail_bad_raw_identifier.carbon
@@ -0,0 +1,28 @@
 
				+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
			
 
				+// Exceptions. See /LICENSE for license information.
			
 
				+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
			
 
				+//
			
 
				+// AUTOUPDATE
			
 
				+// CHECK:STDOUT: - filename: fail_bad_raw_identifier.carbon
			
 
				+// CHECK:STDOUT:   tokens: [
			
 
				+// CHECK:STDOUT:     { index: 0, kind:    'StartOfFile', line: {{ *\d+}}, column:  1, indent: 1, spelling: '', has_trailing_space: true },
			
 
				+
			
 
				+// Missing the character after `#`.
			
 
				+// CHECK:STDERR: fail_bad_raw_identifier.carbon:[[@LINE+3]]:2: ERROR: Encountered unrecognized characters while parsing.
			
 
				+// CHECK:STDERR: r#
			
 
				+// CHECK:STDERR:  ^
			
 
				+r#
			
 
				+// CHECK:STDOUT:     { index: 1, kind:     'Identifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'r', identifier: 0 },
			
 
				+// CHECK:STDOUT:     { index: 2, kind:          'Error', line: {{ *}}[[@LINE-2]], column:  2, indent: 1, spelling: '#', has_trailing_space: true },
			
 
				+
			
 
				+// Not a valid identifier.
			
 
				+// CHECK:STDERR: fail_bad_raw_identifier.carbon:[[@LINE+3]]:2: ERROR: Encountered unrecognized characters while parsing.
			
 
				+// CHECK:STDERR: r#3
			
 
				+// CHECK:STDERR:  ^
			
 
				+r#3
			
 
				+// CHECK:STDOUT:     { index: 3, kind:     'Identifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'r', identifier: 0 },
			
 
				+// CHECK:STDOUT:     { index: 4, kind:          'Error', line: {{ *}}[[@LINE-2]], column:  2, indent: 1, spelling: '#' },
			
 
				+// CHECK:STDOUT:     { index: 5, kind: 'IntegerLiteral', line: {{ *}}[[@LINE-3]], column:  3, indent: 1, spelling: '3', value: `3`, has_trailing_space: true },
			
 
				+
			
 
				+// CHECK:STDOUT:     { index: 6, kind:      'EndOfFile', line: {{ *}}[[@LINE+1]], column: {{ *\d+}}, indent: 1, spelling: '' },
			
 
				+// CHECK:STDOUT:   ]
			
--- a/toolchain/lex/testdata/raw_identifier.carbon
+++ b/toolchain/lex/testdata/raw_identifier.carbon
@@ -0,0 +1,35 @@
 
				+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
			
 
				+// Exceptions. See /LICENSE for license information.
			
 
				+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
			
 
				+//
			
 
				+// AUTOUPDATE
			
 
				+// CHECK:STDOUT: - filename: raw_identifier.carbon
			
 
				+// CHECK:STDOUT:   tokens: [
			
 
				+// CHECK:STDOUT:     { index: 0, kind:         'StartOfFile', line: {{ *\d+}}, column:  1, indent: 1, spelling: '', has_trailing_space: true },
			
 
				+
			
 
				+// A non-keyword identifier.
			
 
				+r#foo
			
 
				+// CHECK:STDOUT:     { index: 1, kind:          'Identifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'foo', identifier: 0, has_trailing_space: true },
			
 
				+
			
 
				+// The same non-keyword identifier, for comparison.
			
 
				+foo
			
 
				+// CHECK:STDOUT:     { index: 2, kind:          'Identifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'foo', identifier: 0, has_trailing_space: true },
			
 
				+
			
 
				+// A keyword as a raw identifier.
			
 
				+r#self
			
 
				+// CHECK:STDOUT:     { index: 3, kind:          'Identifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'self', identifier: 1, has_trailing_space: true },
			
 
				+
			
 
				+// The same keyword, for comparison.
			
 
				+self
			
 
				+// CHECK:STDOUT:     { index: 4, kind: 'SelfValueIdentifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'self', has_trailing_space: true },
			
 
				+
			
 
				+// A type literal as a raw identifier.
			
 
				+r#i32
			
 
				+// CHECK:STDOUT:     { index: 5, kind:          'Identifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'i32', identifier: 2, has_trailing_space: true },
			
 
				+
			
 
				+// The same type literal, for comparison.
			
 
				+i32
			
 
				+// CHECK:STDOUT:     { index: 6, kind:  'IntegerTypeLiteral', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'i32', has_trailing_space: true },
			
 
				+
			
 
				+// CHECK:STDOUT:     { index: 7, kind:           'EndOfFile', line: {{ *}}[[@LINE+1]], column: {{ *\d+}}, indent: 1, spelling: '' },
			
 
				+// CHECK:STDOUT:   ]
			
--- a/toolchain/lex/tokenized_buffer.cpp
+++ b/toolchain/lex/tokenized_buffer.cpp
@@ -81,13 +81,9 @@ static constexpr SIMDMaskArrayT PrefixMasks = []() constexpr {
 
				 #endif  // CARBON_USE_SIMD
			
 
				 
			
 
				 // A table of booleans that we can use to classify bytes as being valid
			
 
				-// identifier (or keyword) characters. This is used in the generic,
			
 
				-// non-vectorized fallback code to scan for length of an identifier.
			
 
				-constexpr std::array<bool, 256> IsIdByteTable = [] {
			
 
				+// identifier start. This is used by raw identifier detection.
			
 
				+constexpr std::array<bool, 256> IsIdStartByteTable = [] {
			
 
				   std::array<bool, 256> table = {};
			
 
				-  for (char c = '0'; c <= '9'; ++c) {
			
 
				-    table[c] = true;
			
 
				-  }
			
 
				   for (char c = 'A'; c <= 'Z'; ++c) {
			
 
				     table[c] = true;
			
 
				   }
			
@@ -98,6 +94,17 @@ constexpr std::array<bool, 256> IsIdByteTable = [] {
 
				   return table;
			
 
				 }();
			
 
				 
			
 
				+// A table of booleans that we can use to classify bytes as being valid
			
 
				+// identifier (or keyword) characters. This is used in the generic,
			
 
				+// non-vectorized fallback code to scan for length of an identifier.
			
 
				+constexpr std::array<bool, 256> IsIdByteTable = [] {
			
 
				+  std::array<bool, 256> table = IsIdStartByteTable;
			
 
				+  for (char c = '0'; c <= '9'; ++c) {
			
 
				+    table[c] = true;
			
 
				+  }
			
 
				+  return table;
			
 
				+}();
			
 
				+
			
 
				 // Baseline scalar version, also available for scalar-fallback in SIMD code.
			
 
				 // Uses `ssize_t` for performance when indexing in the loop.
			
 
				 //
			
@@ -859,8 +866,7 @@ class [[clang::internal_linkage]] TokenizedBuffer::Lexer {
 
				       // TODO: Need to add support for Unicode lexing.
			
 
				       return LexError(source_text, position);
			
 
				     }
			
 
				-    CARBON_CHECK(IsAlpha(source_text[position]) ||
			
 
				-                 source_text[position] == '_');
			
 
				+    CARBON_CHECK(IsIdStartByteTable[source_text[position]]);
			
 
				 
			
 
				     int column = ComputeColumn(position);
			
 
				 
			
@@ -894,6 +900,42 @@ class [[clang::internal_linkage]] TokenizedBuffer::Lexer {
 
				          .string_id = buffer_.value_stores_->strings().Add(identifier_text)});
			
 
				   }
			
 
				 
			
 
				+  auto LexKeywordOrIdentifierMaybeRaw(llvm::StringRef source_text,
			
 
				+                                      ssize_t& position) -> LexResult {
			
 
				+    CARBON_CHECK(source_text[position] == 'r');
			
 
				+    // Raw identifiers must look like `r#<valid identifier>`, otherwise it's an
			
 
				+    // identifier starting with the 'r'.
			
 
				+    // TODO: Need to add support for Unicode lexing.
			
 
				+    if (LLVM_LIKELY(position + 2 >= static_cast<ssize_t>(source_text.size()) ||
			
 
				+                    source_text[position + 1] != '#' ||
			
 
				+                    !IsIdStartByteTable[source_text[position + 2]])) {
			
 
				+      // TODO: Should this print a different error when there is `r#`, but it
			
 
				+      // isn't followed by identifier text? Or is it right to put it back so
			
 
				+      // that the `#` could be parsed as part of a raw string literal?
			
 
				+      return LexKeywordOrIdentifier(source_text, position);
			
 
				+    }
			
 
				+
			
 
				+    int column = ComputeColumn(position);
			
 
				+
			
 
				+    // Take the valid characters off the front of the source buffer.
			
 
				+    llvm::StringRef identifier_text =
			
 
				+        ScanForIdentifierPrefix(source_text.substr(position + 2));
			
 
				+    CARBON_CHECK(!identifier_text.empty())
			
 
				+        << "Must have at least one character!";
			
 
				+    position += identifier_text.size() + 2;
			
 
				+
			
 
				+    // Versus LexKeywordOrIdentifier, raw identifiers do not do keyword checks.
			
 
				+
			
 
				+    // Otherwise we have a raw identifier.
			
 
				+    // TODO: This token doesn't carry any indicator that it's raw, so
			
 
				+    // diagnostics are unclear.
			
 
				+    return buffer_.AddToken(
			
 
				+        {.kind = TokenKind::Identifier,
			
 
				+         .token_line = current_line(),
			
 
				+         .column = column,
			
 
				+         .string_id = buffer_.value_stores_->strings().Add(identifier_text)});
			
 
				+  }
			
 
				+
			
 
				   auto LexError(llvm::StringRef source_text, ssize_t& position) -> LexResult {
			
 
				     llvm::StringRef error_text =
			
 
				         source_text.substr(position).take_while([](char c) {
			
@@ -1018,6 +1060,7 @@ class [[clang::internal_linkage]] TokenizedBuffer::Lexer {
 
				   CARBON_DISPATCH_LEX_TOKEN(LexError)
			
 
				   CARBON_DISPATCH_LEX_TOKEN(LexSymbolToken)
			
 
				   CARBON_DISPATCH_LEX_TOKEN(LexKeywordOrIdentifier)
			
 
				+  CARBON_DISPATCH_LEX_TOKEN(LexKeywordOrIdentifierMaybeRaw)
			
 
				   CARBON_DISPATCH_LEX_TOKEN(LexNumericLiteral)
			
 
				   CARBON_DISPATCH_LEX_TOKEN(LexStringLiteral)
			
 
				 
			
@@ -1147,6 +1190,7 @@ class [[clang::internal_linkage]] TokenizedBuffer::Lexer {
 
				     for (unsigned char c = 'a'; c <= 'z'; ++c) {
			
 
				       table[c] = &DispatchLexKeywordOrIdentifier;
			
 
				     }
			
 
				+    table['r'] = &DispatchLexKeywordOrIdentifierMaybeRaw;
			
 
				     for (unsigned char c = 'A'; c <= 'Z'; ++c) {
			
 
				       table[c] = &DispatchLexKeywordOrIdentifier;
			
 
				     }
			
--- a/toolchain/lex/tokenized_buffer_benchmark.cpp
+++ b/toolchain/lex/tokenized_buffer_benchmark.cpp
@@ -427,6 +427,66 @@ void BM_ValidKeywords(benchmark::State& state) {
 
				 }
			
 
				 BENCHMARK(BM_ValidKeywords);
			
 
				 
			
 
				+void BM_ValidKeywordsAsRawIdentifiers(benchmark::State& state) {
			
 
				+  absl::BitGen gen;
			
 
				+  std::array<llvm::StringRef, NumTokens> tokens;
			
 
				+  for (int i : llvm::seq(NumTokens)) {
			
 
				+    tokens[i] = TokenKind::KeywordTokens[i % TokenKind::KeywordTokens.size()]
			
 
				+                    .fixed_spelling();
			
 
				+  }
			
 
				+  std::shuffle(tokens.begin(), tokens.end(), gen);
			
 
				+  std::string source("r#");
			
 
				+  source.append(llvm::join(tokens, " r#"));
			
 
				+
			
 
				+  LexerBenchHelper helper(source);
			
 
				+  for (auto _ : state) {
			
 
				+    TokenizedBuffer buffer = helper.Lex();
			
 
				+    CARBON_CHECK(!buffer.has_errors());
			
 
				+  }
			
 
				+
			
 
				+  state.SetBytesProcessed(state.iterations() * source.size());
			
 
				+  state.counters["tokens_per_second"] = benchmark::Counter(
			
 
				+      NumTokens, benchmark::Counter::kIsIterationInvariantRate);
			
 
				+}
			
 
				+BENCHMARK(BM_ValidKeywordsAsRawIdentifiers);
			
 
				+
			
 
				+// This benchmark does a 50-50 split of r-prefixed and r#-prefixed identifiers
			
 
				+// to directly compare raw and non-raw performance.
			
 
				+void BM_RawIdentifierFocus(benchmark::State& state) {
			
 
				+  const std::array<std::string, NumTokens>& ids = GetRandomIdentifiers();
			
 
				+
			
 
				+  llvm::SmallVector<std::string> modified_ids;
			
 
				+  // As we resize, start with the in-use prefix. Note that `r#` uses the first
			
 
				+  // character of the original identifier.
			
 
				+  modified_ids.resize(NumTokens / 2, "r#");
			
 
				+  modified_ids.resize(NumTokens, "r");
			
 
				+  for (int i : llvm::seq(NumTokens / 2)) {
			
 
				+    // Use the same identifier both ways.
			
 
				+    modified_ids[i].append(ids[i]);
			
 
				+    modified_ids[i + NumTokens / 2].append(
			
 
				+        llvm::StringRef(ids[i]).drop_front());
			
 
				+  }
			
 
				+
			
 
				+  absl::BitGen gen;
			
 
				+  std::array<llvm::StringRef, NumTokens> tokens;
			
 
				+  for (int i : llvm::seq(NumTokens)) {
			
 
				+    tokens[i] = modified_ids[i];
			
 
				+  }
			
 
				+  std::shuffle(tokens.begin(), tokens.end(), gen);
			
 
				+  std::string source = llvm::join(tokens, " ");
			
 
				+
			
 
				+  LexerBenchHelper helper(source);
			
 
				+  for (auto _ : state) {
			
 
				+    TokenizedBuffer buffer = helper.Lex();
			
 
				+    CARBON_CHECK(!buffer.has_errors());
			
 
				+  }
			
 
				+
			
 
				+  state.SetBytesProcessed(state.iterations() * source.size());
			
 
				+  state.counters["tokens_per_second"] = benchmark::Counter(
			
 
				+      NumTokens, benchmark::Counter::kIsIterationInvariantRate);
			
 
				+}
			
 
				+BENCHMARK(BM_RawIdentifierFocus);
			
 
				+
			
 
				 template <int MinLength, int MaxLength, bool Uniform>
			
 
				 void BM_ValidIdentifiers(benchmark::State& state) {
			
 
				   std::string source = RandomIdentifierSeq<MinLength, MaxLength, Uniform>();