2 ani în urmă · 6742d0d048
--- a/toolchain/check/testdata/basics/raw_identifier.carbon
+++ b/toolchain/check/testdata/basics/raw_identifier.carbon
@@ -0,0 +1,45 @@
 
															+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
														
 
															+// Exceptions. See /LICENSE for license information.
														
 
															+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
														
 
															+//
														
 
															+// ARGS: compile --phase=check --dump-sem-ir %s
														
 
															+//
														
 
															+// Check that the command-line flag to dump textual IR works.
														
 
															+//
														
 
															+// AUTOUPDATE
														
 
															+
														
 
															+fn A(n: i32) -> i32 {
														
 
															+  return r#n;
														
 
															+}
														
 
															+
														
 
															+fn B(r#n: i32) -> i32 {
														
 
															+  return n;
														
 
															+}
														
 
															+
														
 
															+fn C(r#if: i32) -> i32 {
														
 
															+  return r#if;
														
 
															+}
														
 
															+
														
 
															+// CHECK:STDOUT: file "raw_identifier.carbon" {
														
 
															+// CHECK:STDOUT:   %A: <function> = fn_decl @A
														
 
															+// CHECK:STDOUT:   %B: <function> = fn_decl @B
														
 
															+// CHECK:STDOUT:   %C: <function> = fn_decl @C
														
 
															+// CHECK:STDOUT: }
														
 
															+// CHECK:STDOUT:
														
 
															+// CHECK:STDOUT: fn @A(%n: i32) -> i32 {
														
 
															+// CHECK:STDOUT: !entry:
														
 
															+// CHECK:STDOUT:   %n.ref: i32 = name_reference "n", %n
														
 
															+// CHECK:STDOUT:   return %n.ref
														
 
															+// CHECK:STDOUT: }
														
 
															+// CHECK:STDOUT:
														
 
															+// CHECK:STDOUT: fn @B(%n: i32) -> i32 {
														
 
															+// CHECK:STDOUT: !entry:
														
 
															+// CHECK:STDOUT:   %n.ref: i32 = name_reference "n", %n
														
 
															+// CHECK:STDOUT:   return %n.ref
														
 
															+// CHECK:STDOUT: }
														
 
															+// CHECK:STDOUT:
														
 
															+// CHECK:STDOUT: fn @C(%if: i32) -> i32 {
														
 
															+// CHECK:STDOUT: !entry:
														
 
															+// CHECK:STDOUT:   %if.ref: i32 = name_reference "if", %if
														
 
															+// CHECK:STDOUT:   return %if.ref
														
 
															+// CHECK:STDOUT: }
														
--- a/toolchain/lex/testdata/fail_bad_raw_identifier.carbon
+++ b/toolchain/lex/testdata/fail_bad_raw_identifier.carbon
@@ -0,0 +1,28 @@
 
															+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
														
 
															+// Exceptions. See /LICENSE for license information.
														
 
															+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
														
 
															+//
														
 
															+// AUTOUPDATE
														
 
															+// CHECK:STDOUT: - filename: fail_bad_raw_identifier.carbon
														
 
															+// CHECK:STDOUT:   tokens: [
														
 
															+// CHECK:STDOUT:     { index: 0, kind:    'StartOfFile', line: {{ *\d+}}, column:  1, indent: 1, spelling: '', has_trailing_space: true },
														
 
															+
														
 
															+// Missing the character after `#`.
														
 
															+// CHECK:STDERR: fail_bad_raw_identifier.carbon:[[@LINE+3]]:2: ERROR: Encountered unrecognized characters while parsing.
														
 
															+// CHECK:STDERR: r#
														
 
															+// CHECK:STDERR:  ^
														
 
															+r#
														
 
															+// CHECK:STDOUT:     { index: 1, kind:     'Identifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'r', identifier: 0 },
														
 
															+// CHECK:STDOUT:     { index: 2, kind:          'Error', line: {{ *}}[[@LINE-2]], column:  2, indent: 1, spelling: '#', has_trailing_space: true },
														
 
															+
														
 
															+// Not a valid identifier.
														
 
															+// CHECK:STDERR: fail_bad_raw_identifier.carbon:[[@LINE+3]]:2: ERROR: Encountered unrecognized characters while parsing.
														
 
															+// CHECK:STDERR: r#3
														
 
															+// CHECK:STDERR:  ^
														
 
															+r#3
														
 
															+// CHECK:STDOUT:     { index: 3, kind:     'Identifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'r', identifier: 0 },
														
 
															+// CHECK:STDOUT:     { index: 4, kind:          'Error', line: {{ *}}[[@LINE-2]], column:  2, indent: 1, spelling: '#' },
														
 
															+// CHECK:STDOUT:     { index: 5, kind: 'IntegerLiteral', line: {{ *}}[[@LINE-3]], column:  3, indent: 1, spelling: '3', value: `3`, has_trailing_space: true },
														
 
															+
														
 
															+// CHECK:STDOUT:     { index: 6, kind:      'EndOfFile', line: {{ *}}[[@LINE+1]], column: {{ *\d+}}, indent: 1, spelling: '' },
														
 
															+// CHECK:STDOUT:   ]
														
--- a/toolchain/lex/testdata/raw_identifier.carbon
+++ b/toolchain/lex/testdata/raw_identifier.carbon
@@ -0,0 +1,35 @@
 
															+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
														
 
															+// Exceptions. See /LICENSE for license information.
														
 
															+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
														
 
															+//
														
 
															+// AUTOUPDATE
														
 
															+// CHECK:STDOUT: - filename: raw_identifier.carbon
														
 
															+// CHECK:STDOUT:   tokens: [
														
 
															+// CHECK:STDOUT:     { index: 0, kind:         'StartOfFile', line: {{ *\d+}}, column:  1, indent: 1, spelling: '', has_trailing_space: true },
														
 
															+
														
 
															+// A non-keyword identifier.
														
 
															+r#foo
														
 
															+// CHECK:STDOUT:     { index: 1, kind:          'Identifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'foo', identifier: 0, has_trailing_space: true },
														
 
															+
														
 
															+// The same non-keyword identifier, for comparison.
														
 
															+foo
														
 
															+// CHECK:STDOUT:     { index: 2, kind:          'Identifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'foo', identifier: 0, has_trailing_space: true },
														
 
															+
														
 
															+// A keyword as a raw identifier.
														
 
															+r#self
														
 
															+// CHECK:STDOUT:     { index: 3, kind:          'Identifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'self', identifier: 1, has_trailing_space: true },
														
 
															+
														
 
															+// The same keyword, for comparison.
														
 
															+self
														
 
															+// CHECK:STDOUT:     { index: 4, kind: 'SelfValueIdentifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'self', has_trailing_space: true },
														
 
															+
														
 
															+// A type literal as a raw identifier.
														
 
															+r#i32
														
 
															+// CHECK:STDOUT:     { index: 5, kind:          'Identifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'i32', identifier: 2, has_trailing_space: true },
														
 
															+
														
 
															+// The same type literal, for comparison.
														
 
															+i32
														
 
															+// CHECK:STDOUT:     { index: 6, kind:  'IntegerTypeLiteral', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'i32', has_trailing_space: true },
														
 
															+
														
 
															+// CHECK:STDOUT:     { index: 7, kind:           'EndOfFile', line: {{ *}}[[@LINE+1]], column: {{ *\d+}}, indent: 1, spelling: '' },
														
 
															+// CHECK:STDOUT:   ]
														
--- a/toolchain/lex/tokenized_buffer.cpp
+++ b/toolchain/lex/tokenized_buffer.cpp
@@ -81,13 +81,9 @@ static constexpr SIMDMaskArrayT PrefixMasks = []() constexpr {
 
															 #endif  // CARBON_USE_SIMD
														
 
															 // A table of booleans that we can use to classify bytes as being valid
														
 
															-// identifier (or keyword) characters. This is used in the generic,
														
 
															-// non-vectorized fallback code to scan for length of an identifier.
														
 
															-constexpr std::array<bool, 256> IsIdByteTable = [] {
														
 
															+// identifier start. This is used by raw identifier detection.
														
 
															+constexpr std::array<bool, 256> IsIdStartByteTable = [] {
														
 
															   std::array<bool, 256> table = {};
														
 
															-  for (char c = '0'; c <= '9'; ++c) {
														
 
															-    table[c] = true;
														
 
															-  }
														
 
															   for (char c = 'A'; c <= 'Z'; ++c) {
														
 
															     table[c] = true;
														
 
															   }
														
@@ -98,6 +94,17 @@ constexpr std::array<bool, 256> IsIdByteTable = [] {
 
															   return table;
														
 
															 }();
														
 
															+// A table of booleans that we can use to classify bytes as being valid
														
 
															+// identifier (or keyword) characters. This is used in the generic,
														
 
															+// non-vectorized fallback code to scan for length of an identifier.
														
 
															+constexpr std::array<bool, 256> IsIdByteTable = [] {
														
 
															+  std::array<bool, 256> table = IsIdStartByteTable;
														
 
															+  for (char c = '0'; c <= '9'; ++c) {
														
 
															+    table[c] = true;
														
 
															+  }
														
 
															+  return table;
														
 
															+}();
														
 
															+
														
 
															 // Baseline scalar version, also available for scalar-fallback in SIMD code.
														
 
															 // Uses `ssize_t` for performance when indexing in the loop.
														
 
															 //
														
@@ -859,8 +866,7 @@ class [[clang::internal_linkage]] TokenizedBuffer::Lexer {
 
															       // TODO: Need to add support for Unicode lexing.
														
 
															       return LexError(source_text, position);
														
 
															     }
														
 
															-    CARBON_CHECK(IsAlpha(source_text[position]) ||
														
 
															-                 source_text[position] == '_');
														
 
															+    CARBON_CHECK(IsIdStartByteTable[source_text[position]]);
														
 
															     int column = ComputeColumn(position);
														
@@ -894,6 +900,42 @@ class [[clang::internal_linkage]] TokenizedBuffer::Lexer {
 
															          .string_id = buffer_.value_stores_->strings().Add(identifier_text)});
														
 
															   }
														
 
															+  auto LexKeywordOrIdentifierMaybeRaw(llvm::StringRef source_text,
														
 
															+                                      ssize_t& position) -> LexResult {
														
 
															+    CARBON_CHECK(source_text[position] == 'r');
														
 
															+    // Raw identifiers must look like `r#<valid identifier>`, otherwise it's an
														
 
															+    // identifier starting with the 'r'.
														
 
															+    // TODO: Need to add support for Unicode lexing.
														
 
															+    if (LLVM_LIKELY(position + 2 >= static_cast<ssize_t>(source_text.size()) ||
														
 
															+                    source_text[position + 1] != '#' ||
														
 
															+                    !IsIdStartByteTable[source_text[position + 2]])) {
														
 
															+      // TODO: Should this print a different error when there is `r#`, but it
														
 
															+      // isn't followed by identifier text? Or is it right to put it back so
														
 
															+      // that the `#` could be parsed as part of a raw string literal?
														
 
															+      return LexKeywordOrIdentifier(source_text, position);
														
 
															+    }
														
 
															+
														
 
															+    int column = ComputeColumn(position);
														
 
															+
														
 
															+    // Take the valid characters off the front of the source buffer.
														
 
															+    llvm::StringRef identifier_text =
														
 
															+        ScanForIdentifierPrefix(source_text.substr(position + 2));
														
 
															+    CARBON_CHECK(!identifier_text.empty())
														
 
															+        << "Must have at least one character!";
														
 
															+    position += identifier_text.size() + 2;
														
 
															+
														
 
															+    // Versus LexKeywordOrIdentifier, raw identifiers do not do keyword checks.
														
 
															+
														
 
															+    // Otherwise we have a raw identifier.
														
 
															+    // TODO: This token doesn't carry any indicator that it's raw, so
														
 
															+    // diagnostics are unclear.
														
 
															+    return buffer_.AddToken(
														
 
															+        {.kind = TokenKind::Identifier,
														
 
															+         .token_line = current_line(),
														
 
															+         .column = column,
														
 
															+         .string_id = buffer_.value_stores_->strings().Add(identifier_text)});
														
 
															+  }
														
 
															+
														
 
															   auto LexError(llvm::StringRef source_text, ssize_t& position) -> LexResult {
														
 
															     llvm::StringRef error_text =
														
 
															         source_text.substr(position).take_while([](char c) {
														
@@ -1018,6 +1060,7 @@ class [[clang::internal_linkage]] TokenizedBuffer::Lexer {
 
															   CARBON_DISPATCH_LEX_TOKEN(LexError)
														
 
															   CARBON_DISPATCH_LEX_TOKEN(LexSymbolToken)
														
 
															   CARBON_DISPATCH_LEX_TOKEN(LexKeywordOrIdentifier)
														
 
															+  CARBON_DISPATCH_LEX_TOKEN(LexKeywordOrIdentifierMaybeRaw)
														
 
															   CARBON_DISPATCH_LEX_TOKEN(LexNumericLiteral)
														
 
															   CARBON_DISPATCH_LEX_TOKEN(LexStringLiteral)
														
@@ -1147,6 +1190,7 @@ class [[clang::internal_linkage]] TokenizedBuffer::Lexer {
 
															     for (unsigned char c = 'a'; c <= 'z'; ++c) {
														
 
															       table[c] = &DispatchLexKeywordOrIdentifier;
														
 
															     }
														
 
															+    table['r'] = &DispatchLexKeywordOrIdentifierMaybeRaw;
														
 
															     for (unsigned char c = 'A'; c <= 'Z'; ++c) {
														
 
															       table[c] = &DispatchLexKeywordOrIdentifier;
														
 
															     }
														
--- a/toolchain/lex/tokenized_buffer_benchmark.cpp
+++ b/toolchain/lex/tokenized_buffer_benchmark.cpp
@@ -427,6 +427,66 @@ void BM_ValidKeywords(benchmark::State& state) {
 
															 }
														
 
															 BENCHMARK(BM_ValidKeywords);
														
 
															+void BM_ValidKeywordsAsRawIdentifiers(benchmark::State& state) {
														
 
															+  absl::BitGen gen;
														
 
															+  std::array<llvm::StringRef, NumTokens> tokens;
														
 
															+  for (int i : llvm::seq(NumTokens)) {
														
 
															+    tokens[i] = TokenKind::KeywordTokens[i % TokenKind::KeywordTokens.size()]
														
 
															+                    .fixed_spelling();
														
 
															+  }
														
 
															+  std::shuffle(tokens.begin(), tokens.end(), gen);
														
 
															+  std::string source("r#");
														
 
															+  source.append(llvm::join(tokens, " r#"));
														
 
															+
														
 
															+  LexerBenchHelper helper(source);
														
 
															+  for (auto _ : state) {
														
 
															+    TokenizedBuffer buffer = helper.Lex();
														
 
															+    CARBON_CHECK(!buffer.has_errors());
														
 
															+  }
														
 
															+
														
 
															+  state.SetBytesProcessed(state.iterations() * source.size());
														
 
															+  state.counters["tokens_per_second"] = benchmark::Counter(
														
 
															+      NumTokens, benchmark::Counter::kIsIterationInvariantRate);
														
 
															+}
														
 
															+BENCHMARK(BM_ValidKeywordsAsRawIdentifiers);
														
 
															+
														
 
															+// This benchmark does a 50-50 split of r-prefixed and r#-prefixed identifiers
														
 
															+// to directly compare raw and non-raw performance.
														
 
															+void BM_RawIdentifierFocus(benchmark::State& state) {
														
 
															+  const std::array<std::string, NumTokens>& ids = GetRandomIdentifiers();
														
 
															+
														
 
															+  llvm::SmallVector<std::string> modified_ids;
														
 
															+  // As we resize, start with the in-use prefix. Note that `r#` uses the first
														
 
															+  // character of the original identifier.
														
 
															+  modified_ids.resize(NumTokens / 2, "r#");
														
 
															+  modified_ids.resize(NumTokens, "r");
														
 
															+  for (int i : llvm::seq(NumTokens / 2)) {
														
 
															+    // Use the same identifier both ways.
														
 
															+    modified_ids[i].append(ids[i]);
														
 
															+    modified_ids[i + NumTokens / 2].append(
														
 
															+        llvm::StringRef(ids[i]).drop_front());
														
 
															+  }
														
 
															+
														
 
															+  absl::BitGen gen;
														
 
															+  std::array<llvm::StringRef, NumTokens> tokens;
														
 
															+  for (int i : llvm::seq(NumTokens)) {
														
 
															+    tokens[i] = modified_ids[i];
														
 
															+  }
														
 
															+  std::shuffle(tokens.begin(), tokens.end(), gen);
														
 
															+  std::string source = llvm::join(tokens, " ");
														
 
															+
														
 
															+  LexerBenchHelper helper(source);
														
 
															+  for (auto _ : state) {
														
 
															+    TokenizedBuffer buffer = helper.Lex();
														
 
															+    CARBON_CHECK(!buffer.has_errors());
														
 
															+  }
														
 
															+
														
 
															+  state.SetBytesProcessed(state.iterations() * source.size());
														
 
															+  state.counters["tokens_per_second"] = benchmark::Counter(
														
 
															+      NumTokens, benchmark::Counter::kIsIterationInvariantRate);
														
 
															+}
														
 
															+BENCHMARK(BM_RawIdentifierFocus);
														
 
															+
														
 
															 template <int MinLength, int MaxLength, bool Uniform>
														
 
															 void BM_ValidIdentifiers(benchmark::State& state) {
														
 
															   std::string source = RandomIdentifierSeq<MinLength, MaxLength, Uniform>();