Răsfoiți Sursa

Add partial raw identifier support. (#3344)

I'm looking at this due to the conversation on #3341. Although
diagnostics aren't where they should be, I thought it may help to start
adding raw identifier support (which may also help show how I was
thinking about this).

Note regarding the TODO on how to form the token, `GetTokenText` returns
the `string_id`'s reference value for an `Identifier`. So to make
`GetTokenText` work in a way that returns `r#foo` for a raw identifier,
I think there are a few options:

1. Add additional data indicating the end of the identifier.
2. Add `RawIdentifier` as a token kind to indicate that it's raw and
should be prefixed with `r#` (but also giving later stages one more
token kind to handle)
3. Make the `string_id` correspond to `r#foo`, and have later stages add
`foo` to the strings table whenever `r#foo` is encountered (with map
lookups leading to deduplication).
4. Add `StringId::RawKeyword` special values for each keyword.
- This would mean `self` prints as `self`, `r#self` prints as `r#self`,
but `r#foo` is not a keyword so prints as `foo`.
- This means keywords would need to be listed in a place `StringId` can
depend on them, one way or the other (e.g., a `keywords.def` file in
`base/` should work).
5. Say that it _is_ an `Identifier`, and if it's a keyword spelling, it
must have been a raw identifier.
- Same limitation as above: This would mean `self` prints as `self`,
`r#self` prints as `r#self`, but `r#foo` is not a keyword so prints as
`foo`.

I'm hoping to resolve this issue separately though. :)
Jon Ross-Perkins 2 ani în urmă
părinte
comite
6742d0d048

+ 45 - 0
toolchain/check/testdata/basics/raw_identifier.carbon

@@ -0,0 +1,45 @@
+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+// Exceptions. See /LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// ARGS: compile --phase=check --dump-sem-ir %s
+//
+// Check that the command-line flag to dump textual IR works.
+//
+// AUTOUPDATE
+
+fn A(n: i32) -> i32 {
+  return r#n;
+}
+
+fn B(r#n: i32) -> i32 {
+  return n;
+}
+
+fn C(r#if: i32) -> i32 {
+  return r#if;
+}
+
+// CHECK:STDOUT: file "raw_identifier.carbon" {
+// CHECK:STDOUT:   %A: <function> = fn_decl @A
+// CHECK:STDOUT:   %B: <function> = fn_decl @B
+// CHECK:STDOUT:   %C: <function> = fn_decl @C
+// CHECK:STDOUT: }
+// CHECK:STDOUT:
+// CHECK:STDOUT: fn @A(%n: i32) -> i32 {
+// CHECK:STDOUT: !entry:
+// CHECK:STDOUT:   %n.ref: i32 = name_reference "n", %n
+// CHECK:STDOUT:   return %n.ref
+// CHECK:STDOUT: }
+// CHECK:STDOUT:
+// CHECK:STDOUT: fn @B(%n: i32) -> i32 {
+// CHECK:STDOUT: !entry:
+// CHECK:STDOUT:   %n.ref: i32 = name_reference "n", %n
+// CHECK:STDOUT:   return %n.ref
+// CHECK:STDOUT: }
+// CHECK:STDOUT:
+// CHECK:STDOUT: fn @C(%if: i32) -> i32 {
+// CHECK:STDOUT: !entry:
+// CHECK:STDOUT:   %if.ref: i32 = name_reference "if", %if
+// CHECK:STDOUT:   return %if.ref
+// CHECK:STDOUT: }

+ 28 - 0
toolchain/lex/testdata/fail_bad_raw_identifier.carbon

@@ -0,0 +1,28 @@
+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+// Exceptions. See /LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// AUTOUPDATE
+// CHECK:STDOUT: - filename: fail_bad_raw_identifier.carbon
+// CHECK:STDOUT:   tokens: [
+// CHECK:STDOUT:     { index: 0, kind:    'StartOfFile', line: {{ *\d+}}, column:  1, indent: 1, spelling: '', has_trailing_space: true },
+
+// Missing the character after `#`.
+// CHECK:STDERR: fail_bad_raw_identifier.carbon:[[@LINE+3]]:2: ERROR: Encountered unrecognized characters while parsing.
+// CHECK:STDERR: r#
+// CHECK:STDERR:  ^
+r#
+// CHECK:STDOUT:     { index: 1, kind:     'Identifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'r', identifier: 0 },
+// CHECK:STDOUT:     { index: 2, kind:          'Error', line: {{ *}}[[@LINE-2]], column:  2, indent: 1, spelling: '#', has_trailing_space: true },
+
+// Not a valid identifier.
+// CHECK:STDERR: fail_bad_raw_identifier.carbon:[[@LINE+3]]:2: ERROR: Encountered unrecognized characters while parsing.
+// CHECK:STDERR: r#3
+// CHECK:STDERR:  ^
+r#3
+// CHECK:STDOUT:     { index: 3, kind:     'Identifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'r', identifier: 0 },
+// CHECK:STDOUT:     { index: 4, kind:          'Error', line: {{ *}}[[@LINE-2]], column:  2, indent: 1, spelling: '#' },
+// CHECK:STDOUT:     { index: 5, kind: 'IntegerLiteral', line: {{ *}}[[@LINE-3]], column:  3, indent: 1, spelling: '3', value: `3`, has_trailing_space: true },
+
+// CHECK:STDOUT:     { index: 6, kind:      'EndOfFile', line: {{ *}}[[@LINE+1]], column: {{ *\d+}}, indent: 1, spelling: '' },
+// CHECK:STDOUT:   ]

+ 35 - 0
toolchain/lex/testdata/raw_identifier.carbon

@@ -0,0 +1,35 @@
+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+// Exceptions. See /LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// AUTOUPDATE
+// CHECK:STDOUT: - filename: raw_identifier.carbon
+// CHECK:STDOUT:   tokens: [
+// CHECK:STDOUT:     { index: 0, kind:         'StartOfFile', line: {{ *\d+}}, column:  1, indent: 1, spelling: '', has_trailing_space: true },
+
+// A non-keyword identifier.
+r#foo
+// CHECK:STDOUT:     { index: 1, kind:          'Identifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'foo', identifier: 0, has_trailing_space: true },
+
+// The same non-keyword identifier, for comparison.
+foo
+// CHECK:STDOUT:     { index: 2, kind:          'Identifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'foo', identifier: 0, has_trailing_space: true },
+
+// A keyword as a raw identifier.
+r#self
+// CHECK:STDOUT:     { index: 3, kind:          'Identifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'self', identifier: 1, has_trailing_space: true },
+
+// The same keyword, for comparison.
+self
+// CHECK:STDOUT:     { index: 4, kind: 'SelfValueIdentifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'self', has_trailing_space: true },
+
+// A type literal as a raw identifier.
+r#i32
+// CHECK:STDOUT:     { index: 5, kind:          'Identifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'i32', identifier: 2, has_trailing_space: true },
+
+// The same type literal, for comparison.
+i32
+// CHECK:STDOUT:     { index: 6, kind:  'IntegerTypeLiteral', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'i32', has_trailing_space: true },
+
+// CHECK:STDOUT:     { index: 7, kind:           'EndOfFile', line: {{ *}}[[@LINE+1]], column: {{ *\d+}}, indent: 1, spelling: '' },
+// CHECK:STDOUT:   ]

+ 52 - 8
toolchain/lex/tokenized_buffer.cpp

@@ -81,13 +81,9 @@ static constexpr SIMDMaskArrayT PrefixMasks = []() constexpr {
 #endif  // CARBON_USE_SIMD
 #endif  // CARBON_USE_SIMD
 
 
 // A table of booleans that we can use to classify bytes as being valid
 // A table of booleans that we can use to classify bytes as being valid
-// identifier (or keyword) characters. This is used in the generic,
-// non-vectorized fallback code to scan for length of an identifier.
-constexpr std::array<bool, 256> IsIdByteTable = [] {
+// identifier start. This is used by raw identifier detection.
+constexpr std::array<bool, 256> IsIdStartByteTable = [] {
   std::array<bool, 256> table = {};
   std::array<bool, 256> table = {};
-  for (char c = '0'; c <= '9'; ++c) {
-    table[c] = true;
-  }
   for (char c = 'A'; c <= 'Z'; ++c) {
   for (char c = 'A'; c <= 'Z'; ++c) {
     table[c] = true;
     table[c] = true;
   }
   }
@@ -98,6 +94,17 @@ constexpr std::array<bool, 256> IsIdByteTable = [] {
   return table;
   return table;
 }();
 }();
 
 
+// A table of booleans that we can use to classify bytes as being valid
+// identifier (or keyword) characters. This is used in the generic,
+// non-vectorized fallback code to scan for length of an identifier.
+constexpr std::array<bool, 256> IsIdByteTable = [] {
+  std::array<bool, 256> table = IsIdStartByteTable;
+  for (char c = '0'; c <= '9'; ++c) {
+    table[c] = true;
+  }
+  return table;
+}();
+
 // Baseline scalar version, also available for scalar-fallback in SIMD code.
 // Baseline scalar version, also available for scalar-fallback in SIMD code.
 // Uses `ssize_t` for performance when indexing in the loop.
 // Uses `ssize_t` for performance when indexing in the loop.
 //
 //
@@ -859,8 +866,7 @@ class [[clang::internal_linkage]] TokenizedBuffer::Lexer {
       // TODO: Need to add support for Unicode lexing.
       // TODO: Need to add support for Unicode lexing.
       return LexError(source_text, position);
       return LexError(source_text, position);
     }
     }
-    CARBON_CHECK(IsAlpha(source_text[position]) ||
-                 source_text[position] == '_');
+    CARBON_CHECK(IsIdStartByteTable[source_text[position]]);
 
 
     int column = ComputeColumn(position);
     int column = ComputeColumn(position);
 
 
@@ -894,6 +900,42 @@ class [[clang::internal_linkage]] TokenizedBuffer::Lexer {
          .string_id = buffer_.value_stores_->strings().Add(identifier_text)});
          .string_id = buffer_.value_stores_->strings().Add(identifier_text)});
   }
   }
 
 
+  auto LexKeywordOrIdentifierMaybeRaw(llvm::StringRef source_text,
+                                      ssize_t& position) -> LexResult {
+    CARBON_CHECK(source_text[position] == 'r');
+    // Raw identifiers must look like `r#<valid identifier>`, otherwise it's an
+    // identifier starting with the 'r'.
+    // TODO: Need to add support for Unicode lexing.
+    if (LLVM_LIKELY(position + 2 >= static_cast<ssize_t>(source_text.size()) ||
+                    source_text[position + 1] != '#' ||
+                    !IsIdStartByteTable[source_text[position + 2]])) {
+      // TODO: Should this print a different error when there is `r#`, but it
+      // isn't followed by identifier text? Or is it right to put it back so
+      // that the `#` could be parsed as part of a raw string literal?
+      return LexKeywordOrIdentifier(source_text, position);
+    }
+
+    int column = ComputeColumn(position);
+
+    // Take the valid characters off the front of the source buffer.
+    llvm::StringRef identifier_text =
+        ScanForIdentifierPrefix(source_text.substr(position + 2));
+    CARBON_CHECK(!identifier_text.empty())
+        << "Must have at least one character!";
+    position += identifier_text.size() + 2;
+
+    // Versus LexKeywordOrIdentifier, raw identifiers do not do keyword checks.
+
+    // Otherwise we have a raw identifier.
+    // TODO: This token doesn't carry any indicator that it's raw, so
+    // diagnostics are unclear.
+    return buffer_.AddToken(
+        {.kind = TokenKind::Identifier,
+         .token_line = current_line(),
+         .column = column,
+         .string_id = buffer_.value_stores_->strings().Add(identifier_text)});
+  }
+
   auto LexError(llvm::StringRef source_text, ssize_t& position) -> LexResult {
   auto LexError(llvm::StringRef source_text, ssize_t& position) -> LexResult {
     llvm::StringRef error_text =
     llvm::StringRef error_text =
         source_text.substr(position).take_while([](char c) {
         source_text.substr(position).take_while([](char c) {
@@ -1018,6 +1060,7 @@ class [[clang::internal_linkage]] TokenizedBuffer::Lexer {
   CARBON_DISPATCH_LEX_TOKEN(LexError)
   CARBON_DISPATCH_LEX_TOKEN(LexError)
   CARBON_DISPATCH_LEX_TOKEN(LexSymbolToken)
   CARBON_DISPATCH_LEX_TOKEN(LexSymbolToken)
   CARBON_DISPATCH_LEX_TOKEN(LexKeywordOrIdentifier)
   CARBON_DISPATCH_LEX_TOKEN(LexKeywordOrIdentifier)
+  CARBON_DISPATCH_LEX_TOKEN(LexKeywordOrIdentifierMaybeRaw)
   CARBON_DISPATCH_LEX_TOKEN(LexNumericLiteral)
   CARBON_DISPATCH_LEX_TOKEN(LexNumericLiteral)
   CARBON_DISPATCH_LEX_TOKEN(LexStringLiteral)
   CARBON_DISPATCH_LEX_TOKEN(LexStringLiteral)
 
 
@@ -1147,6 +1190,7 @@ class [[clang::internal_linkage]] TokenizedBuffer::Lexer {
     for (unsigned char c = 'a'; c <= 'z'; ++c) {
     for (unsigned char c = 'a'; c <= 'z'; ++c) {
       table[c] = &DispatchLexKeywordOrIdentifier;
       table[c] = &DispatchLexKeywordOrIdentifier;
     }
     }
+    table['r'] = &DispatchLexKeywordOrIdentifierMaybeRaw;
     for (unsigned char c = 'A'; c <= 'Z'; ++c) {
     for (unsigned char c = 'A'; c <= 'Z'; ++c) {
       table[c] = &DispatchLexKeywordOrIdentifier;
       table[c] = &DispatchLexKeywordOrIdentifier;
     }
     }

+ 60 - 0
toolchain/lex/tokenized_buffer_benchmark.cpp

@@ -427,6 +427,66 @@ void BM_ValidKeywords(benchmark::State& state) {
 }
 }
 BENCHMARK(BM_ValidKeywords);
 BENCHMARK(BM_ValidKeywords);
 
 
+void BM_ValidKeywordsAsRawIdentifiers(benchmark::State& state) {
+  absl::BitGen gen;
+  std::array<llvm::StringRef, NumTokens> tokens;
+  for (int i : llvm::seq(NumTokens)) {
+    tokens[i] = TokenKind::KeywordTokens[i % TokenKind::KeywordTokens.size()]
+                    .fixed_spelling();
+  }
+  std::shuffle(tokens.begin(), tokens.end(), gen);
+  std::string source("r#");
+  source.append(llvm::join(tokens, " r#"));
+
+  LexerBenchHelper helper(source);
+  for (auto _ : state) {
+    TokenizedBuffer buffer = helper.Lex();
+    CARBON_CHECK(!buffer.has_errors());
+  }
+
+  state.SetBytesProcessed(state.iterations() * source.size());
+  state.counters["tokens_per_second"] = benchmark::Counter(
+      NumTokens, benchmark::Counter::kIsIterationInvariantRate);
+}
+BENCHMARK(BM_ValidKeywordsAsRawIdentifiers);
+
+// This benchmark does a 50-50 split of r-prefixed and r#-prefixed identifiers
+// to directly compare raw and non-raw performance.
+void BM_RawIdentifierFocus(benchmark::State& state) {
+  const std::array<std::string, NumTokens>& ids = GetRandomIdentifiers();
+
+  llvm::SmallVector<std::string> modified_ids;
+  // As we resize, start with the in-use prefix. Note that `r#` uses the first
+  // character of the original identifier.
+  modified_ids.resize(NumTokens / 2, "r#");
+  modified_ids.resize(NumTokens, "r");
+  for (int i : llvm::seq(NumTokens / 2)) {
+    // Use the same identifier both ways.
+    modified_ids[i].append(ids[i]);
+    modified_ids[i + NumTokens / 2].append(
+        llvm::StringRef(ids[i]).drop_front());
+  }
+
+  absl::BitGen gen;
+  std::array<llvm::StringRef, NumTokens> tokens;
+  for (int i : llvm::seq(NumTokens)) {
+    tokens[i] = modified_ids[i];
+  }
+  std::shuffle(tokens.begin(), tokens.end(), gen);
+  std::string source = llvm::join(tokens, " ");
+
+  LexerBenchHelper helper(source);
+  for (auto _ : state) {
+    TokenizedBuffer buffer = helper.Lex();
+    CARBON_CHECK(!buffer.has_errors());
+  }
+
+  state.SetBytesProcessed(state.iterations() * source.size());
+  state.counters["tokens_per_second"] = benchmark::Counter(
+      NumTokens, benchmark::Counter::kIsIterationInvariantRate);
+}
+BENCHMARK(BM_RawIdentifierFocus);
+
 template <int MinLength, int MaxLength, bool Uniform>
 template <int MinLength, int MaxLength, bool Uniform>
 void BM_ValidIdentifiers(benchmark::State& state) {
 void BM_ValidIdentifiers(benchmark::State& state) {
   std::string source = RandomIdentifierSeq<MinLength, MaxLength, Uniform>();
   std::string source = RandomIdentifierSeq<MinLength, MaxLength, Uniform>();