Просмотр исходного кода

Support lexing characters (#5893)

Adapts `StringLiteral` to lex characters. Adds a `CharLiteral` token,
which contains a `CharLiteralValue` which is a straight unicode code
point (suggested by zygoloid).

---------

Co-authored-by: Richard Smith <richard@metafoo.co.uk>
Jon Ross-Perkins 9 месяцев назад
Родитель
Сommit
cae8aa3adf

+ 6 - 0
toolchain/diagnostics/diagnostic_kind.def

@@ -74,6 +74,12 @@ CARBON_DIAGNOSTIC_KIND(UnrecognizedCharacters)
 CARBON_DIAGNOSTIC_KIND(UnterminatedString)
 CARBON_DIAGNOSTIC_KIND(WrongRealLiteralExponent)
 
+CARBON_DIAGNOSTIC_KIND(CharLiteralEmpty)
+CARBON_DIAGNOSTIC_KIND(CharLiteralInvalidUTF8)
+CARBON_DIAGNOSTIC_KIND(CharLiteralOverflow)
+CARBON_DIAGNOSTIC_KIND(CharLiteralRaw)
+CARBON_DIAGNOSTIC_KIND(CharLiteralUnderflow)
+
 // ============================================================================
 // Parser diagnostics
 // ============================================================================

+ 3 - 0
toolchain/lex/BUILD

@@ -129,6 +129,7 @@ cc_library(
     deps = [
         ":character_set",
         ":helpers",
+        ":token_info",
         "//common:check",
         "//toolchain/diagnostics:diagnostic_emitter",
         "@llvm-project//llvm:Support",
@@ -189,6 +190,7 @@ cc_library(
         ":numeric_literal",
         ":string_literal",
         ":token_index",
+        ":token_info",
         ":token_kind",
         ":tokenized_buffer",
         "//common:check",
@@ -196,6 +198,7 @@ cc_library(
         "//toolchain/base:kind_switch",
         "//toolchain/base:shared_value_stores",
         "//toolchain/diagnostics:diagnostic_emitter",
+        "//toolchain/diagnostics:format_providers",
         "//toolchain/source:source_buffer",
         "@llvm-project//llvm:Support",
     ],

+ 38 - 13
toolchain/lex/lex.cpp

@@ -16,11 +16,13 @@
 #include "llvm/Support/Compiler.h"
 #include "toolchain/base/kind_switch.h"
 #include "toolchain/base/shared_value_stores.h"
+#include "toolchain/diagnostics/format_providers.h"
 #include "toolchain/lex/character_set.h"
 #include "toolchain/lex/helpers.h"
 #include "toolchain/lex/numeric_literal.h"
 #include "toolchain/lex/string_literal.h"
 #include "toolchain/lex/token_index.h"
+#include "toolchain/lex/token_info.h"
 #include "toolchain/lex/token_kind.h"
 #include "toolchain/lex/tokenized_buffer.h"
 
@@ -1126,6 +1128,15 @@ auto Lexer::LexNumericLiteral(llvm::StringRef source_text, ssize_t& position)
   }
 }
 
+static auto DiagnoseUnterminatedString(
+    Diagnostics::Emitter<const char*>& emitter, const StringLiteral& literal,
+    bool is_char) -> void {
+  CARBON_DIAGNOSTIC(UnterminatedString, Error,
+                    "{0:character|string} literal is missing a terminator",
+                    Diagnostics::BoolAsSelect);
+  emitter.Emit(literal.text().begin(), UnterminatedString, is_char);
+}
+
 auto Lexer::LexStringLiteral(llvm::StringRef source_text, ssize_t& position)
     -> LexResult {
   std::optional<StringLiteral> literal =
@@ -1137,11 +1148,28 @@ auto Lexer::LexStringLiteral(llvm::StringRef source_text, ssize_t& position)
   // Capture the position before we step past the token.
   int32_t byte_offset = position;
   int string_column = byte_offset - current_line_info().start;
-  ssize_t literal_size = literal->text().size();
-  position += literal_size;
+  position += literal->text().size();
+
+  // Helper for error paths.
+  auto lex_as_error = [&]() {
+    return LexTokenWithPayload(TokenKind::Error, literal->text().size(),
+                               byte_offset);
+  };
+
+  if (literal->kind() == StringLiteral::Kind::Char) {
+    if (!literal->is_terminated()) {
+      DiagnoseUnterminatedString(emitter_, *literal, /*is_char=*/true);
+      return lex_as_error();
+    }
+    if (auto value = literal->ComputeCharValue(emitter_)) {
+      return LexTokenWithPayload(TokenKind::CharLiteral, value->value,
+                                 byte_offset);
+    }
+    return lex_as_error();
+  }
 
   // Update line and column information.
-  if (literal->is_multi_line()) {
+  if (literal->kind() != StringLiteral::Kind::SingleLine) {
     while (next_line_info().start < position) {
       ++line_index_.index;
       current_line_info().indent = string_column;
@@ -1151,17 +1179,14 @@ auto Lexer::LexStringLiteral(llvm::StringRef source_text, ssize_t& position)
     // last line of the multi-line literal *also* has its indent set.
   }
 
-  if (literal->is_terminated()) {
-    auto string_id = buffer_.value_stores_->string_literal_values().Add(
-        literal->ComputeValue(buffer_.allocator_, emitter_));
-    return LexTokenWithPayload(TokenKind::StringLiteral, string_id.index,
-                               byte_offset);
-  } else {
-    CARBON_DIAGNOSTIC(UnterminatedString, Error,
-                      "string is missing a terminator");
-    emitter_.Emit(literal->text().begin(), UnterminatedString);
-    return LexTokenWithPayload(TokenKind::Error, literal_size, byte_offset);
+  if (!literal->is_terminated()) {
+    DiagnoseUnterminatedString(emitter_, *literal, /*is_char=*/false);
+    return lex_as_error();
   }
+  auto string_id = buffer_.value_stores_->string_literal_values().Add(
+      literal->ComputeStringValue(buffer_.allocator_, emitter_));
+  return LexTokenWithPayload(TokenKind::StringLiteral, string_id.index,
+                             byte_offset);
 }
 
 auto Lexer::LexOneCharSymbolToken(llvm::StringRef source_text, TokenKind kind,

+ 90 - 19
toolchain/lex/string_literal.cpp

@@ -24,7 +24,7 @@ static constexpr char DoubleQuotedMultiLineIndicator[] = R"(""")";
 
 struct StringLiteral::Introducer {
   // The kind of string being introduced.
-  MultiLineKind kind;
+  Kind kind;
   // The terminator for the string, without any '#' suffixes.
   llvm::StringRef terminator;
   // The length of the introducer, including the file type indicator and
@@ -41,17 +41,17 @@ struct StringLiteral::Introducer {
 // recovery purposes, and reject """ literals after lexing.
 auto StringLiteral::Introducer::Lex(llvm::StringRef source_text)
     -> std::optional<Introducer> {
-  MultiLineKind kind = NotMultiLine;
+  Kind kind = Kind::SingleLine;
   llvm::StringRef indicator;
   if (source_text.starts_with(MultiLineIndicator)) {
-    kind = MultiLine;
+    kind = Kind::MultiLine;
     indicator = llvm::StringRef(MultiLineIndicator);
   } else if (source_text.starts_with(DoubleQuotedMultiLineIndicator)) {
-    kind = MultiLineWithDoubleQuotes;
+    kind = Kind::MultiLineWithDoubleQuotes;
     indicator = llvm::StringRef(DoubleQuotedMultiLineIndicator);
   }
 
-  if (kind != NotMultiLine) {
+  if (kind != Kind::SingleLine) {
     // The rest of the line must be a valid file type indicator: a sequence of
     // characters containing neither '#' nor '"' followed by a newline.
     auto prefix_end = source_text.find_first_of("#\n\"", indicator.size());
@@ -64,9 +64,13 @@ auto StringLiteral::Introducer::Lex(llvm::StringRef source_text)
     }
   }
 
-  if (!source_text.empty() && source_text[0] == '"') {
+  if (source_text.starts_with('"')) {
     return Introducer{
-        .kind = NotMultiLine, .terminator = "\"", .prefix_size = 1};
+        .kind = Kind::SingleLine, .terminator = "\"", .prefix_size = 1};
+  }
+
+  if (source_text.starts_with('\'')) {
+    return Introducer{.kind = Kind::Char, .terminator = "'", .prefix_size = 1};
   }
 
   return std::nullopt;
@@ -89,6 +93,12 @@ struct alignas(8) CharSet {
 };
 }  // namespace
 
+// Determine whether this is a multi-line string literal.
+static auto IsMultiLine(StringLiteral::Kind kind) -> bool {
+  return kind == StringLiteral::Kind::MultiLine ||
+         kind == StringLiteral::Kind::MultiLineWithDoubleQuotes;
+}
+
 auto StringLiteral::Lex(llvm::StringRef source_text)
     -> std::optional<StringLiteral> {
   int64_t cursor = 0;
@@ -144,8 +154,8 @@ auto StringLiteral::Lex(llvm::StringRef source_text)
           // If there's either not a character following the escape, or it's a
           // single-line string and the escaped character is a newline, we
           // should stop here.
-          if (cursor >= source_text_size || (introducer->kind == NotMultiLine &&
-                                             source_text[cursor] == '\n')) {
+          if (cursor >= source_text_size ||
+              (!IsMultiLine(introducer->kind) && source_text[cursor] == '\n')) {
             llvm::StringRef text = source_text.take_front(cursor);
             return StringLiteral(text, text.drop_front(prefix_len),
                                  content_needs_validation, hash_level,
@@ -155,7 +165,7 @@ auto StringLiteral::Lex(llvm::StringRef source_text)
         }
         break;
       case '\n':
-        if (introducer->kind == NotMultiLine) {
+        if (!IsMultiLine(introducer->kind)) {
           llvm::StringRef text = source_text.take_front(cursor);
           return StringLiteral(text, text.drop_front(prefix_len),
                                content_needs_validation, hash_level,
@@ -466,21 +476,82 @@ static auto ExpandEscapeSequencesAndRemoveIndent(
   }
 }
 
-auto StringLiteral::ComputeValue(llvm::BumpPtrAllocator& allocator,
-                                 DiagnosticEmitter& emitter) const
-    -> llvm::StringRef {
-  if (!is_terminated_) {
-    return "";
+auto StringLiteral::ComputeCharValue(Diagnostics::Emitter<const char*>& emitter)
+    const -> std::optional<CharLiteralValue> {
+  CARBON_DCHECK(kind_ == Kind::Char);
+  CARBON_DCHECK(is_terminated_);
+
+  if (hash_level_ != 0) {
+    CARBON_DIAGNOSTIC(CharLiteralRaw, Error,
+                      "unexpected `#` before character literal");
+    emitter.Emit(text_.begin(), CharLiteralRaw);
   }
-  if (multi_line_ == MultiLineWithDoubleQuotes) {
+
+  // Allocate a buffer sized to the content. Note it's possible this could be
+  // more efficient/faster with a `ExpandEscapeSequencesAndRemoveIndent`
+  // implementation aware of the buffer size, but this is trying to share logic
+  // with string expansion.
+  llvm::SmallVector<char> buffer;
+  buffer.resize_for_overwrite(content_.size());
+
+  auto result = ExpandEscapeSequencesAndRemoveIndent(
+      emitter, content_, 0, /*indent=*/llvm::StringRef(), buffer.data());
+  CARBON_CHECK(result.size() <= content_.size(),
+               "Content grew from {0} to {1}: `{2}`", content_.size(),
+               result.size(), content_);
+
+  llvm::UTF32 target[1];
+  const auto* source_cursor =
+      reinterpret_cast<const llvm::UTF8*>(result.begin());
+  llvm::UTF32* target_cursor = target;
+  llvm::ConversionResult conv_result = llvm::ConvertUTF8toUTF32(
+      &source_cursor, reinterpret_cast<const llvm::UTF8*>(result.end()),
+      &target_cursor, std::end(target), llvm::strictConversion);
+
+  switch (conv_result) {
+    case llvm::conversionOK: {
+      if (target_cursor == target) {
+        CARBON_DIAGNOSTIC(CharLiteralEmpty, Error, "empty character literal");
+        emitter.Emit(text_.begin(), CharLiteralEmpty);
+        return std::nullopt;
+      }
+      return CharLiteralValue{.value = static_cast<int32_t>(target[0])};
+    }
+    case llvm::sourceExhausted: {
+      CARBON_DIAGNOSTIC(CharLiteralUnderflow, Error, "incomplete UTF-8");
+      emitter.Emit(text_.begin(), CharLiteralUnderflow);
+      return std::nullopt;
+    }
+    case llvm::targetExhausted: {
+      CARBON_DIAGNOSTIC(CharLiteralOverflow, Error, "too many characters");
+      emitter.Emit(text_.begin(), CharLiteralOverflow);
+      return std::nullopt;
+    }
+    case llvm::sourceIllegal: {
+      CARBON_DIAGNOSTIC(CharLiteralInvalidUTF8, Error,
+                        "invalid UTF-8 character");
+      emitter.Emit(text_.begin(), CharLiteralInvalidUTF8);
+      return std::nullopt;
+    }
+  }
+}
+
+auto StringLiteral::ComputeStringValue(llvm::BumpPtrAllocator& allocator,
+                                       DiagnosticEmitter& emitter) const
+    -> llvm::StringRef {
+  CARBON_DCHECK(kind_ != Kind::Char);
+  CARBON_DCHECK(is_terminated_);
+
+  if (kind_ == Kind::MultiLineWithDoubleQuotes) {
     CARBON_DIAGNOSTIC(
         MultiLineStringWithDoubleQuotes, Error,
         "use `'''` delimiters for a multi-line string literal, not `\"\"\"`");
     emitter.Emit(text_.begin(), MultiLineStringWithDoubleQuotes);
   }
-  llvm::StringRef indent =
-      multi_line_ ? CheckIndent(emitter, text_, content_) : llvm::StringRef();
-  if (!content_needs_validation_ && (!multi_line_ || indent.empty())) {
+  llvm::StringRef indent = IsMultiLine(kind_)
+                               ? CheckIndent(emitter, text_, content_)
+                               : llvm::StringRef();
+  if (!content_needs_validation_ && (!IsMultiLine(kind_) || indent.empty())) {
     return content_;
   }
 

+ 35 - 13
toolchain/lex/string_literal.h

@@ -10,11 +10,27 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Allocator.h"
 #include "toolchain/diagnostics/diagnostic_emitter.h"
+#include "toolchain/lex/token_info.h"
 
 namespace Carbon::Lex {
 
 class StringLiteral {
  public:
+  // A string literal's kind.
+  enum class Kind : int8_t {
+    // A character literal is still handled through string literal lexing.
+    Char,
+
+    // A single-line string, `"<content>"`.
+    SingleLine,
+
+    // A multi-line string, `'''<content>'''`.
+    MultiLine,
+
+    // An incorrectly double-quoted multi-line string, `"""<content>"""`.
+    MultiLineWithDoubleQuotes,
+  };
+
   // Extract a string literal token from the given text, if it has a suitable
   // form. Returning std::nullopt indicates no string literal was found;
   // returning an invalid literal indicates a string prefix was found, but it's
@@ -22,58 +38,64 @@ class StringLiteral {
   // construction.
   static auto Lex(llvm::StringRef source_text) -> std::optional<StringLiteral>;
 
+  // Expand any escape sequences and compute the resulting character. This
+  // handles error recovery internally, but can return nullopt for an invalid
+  // character.
+  auto ComputeCharValue(Diagnostics::Emitter<const char*>& emitter) const
+      -> std::optional<CharLiteralValue>;
+
   // Expand any escape sequences in the given string literal and compute the
   // resulting value. This handles error recovery internally and cannot fail.
   //
   // When content_needs_validation_ is false and the string has no indent to
   // deal with, this can return the content directly. Otherwise, the allocator
   // will be used for the StringRef.
-  auto ComputeValue(llvm::BumpPtrAllocator& allocator,
-                    Diagnostics::Emitter<const char*>& emitter) const
+  auto ComputeStringValue(llvm::BumpPtrAllocator& allocator,
+                          Diagnostics::Emitter<const char*>& emitter) const
       -> llvm::StringRef;
 
   // Get the text corresponding to this literal.
   auto text() const -> llvm::StringRef { return text_; }
 
   // Determine whether this is a multi-line string literal.
-  auto is_multi_line() const -> bool { return multi_line_; }
+  auto kind() const -> Kind { return kind_; }
 
   // Returns true if the string has a valid terminator.
   auto is_terminated() const -> bool { return is_terminated_; }
 
  private:
-  enum MultiLineKind : int8_t {
-    NotMultiLine,
-    MultiLine,
-    MultiLineWithDoubleQuotes
-  };
-
   struct Introducer;
 
   explicit StringLiteral(llvm::StringRef text, llvm::StringRef content,
                          bool content_needs_validation, int hash_level,
-                         MultiLineKind multi_line, bool is_terminated)
+                         Kind kind, bool is_terminated)
       : text_(text),
         content_(content),
         content_needs_validation_(content_needs_validation),
         hash_level_(hash_level),
-        multi_line_(multi_line),
+        kind_(kind),
         is_terminated_(is_terminated) {}
 
   // The complete text of the string literal.
   llvm::StringRef text_;
+
   // The content of the literal. For a multi-line literal, this begins
   // immediately after the newline following the file type indicator, and ends
   // at the start of the closing `"""`. Leading whitespace is not removed from
   // either end.
   llvm::StringRef content_;
+
   // Whether content needs validation, in particular due to either an escape
   // (which needs modifications) or a tab character (which may cause a warning).
   bool content_needs_validation_;
+
   // The number of `#`s preceding the opening `"` or `"""`.
   int hash_level_;
-  // Whether this was a multi-line string literal.
-  MultiLineKind multi_line_;
+
+  // Whether this was a single-line string literal, multi-line string literal,
+  // or a char literal.
+  Kind kind_;
+
   // Whether the literal is valid, or should only be used for errors.
   bool is_terminated_;
 };

+ 18 - 16
toolchain/lex/string_literal_benchmark.cpp

@@ -92,54 +92,56 @@ static void BM_SimpleStringValue(benchmark::State& state, int size,
   std::string x(introducer);
   x.append(size, 'a');
   if (add_escape) {
-    // Adds a basic escape that forces ComputeValue to generate a new string.
+    // Adds a basic escape that forces ComputeStringValue to generate a new
+    // string.
     x.append("\\\\");
   }
   x.append(terminator);
   for (auto _ : state) {
-    StringLiteral::Lex(x)->ComputeValue(
+    StringLiteral::Lex(x)->ComputeStringValue(
         allocator, Diagnostics::NullEmitter<const char*>());
   }
 }
 
-static void BM_ComputeValue_NoGenerate_Short(benchmark::State& state) {
+static void BM_ComputeStringValue_NoGenerate_Short(benchmark::State& state) {
   BM_SimpleStringValue(state, 10, "\"", /*add_escape=*/false, "\"");
 }
 
-static void BM_ComputeValue_NoGenerate_Long(benchmark::State& state) {
+static void BM_ComputeStringValue_NoGenerate_Long(benchmark::State& state) {
   BM_SimpleStringValue(state, 10000, "\"", /*add_escape=*/false, "\"");
 }
 
-static void BM_ComputeValue_WillGenerate_Short(benchmark::State& state) {
+static void BM_ComputeStringValue_WillGenerate_Short(benchmark::State& state) {
   BM_SimpleStringValue(state, 10, "\"", /*add_escape=*/true, "\"");
 }
 
-static void BM_ComputeValue_WillGenerate_Long(benchmark::State& state) {
+static void BM_ComputeStringValue_WillGenerate_Long(benchmark::State& state) {
   BM_SimpleStringValue(state, 10000, "\"", /*add_escape=*/true, "\"");
 }
 
-static void BM_ComputeValue_WillGenerate_Multiline(benchmark::State& state) {
+static void BM_ComputeStringValue_WillGenerate_Multiline(
+    benchmark::State& state) {
   BM_SimpleStringValue(state, 10000, "'''\n", /*add_escape=*/true, "\n'''");
 }
 
-static void BM_ComputeValue_WillGenerate_MultilineDoubleQuote(
+static void BM_ComputeStringValue_WillGenerate_MultilineDoubleQuote(
     benchmark::State& state) {
   BM_SimpleStringValue(state, 10000, "\"\"\"\n", /*add_escape=*/true,
                        "\n\"\"\"");
 }
 
-static void BM_ComputeValue_WillGenerate_Raw(benchmark::State& state) {
+static void BM_ComputeStringValue_WillGenerate_Raw(benchmark::State& state) {
   BM_SimpleStringValue(state, 10000, "#\"", /*add_escape=*/true, "\"#");
 }
 
-BENCHMARK(BM_ComputeValue_NoGenerate_Short);
-BENCHMARK(BM_ComputeValue_NoGenerate_Long);
+BENCHMARK(BM_ComputeStringValue_NoGenerate_Short);
+BENCHMARK(BM_ComputeStringValue_NoGenerate_Long);
 
-BENCHMARK(BM_ComputeValue_WillGenerate_Short);
-BENCHMARK(BM_ComputeValue_WillGenerate_Long);
-BENCHMARK(BM_ComputeValue_WillGenerate_Multiline);
-BENCHMARK(BM_ComputeValue_WillGenerate_MultilineDoubleQuote);
-BENCHMARK(BM_ComputeValue_WillGenerate_Raw);
+BENCHMARK(BM_ComputeStringValue_WillGenerate_Short);
+BENCHMARK(BM_ComputeStringValue_WillGenerate_Long);
+BENCHMARK(BM_ComputeStringValue_WillGenerate_Multiline);
+BENCHMARK(BM_ComputeStringValue_WillGenerate_MultilineDoubleQuote);
+BENCHMARK(BM_ComputeStringValue_WillGenerate_Raw);
 
 }  // namespace
 }  // namespace Carbon::Lex

+ 28 - 11
toolchain/lex/string_literal_fuzzer.cpp

@@ -14,29 +14,46 @@ namespace Carbon::Testing {
 
 // NOLINTNEXTLINE: Match the documented fuzzer entry point declaration style.
 extern "C" int LLVMFuzzerTestOneInput(const unsigned char* data, size_t size) {
-  auto token = Lex::StringLiteral::Lex(
+  auto literal = Lex::StringLiteral::Lex(
       llvm::StringRef(reinterpret_cast<const char*>(data), size));
-  if (!token) {
+  if (!literal) {
     // Lexically not a string literal.
     return 0;
   }
 
-  if (!token->is_terminated()) {
+  if (!literal->is_terminated()) {
     // Found errors while parsing.
     return 0;
   }
 
-  fprintf(stderr, "valid: %d\n", token->is_terminated());
-  fprintf(stderr, "size: %lu\n", token->text().size());
-  fprintf(stderr, "text: %s\n", token->text().str().c_str());
+  fprintf(stderr, "valid: %d\n", literal->is_terminated());
+  fprintf(stderr, "size: %lu\n", literal->text().size());
+  fprintf(stderr, "text: %s\n", literal->text().str().c_str());
 
   // Check multiline flag was computed correctly.
-  CARBON_CHECK(token->is_multi_line() == token->text().contains('\n'));
+  switch (literal->kind()) {
+    case Lex::StringLiteral::Kind::Char:
+      break;
+
+    case Lex::StringLiteral::Kind::SingleLine:
+      CARBON_CHECK(!literal->text().contains('\n'));
+      break;
+
+    case Lex::StringLiteral::Kind::MultiLine:
+    case Lex::StringLiteral::Kind::MultiLineWithDoubleQuotes:
+      CARBON_CHECK(literal->text().contains('\n'));
+      break;
+  }
 
-  llvm::BumpPtrAllocator allocator;
-  volatile auto value =
-      token->ComputeValue(allocator, Diagnostics::NullEmitter<const char*>());
-  (void)value;
+  auto* null_emitter = &Diagnostics::NullEmitter<const char*>();
+  if (literal->kind() == Lex::StringLiteral::Kind::Char) {
+    volatile auto value = literal->ComputeCharValue(*null_emitter);
+    (void)value;
+  } else {
+    llvm::BumpPtrAllocator allocator;
+    volatile auto value = literal->ComputeStringValue(allocator, *null_emitter);
+    (void)value;
+  }
 
   return 0;
 }

+ 1 - 1
toolchain/lex/string_literal_test.cpp

@@ -32,7 +32,7 @@ class StringLiteralTest : public ::testing::Test {
   auto Parse(llvm::StringRef text) -> llvm::StringRef {
     StringLiteral token = Lex(text);
     Testing::SingleTokenDiagnosticEmitter emitter(&error_tracker_, text);
-    return token.ComputeValue(allocator_, emitter);
+    return token.ComputeStringValue(allocator_, emitter);
   }
 
   llvm::BumpPtrAllocator allocator_;

+ 63 - 0
toolchain/lex/testdata/char_literals.carbon

@@ -0,0 +1,63 @@
+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+// Exceptions. See /LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// AUTOUPDATE
+// TIP: To test this file alone, run:
+// TIP:   bazel test //toolchain/testing:file_test --test_arg=--file_tests=toolchain/lex/testdata/char_literals.carbon
+// TIP: To dump output, run:
+// TIP:   bazel run //toolchain/testing:file_test -- --dump_output --file_tests=toolchain/lex/testdata/char_literals.carbon
+
+// --- valid.carbon
+// CHECK:STDOUT: - filename: valid.carbon
+// CHECK:STDOUT:   tokens:
+
+'a'
+// CHECK:STDOUT:   - { index: 1, kind: "CharLiteral", line: {{ *}}[[@LINE-1]], column: 1, indent: 1, spelling: "'a'", has_leading_space: true }
+'\n'
+// CHECK:STDOUT:   - { index: 2, kind: "CharLiteral", line: {{ *}}[[@LINE-1]], column: 1, indent: 1, spelling: "'\\n'", has_leading_space: true }
+'\x7F'
+// CHECK:STDOUT:   - { index: 3, kind: "CharLiteral", line: {{ *}}[[@LINE-1]], column: 1, indent: 1, spelling: "'\\x7F'", has_leading_space: true }
+'\u{123}'
+// CHECK:STDOUT:   - { index: 4, kind: "CharLiteral", line: {{ *}}[[@LINE-1]], column: 1, indent: 1, spelling: "'\\u{123}'", has_leading_space: true }
+'\xC3\xA9'
+// CHECK:STDOUT:   - { index: 5, kind: "CharLiteral", line: {{ *}}[[@LINE-1]], column: 1, indent: 1, spelling: "'\\xC3\\xA9'", has_leading_space: true }
+
+// --- fail_invalid.carbon
+// CHECK:STDOUT: - filename: fail_invalid.carbon
+// CHECK:STDOUT:   tokens:
+
+// CHECK:STDERR: fail_invalid.carbon:[[@LINE+4]]:1: error: empty character literal [CharLiteralEmpty]
+// CHECK:STDERR: ''
+// CHECK:STDERR: ^
+// CHECK:STDERR:
+''
+// CHECK:STDOUT:   - { index: 1, kind:       "Error", line: {{ *}}[[@LINE-1]], column:   1, indent: 1, spelling: "''", has_leading_space: true }
+
+// CHECK:STDERR: fail_invalid.carbon:[[@LINE+4]]:1: error: too many characters [CharLiteralOverflow]
+// CHECK:STDERR: 'abcde'
+// CHECK:STDERR: ^
+// CHECK:STDERR:
+'abcde'
+// CHECK:STDOUT:   - { index: 2, kind:       "Error", line: {{ *}}[[@LINE-1]], column:   1, indent: 1, spelling: "'abcde'", has_leading_space: true }
+
+// CHECK:STDERR: fail_invalid.carbon:[[@LINE+4]]:1: error: incomplete UTF-8 [CharLiteralUnderflow]
+// CHECK:STDERR: '\xC3'
+// CHECK:STDERR: ^
+// CHECK:STDERR:
+'\xC3'
+// CHECK:STDOUT:   - { index: 3, kind:       "Error", line: {{ *}}[[@LINE-1]], column:   1, indent: 1, spelling: "'\\xC3'", has_leading_space: true }
+
+// CHECK:STDERR: fail_invalid.carbon:[[@LINE+4]]:1: error: invalid UTF-8 character [CharLiteralInvalidUTF8]
+// CHECK:STDERR: '\xC3\xFF'
+// CHECK:STDERR: ^
+// CHECK:STDERR:
+'\xC3\xFF'
+// CHECK:STDOUT:   - { index: 4, kind:       "Error", line: {{ *}}[[@LINE-1]], column:   1, indent: 1, spelling: "'\\xC3\\xFF'", has_leading_space: true }
+
+// CHECK:STDERR: fail_invalid.carbon:[[@LINE+4]]:1: error: unexpected `#` before character literal [CharLiteralRaw]
+// CHECK:STDERR: #'a'#
+// CHECK:STDERR: ^
+// CHECK:STDERR:
+#'a'#
+// CHECK:STDOUT:   - { index: 5, kind: "CharLiteral", line: {{ *}}[[@LINE-1]], column:   1, indent: 1, spelling: "#'a'#", has_leading_space: true }

+ 1 - 1
toolchain/lex/testdata/string_literals.carbon

@@ -23,7 +23,7 @@
 // CHECK:STDOUT: - filename: fail_unterminated.carbon
 // CHECK:STDOUT:   tokens:
 
-// CHECK:STDERR: fail_unterminated.carbon:[[@LINE+4]]:1: error: string is missing a terminator [UnterminatedString]
+// CHECK:STDERR: fail_unterminated.carbon:[[@LINE+4]]:1: error: string literal is missing a terminator [UnterminatedString]
 // CHECK:STDERR: "s
 // CHECK:STDERR: ^
 // CHECK:STDERR:

+ 13 - 0
toolchain/lex/token_info.h

@@ -13,6 +13,14 @@
 
 namespace Carbon::Lex {
 
+// A character as a unicode code point.
+//
+// Unicode requires 21 bits, which should fit inside `TokenInfo::PayloadBits`,
+// so we store the value directly.
+struct CharLiteralValue {
+  int32_t value;
+};
+
 // Storage for the information about a specific token, as an implementation
 // detail of `TokenizedBuffer`.
 //
@@ -69,6 +77,11 @@ class TokenInfo {
     return StringLiteralValueId(token_payload_);
   }
 
+  auto char_literal() const -> CharLiteralValue {
+    CARBON_DCHECK(kind() == TokenKind::CharLiteral);
+    return CharLiteralValue(token_payload_);
+  }
+
   auto int_id() const -> IntId {
     CARBON_DCHECK(kind() == TokenKind::IntLiteral ||
                   kind() == TokenKind::IntTypeLiteral ||

+ 1 - 0
toolchain/lex/token_kind.def

@@ -226,6 +226,7 @@ CARBON_TOKEN(Identifier)
 CARBON_TOKEN(IntLiteral)
 CARBON_TOKEN(RealLiteral)
 CARBON_TOKEN(StringLiteral)
+CARBON_TOKEN(CharLiteral)
 CARBON_TOKEN(IntTypeLiteral)
 CARBON_TOKEN(UnsignedIntTypeLiteral)
 CARBON_TOKEN(FloatTypeLiteral)

+ 10 - 1
toolchain/lex/tokenized_buffer.cpp

@@ -82,7 +82,8 @@ auto TokenizedBuffer::GetTokenText(TokenIndex token) const -> llvm::StringRef {
 
   // Refer back to the source text to find the original spelling, including
   // escape sequences etc.
-  if (token_info.kind() == TokenKind::StringLiteral) {
+  if (token_info.kind() == TokenKind::StringLiteral ||
+      token_info.kind() == TokenKind::CharLiteral) {
     std::optional<StringLiteral> relexed_token =
         StringLiteral::Lex(source_->text().substr(token_info.byte_offset()));
     CARBON_CHECK(relexed_token, "Could not reform string literal token.");
@@ -137,6 +138,14 @@ auto TokenizedBuffer::GetStringLiteralValue(TokenIndex token) const
   return token_info.string_literal_id();
 }
 
+auto TokenizedBuffer::GetCharLiteralValue(TokenIndex token) const
+    -> CharLiteralValue {
+  const auto& token_info = token_infos_.Get(token);
+  CARBON_CHECK(token_info.kind() == TokenKind::CharLiteral, "{0}",
+               token_info.kind());
+  return token_info.char_literal();
+}
+
 auto TokenizedBuffer::GetTypeLiteralSize(TokenIndex token) const -> IntId {
   const auto& token_info = token_infos_.Get(token);
   CARBON_CHECK(token_info.kind().is_sized_type_literal(), "{0}",

+ 7 - 4
toolchain/lex/tokenized_buffer.h

@@ -138,16 +138,19 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
   // an `Identifier`.
   auto GetIdentifier(TokenIndex token) const -> IdentifierId;
 
-  // Returns the value of an `IntLiteral()` token.
+  // Returns the value of an `IntLiteral` token.
   auto GetIntLiteral(TokenIndex token) const -> IntId;
 
-  // Returns the value of an `RealLiteral()` token.
+  // Returns the value of an `RealLiteral` token.
   auto GetRealLiteral(TokenIndex token) const -> RealId;
 
-  // Returns the value of a `StringLiteral()` token.
+  // Returns the value of a `StringLiteral` token.
   auto GetStringLiteralValue(TokenIndex token) const -> StringLiteralValueId;
 
-  // Returns the size specified in a `*TypeLiteral()` token.
+  // Returns the value of a `CharLiteral` token.
+  auto GetCharLiteralValue(TokenIndex token) const -> CharLiteralValue;
+
+  // Returns the size specified in a `*TypeLiteral` token.
   auto GetTypeLiteralSize(TokenIndex token) const -> IntId;
 
   // Returns the closing token matched with the given opening token.