Răsfoiți Sursa

Initial support for string literals, following #199 (#309)

Richard Smith 5 ani în urmă
părinte
comite
67ee1fcaa7

+ 24 - 0
lexer/BUILD

@@ -49,11 +49,35 @@ cc_test(
     ],
 )
 
+cc_library(
+    name = "string_literal",
+    srcs = ["string_literal.cpp"],
+    hdrs = ["string_literal.h"],
+    deps = [
+        "//diagnostics:diagnostic_emitter",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
+cc_test(
+    name = "string_literal_test",
+    srcs = ["string_literal_test.cpp"],
+    deps = [
+        ":string_literal",
+        "//diagnostics:diagnostic_emitter",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//llvm:gmock",
+        "@llvm-project//llvm:gtest",
+        "@llvm-project//llvm:gtest_main",
+    ],
+)
+
 cc_library(
     name = "tokenized_buffer",
     srcs = ["tokenized_buffer.cpp"],
     hdrs = ["tokenized_buffer.h"],
     deps = [
+        ":string_literal",
         ":token_kind",
         ":numeric_literal",
         "//diagnostics:diagnostic_emitter",

+ 388 - 0
lexer/string_literal.cpp

@@ -0,0 +1,388 @@
+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+// Exceptions. See /LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "lexer/string_literal.h"
+
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/ConvertUTF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormatVariadic.h"
+
+namespace Carbon {
+
+struct ContentBeforeStringTerminator
+    : SimpleDiagnostic<ContentBeforeStringTerminator> {
+  static constexpr llvm::StringLiteral ShortName = "syntax-invalid-string";
+  static constexpr llvm::StringLiteral Message =
+      "Only whitespace is permitted before the closing `\"\"\"` of a "
+      "multi-line string.";
+};
+
+struct UnicodeEscapeTooLarge : SimpleDiagnostic<UnicodeEscapeTooLarge> {
+  static constexpr llvm::StringLiteral ShortName = "syntax-invalid-string";
+  static constexpr llvm::StringLiteral Message =
+      "Code point specified by `\\u{...}` escape is greater than 0x10FFFF.";
+};
+
+struct UnicodeEscapeSurrogate : SimpleDiagnostic<UnicodeEscapeSurrogate> {
+  static constexpr llvm::StringLiteral ShortName = "syntax-invalid-string";
+  static constexpr llvm::StringLiteral Message =
+      "Code point specified by `\\u{...}` escape is a surrogate character.";
+};
+
+struct UnicodeEscapeMissingBracedDigits
+    : SimpleDiagnostic<UnicodeEscapeMissingBracedDigits> {
+  static constexpr llvm::StringLiteral ShortName = "syntax-invalid-string";
+  static constexpr llvm::StringLiteral Message =
+      "Escape sequence `\\u` must be followed by a braced sequence of "
+      "uppercase hexadecimal digits, for example `\\u{70AD}`.";
+};
+
+struct HexadecimalEscapeMissingDigits
+    : SimpleDiagnostic<HexadecimalEscapeMissingDigits> {
+  static constexpr llvm::StringLiteral ShortName = "syntax-invalid-string";
+  static constexpr llvm::StringLiteral Message =
+      "Escape sequence `\\x` must be followed by two "
+      "uppercase hexadecimal digits, for example `\\x0F`.";
+};
+
+struct DecimalEscapeSequence : SimpleDiagnostic<DecimalEscapeSequence> {
+  static constexpr llvm::StringLiteral ShortName = "syntax-invalid-string";
+  static constexpr llvm::StringLiteral Message =
+      "Decimal digit follows `\\0` escape sequence. Use `\\x00` instead of "
+      "`\\0` if the next character is a digit.";
+};
+
+struct UnknownEscapeSequence {
+  static constexpr llvm::StringLiteral ShortName = "syntax-invalid-string";
+  static constexpr const char* Message = "Unrecognized escape sequence `{0}`.";
+
+  char first;
+
+  auto Format() -> std::string { return llvm::formatv(Message, first).str(); }
+};
+
+struct MismatchedIndentInString : SimpleDiagnostic<MismatchedIndentInString> {
+  static constexpr llvm::StringLiteral ShortName = "syntax-invalid-string";
+  static constexpr llvm::StringLiteral Message =
+      "Indentation does not match that of the closing \"\"\" in multi-line "
+      "string literal.";
+};
+
+// TODO(zygoloid): Update this to match whatever we decide qualifies as
+// acceptable whitespace.
+static bool isSpace(char c) { return c == ' ' || c == '\n' || c == '\t'; }
+
+static constexpr llvm::StringLiteral HorizontalWhitespace = " \t";
+
+static bool isUpperHexDigit(char c) {
+  return ('0' <= c && c <= '9') || ('A' <= c && c <= 'F');
+}
+
+// Find and return the opening characters of a multi-line string literal,
+// after any '#'s, including the file type indicator and following newline.
+static auto TakeMultiLineStringLiteralPrefix(llvm::StringRef source_text)
+    -> llvm::StringRef {
+  llvm::StringRef remaining = source_text;
+  if (!remaining.consume_front("\"\"\"")) {
+    return llvm::StringRef();
+  }
+
+  // The rest of the line must be a valid file type indicator: a sequence of
+  // characters containing neither '#' nor '"' followed by a newline.
+  remaining = remaining.drop_until(
+      [](char c) { return c == '"' || c == '#' || c == '\n'; });
+  if (!remaining.consume_front("\n")) {
+    return llvm::StringRef();
+  }
+
+  return source_text.take_front(remaining.begin() - source_text.begin());
+}
+
+// If source_text begins with a string literal token, extract and return
+// information on that token.
+auto StringLiteralToken::Lex(llvm::StringRef source_text)
+    -> llvm::Optional<StringLiteralToken> {
+  const char* begin = source_text.begin();
+
+  int hash_level = 0;
+  while (source_text.consume_front("#")) {
+    ++hash_level;
+  }
+
+  llvm::SmallString<16> terminator("\"");
+  llvm::SmallString<16> escape("\\");
+
+  llvm::StringRef multi_line_prefix =
+      TakeMultiLineStringLiteralPrefix(source_text);
+  bool multi_line = !multi_line_prefix.empty();
+  if (multi_line) {
+    source_text = source_text.drop_front(multi_line_prefix.size());
+    terminator = "\"\"\"";
+  } else if (!source_text.consume_front("\"")) {
+    return llvm::None;
+  }
+
+  // The terminator and escape sequence marker require a number of '#'s
+  // matching the leading sequence of '#'s.
+  terminator.resize(terminator.size() + hash_level, '#');
+  escape.resize(escape.size() + hash_level, '#');
+
+  const char* content_begin = source_text.begin();
+  const char* content_end = content_begin;
+  while (!source_text.consume_front(terminator)) {
+    // Let LexError figure out how to recover from an unterminated string
+    // literal.
+    if (source_text.empty()) {
+      return llvm::None;
+    }
+    if (!multi_line && source_text.startswith("\n")) {
+      return llvm::None;
+    }
+
+    // Consume an escape sequence marker if present.
+    (void)source_text.consume_front(escape);
+    // Then consume one more character, either of the content or of an
+    // escape sequence. This relies on multi-character escape sequences
+    // not containing an embedded and unescaped terminator or newline.
+    source_text = source_text.substr(1);
+    content_end = source_text.begin();
+  }
+
+  return StringLiteralToken(
+      llvm::StringRef(begin, source_text.begin() - begin),
+      llvm::StringRef(content_begin, content_end - content_begin), hash_level,
+      multi_line);
+}
+
+// Given a string that contains at least one newline, find the indent (the
+// leading sequence of horizontal whitespace) of its final line.
+static auto ComputeIndentOfFinalLine(llvm::StringRef text) -> llvm::StringRef {
+  int indent_end = text.size();
+  for (int i = indent_end - 1; i >= 0; --i) {
+    if (text[i] == '\n') {
+      int indent_start = i + 1;
+      return text.substr(indent_start, indent_end - indent_start);
+    }
+    if (!isSpace(text[i])) {
+      indent_end = i;
+    }
+  }
+  llvm_unreachable("Given text is required to contain a newline.");
+}
+
+namespace {
+// The leading whitespace in a multi-line string literal.
+struct Indent {
+  llvm::StringRef indent;
+  bool has_errors;
+};
+}  // namespace
+
+// Check the literal is indented properly, if it's a multi-line litera.
+// Find the leading whitespace that should be removed from each line of a
+// multi-line string literal.
+static auto CheckIndent(DiagnosticEmitter& emitter, llvm::StringRef text,
+                        llvm::StringRef content) -> Indent {
+  // Find the leading horizontal whitespace on the final line of this literal.
+  // Note that for an empty literal, this might not be inside the content.
+  llvm::StringRef indent = ComputeIndentOfFinalLine(text);
+  bool has_errors = false;
+
+  // The last line is not permitted to contain any content after its
+  // indentation.
+  if (indent.end() != content.end()) {
+    emitter.EmitError<ContentBeforeStringTerminator>();
+    has_errors = true;
+  }
+
+  return {.indent = indent, .has_errors = has_errors};
+}
+
+// Expand a `\u{HHHHHH}` escape sequence into a sequence of UTF-8 code units.
+static auto ExpandUnicodeEscapeSequence(DiagnosticEmitter& emitter,
+                                        llvm::StringRef digits,
+                                        std::string& result) -> bool {
+  unsigned code_point;
+  if (digits.getAsInteger(16, code_point) || code_point > 0x10FFFF) {
+    emitter.EmitError<UnicodeEscapeTooLarge>();
+    return false;
+  }
+
+  if (code_point >= 0xD800 && code_point < 0xE000) {
+    emitter.EmitError<UnicodeEscapeSurrogate>();
+    return false;
+  }
+
+  // Convert the code point to a sequence of UTF-8 code units.
+  // Every code point fits in 6 UTF-8 code units.
+  const llvm::UTF32 utf32_code_units[1] = {code_point};
+  llvm::UTF8 utf8_code_units[6];
+  const llvm::UTF32* src_pos = utf32_code_units;
+  llvm::UTF8* dest_pos = utf8_code_units;
+  llvm::ConversionResult conv_result = llvm::ConvertUTF32toUTF8(
+      &src_pos, src_pos + 1, &dest_pos, dest_pos + 6, llvm::strictConversion);
+  if (conv_result != llvm::conversionOK) {
+    llvm_unreachable("conversion of valid code point to UTF-8 cannot fail");
+  }
+  result.insert(result.end(), reinterpret_cast<char*>(utf8_code_units),
+                reinterpret_cast<char*>(dest_pos));
+  return true;
+}
+
+// Expand an escape sequence, appending the expanded value to the given
+// `result` string. `content` is the string content, starting from the first
+// character after the escape sequence introducer (for example, the `n` in
+// `\n`), and will be updated to remove the leading escape sequence.
+static auto ExpandAndConsumeEscapeSequence(DiagnosticEmitter& emitter,
+                                           llvm::StringRef& content,
+                                           std::string& result) -> bool {
+  assert(!content.empty() && "should have escaped closing delimiter");
+  char first = content.front();
+  content = content.drop_front(1);
+
+  switch (first) {
+    case 't':
+      result += '\t';
+      return true;
+    case 'n':
+      result += '\n';
+      return true;
+    case 'r':
+      result += '\r';
+      return true;
+    case '"':
+      result += '"';
+      return true;
+    case '\'':
+      result += '\'';
+      return true;
+    case '\\':
+      result += '\\';
+      return true;
+    case '0':
+      result += '\0';
+      if (!content.empty() && llvm::isDigit(content.front())) {
+        emitter.EmitError<DecimalEscapeSequence>();
+        return false;
+      }
+      return true;
+    case 'x':
+      if (content.size() >= 2 && isUpperHexDigit(content[0]) &&
+          isUpperHexDigit(content[1])) {
+        result +=
+            static_cast<char>(llvm::hexFromNibbles(content[0], content[1]));
+        content = content.drop_front(2);
+        return true;
+      }
+      emitter.EmitError<HexadecimalEscapeMissingDigits>();
+      break;
+    case 'u': {
+      llvm::StringRef remaining = content;
+      if (remaining.consume_front("{")) {
+        llvm::StringRef digits = remaining.take_while(isUpperHexDigit);
+        remaining = remaining.drop_front(digits.size());
+        if (!digits.empty() && remaining.consume_front("}")) {
+          if (!ExpandUnicodeEscapeSequence(emitter, digits, result)) {
+            break;
+          }
+          content = remaining;
+          return true;
+        }
+      }
+      emitter.EmitError<UnicodeEscapeMissingBracedDigits>();
+      break;
+    }
+    default:
+      emitter.EmitError<UnknownEscapeSequence>({.first = first});
+      break;
+  }
+
+  // If we get here, we didn't recognize this escape sequence and have already
+  // issued a diagnostic. For error recovery purposes, expand this escape
+  // sequence to itself, dropping the introducer (for example, `\q` -> `q`).
+  result += first;
+  return false;
+}
+
+// Expand any escape sequences in the given string literal.
+static auto ExpandEscapeSequencesAndRemoveIndent(DiagnosticEmitter& emitter,
+                                                 llvm::StringRef contents,
+                                                 int hash_level,
+                                                 llvm::StringRef indent)
+    -> StringLiteralToken::ExpandedValue {
+  std::string result;
+  result.reserve(contents.size());
+  bool has_errors = false;
+
+  llvm::SmallString<16> escape("\\");
+  escape.resize(1 + hash_level, '#');
+
+  // Process each line of the string literal.
+  while (true) {
+    // Every non-empty line (that contains anything other than horizontal
+    // whitespace) is required to start with the string's indent. For error
+    // recovery, remove all leading whitespace if the indent doesn't match.
+    if (!contents.consume_front(indent)) {
+      contents = contents.ltrim(HorizontalWhitespace);
+      if (!contents.startswith("\n")) {
+        emitter.EmitError<MismatchedIndentInString>();
+        has_errors = true;
+      }
+    }
+
+    // Process the contents of the line.
+    while (true) {
+      auto end_of_regular_text = contents.find_first_of("\n\\");
+      result += contents.substr(0, end_of_regular_text);
+      contents = contents.substr(end_of_regular_text);
+
+      if (contents.empty()) {
+        return {.result = result, .has_errors = has_errors};
+      }
+
+      if (contents.consume_front("\n")) {
+        // Trailing whitespace before a newline doesn't contribute to the string
+        // literal value.
+        while (!result.empty() && result.back() != '\n' &&
+               isSpace(result.back())) {
+          result.pop_back();
+        }
+        result += '\n';
+        // Move onto to the next line.
+        break;
+      }
+
+      if (!contents.consume_front(escape)) {
+        // This is not an escape sequence, just a raw `\`.
+        result += contents.front();
+        contents = contents.drop_front(1);
+        continue;
+      }
+
+      if (contents.consume_front("\n")) {
+        // An escaped ends the line without producing any content and without
+        // trimming trailing whitespace.
+        break;
+      }
+
+      // Handle this escape sequence.
+      if (!ExpandAndConsumeEscapeSequence(emitter, contents, result)) {
+        has_errors = true;
+      }
+    }
+  }
+}
+
+auto StringLiteralToken::ComputeValue(DiagnosticEmitter& emitter) const
+    -> ExpandedValue {
+  auto indent = multi_line ? CheckIndent(emitter, text, content) : Indent();
+  auto result = ExpandEscapeSequencesAndRemoveIndent(emitter, content,
+                                                     hash_level, indent.indent);
+  result.has_errors |= indent.has_errors;
+  return result;
+}
+
+}  // namespace Carbon

+ 58 - 0
lexer/string_literal.h

@@ -0,0 +1,58 @@
+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+// Exceptions. See /LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <string>
+
+#include "diagnostics/diagnostic_emitter.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/StringRef.h"
+
+namespace Carbon {
+
+class StringLiteralToken {
+ public:
+  // Get the text corresponding to this literal.
+  auto Text() const -> llvm::StringRef { return text; }
+
+  // Determine whether this is a multi-line string literal.
+  auto IsMultiLine() const -> bool { return multi_line; }
+
+  // Extract a string literal token from the given text, if it has a suitable
+  // form.
+  static auto Lex(llvm::StringRef source_text)
+      -> llvm::Optional<StringLiteralToken>;
+
+  // The result of expanding escape sequences in a string literal.
+  struct ExpandedValue {
+    std::string result;
+    bool has_errors;
+  };
+
+  // Expand any escape sequences in the given string literal and compute the
+  // resulting value.
+  auto ComputeValue(DiagnosticEmitter& emitter) const -> ExpandedValue;
+
+ private:
+  StringLiteralToken(llvm::StringRef text, llvm::StringRef content,
+                     int hash_level, bool multi_line)
+      : text(text),
+        content(content),
+        hash_level(hash_level),
+        multi_line(multi_line) {}
+
+  // The complete text of the string literal.
+  llvm::StringRef text;
+  // The content of the literal. For a multi-line literal, this begins
+  // immediately after the newline following the file type indicator, and ends
+  // at the start of the closing `"""`. Leading whitespace is not removed from
+  // either end.
+  llvm::StringRef content;
+  // The number of `#`s preceding the opening `"` or `"""`.
+  int hash_level;
+  // Whether this was a multi-line string literal.
+  bool multi_line;
+};
+
+
+}  // namespace Carbon

+ 257 - 0
lexer/string_literal_test.cpp

@@ -0,0 +1,257 @@
+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+// Exceptions. See /LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "lexer/string_literal.h"
+
+#include "diagnostics/diagnostic_emitter.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace Carbon {
+namespace {
+
+struct StringLiteralTest : ::testing::Test {
+  auto Lex(llvm::StringRef text) -> StringLiteralToken {
+    llvm::Optional<StringLiteralToken> result = StringLiteralToken::Lex(text);
+    assert(result);
+    EXPECT_EQ(result->Text(), text);
+    return *result;
+  }
+
+  auto Parse(llvm::StringRef text) -> StringLiteralToken::ExpandedValue {
+    StringLiteralToken token = Lex(text);
+    return token.ComputeValue(ConsoleDiagnosticEmitter());
+  }
+};
+
+TEST_F(StringLiteralTest, StringLiteralBounds) {
+  llvm::StringLiteral valid[] = {
+      R"("")",
+      R"("""
+      """)",
+      R"("""
+      "foo"
+      """)",
+
+      // Escaped terminators don't end the string.
+      R"("\"")",
+      R"("\\")",
+      R"("\\\"")",
+      R"("""
+      \"""
+      """)",
+      R"("""
+      "\""
+      """)",
+      R"("""
+      ""\"
+      """)",
+      R"("""
+      ""\
+      """)",
+      R"(#"""
+      """\#n
+      """#)",
+
+      // Only a matching number of '#'s terminates the string.
+      R"(#""#)",
+      R"(#"xyz"foo"#)",
+      R"(##"xyz"#foo"##)",
+      R"(#"\""#)",
+
+      // Escape sequences likewise require a matching number of '#'s.
+      R"(#"\#"#"#)",
+      R"(#"\"#)",
+      R"(#"""
+      \#"""#
+      """#)",
+
+      // #"""# does not start a multiline string literal.
+      R"(#"""#)",
+      R"(##"""##)",
+  };
+
+  for (llvm::StringLiteral test : valid) {
+    llvm::Optional<StringLiteralToken> result = StringLiteralToken::Lex(test);
+    EXPECT_TRUE(result.hasValue()) << test;
+    if (result) {
+      EXPECT_EQ(result->Text(), test);
+    }
+  }
+
+  llvm::StringLiteral invalid[] = {
+      R"(")",
+      R"("""
+      "")",
+      R"("\)",
+      R"("\")",
+      R"("\\)",
+      R"("\\\")",
+      R"("""
+      )",
+      R"(#"""
+      """)",
+  };
+
+  for (llvm::StringLiteral test : invalid) {
+    EXPECT_FALSE(StringLiteralToken::Lex(test).hasValue())
+        << "`" << test << "`";
+  }
+}
+
+TEST_F(StringLiteralTest, StringLiteralContents) {
+  // We use ""s strings to handle embedded nul characters below.
+  using std::operator""s;
+
+  std::pair<llvm::StringLiteral, llvm::StringLiteral> testcases[] = {
+      // Empty strings.
+      {R"("")", ""},
+
+      {R"(
+"""
+"""
+       )",
+       ""},
+
+      // Nearly-empty strings.
+      {R"(
+"""
+
+"""
+       )",
+       "\n"},
+
+      // Indent removal.
+      {R"(
+       """file type indicator
+          indented contents \
+         """
+       )",
+       " indented contents "},
+
+      {R"(
+    """
+   hello
+  world
+
+   end of test
+  """
+       )",
+       " hello\nworld\n\n end of test\n"},
+
+      // Escape sequences.
+      {R"(
+       "\x14,\u{1234},\u{00000010},\n,\r,\t,\0,\",\',\\"
+       )",
+       llvm::StringLiteral::withInnerNUL(
+           "\x14,\xE1\x88\xB4,\x10,\x0A,\x0D,\x09,\x00,\x22,\x27,\x5C")},
+
+      {R"(
+       "\0A\x1234"
+       )",
+       llvm::StringLiteral::withInnerNUL("\0A\x12"
+                                         "34")},
+
+      {R"(
+       "\u{D7FF},\u{E000},\u{10FFFF}"
+       )",
+       "\xED\x9F\xBF,\xEE\x80\x80,\xF4\x8F\xBF\xBF"},
+
+      // Escape sequences in 'raw' strings.
+      {R"(
+       #"\#x00,\#xFF,\#u{56789},\#u{ABCD},\#u{00000000000000000EF}"#
+       )",
+       llvm::StringLiteral::withInnerNUL(
+           "\x00,\xFF,\xF1\x96\x9E\x89,\xEA\xAF\x8D,\xC3\xAF")},
+
+      {R"(
+       ##"\n,\#n,\##n,\##\##n,\##\###n"##
+       )",
+       "\\n,\\#n,\n,\\##n,\\###n"},
+
+      // Trailing whitespace handling.
+      {"\"\"\"\n  Hello \\\n  World \t \n  Bye!  \\\n  \"\"\"",
+       "Hello World\nBye!  "},
+  };
+
+  for (auto [test, contents] : testcases) {
+    auto value = Parse(test.trim());
+    EXPECT_FALSE(value.has_errors) << "`" << test << "`";
+    EXPECT_EQ(value.result, contents);
+  }
+}
+
+TEST_F(StringLiteralTest, StringLiteralBadIndent) {
+  std::pair<llvm::StringLiteral, llvm::StringLiteral> testcases[] = {
+    // Indent doesn't match the last line.
+    {"\"\"\"\n \tx\n  \"\"\"", "x\n"},
+    {"\"\"\"\n x\n  \"\"\"", "x\n"},
+    {"\"\"\"\n  x\n\t\"\"\"", "x\n"},
+    {"\"\"\"\n  ok\n bad\n  \"\"\"", "ok\nbad\n"},
+    {"\"\"\"\n bad\n  ok\n  \"\"\"", "bad\nok\n"},
+    {"\"\"\"\n  escaped,\\\n bad\n  \"\"\"", "escaped,bad\n"},
+
+    // Indent on last line is followed by text.
+    {"\"\"\"\n  x\n  x\"\"\"", "x\nx"},
+    {"\"\"\"\n   x\n  x\"\"\"", " x\nx"},
+    {"\"\"\"\n x\n  x\"\"\"", "x\nx"},
+  };
+
+  for (auto [test, contents] : testcases) {
+    auto value = Parse(test);
+    EXPECT_TRUE(value.has_errors) << "`" << test << "`";
+    EXPECT_EQ(value.result, contents);
+  }
+}
+
+TEST_F(StringLiteralTest, StringLiteralBadEscapeSequence) {
+  llvm::StringLiteral testcases[] = {
+    R"("\a")",
+    R"("\b")",
+    R"("\e")",
+    R"("\f")",
+    R"("\v")",
+    R"("\?")",
+    R"("\1")",
+    R"("\9")",
+
+    // \0 can't be followed by a decimal digit.
+    R"("\01")",
+    R"("\09")",
+
+    // \x requires two (uppercase) hexadecimal digits.
+    R"("\x")",
+    R"("\x0")",
+    R"("\x0G")",
+    R"("\xab")",
+    R"("\x\n")",
+    R"("\x\"")",
+
+    // \u requires a braced list of one or more hexadecimal digits.
+    R"("\u")",
+    R"("\u?")",
+    R"("\u\"")",
+    R"("\u{")",
+    R"("\u{}")",
+    R"("\u{A")",
+    R"("\u{G}")",
+    R"("\u{0000012323127z}")",
+    R"("\u{-3}")",
+
+    // \u must specify a non-surrogate code point.
+    R"("\u{110000}")",
+    R"("\u{000000000000000000000000000000000110000}")",
+    R"("\u{D800}")",
+    R"("\u{DFFF}")",
+  };
+
+  for (llvm::StringLiteral test : testcases) {
+    auto value = Parse(test);
+    EXPECT_TRUE(value.has_errors) << "`" << test << "`";
+    // TODO: Test value produced by error recovery.
+  }
+}
+
+}  // namespace
+}  // namespace Carbon

+ 1 - 1
lexer/token_registry.def

@@ -71,7 +71,6 @@ CARBON_SYMBOL_TOKEN(Comma,               ",")
 CARBON_SYMBOL_TOKEN(Equal,               "=")
 CARBON_SYMBOL_TOKEN(Exclaim,             "!")
 CARBON_SYMBOL_TOKEN(Greater,             ">")
-CARBON_SYMBOL_TOKEN(Hash,                "#")
 CARBON_SYMBOL_TOKEN(Less,                "<")
 CARBON_SYMBOL_TOKEN(Minus,               "-")
 CARBON_SYMBOL_TOKEN(Percent,             "%")
@@ -157,6 +156,7 @@ CARBON_KEYWORD_TOKEN(XorKeyword,        "xor")
 CARBON_TOKEN(Identifier)
 CARBON_TOKEN(IntegerLiteral)
 CARBON_TOKEN(RealLiteral)
+CARBON_TOKEN(StringLiteral)
 CARBON_TOKEN(Error)
 
 #undef CARBON_TOKEN

+ 86 - 7
lexer/tokenized_buffer.cpp

@@ -10,6 +10,7 @@
 #include <string>
 
 #include "lexer/numeric_literal.h"
+#include "lexer/string_literal.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Twine.h"
@@ -104,6 +105,18 @@ class TokenizedBuffer::Lexer {
     explicit operator bool() const { return formed_token; }
   };
 
+  // Perform the necessary bookkeeping to step past a newline at the current
+  // line and column.
+  auto HandleNewline() -> void {
+    current_line_info->length = current_column;
+
+    current_line = buffer.AddLine(
+        {current_line_info->start + current_column + 1, 0, 0});
+    current_line_info = &buffer.GetLineInfo(current_line);
+    current_column = 0;
+    set_indent = false;
+  }
+
   auto SkipWhitespace(llvm::StringRef& source_text) -> bool {
     while (!source_text.empty()) {
       // We only support line-oriented commenting and lex comments as-if they
@@ -136,21 +149,16 @@ class TokenizedBuffer::Lexer {
           return true;
 
         case '\n':
-          // New lines are special in order to track line structure.
-          current_line_info->length = current_column;
           // If this is the last character in the source, directly return here
           // to avoid creating an empty line.
           source_text = source_text.drop_front();
           if (source_text.empty()) {
+            current_line_info->length = current_column;
             return false;
           }
 
           // Otherwise, add a line and set up to continue lexing.
-          current_line = buffer.AddLine(
-              {current_line_info->start + current_column + 1, 0, 0});
-          current_line_info = &buffer.GetLineInfo(current_line);
-          current_column = 0;
-          set_indent = false;
+          HandleNewline();
           continue;
 
         case ' ':
@@ -231,6 +239,53 @@ class TokenizedBuffer::Lexer {
     }
   }
 
+  auto LexStringLiteral(llvm::StringRef& source_text) -> LexResult {
+    llvm::Optional<StringLiteralToken> literal =
+        StringLiteralToken::Lex(source_text);
+    if (!literal) {
+      return LexResult::NoMatch();
+    }
+
+    Line string_line = current_line;
+    int string_column = current_column;
+    int literal_size = literal->Text().size();
+    source_text = source_text.drop_front(literal_size);
+
+    if (!set_indent) {
+      current_line_info->indent = string_column;
+      set_indent = true;
+    }
+
+    // Update line and column information.
+    if (!literal->IsMultiLine()) {
+      current_column += literal_size;
+    } else {
+      for (char c : literal->Text()) {
+        if (c == '\n') {
+          HandleNewline();
+          // The indentation of all lines in a multi-line string literal is
+          // that of the first line.
+          current_line_info->indent = string_column;
+          set_indent = true;
+        } else {
+          ++current_column;
+        }
+      }
+    }
+
+    // Determine string literal value.
+    auto expanded = literal->ComputeValue(emitter);
+    buffer.has_errors |= expanded.has_errors;
+
+    auto token = buffer.AddToken({.kind = TokenKind::StringLiteral(),
+                                  .token_line = string_line,
+                                  .column = string_column});
+    buffer.GetTokenInfo(token).literal_index =
+        buffer.literal_string_storage.size();
+    buffer.literal_string_storage.push_back(std::move(expanded.result));
+    return token;
+  }
+
   auto LexSymbolToken(llvm::StringRef& source_text) -> LexResult {
     TokenKind kind = llvm::StringSwitch<TokenKind>(source_text)
 #define CARBON_SYMBOL_TOKEN(Name, Spelling) \
@@ -420,6 +475,9 @@ auto TokenizedBuffer::Lex(SourceBuffer& source, DiagnosticEmitter& emitter)
     if (!result) {
       result = lexer.LexNumericLiteral(source_text);
     }
+    if (!result) {
+      result = lexer.LexStringLiteral(source_text);
+    }
     if (!result) {
       result = lexer.LexError(source_text);
     }
@@ -471,6 +529,17 @@ auto TokenizedBuffer::GetTokenText(Token token) const -> llvm::StringRef {
     return relexed_token->Text();
   }
 
+  // Refer back to the source text to find the original spelling, including
+  // escape sequences etc.
+  if (token_info.kind == TokenKind::StringLiteral()) {
+    auto& line_info = GetLineInfo(token_info.token_line);
+    int64_t token_start = line_info.start + token_info.column;
+    llvm::Optional<StringLiteralToken> relexed_token =
+        StringLiteralToken::Lex(source->Text().substr(token_start));
+    assert(relexed_token && "Could not reform string literal token.");
+    return relexed_token->Text();
+  }
+
   assert(token_info.kind == TokenKind::Identifier() &&
          "Only identifiers have stored text!");
   return GetIdentifierText(token_info.id);
@@ -507,6 +576,13 @@ auto TokenizedBuffer::GetRealLiteral(Token token) const -> RealLiteralValue {
   return RealLiteralValue(this, token_info.literal_index, is_decimal);
 }
 
+auto TokenizedBuffer::GetStringLiteral(Token token) const -> llvm::StringRef {
+  auto& token_info = GetTokenInfo(token);
+  assert(token_info.kind == TokenKind::StringLiteral() &&
+         "The token must be a string literal!");
+  return literal_string_storage[token_info.literal_index];
+}
+
 auto TokenizedBuffer::GetMatchedClosingToken(Token opening_token) const
     -> Token {
   auto& opening_token_info = GetTokenInfo(opening_token);
@@ -624,7 +700,10 @@ auto TokenizedBuffer::PrintToken(llvm::raw_ostream& output_stream, Token token,
     output_stream << ", closing_token: " << GetMatchedClosingToken(token).index;
   } else if (token_info.kind.IsClosingSymbol()) {
     output_stream << ", opening_token: " << GetMatchedOpeningToken(token).index;
+  } else if (token_info.kind == TokenKind::StringLiteral()) {
+    output_stream << ", value: `" << GetStringLiteral(token) << "`";
   }
+  // TODO: Include value for numeric literals.
 
   if (token_info.is_recovery) {
     output_stream << ", recovery: true";

+ 5 - 0
lexer/tokenized_buffer.h

@@ -263,6 +263,9 @@ class TokenizedBuffer {
   // Returns the value of an `RealLiteral()` token.
   [[nodiscard]] auto GetRealLiteral(Token token) const -> RealLiteralValue;
 
+  // Returns the value of a `StringLiteral()` token.
+  auto GetStringLiteral(Token token) const -> llvm::StringRef;
+
   // Returns the closing token matched with the given opening token.
   //
   // The given token must be an opening token kind.
@@ -402,6 +405,8 @@ class TokenizedBuffer {
   // Storage for integers that form part of the value of a numeric literal.
   llvm::SmallVector<llvm::APInt, 16> literal_int_storage;
 
+  llvm::SmallVector<std::string, 16> literal_string_storage;
+
   llvm::DenseMap<llvm::StringRef, Identifier> identifier_map;
 
   bool has_errors = false;

+ 140 - 4
lexer/tokenized_buffer_test.cpp

@@ -51,7 +51,7 @@ TEST_F(LexerTest, HandlesEmptyBuffer) {
 }
 
 TEST_F(LexerTest, TracksLinesAndColumns) {
-  auto buffer = Lex("\n  ;;\n   ;;;\n");
+  auto buffer = Lex("\n  ;;\n   ;;;\n   x\"foo\" \"\"\"baz\n  a\n \"\"\" y");
   EXPECT_FALSE(buffer.HasErrors());
   EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
                           {.kind = TokenKind::Semi(),
@@ -74,6 +74,24 @@ TEST_F(LexerTest, TracksLinesAndColumns) {
                            .line = 3,
                            .column = 6,
                            .indent_column = 4},
+                          {.kind = TokenKind::Identifier(),
+                           .line = 4,
+                           .column = 4,
+                           .indent_column = 4,
+                           .text = "x"},
+                          {.kind = TokenKind::StringLiteral(),
+                           .line = 4,
+                           .column = 5,
+                           .indent_column = 4},
+                          {.kind = TokenKind::StringLiteral(),
+                           .line = 4,
+                           .column = 11,
+                           .indent_column = 4},
+                          {.kind = TokenKind::Identifier(),
+                           .line = 6,
+                           .column = 6,
+                           .indent_column = 11,
+                           .text = "y"},
                       }));
 }
 
@@ -250,7 +268,7 @@ TEST_F(LexerTest, SplitsNumericLiteralsProperly) {
 }
 
 TEST_F(LexerTest, HandlesGarbageCharacters) {
-  constexpr char GarbageText[] = "$$💩-$\n$\0$12$";
+  constexpr char GarbageText[] = "$$💩-$\n$\0$12$\n\"\n\"\\";
   auto buffer = Lex(llvm::StringRef(GarbageText, sizeof(GarbageText) - 1));
   EXPECT_TRUE(buffer.HasErrors());
   EXPECT_THAT(
@@ -273,6 +291,20 @@ TEST_F(LexerTest, HandlesGarbageCharacters) {
            .column = 4,
            .text = "12"},
           {.kind = TokenKind::Error(), .line = 2, .column = 6, .text = "$"},
+          // newline
+          {.kind = TokenKind::Error(),
+           .line = 3,
+           .column = 1,
+           .text = llvm::StringRef("\"", 1)},
+          // newline
+          {.kind = TokenKind::Error(),
+           .line = 4,
+           .column = 1,
+           .text = llvm::StringRef("\"", 1)},
+          {.kind = TokenKind::Backslash(),
+           .line = 4,
+           .column = 2,
+           .text = llvm::StringRef("\\", 1)},
       }));
 }
 
@@ -302,13 +334,12 @@ TEST_F(LexerTest, Symbols) {
                           {TokenKind::Greater()},
                       }));
 
-  buffer = Lex("\\/?#@&^!");
+  buffer = Lex("\\/?@&^!");
   EXPECT_FALSE(buffer.HasErrors());
   EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
                           {TokenKind::Backslash()},
                           {TokenKind::Slash()},
                           {TokenKind::Question()},
-                          {TokenKind::Hash()},
                           {TokenKind::At()},
                           {TokenKind::Amp()},
                           {TokenKind::Caret()},
@@ -614,6 +645,111 @@ TEST_F(LexerTest, Identifiers) {
                       }));
 }
 
+TEST_F(LexerTest, StringLiterals) {
+  llvm::StringLiteral testcase = R"(
+    "hello world\n"
+
+    """foo
+      test \
+      \xAB
+     """ trailing
+
+      #"""#
+
+    "\0"
+
+    #"\0"foo"\1"#
+
+    """x"""
+  )";
+
+  auto buffer = Lex(testcase);
+  EXPECT_FALSE(buffer.HasErrors());
+  EXPECT_THAT(buffer,
+              HasTokens(llvm::ArrayRef<ExpectedToken>{
+                  {.kind = TokenKind::StringLiteral(),
+                   .line = 2,
+                   .column = 5,
+                   .indent_column = 5,
+                   .string_contents = {"hello world\n"}},
+                  {.kind = TokenKind::StringLiteral(),
+                   .line = 4,
+                   .column = 5,
+                   .indent_column = 5,
+                   .string_contents = {" test  \xAB\n"}},
+                  {.kind = TokenKind::Identifier(),
+                   .line = 7,
+                   .column = 10,
+                   .indent_column = 5,
+                   .text = "trailing"},
+                  {.kind = TokenKind::StringLiteral(),
+                   .line = 9,
+                   .column = 7,
+                   .indent_column = 7,
+                   .string_contents = {"\""}},
+                  {.kind = TokenKind::StringLiteral(),
+                   .line = 11,
+                   .column = 5,
+                   .indent_column = 5,
+                   .string_contents = llvm::StringLiteral::withInnerNUL("\0")},
+                  {.kind = TokenKind::StringLiteral(),
+                   .line = 13,
+                   .column = 5,
+                   .indent_column = 5,
+                   .string_contents = {"\\0\"foo\"\\1"}},
+
+                  // """x""" is three string literals, not one.
+                  {.kind = TokenKind::StringLiteral(),
+                   .line = 15,
+                   .column = 5,
+                   .indent_column = 5,
+                   .string_contents = {""}},
+                  {.kind = TokenKind::StringLiteral(),
+                   .line = 15,
+                   .column = 7,
+                   .indent_column = 5,
+                   .string_contents = {"x"}},
+                  {.kind = TokenKind::StringLiteral(),
+                   .line = 15,
+                   .column = 10,
+                   .indent_column = 5,
+                   .string_contents = {""}},
+              }));
+}
+
+TEST_F(LexerTest, InvalidStringLiterals) {
+  llvm::StringLiteral invalid[] = {
+      R"(")",
+      R"("""
+      "")",
+      R"("\)",
+      R"("\")",
+      R"("\\)",
+      R"("\\\")",
+      R"(""")",
+      R"("""
+      )",
+      R"("""\)",
+      R"(#"""
+      """)",
+  };
+
+  for (llvm::StringLiteral test : invalid) {
+    auto buffer = Lex(test);
+    EXPECT_TRUE(buffer.HasErrors()) << "`" << test << "`";
+
+    // We should have formed at least one error token.
+    bool found_error = false;
+    for (TokenizedBuffer::Token token : buffer.Tokens()) {
+      if (buffer.GetKind(token) == TokenKind::Error()) {
+        found_error = true;
+        break;
+      }
+    }
+    EXPECT_TRUE(found_error) << "`" << test << "`";
+  }
+}
+
 auto GetAndDropLine(llvm::StringRef& text) -> std::string {
   auto newline_offset = text.find_first_of('\n');
   llvm::StringRef line = text.slice(0, newline_offset);

+ 19 - 2
lexer/tokenized_buffer_test_helpers.h

@@ -26,9 +26,9 @@ namespace Testing {
 struct ExpectedToken {
   friend auto operator<<(std::ostream& output, const ExpectedToken& expected)
       -> std::ostream& {
-    output << "\ntoken: { kind: '" << expected.kind.Name().str();
+    output << "\ntoken: { kind: '" << expected.kind.Name().str() << "'";
     if (expected.line != -1) {
-      output << "', line: " << expected.line;
+      output << ", line: " << expected.line;
     }
     if (expected.column != -1) {
       output << ", column " << expected.column;
@@ -39,6 +39,10 @@ struct ExpectedToken {
     if (!expected.text.empty()) {
       output << ", spelling: '" << expected.text.str() << "'";
     }
+    if (expected.string_contents) {
+      output << ", string contents: '" << expected.string_contents->str()
+             << "'";
+    }
     if (expected.recovery) {
       output << ", recovery: true";
     }
@@ -52,6 +56,7 @@ struct ExpectedToken {
   int indent_column = -1;
   bool recovery = false;
   llvm::StringRef text = "";
+  llvm::Optional<llvm::StringRef> string_contents = llvm::None;
 };
 
 // TODO: Consider rewriting this into a `TokenEq` matcher which is used inside
@@ -123,6 +128,18 @@ MATCHER_P(HasTokens, raw_all_expected, "") {
                        << expected.text.str() << "`.";
       matches = false;
     }
+
+    assert(!expected.string_contents ||
+           expected.kind == TokenKind::StringLiteral());
+    if (expected.string_contents && actual_kind == TokenKind::StringLiteral()) {
+      llvm::StringRef actual_contents = buffer.GetStringLiteral(token);
+      if (actual_contents != *expected.string_contents) {
+        *result_listener << "\nToken " << index << " has contents `"
+                         << actual_contents.str() << "`, expected `"
+                         << expected.string_contents->str() << "`.";
+        matches = false;
+      }
+    }
   }
 
   int actual_size = buffer.Tokens().end() - buffer.Tokens().begin();