5 ani în urmă · 67ee1fcaa7
--- a/lexer/BUILD
+++ b/lexer/BUILD
@@ -49,11 +49,35 @@ cc_test(
 
				     ],
			
 
				 )
			
 
				 
			
 
				+cc_library(
			
 
				+    name = "string_literal",
			
 
				+    srcs = ["string_literal.cpp"],
			
 
				+    hdrs = ["string_literal.h"],
			
 
				+    deps = [
			
 
				+        "//diagnostics:diagnostic_emitter",
			
 
				+        "@llvm-project//llvm:Support",
			
 
				+    ],
			
 
				+)
			
 
				+
			
 
				+cc_test(
			
 
				+    name = "string_literal_test",
			
 
				+    srcs = ["string_literal_test.cpp"],
			
 
				+    deps = [
			
 
				+        ":string_literal",
			
 
				+        "//diagnostics:diagnostic_emitter",
			
 
				+        "@llvm-project//llvm:Support",
			
 
				+        "@llvm-project//llvm:gmock",
			
 
				+        "@llvm-project//llvm:gtest",
			
 
				+        "@llvm-project//llvm:gtest_main",
			
 
				+    ],
			
 
				+)
			
 
				+
			
 
				 cc_library(
			
 
				     name = "tokenized_buffer",
			
 
				     srcs = ["tokenized_buffer.cpp"],
			
 
				     hdrs = ["tokenized_buffer.h"],
			
 
				     deps = [
			
 
				+        ":string_literal",
			
 
				         ":token_kind",
			
 
				         ":numeric_literal",
			
 
				         "//diagnostics:diagnostic_emitter",
			
--- a/lexer/string_literal.cpp
+++ b/lexer/string_literal.cpp
@@ -0,0 +1,388 @@
 
				+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
			
 
				+// Exceptions. See /LICENSE for license information.
			
 
				+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
			
 
				+
			
 
				+#include "lexer/string_literal.h"
			
 
				+
			
 
				+#include "llvm/ADT/SmallString.h"
			
 
				+#include "llvm/ADT/StringExtras.h"
			
 
				+#include "llvm/Support/ConvertUTF.h"
			
 
				+#include "llvm/Support/ErrorHandling.h"
			
 
				+#include "llvm/Support/FormatVariadic.h"
			
 
				+
			
 
				+namespace Carbon {
			
 
				+
			
 
				+struct ContentBeforeStringTerminator
			
 
				+    : SimpleDiagnostic<ContentBeforeStringTerminator> {
			
 
				+  static constexpr llvm::StringLiteral ShortName = "syntax-invalid-string";
			
 
				+  static constexpr llvm::StringLiteral Message =
			
 
				+      "Only whitespace is permitted before the closing `\"\"\"` of a "
			
 
				+      "multi-line string.";
			
 
				+};
			
 
				+
			
 
				+struct UnicodeEscapeTooLarge : SimpleDiagnostic<UnicodeEscapeTooLarge> {
			
 
				+  static constexpr llvm::StringLiteral ShortName = "syntax-invalid-string";
			
 
				+  static constexpr llvm::StringLiteral Message =
			
 
				+      "Code point specified by `\\u{...}` escape is greater than 0x10FFFF.";
			
 
				+};
			
 
				+
			
 
				+struct UnicodeEscapeSurrogate : SimpleDiagnostic<UnicodeEscapeSurrogate> {
			
 
				+  static constexpr llvm::StringLiteral ShortName = "syntax-invalid-string";
			
 
				+  static constexpr llvm::StringLiteral Message =
			
 
				+      "Code point specified by `\\u{...}` escape is a surrogate character.";
			
 
				+};
			
 
				+
			
 
				+struct UnicodeEscapeMissingBracedDigits
			
 
				+    : SimpleDiagnostic<UnicodeEscapeMissingBracedDigits> {
			
 
				+  static constexpr llvm::StringLiteral ShortName = "syntax-invalid-string";
			
 
				+  static constexpr llvm::StringLiteral Message =
			
 
				+      "Escape sequence `\\u` must be followed by a braced sequence of "
			
 
				+      "uppercase hexadecimal digits, for example `\\u{70AD}`.";
			
 
				+};
			
 
				+
			
 
				+struct HexadecimalEscapeMissingDigits
			
 
				+    : SimpleDiagnostic<HexadecimalEscapeMissingDigits> {
			
 
				+  static constexpr llvm::StringLiteral ShortName = "syntax-invalid-string";
			
 
				+  static constexpr llvm::StringLiteral Message =
			
 
				+      "Escape sequence `\\x` must be followed by two "
			
 
				+      "uppercase hexadecimal digits, for example `\\x0F`.";
			
 
				+};
			
 
				+
			
 
				+struct DecimalEscapeSequence : SimpleDiagnostic<DecimalEscapeSequence> {
			
 
				+  static constexpr llvm::StringLiteral ShortName = "syntax-invalid-string";
			
 
				+  static constexpr llvm::StringLiteral Message =
			
 
				+      "Decimal digit follows `\\0` escape sequence. Use `\\x00` instead of "
			
 
				+      "`\\0` if the next character is a digit.";
			
 
				+};
			
 
				+
			
 
				+struct UnknownEscapeSequence {
			
 
				+  static constexpr llvm::StringLiteral ShortName = "syntax-invalid-string";
			
 
				+  static constexpr const char* Message = "Unrecognized escape sequence `{0}`.";
			
 
				+
			
 
				+  char first;
			
 
				+
			
 
				+  auto Format() -> std::string { return llvm::formatv(Message, first).str(); }
			
 
				+};
			
 
				+
			
 
				+struct MismatchedIndentInString : SimpleDiagnostic<MismatchedIndentInString> {
			
 
				+  static constexpr llvm::StringLiteral ShortName = "syntax-invalid-string";
			
 
				+  static constexpr llvm::StringLiteral Message =
			
 
				+      "Indentation does not match that of the closing \"\"\" in multi-line "
			
 
				+      "string literal.";
			
 
				+};
			
 
				+
			
 
				+// TODO(zygoloid): Update this to match whatever we decide qualifies as
			
 
				+// acceptable whitespace.
			
 
				+static bool isSpace(char c) { return c == ' ' || c == '\n' || c == '\t'; }
			
 
				+
			
 
				+static constexpr llvm::StringLiteral HorizontalWhitespace = " \t";
			
 
				+
			
 
				+static bool isUpperHexDigit(char c) {
			
 
				+  return ('0' <= c && c <= '9') || ('A' <= c && c <= 'F');
			
 
				+}
			
 
				+
			
 
				+// Find and return the opening characters of a multi-line string literal,
			
 
				+// after any '#'s, including the file type indicator and following newline.
			
 
				+static auto TakeMultiLineStringLiteralPrefix(llvm::StringRef source_text)
			
 
				+    -> llvm::StringRef {
			
 
				+  llvm::StringRef remaining = source_text;
			
 
				+  if (!remaining.consume_front("\"\"\"")) {
			
 
				+    return llvm::StringRef();
			
 
				+  }
			
 
				+
			
 
				+  // The rest of the line must be a valid file type indicator: a sequence of
			
 
				+  // characters containing neither '#' nor '"' followed by a newline.
			
 
				+  remaining = remaining.drop_until(
			
 
				+      [](char c) { return c == '"' || c == '#' || c == '\n'; });
			
 
				+  if (!remaining.consume_front("\n")) {
			
 
				+    return llvm::StringRef();
			
 
				+  }
			
 
				+
			
 
				+  return source_text.take_front(remaining.begin() - source_text.begin());
			
 
				+}
			
 
				+
			
 
				+// If source_text begins with a string literal token, extract and return
			
 
				+// information on that token.
			
 
				+auto StringLiteralToken::Lex(llvm::StringRef source_text)
			
 
				+    -> llvm::Optional<StringLiteralToken> {
			
 
				+  const char* begin = source_text.begin();
			
 
				+
			
 
				+  int hash_level = 0;
			
 
				+  while (source_text.consume_front("#")) {
			
 
				+    ++hash_level;
			
 
				+  }
			
 
				+
			
 
				+  llvm::SmallString<16> terminator("\"");
			
 
				+  llvm::SmallString<16> escape("\\");
			
 
				+
			
 
				+  llvm::StringRef multi_line_prefix =
			
 
				+      TakeMultiLineStringLiteralPrefix(source_text);
			
 
				+  bool multi_line = !multi_line_prefix.empty();
			
 
				+  if (multi_line) {
			
 
				+    source_text = source_text.drop_front(multi_line_prefix.size());
			
 
				+    terminator = "\"\"\"";
			
 
				+  } else if (!source_text.consume_front("\"")) {
			
 
				+    return llvm::None;
			
 
				+  }
			
 
				+
			
 
				+  // The terminator and escape sequence marker require a number of '#'s
			
 
				+  // matching the leading sequence of '#'s.
			
 
				+  terminator.resize(terminator.size() + hash_level, '#');
			
 
				+  escape.resize(escape.size() + hash_level, '#');
			
 
				+
			
 
				+  const char* content_begin = source_text.begin();
			
 
				+  const char* content_end = content_begin;
			
 
				+  while (!source_text.consume_front(terminator)) {
			
 
				+    // Let LexError figure out how to recover from an unterminated string
			
 
				+    // literal.
			
 
				+    if (source_text.empty()) {
			
 
				+      return llvm::None;
			
 
				+    }
			
 
				+    if (!multi_line && source_text.startswith("\n")) {
			
 
				+      return llvm::None;
			
 
				+    }
			
 
				+
			
 
				+    // Consume an escape sequence marker if present.
			
 
				+    (void)source_text.consume_front(escape);
			
 
				+    // Then consume one more character, either of the content or of an
			
 
				+    // escape sequence. This relies on multi-character escape sequences
			
 
				+    // not containing an embedded and unescaped terminator or newline.
			
 
				+    source_text = source_text.substr(1);
			
 
				+    content_end = source_text.begin();
			
 
				+  }
			
 
				+
			
 
				+  return StringLiteralToken(
			
 
				+      llvm::StringRef(begin, source_text.begin() - begin),
			
 
				+      llvm::StringRef(content_begin, content_end - content_begin), hash_level,
			
 
				+      multi_line);
			
 
				+}
			
 
				+
			
 
				+// Given a string that contains at least one newline, find the indent (the
			
 
				+// leading sequence of horizontal whitespace) of its final line.
			
 
				+static auto ComputeIndentOfFinalLine(llvm::StringRef text) -> llvm::StringRef {
			
 
				+  int indent_end = text.size();
			
 
				+  for (int i = indent_end - 1; i >= 0; --i) {
			
 
				+    if (text[i] == '\n') {
			
 
				+      int indent_start = i + 1;
			
 
				+      return text.substr(indent_start, indent_end - indent_start);
			
 
				+    }
			
 
				+    if (!isSpace(text[i])) {
			
 
				+      indent_end = i;
			
 
				+    }
			
 
				+  }
			
 
				+  llvm_unreachable("Given text is required to contain a newline.");
			
 
				+}
			
 
				+
			
 
				+namespace {
			
 
				+// The leading whitespace in a multi-line string literal.
			
 
				+struct Indent {
			
 
				+  llvm::StringRef indent;
			
 
				+  bool has_errors;
			
 
				+};
			
 
				+}  // namespace
			
 
				+
			
 
				+// Check the literal is indented properly, if it's a multi-line litera.
			
 
				+// Find the leading whitespace that should be removed from each line of a
			
 
				+// multi-line string literal.
			
 
				+static auto CheckIndent(DiagnosticEmitter& emitter, llvm::StringRef text,
			
 
				+                        llvm::StringRef content) -> Indent {
			
 
				+  // Find the leading horizontal whitespace on the final line of this literal.
			
 
				+  // Note that for an empty literal, this might not be inside the content.
			
 
				+  llvm::StringRef indent = ComputeIndentOfFinalLine(text);
			
 
				+  bool has_errors = false;
			
 
				+
			
 
				+  // The last line is not permitted to contain any content after its
			
 
				+  // indentation.
			
 
				+  if (indent.end() != content.end()) {
			
 
				+    emitter.EmitError<ContentBeforeStringTerminator>();
			
 
				+    has_errors = true;
			
 
				+  }
			
 
				+
			
 
				+  return {.indent = indent, .has_errors = has_errors};
			
 
				+}
			
 
				+
			
 
				+// Expand a `\u{HHHHHH}` escape sequence into a sequence of UTF-8 code units.
			
 
				+static auto ExpandUnicodeEscapeSequence(DiagnosticEmitter& emitter,
			
 
				+                                        llvm::StringRef digits,
			
 
				+                                        std::string& result) -> bool {
			
 
				+  unsigned code_point;
			
 
				+  if (digits.getAsInteger(16, code_point) || code_point > 0x10FFFF) {
			
 
				+    emitter.EmitError<UnicodeEscapeTooLarge>();
			
 
				+    return false;
			
 
				+  }
			
 
				+
			
 
				+  if (code_point >= 0xD800 && code_point < 0xE000) {
			
 
				+    emitter.EmitError<UnicodeEscapeSurrogate>();
			
 
				+    return false;
			
 
				+  }
			
 
				+
			
 
				+  // Convert the code point to a sequence of UTF-8 code units.
			
 
				+  // Every code point fits in 6 UTF-8 code units.
			
 
				+  const llvm::UTF32 utf32_code_units[1] = {code_point};
			
 
				+  llvm::UTF8 utf8_code_units[6];
			
 
				+  const llvm::UTF32* src_pos = utf32_code_units;
			
 
				+  llvm::UTF8* dest_pos = utf8_code_units;
			
 
				+  llvm::ConversionResult conv_result = llvm::ConvertUTF32toUTF8(
			
 
				+      &src_pos, src_pos + 1, &dest_pos, dest_pos + 6, llvm::strictConversion);
			
 
				+  if (conv_result != llvm::conversionOK) {
			
 
				+    llvm_unreachable("conversion of valid code point to UTF-8 cannot fail");
			
 
				+  }
			
 
				+  result.insert(result.end(), reinterpret_cast<char*>(utf8_code_units),
			
 
				+                reinterpret_cast<char*>(dest_pos));
			
 
				+  return true;
			
 
				+}
			
 
				+
			
 
				+// Expand an escape sequence, appending the expanded value to the given
			
 
				+// `result` string. `content` is the string content, starting from the first
			
 
				+// character after the escape sequence introducer (for example, the `n` in
			
 
				+// `\n`), and will be updated to remove the leading escape sequence.
			
 
				+static auto ExpandAndConsumeEscapeSequence(DiagnosticEmitter& emitter,
			
 
				+                                           llvm::StringRef& content,
			
 
				+                                           std::string& result) -> bool {
			
 
				+  assert(!content.empty() && "should have escaped closing delimiter");
			
 
				+  char first = content.front();
			
 
				+  content = content.drop_front(1);
			
 
				+
			
 
				+  switch (first) {
			
 
				+    case 't':
			
 
				+      result += '\t';
			
 
				+      return true;
			
 
				+    case 'n':
			
 
				+      result += '\n';
			
 
				+      return true;
			
 
				+    case 'r':
			
 
				+      result += '\r';
			
 
				+      return true;
			
 
				+    case '"':
			
 
				+      result += '"';
			
 
				+      return true;
			
 
				+    case '\'':
			
 
				+      result += '\'';
			
 
				+      return true;
			
 
				+    case '\\':
			
 
				+      result += '\\';
			
 
				+      return true;
			
 
				+    case '0':
			
 
				+      result += '\0';
			
 
				+      if (!content.empty() && llvm::isDigit(content.front())) {
			
 
				+        emitter.EmitError<DecimalEscapeSequence>();
			
 
				+        return false;
			
 
				+      }
			
 
				+      return true;
			
 
				+    case 'x':
			
 
				+      if (content.size() >= 2 && isUpperHexDigit(content[0]) &&
			
 
				+          isUpperHexDigit(content[1])) {
			
 
				+        result +=
			
 
				+            static_cast<char>(llvm::hexFromNibbles(content[0], content[1]));
			
 
				+        content = content.drop_front(2);
			
 
				+        return true;
			
 
				+      }
			
 
				+      emitter.EmitError<HexadecimalEscapeMissingDigits>();
			
 
				+      break;
			
 
				+    case 'u': {
			
 
				+      llvm::StringRef remaining = content;
			
 
				+      if (remaining.consume_front("{")) {
			
 
				+        llvm::StringRef digits = remaining.take_while(isUpperHexDigit);
			
 
				+        remaining = remaining.drop_front(digits.size());
			
 
				+        if (!digits.empty() && remaining.consume_front("}")) {
			
 
				+          if (!ExpandUnicodeEscapeSequence(emitter, digits, result)) {
			
 
				+            break;
			
 
				+          }
			
 
				+          content = remaining;
			
 
				+          return true;
			
 
				+        }
			
 
				+      }
			
 
				+      emitter.EmitError<UnicodeEscapeMissingBracedDigits>();
			
 
				+      break;
			
 
				+    }
			
 
				+    default:
			
 
				+      emitter.EmitError<UnknownEscapeSequence>({.first = first});
			
 
				+      break;
			
 
				+  }
			
 
				+
			
 
				+  // If we get here, we didn't recognize this escape sequence and have already
			
 
				+  // issued a diagnostic. For error recovery purposes, expand this escape
			
 
				+  // sequence to itself, dropping the introducer (for example, `\q` -> `q`).
			
 
				+  result += first;
			
 
				+  return false;
			
 
				+}
			
 
				+
			
 
				+// Expand any escape sequences in the given string literal.
			
 
				+static auto ExpandEscapeSequencesAndRemoveIndent(DiagnosticEmitter& emitter,
			
 
				+                                                 llvm::StringRef contents,
			
 
				+                                                 int hash_level,
			
 
				+                                                 llvm::StringRef indent)
			
 
				+    -> StringLiteralToken::ExpandedValue {
			
 
				+  std::string result;
			
 
				+  result.reserve(contents.size());
			
 
				+  bool has_errors = false;
			
 
				+
			
 
				+  llvm::SmallString<16> escape("\\");
			
 
				+  escape.resize(1 + hash_level, '#');
			
 
				+
			
 
				+  // Process each line of the string literal.
			
 
				+  while (true) {
			
 
				+    // Every non-empty line (that contains anything other than horizontal
			
 
				+    // whitespace) is required to start with the string's indent. For error
			
 
				+    // recovery, remove all leading whitespace if the indent doesn't match.
			
 
				+    if (!contents.consume_front(indent)) {
			
 
				+      contents = contents.ltrim(HorizontalWhitespace);
			
 
				+      if (!contents.startswith("\n")) {
			
 
				+        emitter.EmitError<MismatchedIndentInString>();
			
 
				+        has_errors = true;
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+    // Process the contents of the line.
			
 
				+    while (true) {
			
 
				+      auto end_of_regular_text = contents.find_first_of("\n\\");
			
 
				+      result += contents.substr(0, end_of_regular_text);
			
 
				+      contents = contents.substr(end_of_regular_text);
			
 
				+
			
 
				+      if (contents.empty()) {
			
 
				+        return {.result = result, .has_errors = has_errors};
			
 
				+      }
			
 
				+
			
 
				+      if (contents.consume_front("\n")) {
			
 
				+        // Trailing whitespace before a newline doesn't contribute to the string
			
 
				+        // literal value.
			
 
				+        while (!result.empty() && result.back() != '\n' &&
			
 
				+               isSpace(result.back())) {
			
 
				+          result.pop_back();
			
 
				+        }
			
 
				+        result += '\n';
			
 
				+        // Move onto to the next line.
			
 
				+        break;
			
 
				+      }
			
 
				+
			
 
				+      if (!contents.consume_front(escape)) {
			
 
				+        // This is not an escape sequence, just a raw `\`.
			
 
				+        result += contents.front();
			
 
				+        contents = contents.drop_front(1);
			
 
				+        continue;
			
 
				+      }
			
 
				+
			
 
				+      if (contents.consume_front("\n")) {
			
 
				+        // An escaped ends the line without producing any content and without
			
 
				+        // trimming trailing whitespace.
			
 
				+        break;
			
 
				+      }
			
 
				+
			
 
				+      // Handle this escape sequence.
			
 
				+      if (!ExpandAndConsumeEscapeSequence(emitter, contents, result)) {
			
 
				+        has_errors = true;
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+auto StringLiteralToken::ComputeValue(DiagnosticEmitter& emitter) const
			
 
				+    -> ExpandedValue {
			
 
				+  auto indent = multi_line ? CheckIndent(emitter, text, content) : Indent();
			
 
				+  auto result = ExpandEscapeSequencesAndRemoveIndent(emitter, content,
			
 
				+                                                     hash_level, indent.indent);
			
 
				+  result.has_errors |= indent.has_errors;
			
 
				+  return result;
			
 
				+}
			
 
				+
			
 
				+}  // namespace Carbon
			
--- a/lexer/string_literal.h
+++ b/lexer/string_literal.h
@@ -0,0 +1,58 @@
 
				+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
			
 
				+// Exceptions. See /LICENSE for license information.
			
 
				+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
			
 
				+
			
 
				+#include <string>
			
 
				+
			
 
				+#include "diagnostics/diagnostic_emitter.h"
			
 
				+#include "llvm/ADT/Optional.h"
			
 
				+#include "llvm/ADT/StringRef.h"
			
 
				+
			
 
				+namespace Carbon {
			
 
				+
			
 
				+class StringLiteralToken {
			
 
				+ public:
			
 
				+  // Get the text corresponding to this literal.
			
 
				+  auto Text() const -> llvm::StringRef { return text; }
			
 
				+
			
 
				+  // Determine whether this is a multi-line string literal.
			
 
				+  auto IsMultiLine() const -> bool { return multi_line; }
			
 
				+
			
 
				+  // Extract a string literal token from the given text, if it has a suitable
			
 
				+  // form.
			
 
				+  static auto Lex(llvm::StringRef source_text)
			
 
				+      -> llvm::Optional<StringLiteralToken>;
			
 
				+
			
 
				+  // The result of expanding escape sequences in a string literal.
			
 
				+  struct ExpandedValue {
			
 
				+    std::string result;
			
 
				+    bool has_errors;
			
 
				+  };
			
 
				+
			
 
				+  // Expand any escape sequences in the given string literal and compute the
			
 
				+  // resulting value.
			
 
				+  auto ComputeValue(DiagnosticEmitter& emitter) const -> ExpandedValue;
			
 
				+
			
 
				+ private:
			
 
				+  StringLiteralToken(llvm::StringRef text, llvm::StringRef content,
			
 
				+                     int hash_level, bool multi_line)
			
 
				+      : text(text),
			
 
				+        content(content),
			
 
				+        hash_level(hash_level),
			
 
				+        multi_line(multi_line) {}
			
 
				+
			
 
				+  // The complete text of the string literal.
			
 
				+  llvm::StringRef text;
			
 
				+  // The content of the literal. For a multi-line literal, this begins
			
 
				+  // immediately after the newline following the file type indicator, and ends
			
 
				+  // at the start of the closing `"""`. Leading whitespace is not removed from
			
 
				+  // either end.
			
 
				+  llvm::StringRef content;
			
 
				+  // The number of `#`s preceding the opening `"` or `"""`.
			
 
				+  int hash_level;
			
 
				+  // Whether this was a multi-line string literal.
			
 
				+  bool multi_line;
			
 
				+};
			
 
				+
			
 
				+
			
 
				+}  // namespace Carbon
			
--- a/lexer/string_literal_test.cpp
+++ b/lexer/string_literal_test.cpp
@@ -0,0 +1,257 @@
 
				+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
			
 
				+// Exceptions. See /LICENSE for license information.
			
 
				+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
			
 
				+
			
 
				+#include "lexer/string_literal.h"
			
 
				+
			
 
				+#include "diagnostics/diagnostic_emitter.h"
			
 
				+#include "gmock/gmock.h"
			
 
				+#include "gtest/gtest.h"
			
 
				+
			
 
				+namespace Carbon {
			
 
				+namespace {
			
 
				+
			
 
				+struct StringLiteralTest : ::testing::Test {
			
 
				+  auto Lex(llvm::StringRef text) -> StringLiteralToken {
			
 
				+    llvm::Optional<StringLiteralToken> result = StringLiteralToken::Lex(text);
			
 
				+    assert(result);
			
 
				+    EXPECT_EQ(result->Text(), text);
			
 
				+    return *result;
			
 
				+  }
			
 
				+
			
 
				+  auto Parse(llvm::StringRef text) -> StringLiteralToken::ExpandedValue {
			
 
				+    StringLiteralToken token = Lex(text);
			
 
				+    return token.ComputeValue(ConsoleDiagnosticEmitter());
			
 
				+  }
			
 
				+};
			
 
				+
			
 
				+TEST_F(StringLiteralTest, StringLiteralBounds) {
			
 
				+  llvm::StringLiteral valid[] = {
			
 
				+      R"("")",
			
 
				+      R"("""
			
 
				+      """)",
			
 
				+      R"("""
			
 
				+      "foo"
			
 
				+      """)",
			
 
				+
			
 
				+      // Escaped terminators don't end the string.
			
 
				+      R"("\"")",
			
 
				+      R"("\\")",
			
 
				+      R"("\\\"")",
			
 
				+      R"("""
			
 
				+      \"""
			
 
				+      """)",
			
 
				+      R"("""
			
 
				+      "\""
			
 
				+      """)",
			
 
				+      R"("""
			
 
				+      ""\"
			
 
				+      """)",
			
 
				+      R"("""
			
 
				+      ""\
			
 
				+      """)",
			
 
				+      R"(#"""
			
 
				+      """\#n
			
 
				+      """#)",
			
 
				+
			
 
				+      // Only a matching number of '#'s terminates the string.
			
 
				+      R"(#""#)",
			
 
				+      R"(#"xyz"foo"#)",
			
 
				+      R"(##"xyz"#foo"##)",
			
 
				+      R"(#"\""#)",
			
 
				+
			
 
				+      // Escape sequences likewise require a matching number of '#'s.
			
 
				+      R"(#"\#"#"#)",
			
 
				+      R"(#"\"#)",
			
 
				+      R"(#"""
			
 
				+      \#"""#
			
 
				+      """#)",
			
 
				+
			
 
				+      // #"""# does not start a multiline string literal.
			
 
				+      R"(#"""#)",
			
 
				+      R"(##"""##)",
			
 
				+  };
			
 
				+
			
 
				+  for (llvm::StringLiteral test : valid) {
			
 
				+    llvm::Optional<StringLiteralToken> result = StringLiteralToken::Lex(test);
			
 
				+    EXPECT_TRUE(result.hasValue()) << test;
			
 
				+    if (result) {
			
 
				+      EXPECT_EQ(result->Text(), test);
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  llvm::StringLiteral invalid[] = {
			
 
				+      R"(")",
			
 
				+      R"("""
			
 
				+      "")",
			
 
				+      R"("\)",
			
 
				+      R"("\")",
			
 
				+      R"("\\)",
			
 
				+      R"("\\\")",
			
 
				+      R"("""
			
 
				+      )",
			
 
				+      R"(#"""
			
 
				+      """)",
			
 
				+  };
			
 
				+
			
 
				+  for (llvm::StringLiteral test : invalid) {
			
 
				+    EXPECT_FALSE(StringLiteralToken::Lex(test).hasValue())
			
 
				+        << "`" << test << "`";
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+TEST_F(StringLiteralTest, StringLiteralContents) {
			
 
				+  // We use ""s strings to handle embedded nul characters below.
			
 
				+  using std::operator""s;
			
 
				+
			
 
				+  std::pair<llvm::StringLiteral, llvm::StringLiteral> testcases[] = {
			
 
				+      // Empty strings.
			
 
				+      {R"("")", ""},
			
 
				+
			
 
				+      {R"(
			
 
				+"""
			
 
				+"""
			
 
				+       )",
			
 
				+       ""},
			
 
				+
			
 
				+      // Nearly-empty strings.
			
 
				+      {R"(
			
 
				+"""
			
 
				+
			
 
				+"""
			
 
				+       )",
			
 
				+       "\n"},
			
 
				+
			
 
				+      // Indent removal.
			
 
				+      {R"(
			
 
				+       """file type indicator
			
 
				+          indented contents \
			
 
				+         """
			
 
				+       )",
			
 
				+       " indented contents "},
			
 
				+
			
 
				+      {R"(
			
 
				+    """
			
 
				+   hello
			
 
				+  world
			
 
				+
			
 
				+   end of test
			
 
				+  """
			
 
				+       )",
			
 
				+       " hello\nworld\n\n end of test\n"},
			
 
				+
			
 
				+      // Escape sequences.
			
 
				+      {R"(
			
 
				+       "\x14,\u{1234},\u{00000010},\n,\r,\t,\0,\",\',\\"
			
 
				+       )",
			
 
				+       llvm::StringLiteral::withInnerNUL(
			
 
				+           "\x14,\xE1\x88\xB4,\x10,\x0A,\x0D,\x09,\x00,\x22,\x27,\x5C")},
			
 
				+
			
 
				+      {R"(
			
 
				+       "\0A\x1234"
			
 
				+       )",
			
 
				+       llvm::StringLiteral::withInnerNUL("\0A\x12"
			
 
				+                                         "34")},
			
 
				+
			
 
				+      {R"(
			
 
				+       "\u{D7FF},\u{E000},\u{10FFFF}"
			
 
				+       )",
			
 
				+       "\xED\x9F\xBF,\xEE\x80\x80,\xF4\x8F\xBF\xBF"},
			
 
				+
			
 
				+      // Escape sequences in 'raw' strings.
			
 
				+      {R"(
			
 
				+       #"\#x00,\#xFF,\#u{56789},\#u{ABCD},\#u{00000000000000000EF}"#
			
 
				+       )",
			
 
				+       llvm::StringLiteral::withInnerNUL(
			
 
				+           "\x00,\xFF,\xF1\x96\x9E\x89,\xEA\xAF\x8D,\xC3\xAF")},
			
 
				+
			
 
				+      {R"(
			
 
				+       ##"\n,\#n,\##n,\##\##n,\##\###n"##
			
 
				+       )",
			
 
				+       "\\n,\\#n,\n,\\##n,\\###n"},
			
 
				+
			
 
				+      // Trailing whitespace handling.
			
 
				+      {"\"\"\"\n  Hello \\\n  World \t \n  Bye!  \\\n  \"\"\"",
			
 
				+       "Hello World\nBye!  "},
			
 
				+  };
			
 
				+
			
 
				+  for (auto [test, contents] : testcases) {
			
 
				+    auto value = Parse(test.trim());
			
 
				+    EXPECT_FALSE(value.has_errors) << "`" << test << "`";
			
 
				+    EXPECT_EQ(value.result, contents);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+TEST_F(StringLiteralTest, StringLiteralBadIndent) {
			
 
				+  std::pair<llvm::StringLiteral, llvm::StringLiteral> testcases[] = {
			
 
				+    // Indent doesn't match the last line.
			
 
				+    {"\"\"\"\n \tx\n  \"\"\"", "x\n"},
			
 
				+    {"\"\"\"\n x\n  \"\"\"", "x\n"},
			
 
				+    {"\"\"\"\n  x\n\t\"\"\"", "x\n"},
			
 
				+    {"\"\"\"\n  ok\n bad\n  \"\"\"", "ok\nbad\n"},
			
 
				+    {"\"\"\"\n bad\n  ok\n  \"\"\"", "bad\nok\n"},
			
 
				+    {"\"\"\"\n  escaped,\\\n bad\n  \"\"\"", "escaped,bad\n"},
			
 
				+
			
 
				+    // Indent on last line is followed by text.
			
 
				+    {"\"\"\"\n  x\n  x\"\"\"", "x\nx"},
			
 
				+    {"\"\"\"\n   x\n  x\"\"\"", " x\nx"},
			
 
				+    {"\"\"\"\n x\n  x\"\"\"", "x\nx"},
			
 
				+  };
			
 
				+
			
 
				+  for (auto [test, contents] : testcases) {
			
 
				+    auto value = Parse(test);
			
 
				+    EXPECT_TRUE(value.has_errors) << "`" << test << "`";
			
 
				+    EXPECT_EQ(value.result, contents);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+TEST_F(StringLiteralTest, StringLiteralBadEscapeSequence) {
			
 
				+  llvm::StringLiteral testcases[] = {
			
 
				+    R"("\a")",
			
 
				+    R"("\b")",
			
 
				+    R"("\e")",
			
 
				+    R"("\f")",
			
 
				+    R"("\v")",
			
 
				+    R"("\?")",
			
 
				+    R"("\1")",
			
 
				+    R"("\9")",
			
 
				+
			
 
				+    // \0 can't be followed by a decimal digit.
			
 
				+    R"("\01")",
			
 
				+    R"("\09")",
			
 
				+
			
 
				+    // \x requires two (uppercase) hexadecimal digits.
			
 
				+    R"("\x")",
			
 
				+    R"("\x0")",
			
 
				+    R"("\x0G")",
			
 
				+    R"("\xab")",
			
 
				+    R"("\x\n")",
			
 
				+    R"("\x\"")",
			
 
				+
			
 
				+    // \u requires a braced list of one or more hexadecimal digits.
			
 
				+    R"("\u")",
			
 
				+    R"("\u?")",
			
 
				+    R"("\u\"")",
			
 
				+    R"("\u{")",
			
 
				+    R"("\u{}")",
			
 
				+    R"("\u{A")",
			
 
				+    R"("\u{G}")",
			
 
				+    R"("\u{0000012323127z}")",
			
 
				+    R"("\u{-3}")",
			
 
				+
			
 
				+    // \u must specify a non-surrogate code point.
			
 
				+    R"("\u{110000}")",
			
 
				+    R"("\u{000000000000000000000000000000000110000}")",
			
 
				+    R"("\u{D800}")",
			
 
				+    R"("\u{DFFF}")",
			
 
				+  };
			
 
				+
			
 
				+  for (llvm::StringLiteral test : testcases) {
			
 
				+    auto value = Parse(test);
			
 
				+    EXPECT_TRUE(value.has_errors) << "`" << test << "`";
			
 
				+    // TODO: Test value produced by error recovery.
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+}  // namespace
			
 
				+}  // namespace Carbon
			
--- a/lexer/token_registry.def
+++ b/lexer/token_registry.def
@@ -71,7 +71,6 @@ CARBON_SYMBOL_TOKEN(Comma,               ",")
 
				 CARBON_SYMBOL_TOKEN(Equal,               "=")
			
 
				 CARBON_SYMBOL_TOKEN(Exclaim,             "!")
			
 
				 CARBON_SYMBOL_TOKEN(Greater,             ">")
			
 
				-CARBON_SYMBOL_TOKEN(Hash,                "#")
			
 
				 CARBON_SYMBOL_TOKEN(Less,                "<")
			
 
				 CARBON_SYMBOL_TOKEN(Minus,               "-")
			
 
				 CARBON_SYMBOL_TOKEN(Percent,             "%")
			
@@ -157,6 +156,7 @@ CARBON_KEYWORD_TOKEN(XorKeyword,        "xor")
 
				 CARBON_TOKEN(Identifier)
			
 
				 CARBON_TOKEN(IntegerLiteral)
			
 
				 CARBON_TOKEN(RealLiteral)
			
 
				+CARBON_TOKEN(StringLiteral)
			
 
				 CARBON_TOKEN(Error)
			
 
				 
			
 
				 #undef CARBON_TOKEN
			
--- a/lexer/tokenized_buffer.cpp
+++ b/lexer/tokenized_buffer.cpp
@@ -10,6 +10,7 @@
 
				 #include <string>
			
 
				 
			
 
				 #include "lexer/numeric_literal.h"
			
 
				+#include "lexer/string_literal.h"
			
 
				 #include "llvm/ADT/StringRef.h"
			
 
				 #include "llvm/ADT/StringSwitch.h"
			
 
				 #include "llvm/ADT/Twine.h"
			
@@ -104,6 +105,18 @@ class TokenizedBuffer::Lexer {
 
				     explicit operator bool() const { return formed_token; }
			
 
				   };
			
 
				 
			
 
				+  // Perform the necessary bookkeeping to step past a newline at the current
			
 
				+  // line and column.
			
 
				+  auto HandleNewline() -> void {
			
 
				+    current_line_info->length = current_column;
			
 
				+
			
 
				+    current_line = buffer.AddLine(
			
 
				+        {current_line_info->start + current_column + 1, 0, 0});
			
 
				+    current_line_info = &buffer.GetLineInfo(current_line);
			
 
				+    current_column = 0;
			
 
				+    set_indent = false;
			
 
				+  }
			
 
				+
			
 
				   auto SkipWhitespace(llvm::StringRef& source_text) -> bool {
			
 
				     while (!source_text.empty()) {
			
 
				       // We only support line-oriented commenting and lex comments as-if they
			
@@ -136,21 +149,16 @@ class TokenizedBuffer::Lexer {
 
				           return true;
			
 
				 
			
 
				         case '\n':
			
 
				-          // New lines are special in order to track line structure.
			
 
				-          current_line_info->length = current_column;
			
 
				           // If this is the last character in the source, directly return here
			
 
				           // to avoid creating an empty line.
			
 
				           source_text = source_text.drop_front();
			
 
				           if (source_text.empty()) {
			
 
				+            current_line_info->length = current_column;
			
 
				             return false;
			
 
				           }
			
 
				 
			
 
				           // Otherwise, add a line and set up to continue lexing.
			
 
				-          current_line = buffer.AddLine(
			
 
				-              {current_line_info->start + current_column + 1, 0, 0});
			
 
				-          current_line_info = &buffer.GetLineInfo(current_line);
			
 
				-          current_column = 0;
			
 
				-          set_indent = false;
			
 
				+          HandleNewline();
			
 
				           continue;
			
 
				 
			
 
				         case ' ':
			
@@ -231,6 +239,53 @@ class TokenizedBuffer::Lexer {
 
				     }
			
 
				   }
			
 
				 
			
 
				+  auto LexStringLiteral(llvm::StringRef& source_text) -> LexResult {
			
 
				+    llvm::Optional<StringLiteralToken> literal =
			
 
				+        StringLiteralToken::Lex(source_text);
			
 
				+    if (!literal) {
			
 
				+      return LexResult::NoMatch();
			
 
				+    }
			
 
				+
			
 
				+    Line string_line = current_line;
			
 
				+    int string_column = current_column;
			
 
				+    int literal_size = literal->Text().size();
			
 
				+    source_text = source_text.drop_front(literal_size);
			
 
				+
			
 
				+    if (!set_indent) {
			
 
				+      current_line_info->indent = string_column;
			
 
				+      set_indent = true;
			
 
				+    }
			
 
				+
			
 
				+    // Update line and column information.
			
 
				+    if (!literal->IsMultiLine()) {
			
 
				+      current_column += literal_size;
			
 
				+    } else {
			
 
				+      for (char c : literal->Text()) {
			
 
				+        if (c == '\n') {
			
 
				+          HandleNewline();
			
 
				+          // The indentation of all lines in a multi-line string literal is
			
 
				+          // that of the first line.
			
 
				+          current_line_info->indent = string_column;
			
 
				+          set_indent = true;
			
 
				+        } else {
			
 
				+          ++current_column;
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+    // Determine string literal value.
			
 
				+    auto expanded = literal->ComputeValue(emitter);
			
 
				+    buffer.has_errors |= expanded.has_errors;
			
 
				+
			
 
				+    auto token = buffer.AddToken({.kind = TokenKind::StringLiteral(),
			
 
				+                                  .token_line = string_line,
			
 
				+                                  .column = string_column});
			
 
				+    buffer.GetTokenInfo(token).literal_index =
			
 
				+        buffer.literal_string_storage.size();
			
 
				+    buffer.literal_string_storage.push_back(std::move(expanded.result));
			
 
				+    return token;
			
 
				+  }
			
 
				+
			
 
				   auto LexSymbolToken(llvm::StringRef& source_text) -> LexResult {
			
 
				     TokenKind kind = llvm::StringSwitch<TokenKind>(source_text)
			
 
				 #define CARBON_SYMBOL_TOKEN(Name, Spelling) \
			
@@ -420,6 +475,9 @@ auto TokenizedBuffer::Lex(SourceBuffer& source, DiagnosticEmitter& emitter)
 
				     if (!result) {
			
 
				       result = lexer.LexNumericLiteral(source_text);
			
 
				     }
			
 
				+    if (!result) {
			
 
				+      result = lexer.LexStringLiteral(source_text);
			
 
				+    }
			
 
				     if (!result) {
			
 
				       result = lexer.LexError(source_text);
			
 
				     }
			
@@ -471,6 +529,17 @@ auto TokenizedBuffer::GetTokenText(Token token) const -> llvm::StringRef {
 
				     return relexed_token->Text();
			
 
				   }
			
 
				 
			
 
				+  // Refer back to the source text to find the original spelling, including
			
 
				+  // escape sequences etc.
			
 
				+  if (token_info.kind == TokenKind::StringLiteral()) {
			
 
				+    auto& line_info = GetLineInfo(token_info.token_line);
			
 
				+    int64_t token_start = line_info.start + token_info.column;
			
 
				+    llvm::Optional<StringLiteralToken> relexed_token =
			
 
				+        StringLiteralToken::Lex(source->Text().substr(token_start));
			
 
				+    assert(relexed_token && "Could not reform string literal token.");
			
 
				+    return relexed_token->Text();
			
 
				+  }
			
 
				+
			
 
				   assert(token_info.kind == TokenKind::Identifier() &&
			
 
				          "Only identifiers have stored text!");
			
 
				   return GetIdentifierText(token_info.id);
			
@@ -507,6 +576,13 @@ auto TokenizedBuffer::GetRealLiteral(Token token) const -> RealLiteralValue {
 
				   return RealLiteralValue(this, token_info.literal_index, is_decimal);
			
 
				 }
			
 
				 
			
 
				+auto TokenizedBuffer::GetStringLiteral(Token token) const -> llvm::StringRef {
			
 
				+  auto& token_info = GetTokenInfo(token);
			
 
				+  assert(token_info.kind == TokenKind::StringLiteral() &&
			
 
				+         "The token must be a string literal!");
			
 
				+  return literal_string_storage[token_info.literal_index];
			
 
				+}
			
 
				+
			
 
				 auto TokenizedBuffer::GetMatchedClosingToken(Token opening_token) const
			
 
				     -> Token {
			
 
				   auto& opening_token_info = GetTokenInfo(opening_token);
			
@@ -624,7 +700,10 @@ auto TokenizedBuffer::PrintToken(llvm::raw_ostream& output_stream, Token token,
 
				     output_stream << ", closing_token: " << GetMatchedClosingToken(token).index;
			
 
				   } else if (token_info.kind.IsClosingSymbol()) {
			
 
				     output_stream << ", opening_token: " << GetMatchedOpeningToken(token).index;
			
 
				+  } else if (token_info.kind == TokenKind::StringLiteral()) {
			
 
				+    output_stream << ", value: `" << GetStringLiteral(token) << "`";
			
 
				   }
			
 
				+  // TODO: Include value for numeric literals.
			
 
				 
			
 
				   if (token_info.is_recovery) {
			
 
				     output_stream << ", recovery: true";
			
--- a/lexer/tokenized_buffer.h
+++ b/lexer/tokenized_buffer.h
@@ -263,6 +263,9 @@ class TokenizedBuffer {
 
				   // Returns the value of an `RealLiteral()` token.
			
 
				   [[nodiscard]] auto GetRealLiteral(Token token) const -> RealLiteralValue;
			
 
				 
			
 
				+  // Returns the value of a `StringLiteral()` token.
			
 
				+  auto GetStringLiteral(Token token) const -> llvm::StringRef;
			
 
				+
			
 
				   // Returns the closing token matched with the given opening token.
			
 
				   //
			
 
				   // The given token must be an opening token kind.
			
@@ -402,6 +405,8 @@ class TokenizedBuffer {
 
				   // Storage for integers that form part of the value of a numeric literal.
			
 
				   llvm::SmallVector<llvm::APInt, 16> literal_int_storage;
			
 
				 
			
 
				+  llvm::SmallVector<std::string, 16> literal_string_storage;
			
 
				+
			
 
				   llvm::DenseMap<llvm::StringRef, Identifier> identifier_map;
			
 
				 
			
 
				   bool has_errors = false;
			
--- a/lexer/tokenized_buffer_test.cpp
+++ b/lexer/tokenized_buffer_test.cpp
@@ -51,7 +51,7 @@ TEST_F(LexerTest, HandlesEmptyBuffer) {
 
				 }
			
 
				 
			
 
				 TEST_F(LexerTest, TracksLinesAndColumns) {
			
 
				-  auto buffer = Lex("\n  ;;\n   ;;;\n");
			
 
				+  auto buffer = Lex("\n  ;;\n   ;;;\n   x\"foo\" \"\"\"baz\n  a\n \"\"\" y");
			
 
				   EXPECT_FALSE(buffer.HasErrors());
			
 
				   EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
			
 
				                           {.kind = TokenKind::Semi(),
			
@@ -74,6 +74,24 @@ TEST_F(LexerTest, TracksLinesAndColumns) {
 
				                            .line = 3,
			
 
				                            .column = 6,
			
 
				                            .indent_column = 4},
			
 
				+                          {.kind = TokenKind::Identifier(),
			
 
				+                           .line = 4,
			
 
				+                           .column = 4,
			
 
				+                           .indent_column = 4,
			
 
				+                           .text = "x"},
			
 
				+                          {.kind = TokenKind::StringLiteral(),
			
 
				+                           .line = 4,
			
 
				+                           .column = 5,
			
 
				+                           .indent_column = 4},
			
 
				+                          {.kind = TokenKind::StringLiteral(),
			
 
				+                           .line = 4,
			
 
				+                           .column = 11,
			
 
				+                           .indent_column = 4},
			
 
				+                          {.kind = TokenKind::Identifier(),
			
 
				+                           .line = 6,
			
 
				+                           .column = 6,
			
 
				+                           .indent_column = 11,
			
 
				+                           .text = "y"},
			
 
				                       }));
			
 
				 }
			
 
				 
			
@@ -250,7 +268,7 @@ TEST_F(LexerTest, SplitsNumericLiteralsProperly) {
 
				 }
			
 
				 
			
 
				 TEST_F(LexerTest, HandlesGarbageCharacters) {
			
 
				-  constexpr char GarbageText[] = "$$💩-$\n$\0$12$";
			
 
				+  constexpr char GarbageText[] = "$$💩-$\n$\0$12$\n\"\n\"\\";
			
 
				   auto buffer = Lex(llvm::StringRef(GarbageText, sizeof(GarbageText) - 1));
			
 
				   EXPECT_TRUE(buffer.HasErrors());
			
 
				   EXPECT_THAT(
			
@@ -273,6 +291,20 @@ TEST_F(LexerTest, HandlesGarbageCharacters) {
 
				            .column = 4,
			
 
				            .text = "12"},
			
 
				           {.kind = TokenKind::Error(), .line = 2, .column = 6, .text = "$"},
			
 
				+          // newline
			
 
				+          {.kind = TokenKind::Error(),
			
 
				+           .line = 3,
			
 
				+           .column = 1,
			
 
				+           .text = llvm::StringRef("\"", 1)},
			
 
				+          // newline
			
 
				+          {.kind = TokenKind::Error(),
			
 
				+           .line = 4,
			
 
				+           .column = 1,
			
 
				+           .text = llvm::StringRef("\"", 1)},
			
 
				+          {.kind = TokenKind::Backslash(),
			
 
				+           .line = 4,
			
 
				+           .column = 2,
			
 
				+           .text = llvm::StringRef("\\", 1)},
			
 
				       }));
			
 
				 }
			
 
				 
			
@@ -302,13 +334,12 @@ TEST_F(LexerTest, Symbols) {
 
				                           {TokenKind::Greater()},
			
 
				                       }));
			
 
				 
			
 
				-  buffer = Lex("\\/?#@&^!");
			
 
				+  buffer = Lex("\\/?@&^!");
			
 
				   EXPECT_FALSE(buffer.HasErrors());
			
 
				   EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
			
 
				                           {TokenKind::Backslash()},
			
 
				                           {TokenKind::Slash()},
			
 
				                           {TokenKind::Question()},
			
 
				-                          {TokenKind::Hash()},
			
 
				                           {TokenKind::At()},
			
 
				                           {TokenKind::Amp()},
			
 
				                           {TokenKind::Caret()},
			
@@ -614,6 +645,111 @@ TEST_F(LexerTest, Identifiers) {
 
				                       }));
			
 
				 }
			
 
				 
			
 
				+TEST_F(LexerTest, StringLiterals) {
			
 
				+  llvm::StringLiteral testcase = R"(
			
 
				+    "hello world\n"
			
 
				+
			
 
				+    """foo
			
 
				+      test \
			
 
				+      \xAB
			
 
				+     """ trailing
			
 
				+
			
 
				+      #"""#
			
 
				+
			
 
				+    "\0"
			
 
				+
			
 
				+    #"\0"foo"\1"#
			
 
				+
			
 
				+    """x"""
			
 
				+  )";
			
 
				+
			
 
				+  auto buffer = Lex(testcase);
			
 
				+  EXPECT_FALSE(buffer.HasErrors());
			
 
				+  EXPECT_THAT(buffer,
			
 
				+              HasTokens(llvm::ArrayRef<ExpectedToken>{
			
 
				+                  {.kind = TokenKind::StringLiteral(),
			
 
				+                   .line = 2,
			
 
				+                   .column = 5,
			
 
				+                   .indent_column = 5,
			
 
				+                   .string_contents = {"hello world\n"}},
			
 
				+                  {.kind = TokenKind::StringLiteral(),
			
 
				+                   .line = 4,
			
 
				+                   .column = 5,
			
 
				+                   .indent_column = 5,
			
 
				+                   .string_contents = {" test  \xAB\n"}},
			
 
				+                  {.kind = TokenKind::Identifier(),
			
 
				+                   .line = 7,
			
 
				+                   .column = 10,
			
 
				+                   .indent_column = 5,
			
 
				+                   .text = "trailing"},
			
 
				+                  {.kind = TokenKind::StringLiteral(),
			
 
				+                   .line = 9,
			
 
				+                   .column = 7,
			
 
				+                   .indent_column = 7,
			
 
				+                   .string_contents = {"\""}},
			
 
				+                  {.kind = TokenKind::StringLiteral(),
			
 
				+                   .line = 11,
			
 
				+                   .column = 5,
			
 
				+                   .indent_column = 5,
			
 
				+                   .string_contents = llvm::StringLiteral::withInnerNUL("\0")},
			
 
				+                  {.kind = TokenKind::StringLiteral(),
			
 
				+                   .line = 13,
			
 
				+                   .column = 5,
			
 
				+                   .indent_column = 5,
			
 
				+                   .string_contents = {"\\0\"foo\"\\1"}},
			
 
				+
			
 
				+                  // """x""" is three string literals, not one.
			
 
				+                  {.kind = TokenKind::StringLiteral(),
			
 
				+                   .line = 15,
			
 
				+                   .column = 5,
			
 
				+                   .indent_column = 5,
			
 
				+                   .string_contents = {""}},
			
 
				+                  {.kind = TokenKind::StringLiteral(),
			
 
				+                   .line = 15,
			
 
				+                   .column = 7,
			
 
				+                   .indent_column = 5,
			
 
				+                   .string_contents = {"x"}},
			
 
				+                  {.kind = TokenKind::StringLiteral(),
			
 
				+                   .line = 15,
			
 
				+                   .column = 10,
			
 
				+                   .indent_column = 5,
			
 
				+                   .string_contents = {""}},
			
 
				+              }));
			
 
				+}
			
 
				+
			
 
				+TEST_F(LexerTest, InvalidStringLiterals) {
			
 
				+  llvm::StringLiteral invalid[] = {
			
 
				+      R"(")",
			
 
				+      R"("""
			
 
				+      "")",
			
 
				+      R"("\)",
			
 
				+      R"("\")",
			
 
				+      R"("\\)",
			
 
				+      R"("\\\")",
			
 
				+      R"(""")",
			
 
				+      R"("""
			
 
				+      )",
			
 
				+      R"("""\)",
			
 
				+      R"(#"""
			
 
				+      """)",
			
 
				+  };
			
 
				+
			
 
				+  for (llvm::StringLiteral test : invalid) {
			
 
				+    auto buffer = Lex(test);
			
 
				+    EXPECT_TRUE(buffer.HasErrors()) << "`" << test << "`";
			
 
				+
			
 
				+    // We should have formed at least one error token.
			
 
				+    bool found_error = false;
			
 
				+    for (TokenizedBuffer::Token token : buffer.Tokens()) {
			
 
				+      if (buffer.GetKind(token) == TokenKind::Error()) {
			
 
				+        found_error = true;
			
 
				+        break;
			
 
				+      }
			
 
				+    }
			
 
				+    EXPECT_TRUE(found_error) << "`" << test << "`";
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				 auto GetAndDropLine(llvm::StringRef& text) -> std::string {
			
 
				   auto newline_offset = text.find_first_of('\n');
			
 
				   llvm::StringRef line = text.slice(0, newline_offset);
			
--- a/lexer/tokenized_buffer_test_helpers.h
+++ b/lexer/tokenized_buffer_test_helpers.h
@@ -26,9 +26,9 @@ namespace Testing {
 
				 struct ExpectedToken {
			
 
				   friend auto operator<<(std::ostream& output, const ExpectedToken& expected)
			
 
				       -> std::ostream& {
			
 
				-    output << "\ntoken: { kind: '" << expected.kind.Name().str();
			
 
				+    output << "\ntoken: { kind: '" << expected.kind.Name().str() << "'";
			
 
				     if (expected.line != -1) {
			
 
				-      output << "', line: " << expected.line;
			
 
				+      output << ", line: " << expected.line;
			
 
				     }
			
 
				     if (expected.column != -1) {
			
 
				       output << ", column " << expected.column;
			
@@ -39,6 +39,10 @@ struct ExpectedToken {
 
				     if (!expected.text.empty()) {
			
 
				       output << ", spelling: '" << expected.text.str() << "'";
			
 
				     }
			
 
				+    if (expected.string_contents) {
			
 
				+      output << ", string contents: '" << expected.string_contents->str()
			
 
				+             << "'";
			
 
				+    }
			
 
				     if (expected.recovery) {
			
 
				       output << ", recovery: true";
			
 
				     }
			
@@ -52,6 +56,7 @@ struct ExpectedToken {
 
				   int indent_column = -1;
			
 
				   bool recovery = false;
			
 
				   llvm::StringRef text = "";
			
 
				+  llvm::Optional<llvm::StringRef> string_contents = llvm::None;
			
 
				 };
			
 
				 
			
 
				 // TODO: Consider rewriting this into a `TokenEq` matcher which is used inside
			
@@ -123,6 +128,18 @@ MATCHER_P(HasTokens, raw_all_expected, "") {
 
				                        << expected.text.str() << "`.";
			
 
				       matches = false;
			
 
				     }
			
 
				+
			
 
				+    assert(!expected.string_contents ||
			
 
				+           expected.kind == TokenKind::StringLiteral());
			
 
				+    if (expected.string_contents && actual_kind == TokenKind::StringLiteral()) {
			
 
				+      llvm::StringRef actual_contents = buffer.GetStringLiteral(token);
			
 
				+      if (actual_contents != *expected.string_contents) {
			
 
				+        *result_listener << "\nToken " << index << " has contents `"
			
 
				+                         << actual_contents.str() << "`, expected `"
			
 
				+                         << expected.string_contents->str() << "`.";
			
 
				+        matches = false;
			
 
				+      }
			
 
				+    }
			
 
				   }
			
 
				 
			
 
				   int actual_size = buffer.Tokens().end() - buffer.Tokens().begin();