hace 4 años · 0a8c0dc271
--- a/toolchain/lexer/BUILD
+++ b/toolchain/lexer/BUILD
@@ -142,6 +142,7 @@ cc_fuzz_test(
 
				     corpus = glob(["fuzzer_corpus/string_literal/*"]),
			
 
				     deps = [
			
 
				         ":string_literal",
			
 
				+        "//common:check",
			
 
				         "//toolchain/diagnostics:diagnostic_emitter",
			
 
				         "//toolchain/diagnostics:null_diagnostics",
			
 
				         "@llvm-project//llvm:Support",
			
--- a/toolchain/lexer/string_literal.cpp
+++ b/toolchain/lexer/string_literal.cpp
@@ -138,6 +138,8 @@ auto LexedStringLiteral::Lex(llvm::StringRef source_text)
 
				   terminator.resize(terminator.size() + hash_level, '#');
			
 
				   escape.resize(escape.size() + hash_level, '#');
			
 
				 
			
 
				+  // TODO: Detect indent / dedent for multi-line string literals in order to
			
 
				+  // stop parsing on dedent before a terminator is found.
			
 
				   for (; cursor < source_text_size; ++cursor) {
			
 
				     // This switch and loop structure relies on multi-character terminators and
			
 
				     // escape sequences starting with a predictable character and not containing
			
@@ -152,13 +154,19 @@ auto LexedStringLiteral::Lex(llvm::StringRef source_text)
 
				           // should stop here.
			
 
				           if (cursor >= source_text_size ||
			
 
				               (!multi_line && source_text[cursor] == '\n')) {
			
 
				-            return llvm::None;
			
 
				+            llvm::StringRef text = source_text.take_front(cursor);
			
 
				+            return LexedStringLiteral(text, text.drop_front(prefix_len),
			
 
				+                                      hash_level, multi_line,
			
 
				+                                      /*is_terminated=*/false);
			
 
				           }
			
 
				         }
			
 
				         break;
			
 
				       case '\n':
			
 
				         if (!multi_line) {
			
 
				-          return llvm::None;
			
 
				+          llvm::StringRef text = source_text.take_front(cursor);
			
 
				+          return LexedStringLiteral(text, text.drop_front(prefix_len),
			
 
				+                                    hash_level, multi_line,
			
 
				+                                    /*is_terminated=*/false);
			
 
				         }
			
 
				         break;
			
 
				       case '\"': {
			
@@ -168,15 +176,17 @@ auto LexedStringLiteral::Lex(llvm::StringRef source_text)
 
				               source_text.substr(0, cursor + terminator.size());
			
 
				           llvm::StringRef content =
			
 
				               source_text.substr(prefix_len, cursor - prefix_len);
			
 
				-          return LexedStringLiteral(text, content, hash_level, multi_line);
			
 
				+          return LexedStringLiteral(text, content, hash_level, multi_line,
			
 
				+                                    /*is_terminated=*/true);
			
 
				         }
			
 
				         break;
			
 
				       }
			
 
				     }
			
 
				   }
			
 
				-  // Let LexError figure out how to recover from an unterminated string
			
 
				-  // literal.
			
 
				-  return llvm::None;
			
 
				+  // No terminator was found.
			
 
				+  return LexedStringLiteral(source_text, source_text.drop_front(prefix_len),
			
 
				+                            hash_level, multi_line,
			
 
				+                            /*is_terminated=*/false);
			
 
				 }
			
 
				 
			
 
				 // Given a string that contains at least one newline, find the indent (the
			
@@ -407,6 +417,9 @@ static auto ExpandEscapeSequencesAndRemoveIndent(
 
				 
			
 
				 auto LexedStringLiteral::ComputeValue(LexerDiagnosticEmitter& emitter) const
			
 
				     -> std::string {
			
 
				+  if (!is_terminated_) {
			
 
				+    return "";
			
 
				+  }
			
 
				   llvm::StringRef indent =
			
 
				       multi_line_ ? CheckIndent(emitter, text_, content_) : llvm::StringRef();
			
 
				   return ExpandEscapeSequencesAndRemoveIndent(emitter, content_, hash_level_,
			
--- a/toolchain/lexer/string_literal.h
+++ b/toolchain/lexer/string_literal.h
@@ -13,28 +13,34 @@ namespace Carbon {
 
				 class LexedStringLiteral {
			
 
				  public:
			
 
				   // Extract a string literal token from the given text, if it has a suitable
			
 
				-  // form.
			
 
				+  // form. Returning llvm::None indicates no string literal was found; returning
			
 
				+  // an invalid literal indicates a string prefix was found, but it's malformed
			
 
				+  // and is returning a partial string literal to assist error construction.
			
 
				   static auto Lex(llvm::StringRef source_text)
			
 
				       -> llvm::Optional<LexedStringLiteral>;
			
 
				 
			
 
				-  // Get the text corresponding to this literal.
			
 
				-  [[nodiscard]] auto Text() const -> llvm::StringRef { return text_; }
			
 
				-
			
 
				-  // Determine whether this is a multi-line string literal.
			
 
				-  [[nodiscard]] auto IsMultiLine() const -> bool { return multi_line_; }
			
 
				-
			
 
				   // Expand any escape sequences in the given string literal and compute the
			
 
				   // resulting value. This handles error recovery internally and cannot fail.
			
 
				   auto ComputeValue(DiagnosticEmitter<const char*>& emitter) const
			
 
				       -> std::string;
			
 
				 
			
 
				+  // Get the text corresponding to this literal.
			
 
				+  [[nodiscard]] auto text() const -> llvm::StringRef { return text_; }
			
 
				+
			
 
				+  // Determine whether this is a multi-line string literal.
			
 
				+  [[nodiscard]] auto is_multi_line() const -> bool { return multi_line_; }
			
 
				+
			
 
				+  // Returns true if the string has a valid terminator.
			
 
				+  [[nodiscard]] auto is_terminated() const -> bool { return is_terminated_; }
			
 
				+
			
 
				  private:
			
 
				   LexedStringLiteral(llvm::StringRef text, llvm::StringRef content,
			
 
				-                     int hash_level, bool multi_line)
			
 
				+                     int hash_level, bool multi_line, bool is_terminated)
			
 
				       : text_(text),
			
 
				         content_(content),
			
 
				         hash_level_(hash_level),
			
 
				-        multi_line_(multi_line) {}
			
 
				+        multi_line_(multi_line),
			
 
				+        is_terminated_(is_terminated) {}
			
 
				 
			
 
				   // The complete text of the string literal.
			
 
				   llvm::StringRef text_;
			
@@ -47,6 +53,8 @@ class LexedStringLiteral {
 
				   int hash_level_;
			
 
				   // Whether this was a multi-line string literal.
			
 
				   bool multi_line_;
			
 
				+  // Whether the literal is valid, or should only be used for errors.
			
 
				+  bool is_terminated_;
			
 
				 };
			
 
				 
			
 
				 }  // namespace Carbon
			
--- a/toolchain/lexer/string_literal_fuzzer.cpp
+++ b/toolchain/lexer/string_literal_fuzzer.cpp
@@ -5,6 +5,7 @@
 
				 #include <cstdint>
			
 
				 #include <cstring>
			
 
				 
			
 
				+#include "common/check.h"
			
 
				 #include "llvm/ADT/StringRef.h"
			
 
				 #include "toolchain/diagnostics/diagnostic_emitter.h"
			
 
				 #include "toolchain/diagnostics/null_diagnostics.h"
			
@@ -22,11 +23,18 @@ extern "C" int LLVMFuzzerTestOneInput(const unsigned char* data,
 
				     return 0;
			
 
				   }
			
 
				 
			
 
				-  // Check multiline flag was computed correctly.
			
 
				-  if (token->IsMultiLine() != token->Text().contains('\n')) {
			
 
				-    __builtin_trap();
			
 
				+  if (!token->is_terminated()) {
			
 
				+    // Found errors while parsing.
			
 
				+    return 0;
			
 
				   }
			
 
				 
			
 
				+  fprintf(stderr, "valid: %d\n", token->is_terminated());
			
 
				+  fprintf(stderr, "size: %lu\n", token->text().size());
			
 
				+  fprintf(stderr, "text: %s\n", token->text().str().c_str());
			
 
				+
			
 
				+  // Check multiline flag was computed correctly.
			
 
				+  CHECK(token->is_multi_line() == token->text().contains('\n'));
			
 
				+
			
 
				   volatile auto value =
			
 
				       token->ComputeValue(NullDiagnosticEmitter<const char*>());
			
 
				   (void)value;
			
--- a/toolchain/lexer/string_literal_test.cpp
+++ b/toolchain/lexer/string_literal_test.cpp
@@ -22,7 +22,7 @@ class StringLiteralTest : public ::testing::Test {
 
				   auto Lex(llvm::StringRef text) -> LexedStringLiteral {
			
 
				     llvm::Optional<LexedStringLiteral> result = LexedStringLiteral::Lex(text);
			
 
				     CHECK(result);
			
 
				-    EXPECT_EQ(result->Text(), text);
			
 
				+    EXPECT_EQ(result->text(), text);
			
 
				     return *result;
			
 
				   }
			
 
				 
			
@@ -84,20 +84,22 @@ TEST_F(StringLiteralTest, StringLiteralBounds) {
 
				   };
			
 
				 
			
 
				   for (llvm::StringLiteral test : valid) {
			
 
				+    SCOPED_TRACE(test);
			
 
				     llvm::Optional<LexedStringLiteral> result = LexedStringLiteral::Lex(test);
			
 
				-    EXPECT_TRUE(result.hasValue()) << test;
			
 
				+    EXPECT_TRUE(result.hasValue());
			
 
				     if (result) {
			
 
				-      EXPECT_EQ(result->Text(), test);
			
 
				+      EXPECT_EQ(result->text(), test);
			
 
				     }
			
 
				   }
			
 
				 
			
 
				   llvm::StringLiteral invalid[] = {
			
 
				+      // clang-format off
			
 
				       R"(")",
			
 
				       R"("""
			
 
				       "")",
			
 
				-      R"("\)",  //
			
 
				+      R"("\)",
			
 
				       R"("\")",
			
 
				-      R"("\\)",  //
			
 
				+      R"("\\)",
			
 
				       R"("\\\")",
			
 
				       R"("""
			
 
				       )",
			
@@ -105,11 +107,16 @@ TEST_F(StringLiteralTest, StringLiteralBounds) {
 
				       """)",
			
 
				       R"(" \
			
 
				       ")",
			
 
				+      // clang-format on
			
 
				   };
			
 
				 
			
 
				   for (llvm::StringLiteral test : invalid) {
			
 
				-    EXPECT_FALSE(LexedStringLiteral::Lex(test).hasValue())
			
 
				-        << "`" << test << "`";
			
 
				+    SCOPED_TRACE(test);
			
 
				+    llvm::Optional<LexedStringLiteral> result = LexedStringLiteral::Lex(test);
			
 
				+    EXPECT_TRUE(result.hasValue());
			
 
				+    if (result) {
			
 
				+      EXPECT_FALSE(result->is_terminated());
			
 
				+    }
			
 
				   }
			
 
				 }
			
 
				 
			
--- a/toolchain/lexer/tokenized_buffer.cpp
+++ b/toolchain/lexer/tokenized_buffer.cpp
@@ -57,6 +57,12 @@ struct UnrecognizedCharacters : DiagnosticBase<UnrecognizedCharacters> {
 
				       "Encountered unrecognized characters while parsing.";
			
 
				 };
			
 
				 
			
 
				+struct UnterminatedString : DiagnosticBase<UnterminatedString> {
			
 
				+  static constexpr llvm::StringLiteral ShortName = "syntax-string-terminator";
			
 
				+  static constexpr llvm::StringLiteral Message =
			
 
				+      "String is missing a terminator.";
			
 
				+};
			
 
				+
			
 
				 // TODO: Move Overload and VariantMatch somewhere more central.
			
 
				 
			
 
				 // Form an overload set from a list of functions. For example:
			
@@ -267,7 +273,7 @@ class TokenizedBuffer::Lexer {
 
				 
			
 
				     Line string_line = current_line_;
			
 
				     int string_column = current_column_;
			
 
				-    int literal_size = literal->Text().size();
			
 
				+    int literal_size = literal->text().size();
			
 
				     source_text = source_text.drop_front(literal_size);
			
 
				 
			
 
				     if (!set_indent_) {
			
@@ -276,10 +282,10 @@ class TokenizedBuffer::Lexer {
 
				     }
			
 
				 
			
 
				     // Update line and column information.
			
 
				-    if (!literal->IsMultiLine()) {
			
 
				+    if (!literal->is_multi_line()) {
			
 
				       current_column_ += literal_size;
			
 
				     } else {
			
 
				-      for (char c : literal->Text()) {
			
 
				+      for (char c : literal->text()) {
			
 
				         if (c == '\n') {
			
 
				           HandleNewline();
			
 
				           // The indentation of all lines in a multi-line string literal is
			
@@ -292,13 +298,23 @@ class TokenizedBuffer::Lexer {
 
				       }
			
 
				     }
			
 
				 
			
 
				-    auto token = buffer_.AddToken({.kind = TokenKind::StringLiteral(),
			
 
				-                                   .token_line = string_line,
			
 
				-                                   .column = string_column});
			
 
				-    buffer_.GetTokenInfo(token).literal_index =
			
 
				-        buffer_.literal_string_storage_.size();
			
 
				-    buffer_.literal_string_storage_.push_back(literal->ComputeValue(emitter_));
			
 
				-    return token;
			
 
				+    if (literal->is_terminated()) {
			
 
				+      auto token =
			
 
				+          buffer_.AddToken({.kind = TokenKind::StringLiteral(),
			
 
				+                            .token_line = string_line,
			
 
				+                            .column = string_column,
			
 
				+                            .literal_index = static_cast<int32_t>(
			
 
				+                                buffer_.literal_string_storage_.size())});
			
 
				+      buffer_.literal_string_storage_.push_back(
			
 
				+          literal->ComputeValue(emitter_));
			
 
				+      return token;
			
 
				+    } else {
			
 
				+      emitter_.EmitError<UnterminatedString>(literal->text().begin());
			
 
				+      return buffer_.AddToken({.kind = TokenKind::Error(),
			
 
				+                               .token_line = string_line,
			
 
				+                               .column = string_column,
			
 
				+                               .error_length = literal_size});
			
 
				+    }
			
 
				   }
			
 
				 
			
 
				   auto LexSymbolToken(llvm::StringRef& source_text) -> LexResult {
			
@@ -506,8 +522,6 @@ class TokenizedBuffer::Lexer {
 
				       error_text = source_text.take_front(1);
			
 
				     }
			
 
				 
			
 
				-    // Longer errors get to be two tokens.
			
 
				-    error_text = error_text.substr(0, std::numeric_limits<int32_t>::max());
			
 
				     auto token = buffer_.AddToken(
			
 
				         {.kind = TokenKind::Error(),
			
 
				          .token_line = current_line_,
			
@@ -632,7 +646,7 @@ auto TokenizedBuffer::GetTokenText(Token token) const -> llvm::StringRef {
 
				     llvm::Optional<LexedStringLiteral> relexed_token =
			
 
				         LexedStringLiteral::Lex(source_->Text().substr(token_start));
			
 
				     CHECK(relexed_token) << "Could not reform string literal token.";
			
 
				-    return relexed_token->Text();
			
 
				+    return relexed_token->text();
			
 
				   }
			
 
				 
			
 
				   // Refer back to the source text to avoid needing to reconstruct the
			
--- a/toolchain/lexer/tokenized_buffer_test.cpp
+++ b/toolchain/lexer/tokenized_buffer_test.cpp
@@ -280,7 +280,7 @@ TEST_F(LexerTest, SplitsNumericLiteralsProperly) {
 
				 }
			
 
				 
			
 
				 TEST_F(LexerTest, HandlesGarbageCharacters) {
			
 
				-  constexpr char GarbageText[] = "$$💩-$\n$\0$12$\n\"\n\"\\";
			
 
				+  constexpr char GarbageText[] = "$$💩-$\n$\0$12$\n\\\"\\\n\"x";
			
 
				   auto buffer = Lex(llvm::StringRef(GarbageText, sizeof(GarbageText) - 1));
			
 
				   EXPECT_TRUE(buffer.HasErrors());
			
 
				   EXPECT_THAT(
			
@@ -289,8 +289,8 @@ TEST_F(LexerTest, HandlesGarbageCharacters) {
 
				           {.kind = TokenKind::Error(),
			
 
				            .line = 1,
			
 
				            .column = 1,
			
 
				+           // 💩 takes 4 bytes, and we count column as bytes offset.
			
 
				            .text = llvm::StringRef("$$💩", 6)},
			
 
				-          // 💩 takes 4 bytes, and we count column as bytes offset.
			
 
				           {.kind = TokenKind::Minus(), .line = 1, .column = 7},
			
 
				           {.kind = TokenKind::Error(), .line = 1, .column = 8, .text = "$"},
			
 
				           // newline
			
@@ -304,19 +304,13 @@ TEST_F(LexerTest, HandlesGarbageCharacters) {
 
				            .text = "12"},
			
 
				           {.kind = TokenKind::Error(), .line = 2, .column = 6, .text = "$"},
			
 
				           // newline
			
 
				-          {.kind = TokenKind::Error(),
			
 
				+          {.kind = TokenKind::Backslash(),
			
 
				            .line = 3,
			
 
				            .column = 1,
			
 
				-           .text = llvm::StringRef("\"", 1)},
			
 
				+           .text = "\\"},
			
 
				+          {.kind = TokenKind::Error(), .line = 3, .column = 2, .text = "\"\\"},
			
 
				           // newline
			
 
				-          {.kind = TokenKind::Error(),
			
 
				-           .line = 4,
			
 
				-           .column = 1,
			
 
				-           .text = llvm::StringRef("\"", 1)},
			
 
				-          {.kind = TokenKind::Backslash(),
			
 
				-           .line = 4,
			
 
				-           .column = 2,
			
 
				-           .text = llvm::StringRef("\\", 1)},
			
 
				+          {.kind = TokenKind::Error(), .line = 4, .column = 1, .text = "\"x"},
			
 
				           {.kind = TokenKind::EndOfFile(), .line = 4, .column = 3},
			
 
				       }));
			
 
				 }
			
@@ -797,24 +791,27 @@ TEST_F(LexerTest, StringLiterals) {
 
				 
			
 
				 TEST_F(LexerTest, InvalidStringLiterals) {
			
 
				   llvm::StringLiteral invalid[] = {
			
 
				+      // clang-format off
			
 
				       R"(")",
			
 
				       R"("""
			
 
				-      "")",        //
			
 
				-      R"("\)",     //
			
 
				-      R"("\")",    //
			
 
				-      R"("\\)",    //
			
 
				-      R"("\\\")",  //
			
 
				+      "")",
			
 
				+      R"("\)",
			
 
				+      R"("\")",
			
 
				+      R"("\\)",
			
 
				+      R"("\\\")",
			
 
				       R"(""")",
			
 
				       R"("""
			
 
				-      )",  //
			
 
				+      )",
			
 
				       R"("""\)",
			
 
				       R"(#"""
			
 
				       """)",
			
 
				+      // clang-format on
			
 
				   };
			
 
				 
			
 
				   for (llvm::StringLiteral test : invalid) {
			
 
				+    SCOPED_TRACE(test);
			
 
				     auto buffer = Lex(test);
			
 
				-    EXPECT_TRUE(buffer.HasErrors()) << "`" << test << "`";
			
 
				+    EXPECT_TRUE(buffer.HasErrors());
			
 
				 
			
 
				     // We should have formed at least one error token.
			
 
				     bool found_error = false;
			
@@ -824,7 +821,7 @@ TEST_F(LexerTest, InvalidStringLiterals) {
 
				         break;
			
 
				       }
			
 
				     }
			
 
				-    EXPECT_TRUE(found_error) << "`" << test << "`";
			
 
				+    EXPECT_TRUE(found_error);
			
 
				   }
			
 
				 }
			
 
				 
			
@@ -935,45 +932,74 @@ TEST_F(LexerTest, TypeLiterals) {
 
				   EXPECT_EQ(buffer.GetTypeLiteralSize(*token_f1), 1);
			
 
				 }
			
 
				 
			
 
				-TEST_F(LexerTest, Diagnostics) {
			
 
				+TEST_F(LexerTest, DiagnosticTrailingComment) {
			
 
				   llvm::StringLiteral testcase = R"(
			
 
				     // Hello!
			
 
				     var String x; // trailing comment
			
 
				-    //no space after comment
			
 
				-    "hello\bworld\xab"
			
 
				-    0x123abc
			
 
				-    #"
			
 
				   )";
			
 
				 
			
 
				   Testing::MockDiagnosticConsumer consumer;
			
 
				   EXPECT_CALL(consumer, HandleDiagnostic(AllOf(
			
 
				                             DiagnosticAt(3, 19),
			
 
				                             DiagnosticMessage(HasSubstr("Trailing comment")))));
			
 
				+  Lex(testcase, consumer);
			
 
				+}
			
 
				+
			
 
				+TEST_F(LexerTest, DiagnosticWhitespace) {
			
 
				+  Testing::MockDiagnosticConsumer consumer;
			
 
				   EXPECT_CALL(consumer,
			
 
				               HandleDiagnostic(AllOf(
			
 
				-                  DiagnosticAt(4, 7),
			
 
				+                  DiagnosticAt(1, 3),
			
 
				                   DiagnosticMessage(HasSubstr("Whitespace is required")))));
			
 
				+  Lex("//no space after comment", consumer);
			
 
				+}
			
 
				+
			
 
				+TEST_F(LexerTest, DiagnosticUnrecognizedEscape) {
			
 
				+  Testing::MockDiagnosticConsumer consumer;
			
 
				   EXPECT_CALL(
			
 
				       consumer,
			
 
				       HandleDiagnostic(AllOf(
			
 
				-          DiagnosticAt(5, 12),
			
 
				+          DiagnosticAt(1, 8),
			
 
				           DiagnosticMessage(HasSubstr("Unrecognized escape sequence `b`")))));
			
 
				+  Lex(R"("hello\bworld")", consumer);
			
 
				+}
			
 
				+
			
 
				+TEST_F(LexerTest, DiagnosticBadHex) {
			
 
				+  Testing::MockDiagnosticConsumer consumer;
			
 
				   EXPECT_CALL(
			
 
				       consumer,
			
 
				       HandleDiagnostic(AllOf(
			
 
				-          DiagnosticAt(5, 20),
			
 
				+          DiagnosticAt(1, 9),
			
 
				           DiagnosticMessage(HasSubstr("two uppercase hexadecimal digits")))));
			
 
				+  Lex(R"("hello\xabworld")", consumer);
			
 
				+}
			
 
				+
			
 
				+TEST_F(LexerTest, DiagnosticInvalidDigit) {
			
 
				+  Testing::MockDiagnosticConsumer consumer;
			
 
				   EXPECT_CALL(
			
 
				       consumer,
			
 
				       HandleDiagnostic(AllOf(
			
 
				-          DiagnosticAt(6, 10),
			
 
				+          DiagnosticAt(1, 6),
			
 
				           DiagnosticMessage(HasSubstr("Invalid digit 'a' in hexadecimal")))));
			
 
				+  Lex("0x123abc", consumer);
			
 
				+}
			
 
				+
			
 
				+TEST_F(LexerTest, DiagnosticMissingTerminator) {
			
 
				+  Testing::MockDiagnosticConsumer consumer;
			
 
				+  EXPECT_CALL(consumer,
			
 
				+              HandleDiagnostic(
			
 
				+                  AllOf(DiagnosticAt(1, 1),
			
 
				+                        DiagnosticMessage(HasSubstr("missing a terminator")))));
			
 
				+  Lex(R"(#" ")", consumer);
			
 
				+}
			
 
				+
			
 
				+TEST_F(LexerTest, DiagnosticUnrecognizedChar) {
			
 
				+  Testing::MockDiagnosticConsumer consumer;
			
 
				   EXPECT_CALL(consumer,
			
 
				               HandleDiagnostic(AllOf(
			
 
				-                  DiagnosticAt(7, 5),
			
 
				+                  DiagnosticAt(1, 1),
			
 
				                   DiagnosticMessage(HasSubstr("unrecognized character")))));
			
 
				-
			
 
				-  Lex(testcase, consumer);
			
 
				+  Lex("\b", consumer);
			
 
				 }
			
 
				 
			
 
				 auto GetAndDropLine(llvm::StringRef& text) -> std::string {