Explorar el Código

Adjust string parsing to consume everything until the terminator. (#1111)

Note I've added a few TODOs, particularly that multi-line strings should only consume until the dedent.

Co-authored-by: Richard Smith <richard@metafoo.co.uk>
Jon Meow hace 4 años
padre
commit
0a8c0dc271

+ 1 - 0
toolchain/lexer/BUILD

@@ -142,6 +142,7 @@ cc_fuzz_test(
     corpus = glob(["fuzzer_corpus/string_literal/*"]),
     deps = [
         ":string_literal",
+        "//common:check",
         "//toolchain/diagnostics:diagnostic_emitter",
         "//toolchain/diagnostics:null_diagnostics",
         "@llvm-project//llvm:Support",

+ 19 - 6
toolchain/lexer/string_literal.cpp

@@ -138,6 +138,8 @@ auto LexedStringLiteral::Lex(llvm::StringRef source_text)
   terminator.resize(terminator.size() + hash_level, '#');
   escape.resize(escape.size() + hash_level, '#');
 
+  // TODO: Detect indent / dedent for multi-line string literals in order to
+  // stop parsing on dedent before a terminator is found.
   for (; cursor < source_text_size; ++cursor) {
     // This switch and loop structure relies on multi-character terminators and
     // escape sequences starting with a predictable character and not containing
@@ -152,13 +154,19 @@ auto LexedStringLiteral::Lex(llvm::StringRef source_text)
           // should stop here.
           if (cursor >= source_text_size ||
               (!multi_line && source_text[cursor] == '\n')) {
-            return llvm::None;
+            llvm::StringRef text = source_text.take_front(cursor);
+            return LexedStringLiteral(text, text.drop_front(prefix_len),
+                                      hash_level, multi_line,
+                                      /*is_terminated=*/false);
           }
         }
         break;
       case '\n':
         if (!multi_line) {
-          return llvm::None;
+          llvm::StringRef text = source_text.take_front(cursor);
+          return LexedStringLiteral(text, text.drop_front(prefix_len),
+                                    hash_level, multi_line,
+                                    /*is_terminated=*/false);
         }
         break;
       case '\"': {
@@ -168,15 +176,17 @@ auto LexedStringLiteral::Lex(llvm::StringRef source_text)
               source_text.substr(0, cursor + terminator.size());
           llvm::StringRef content =
               source_text.substr(prefix_len, cursor - prefix_len);
-          return LexedStringLiteral(text, content, hash_level, multi_line);
+          return LexedStringLiteral(text, content, hash_level, multi_line,
+                                    /*is_terminated=*/true);
         }
         break;
       }
     }
   }
-  // Let LexError figure out how to recover from an unterminated string
-  // literal.
-  return llvm::None;
+  // No terminator was found.
+  return LexedStringLiteral(source_text, source_text.drop_front(prefix_len),
+                            hash_level, multi_line,
+                            /*is_terminated=*/false);
 }
 
 // Given a string that contains at least one newline, find the indent (the
@@ -407,6 +417,9 @@ static auto ExpandEscapeSequencesAndRemoveIndent(
 
 auto LexedStringLiteral::ComputeValue(LexerDiagnosticEmitter& emitter) const
     -> std::string {
+  if (!is_terminated_) {
+    return "";
+  }
   llvm::StringRef indent =
       multi_line_ ? CheckIndent(emitter, text_, content_) : llvm::StringRef();
   return ExpandEscapeSequencesAndRemoveIndent(emitter, content_, hash_level_,

+ 17 - 9
toolchain/lexer/string_literal.h

@@ -13,28 +13,34 @@ namespace Carbon {
 class LexedStringLiteral {
  public:
   // Extract a string literal token from the given text, if it has a suitable
-  // form.
+  // form. Returning llvm::None indicates no string literal was found; returning
+  // an invalid literal indicates a string prefix was found, but it's malformed
+  // and is returning a partial string literal to assist error construction.
   static auto Lex(llvm::StringRef source_text)
       -> llvm::Optional<LexedStringLiteral>;
 
-  // Get the text corresponding to this literal.
-  [[nodiscard]] auto Text() const -> llvm::StringRef { return text_; }
-
-  // Determine whether this is a multi-line string literal.
-  [[nodiscard]] auto IsMultiLine() const -> bool { return multi_line_; }
-
   // Expand any escape sequences in the given string literal and compute the
   // resulting value. This handles error recovery internally and cannot fail.
   auto ComputeValue(DiagnosticEmitter<const char*>& emitter) const
       -> std::string;
 
+  // Get the text corresponding to this literal.
+  [[nodiscard]] auto text() const -> llvm::StringRef { return text_; }
+
+  // Determine whether this is a multi-line string literal.
+  [[nodiscard]] auto is_multi_line() const -> bool { return multi_line_; }
+
+  // Returns true if the string has a valid terminator.
+  [[nodiscard]] auto is_terminated() const -> bool { return is_terminated_; }
+
  private:
   LexedStringLiteral(llvm::StringRef text, llvm::StringRef content,
-                     int hash_level, bool multi_line)
+                     int hash_level, bool multi_line, bool is_terminated)
       : text_(text),
         content_(content),
         hash_level_(hash_level),
-        multi_line_(multi_line) {}
+        multi_line_(multi_line),
+        is_terminated_(is_terminated) {}
 
   // The complete text of the string literal.
   llvm::StringRef text_;
@@ -47,6 +53,8 @@ class LexedStringLiteral {
   int hash_level_;
   // Whether this was a multi-line string literal.
   bool multi_line_;
+  // Whether the literal is valid, or should only be used for errors.
+  bool is_terminated_;
 };
 
 }  // namespace Carbon

+ 11 - 3
toolchain/lexer/string_literal_fuzzer.cpp

@@ -5,6 +5,7 @@
 #include <cstdint>
 #include <cstring>
 
+#include "common/check.h"
 #include "llvm/ADT/StringRef.h"
 #include "toolchain/diagnostics/diagnostic_emitter.h"
 #include "toolchain/diagnostics/null_diagnostics.h"
@@ -22,11 +23,18 @@ extern "C" int LLVMFuzzerTestOneInput(const unsigned char* data,
     return 0;
   }
 
-  // Check multiline flag was computed correctly.
-  if (token->IsMultiLine() != token->Text().contains('\n')) {
-    __builtin_trap();
+  if (!token->is_terminated()) {
+    // Found errors while parsing.
+    return 0;
   }
 
+  fprintf(stderr, "valid: %d\n", token->is_terminated());
+  fprintf(stderr, "size: %lu\n", token->text().size());
+  fprintf(stderr, "text: %s\n", token->text().str().c_str());
+
+  // Check multiline flag was computed correctly.
+  CHECK(token->is_multi_line() == token->text().contains('\n'));
+
   volatile auto value =
       token->ComputeValue(NullDiagnosticEmitter<const char*>());
   (void)value;

+ 14 - 7
toolchain/lexer/string_literal_test.cpp

@@ -22,7 +22,7 @@ class StringLiteralTest : public ::testing::Test {
   auto Lex(llvm::StringRef text) -> LexedStringLiteral {
     llvm::Optional<LexedStringLiteral> result = LexedStringLiteral::Lex(text);
     CHECK(result);
-    EXPECT_EQ(result->Text(), text);
+    EXPECT_EQ(result->text(), text);
     return *result;
   }
 
@@ -84,20 +84,22 @@ TEST_F(StringLiteralTest, StringLiteralBounds) {
   };
 
   for (llvm::StringLiteral test : valid) {
+    SCOPED_TRACE(test);
     llvm::Optional<LexedStringLiteral> result = LexedStringLiteral::Lex(test);
-    EXPECT_TRUE(result.hasValue()) << test;
+    EXPECT_TRUE(result.hasValue());
     if (result) {
-      EXPECT_EQ(result->Text(), test);
+      EXPECT_EQ(result->text(), test);
     }
   }
 
   llvm::StringLiteral invalid[] = {
+      // clang-format off
       R"(")",
       R"("""
       "")",
-      R"("\)",  //
+      R"("\)",
       R"("\")",
-      R"("\\)",  //
+      R"("\\)",
       R"("\\\")",
       R"("""
       )",
@@ -105,11 +107,16 @@ TEST_F(StringLiteralTest, StringLiteralBounds) {
       """)",
       R"(" \
       ")",
+      // clang-format on
   };
 
   for (llvm::StringLiteral test : invalid) {
-    EXPECT_FALSE(LexedStringLiteral::Lex(test).hasValue())
-        << "`" << test << "`";
+    SCOPED_TRACE(test);
+    llvm::Optional<LexedStringLiteral> result = LexedStringLiteral::Lex(test);
+    EXPECT_TRUE(result.hasValue());
+    if (result) {
+      EXPECT_FALSE(result->is_terminated());
+    }
   }
 }
 

+ 27 - 13
toolchain/lexer/tokenized_buffer.cpp

@@ -57,6 +57,12 @@ struct UnrecognizedCharacters : DiagnosticBase<UnrecognizedCharacters> {
       "Encountered unrecognized characters while parsing.";
 };
 
+struct UnterminatedString : DiagnosticBase<UnterminatedString> {
+  static constexpr llvm::StringLiteral ShortName = "syntax-string-terminator";
+  static constexpr llvm::StringLiteral Message =
+      "String is missing a terminator.";
+};
+
 // TODO: Move Overload and VariantMatch somewhere more central.
 
 // Form an overload set from a list of functions. For example:
@@ -267,7 +273,7 @@ class TokenizedBuffer::Lexer {
 
     Line string_line = current_line_;
     int string_column = current_column_;
-    int literal_size = literal->Text().size();
+    int literal_size = literal->text().size();
     source_text = source_text.drop_front(literal_size);
 
     if (!set_indent_) {
@@ -276,10 +282,10 @@ class TokenizedBuffer::Lexer {
     }
 
     // Update line and column information.
-    if (!literal->IsMultiLine()) {
+    if (!literal->is_multi_line()) {
       current_column_ += literal_size;
     } else {
-      for (char c : literal->Text()) {
+      for (char c : literal->text()) {
         if (c == '\n') {
           HandleNewline();
           // The indentation of all lines in a multi-line string literal is
@@ -292,13 +298,23 @@ class TokenizedBuffer::Lexer {
       }
     }
 
-    auto token = buffer_.AddToken({.kind = TokenKind::StringLiteral(),
-                                   .token_line = string_line,
-                                   .column = string_column});
-    buffer_.GetTokenInfo(token).literal_index =
-        buffer_.literal_string_storage_.size();
-    buffer_.literal_string_storage_.push_back(literal->ComputeValue(emitter_));
-    return token;
+    if (literal->is_terminated()) {
+      auto token =
+          buffer_.AddToken({.kind = TokenKind::StringLiteral(),
+                            .token_line = string_line,
+                            .column = string_column,
+                            .literal_index = static_cast<int32_t>(
+                                buffer_.literal_string_storage_.size())});
+      buffer_.literal_string_storage_.push_back(
+          literal->ComputeValue(emitter_));
+      return token;
+    } else {
+      emitter_.EmitError<UnterminatedString>(literal->text().begin());
+      return buffer_.AddToken({.kind = TokenKind::Error(),
+                               .token_line = string_line,
+                               .column = string_column,
+                               .error_length = literal_size});
+    }
   }
 
   auto LexSymbolToken(llvm::StringRef& source_text) -> LexResult {
@@ -506,8 +522,6 @@ class TokenizedBuffer::Lexer {
       error_text = source_text.take_front(1);
     }
 
-    // Longer errors get to be two tokens.
-    error_text = error_text.substr(0, std::numeric_limits<int32_t>::max());
     auto token = buffer_.AddToken(
         {.kind = TokenKind::Error(),
          .token_line = current_line_,
@@ -632,7 +646,7 @@ auto TokenizedBuffer::GetTokenText(Token token) const -> llvm::StringRef {
     llvm::Optional<LexedStringLiteral> relexed_token =
         LexedStringLiteral::Lex(source_->Text().substr(token_start));
     CHECK(relexed_token) << "Could not reform string literal token.";
-    return relexed_token->Text();
+    return relexed_token->text();
   }
 
   // Refer back to the source text to avoid needing to reconstruct the

+ 58 - 32
toolchain/lexer/tokenized_buffer_test.cpp

@@ -280,7 +280,7 @@ TEST_F(LexerTest, SplitsNumericLiteralsProperly) {
 }
 
 TEST_F(LexerTest, HandlesGarbageCharacters) {
-  constexpr char GarbageText[] = "$$💩-$\n$\0$12$\n\"\n\"\\";
+  constexpr char GarbageText[] = "$$💩-$\n$\0$12$\n\\\"\\\n\"x";
   auto buffer = Lex(llvm::StringRef(GarbageText, sizeof(GarbageText) - 1));
   EXPECT_TRUE(buffer.HasErrors());
   EXPECT_THAT(
@@ -289,8 +289,8 @@ TEST_F(LexerTest, HandlesGarbageCharacters) {
           {.kind = TokenKind::Error(),
            .line = 1,
            .column = 1,
+           // 💩 takes 4 bytes, and we count column as bytes offset.
            .text = llvm::StringRef("$$💩", 6)},
-          // 💩 takes 4 bytes, and we count column as bytes offset.
           {.kind = TokenKind::Minus(), .line = 1, .column = 7},
           {.kind = TokenKind::Error(), .line = 1, .column = 8, .text = "$"},
           // newline
@@ -304,19 +304,13 @@ TEST_F(LexerTest, HandlesGarbageCharacters) {
            .text = "12"},
           {.kind = TokenKind::Error(), .line = 2, .column = 6, .text = "$"},
           // newline
-          {.kind = TokenKind::Error(),
+          {.kind = TokenKind::Backslash(),
            .line = 3,
            .column = 1,
-           .text = llvm::StringRef("\"", 1)},
+           .text = "\\"},
+          {.kind = TokenKind::Error(), .line = 3, .column = 2, .text = "\"\\"},
           // newline
-          {.kind = TokenKind::Error(),
-           .line = 4,
-           .column = 1,
-           .text = llvm::StringRef("\"", 1)},
-          {.kind = TokenKind::Backslash(),
-           .line = 4,
-           .column = 2,
-           .text = llvm::StringRef("\\", 1)},
+          {.kind = TokenKind::Error(), .line = 4, .column = 1, .text = "\"x"},
           {.kind = TokenKind::EndOfFile(), .line = 4, .column = 3},
       }));
 }
@@ -797,24 +791,27 @@ TEST_F(LexerTest, StringLiterals) {
 
 TEST_F(LexerTest, InvalidStringLiterals) {
   llvm::StringLiteral invalid[] = {
+      // clang-format off
       R"(")",
       R"("""
-      "")",        //
-      R"("\)",     //
-      R"("\")",    //
-      R"("\\)",    //
-      R"("\\\")",  //
+      "")",
+      R"("\)",
+      R"("\")",
+      R"("\\)",
+      R"("\\\")",
       R"(""")",
       R"("""
-      )",  //
+      )",
       R"("""\)",
       R"(#"""
       """)",
+      // clang-format on
   };
 
   for (llvm::StringLiteral test : invalid) {
+    SCOPED_TRACE(test);
     auto buffer = Lex(test);
-    EXPECT_TRUE(buffer.HasErrors()) << "`" << test << "`";
+    EXPECT_TRUE(buffer.HasErrors());
 
     // We should have formed at least one error token.
     bool found_error = false;
@@ -824,7 +821,7 @@ TEST_F(LexerTest, InvalidStringLiterals) {
         break;
       }
     }
-    EXPECT_TRUE(found_error) << "`" << test << "`";
+    EXPECT_TRUE(found_error);
   }
 }
 
@@ -935,45 +932,74 @@ TEST_F(LexerTest, TypeLiterals) {
   EXPECT_EQ(buffer.GetTypeLiteralSize(*token_f1), 1);
 }
 
-TEST_F(LexerTest, Diagnostics) {
+TEST_F(LexerTest, DiagnosticTrailingComment) {
   llvm::StringLiteral testcase = R"(
     // Hello!
     var String x; // trailing comment
-    //no space after comment
-    "hello\bworld\xab"
-    0x123abc
-    #"
   )";
 
   Testing::MockDiagnosticConsumer consumer;
   EXPECT_CALL(consumer, HandleDiagnostic(AllOf(
                             DiagnosticAt(3, 19),
                             DiagnosticMessage(HasSubstr("Trailing comment")))));
+  Lex(testcase, consumer);
+}
+
+TEST_F(LexerTest, DiagnosticWhitespace) {
+  Testing::MockDiagnosticConsumer consumer;
   EXPECT_CALL(consumer,
               HandleDiagnostic(AllOf(
-                  DiagnosticAt(4, 7),
+                  DiagnosticAt(1, 3),
                   DiagnosticMessage(HasSubstr("Whitespace is required")))));
+  Lex("//no space after comment", consumer);
+}
+
+TEST_F(LexerTest, DiagnosticUnrecognizedEscape) {
+  Testing::MockDiagnosticConsumer consumer;
   EXPECT_CALL(
       consumer,
       HandleDiagnostic(AllOf(
-          DiagnosticAt(5, 12),
+          DiagnosticAt(1, 8),
           DiagnosticMessage(HasSubstr("Unrecognized escape sequence `b`")))));
+  Lex(R"("hello\bworld")", consumer);
+}
+
+TEST_F(LexerTest, DiagnosticBadHex) {
+  Testing::MockDiagnosticConsumer consumer;
   EXPECT_CALL(
       consumer,
       HandleDiagnostic(AllOf(
-          DiagnosticAt(5, 20),
+          DiagnosticAt(1, 9),
           DiagnosticMessage(HasSubstr("two uppercase hexadecimal digits")))));
+  Lex(R"("hello\xabworld")", consumer);
+}
+
+TEST_F(LexerTest, DiagnosticInvalidDigit) {
+  Testing::MockDiagnosticConsumer consumer;
   EXPECT_CALL(
       consumer,
       HandleDiagnostic(AllOf(
-          DiagnosticAt(6, 10),
+          DiagnosticAt(1, 6),
           DiagnosticMessage(HasSubstr("Invalid digit 'a' in hexadecimal")))));
+  Lex("0x123abc", consumer);
+}
+
+TEST_F(LexerTest, DiagnosticMissingTerminator) {
+  Testing::MockDiagnosticConsumer consumer;
+  EXPECT_CALL(consumer,
+              HandleDiagnostic(
+                  AllOf(DiagnosticAt(1, 1),
+                        DiagnosticMessage(HasSubstr("missing a terminator")))));
+  Lex(R"(#" ")", consumer);
+}
+
+TEST_F(LexerTest, DiagnosticUnrecognizedChar) {
+  Testing::MockDiagnosticConsumer consumer;
   EXPECT_CALL(consumer,
               HandleDiagnostic(AllOf(
-                  DiagnosticAt(7, 5),
+                  DiagnosticAt(1, 1),
                   DiagnosticMessage(HasSubstr("unrecognized character")))));
-
-  Lex(testcase, consumer);
+  Lex("\b", consumer);
 }
 
 auto GetAndDropLine(llvm::StringRef& text) -> std::string {