Просмотр исходного кода

Fix string literal parsing escaped whitespace (#2617)

The toolchain misinterprets escaped whitespace characters as unescaped when trimming trailing whitespace on a line. This PR adds a variable to track the length of the string after escaping the most recent escape, making sure we don't backup past that character, whatever it is.

I did try the approach mentioned in #2132 of not backing up `end_of_regular_text` number of characters, but this caused problems on lines like <kbd>tab</kbd> <kbd>space</kbd> <kbd>tab</kbd> (those characters literally, not escaped) where it would leave the first tab since that is processed in a different iteration of that loop.

I added a test case for this bug. I kept getting mixed up in the test output which one was the expected value, so I also changed a variable name there for clarity.

Fixes #2132.
Calvin 3 лет назад
Родитель
Сommit
a57c3d9801
2 измененных файлов с 14 добавлено и 5 удалено
  1. 10 3
      toolchain/lexer/string_literal.cpp
  2. 4 2
      toolchain/lexer/string_literal_test.cpp

+ 10 - 3
toolchain/lexer/string_literal.cpp

@@ -363,8 +363,13 @@ static auto ExpandEscapeSequencesAndRemoveIndent(
       }
     }
 
+    // Tracks the length of the result at the last time we expanded an escape
+    // to ensure we don't misinterpret it as unescaped when backtracking.
+    size_t last_escape_length = 0;
+
     // Process the contents of the line.
     while (true) {
+      // Append the next segment of plain text.
       auto end_of_regular_text = contents.find_if([](char c) {
         return c == '\n' || c == '\\' ||
                (IsHorizontalWhitespace(c) && c != ' ');
@@ -377,10 +382,11 @@ static auto ExpandEscapeSequencesAndRemoveIndent(
       }
 
       if (contents.consume_front("\n")) {
-        // Trailing whitespace before a newline doesn't contribute to the string
-        // literal value.
+        // Trailing whitespace in the source before a newline doesn't contribute
+        // to the string literal value. However, escaped whitespace (like `\t`)
+        // and any whitespace just before that does contribute.
         while (!result.empty() && result.back() != '\n' &&
-               IsSpace(result.back())) {
+               IsSpace(result.back()) && result.length() > last_escape_length) {
           result.pop_back();
         }
         result += '\n';
@@ -425,6 +431,7 @@ static auto ExpandEscapeSequencesAndRemoveIndent(
 
       // Handle this escape sequence.
       ExpandAndConsumeEscapeSequence(emitter, contents, result);
+      last_escape_length = result.length();
     }
   }
 }

+ 4 - 2
toolchain/lexer/string_literal_test.cpp

@@ -202,13 +202,15 @@ TEST_F(StringLiteralTest, StringLiteralContents) {
       // Trailing whitespace handling.
       {"'''\n  Hello \\\n  World \t \n  Bye!  \\\n  '''",
        "Hello World\nBye!  "},
+      {"'''\n\\t\n'''", "\t\n"},
+      {"'''\n\\t \n'''", "\t\n"},
   };
 
-  for (auto [test, contents] : testcases) {
+  for (auto [test, expected] : testcases) {
     error_tracker.Reset();
     auto value = Parse(test.trim());
     EXPECT_FALSE(error_tracker.seen_error()) << "`" << test << "`";
-    EXPECT_EQ(value, contents);
+    EXPECT_EQ(value, expected);
   }
 }