4 лет назад · 49013ae1cc
--- a/.bazelignore
+++ b/.bazelignore
@@ -1,6 +0,0 @@
 
				-# Part of the Carbon Language project, under the Apache License v2.0 with LLVM
			
 
				-# Exceptions. See /LICENSE for license information.
			
 
				-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
			
 
				-
			
 
				-# Carbon creates some non-standard bazel directories, so ignore them.
			
 
				-bazel-carbon-lang
			
--- a/common/string_helpers.cpp
+++ b/common/string_helpers.cpp
@@ -27,58 +27,68 @@ auto UnescapeStringLiteral(llvm::StringRef source)
 
				   size_t i = 0;
			
 
				   while (i < source.size()) {
			
 
				     char c = source[i];
			
 
				-    if (c == '\\') {
			
 
				-      ++i;
			
 
				-      if (i == source.size()) {
			
 
				-        return std::nullopt;
			
 
				-      }
			
 
				-      switch (source[i]) {
			
 
				-        case 'n':
			
 
				-          ret.push_back('\n');
			
 
				-          break;
			
 
				-        case 'r':
			
 
				-          ret.push_back('\r');
			
 
				-          break;
			
 
				-        case 't':
			
 
				-          ret.push_back('\t');
			
 
				-          break;
			
 
				-        case '0':
			
 
				-          if (i + 1 < source.size() && llvm::isDigit(source[i + 1])) {
			
 
				-            // \0[0-9] is reserved.
			
 
				-            return std::nullopt;
			
 
				-          }
			
 
				-          ret.push_back('\0');
			
 
				-          break;
			
 
				-        case '"':
			
 
				-          ret.push_back('"');
			
 
				-          break;
			
 
				-        case '\'':
			
 
				-          ret.push_back('\'');
			
 
				-          break;
			
 
				-        case '\\':
			
 
				-          ret.push_back('\\');
			
 
				-          break;
			
 
				-        case 'x': {
			
 
				-          i += 2;
			
 
				-          if (i >= source.size()) {
			
 
				-            return std::nullopt;
			
 
				+    switch (c) {
			
 
				+      case '\\':
			
 
				+        ++i;
			
 
				+        if (i == source.size()) {
			
 
				+          return std::nullopt;
			
 
				+        }
			
 
				+        switch (source[i]) {
			
 
				+          case 'n':
			
 
				+            ret.push_back('\n');
			
 
				+            break;
			
 
				+          case 'r':
			
 
				+            ret.push_back('\r');
			
 
				+            break;
			
 
				+          case 't':
			
 
				+            ret.push_back('\t');
			
 
				+            break;
			
 
				+          case '0':
			
 
				+            if (i + 1 < source.size() && llvm::isDigit(source[i + 1])) {
			
 
				+              // \0[0-9] is reserved.
			
 
				+              return std::nullopt;
			
 
				+            }
			
 
				+            ret.push_back('\0');
			
 
				+            break;
			
 
				+          case '"':
			
 
				+            ret.push_back('"');
			
 
				+            break;
			
 
				+          case '\'':
			
 
				+            ret.push_back('\'');
			
 
				+            break;
			
 
				+          case '\\':
			
 
				+            ret.push_back('\\');
			
 
				+            break;
			
 
				+          case 'x': {
			
 
				+            i += 2;
			
 
				+            if (i >= source.size()) {
			
 
				+              return std::nullopt;
			
 
				+            }
			
 
				+            std::optional<char> c1 = FromHex(source[i - 1]);
			
 
				+            std::optional<char> c2 = FromHex(source[i]);
			
 
				+            if (c1 == std::nullopt || c2 == std::nullopt) {
			
 
				+              return std::nullopt;
			
 
				+            }
			
 
				+            ret.push_back(16 * *c1 + *c2);
			
 
				+            break;
			
 
				           }
			
 
				-          std::optional<char> c1 = FromHex(source[i - 1]);
			
 
				-          std::optional<char> c2 = FromHex(source[i]);
			
 
				-          if (c1 == std::nullopt || c2 == std::nullopt) {
			
 
				+          case 'u':
			
 
				+            FATAL() << "\\u is not yet supported in string literals";
			
 
				+          default:
			
 
				+            // Unsupported.
			
 
				             return std::nullopt;
			
 
				-          }
			
 
				-          ret.push_back(16 * *c1 + *c2);
			
 
				-          break;
			
 
				         }
			
 
				-        case 'u':
			
 
				-          FATAL() << "\\u is not yet supported in string literals";
			
 
				-        default:
			
 
				-          // Unsupported.
			
 
				-          return std::nullopt;
			
 
				-      }
			
 
				-    } else {
			
 
				-      ret.push_back(c);
			
 
				+        break;
			
 
				+
			
 
				+      case '\t':
			
 
				+        // Disallow non-` ` horizontal whitespace:
			
 
				+        // https://github.com/carbon-language/carbon-lang/blob/trunk/docs/design/lexical_conventions/whitespace.md
			
 
				+        // TODO: This doesn't handle unicode whitespace.
			
 
				+        return std::nullopt;
			
 
				+
			
 
				+      default:
			
 
				+        ret.push_back(c);
			
 
				+        break;
			
 
				     }
			
 
				     ++i;
			
 
				   }
			
--- a/common/string_helpers_test.cpp
+++ b/common/string_helpers_test.cpp
@@ -17,6 +17,8 @@ namespace {
 
				 
			
 
				 TEST(UnescapeStringLiteral, Valid) {
			
 
				   EXPECT_THAT(UnescapeStringLiteral("test"), Optional(Eq("test")));
			
 
				+  EXPECT_THAT(UnescapeStringLiteral("okay whitespace"),
			
 
				+              Optional(Eq("okay whitespace")));
			
 
				   EXPECT_THAT(UnescapeStringLiteral("test\n"), Optional(Eq("test\n")));
			
 
				   EXPECT_THAT(UnescapeStringLiteral("test\\n"), Optional(Eq("test\n")));
			
 
				   EXPECT_THAT(UnescapeStringLiteral("abc\\ndef"), Optional(Eq("abc\ndef")));
			
--- a/docs/design/lexical_conventions/string_literals.md
+++ b/docs/design/lexical_conventions/string_literals.md
@@ -83,8 +83,9 @@ A _simple string literal_ is formed of a sequence of:
 
				 
			
 
				 -   Characters other than `\` and `"`.
			
 
				     -   Only space characters (U+0020) are valid whitespace in a string literal.
			
 
				-        Other whitespace, including tabs and newlines, are disallowed but parse
			
 
				-        as part of the string for error recovery purposes.
			
 
				+    -   Other [horizontal whitespace](whitespace.md), including tabs, are
			
 
				+        disallowed but parse as part of the string for error recovery purposes.
			
 
				+    -   Vertical whitespace will not parse as part of a simple string literal.
			
 
				 -   [Escape sequences](#escape-sequences).
			
 
				     -   Each escape sequence is replaced with the corresponding character
			
 
				         sequence or code unit sequence.
			
--- a/docs/design/lexical_conventions/whitespace.md
+++ b/docs/design/lexical_conventions/whitespace.md
@@ -27,17 +27,19 @@ Unicode Annex #31 suggests selecting whitespace characters based on the
 
				 characters with Unicode property `Pattern_White_Space`, which is currently these
			
 
				 11 characters:
			
 
				 
			
 
				--   U+0009 CHARACTER TABULATION (horizontal tab)
			
 
				--   U+000A LINE FEED (traditional newline)
			
 
				--   U+000B LINE TABULATION (vertical tab)
			
 
				--   U+000C FORM FEED (page break)
			
 
				--   U+000D CARRIAGE RETURN
			
 
				--   U+0020 SPACE
			
 
				--   U+0085 NEXT LINE (Unicode newline)
			
 
				--   U+200E LEFT-TO-RIGHT MARK
			
 
				--   U+200F RIGHT-TO-LEFT MARK
			
 
				--   U+2028 LINE SEPARATOR
			
 
				--   U+2029 PARAGRAPH SEPARATOR
			
 
				+-   Horizontal whitespace:
			
 
				+    -   U+0009 CHARACTER TABULATION (horizontal tab)
			
 
				+    -   U+0020 SPACE
			
 
				+    -   U+200E LEFT-TO-RIGHT MARK
			
 
				+    -   U+200F RIGHT-TO-LEFT MARK
			
 
				+-   Vertical whitespace:
			
 
				+    -   U+000A LINE FEED (traditional newline)
			
 
				+    -   U+000B LINE TABULATION (vertical tab)
			
 
				+    -   U+000C FORM FEED (page break)
			
 
				+    -   U+000D CARRIAGE RETURN
			
 
				+    -   U+0085 NEXT LINE (Unicode newline)
			
 
				+    -   U+2028 LINE SEPARATOR
			
 
				+    -   U+2029 PARAGRAPH SEPARATOR
			
 
				 
			
 
				 The quantity and kind of whitespace separating tokens is ignored except where
			
 
				 otherwise specified.
			
--- a/executable_semantics/syntax/lexer.lpp
+++ b/executable_semantics/syntax/lexer.lpp
@@ -79,12 +79,14 @@ WHILE                "while"
 
				 identifier            [A-Za-z_][A-Za-z0-9_]*
			
 
				 sized_type_literal    [iuf][1-9][0-9]*
			
 
				 integer_literal       [0-9]+
			
 
				-string_literal        \"([^\\\"\n\t]|\\.)*\"
			
 
				 horizontal_whitespace [ \t\r]
			
 
				 whitespace            [ \t\r\n]
			
 
				 one_line_comment      \/\/[^\n]*\n
			
 
				 operand_start         [(A-Za-z0-9_"]
			
 
				 
			
 
				+/* Single-line string literals should reject vertical whitespace. */
			
 
				+string_literal        \"([^\\\"\n\v\f\r]|\\.)*\"
			
 
				+
			
 
				 %{
			
 
				   // This macro is expanded immediately before each action specified below.
			
 
				   //
			
--- a/executable_semantics/testdata/string_fail6.golden
+++ b/executable_semantics/testdata/string_fail6.golden
@@ -1,2 +1,2 @@
 
				-COMPILATION ERROR: executable_semantics/testdata/string_fail6.carbon:6: invalid character '\x22' in source file.
			
 
				+COMPILATION ERROR: executable_semantics/testdata/string_fail6.carbon:6: Invalid escaping in string: "new	line"
			
 
				 EXIT CODE: 255