Prechádzať zdrojové kódy

Initial implementation of block string literals following lexical_conventions/string_literals.md. (#1028)

* Initial implementation of block string literals following lexical_conventions/string_literals.md.

Enabled yyinput() in flex to implement parsing.
Added ParseBlockStringLiteral() helper to handler further transformations such as indenting.
Modified formar_grammar to support single-quoted strings to prevent a failure on lexer.lpp.

* Fixed _find_string_end quote parameter type int -> str.

* Update executable_semantics/syntax/BUILD

Co-authored-by: Geoff Romer <gromer@google.com>

* Addressed code review comments - split table-drived test into individual tests, renamed constants to match style guide.

* Addressed code review comments -- using EXPECT_THAT_EXPECTED() in tests, lexer comments and code cleanup.

Co-authored-by: Geoff Romer <gromer@google.com>
pk19604014 4 rokov pred
rodič
commit
2017eb4da0

+ 2 - 0
common/BUILD

@@ -60,5 +60,7 @@ cc_test(
     deps = [
         ":string_helpers",
         "@com_google_googletest//:gtest_main",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//llvm:TestingSupport",
     ],
 )

+ 80 - 2
common/string_helpers.cpp

@@ -4,13 +4,22 @@
 
 #include "common/string_helpers.h"
 
+#include <algorithm>
+#include <optional>
+
 #include "common/check.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
 
 namespace Carbon {
 
+namespace {
+
+constexpr llvm::StringRef TripleQuotes = "\"\"\"";
+constexpr llvm::StringRef HorizontalWhitespaceChars = " \t";
+
 // Carbon only takes uppercase hex input.
-static auto FromHex(char c) -> std::optional<char> {
+auto FromHex(char c) -> std::optional<char> {
   if (c >= '0' && c <= '9') {
     return c - '0';
   }
@@ -20,7 +29,14 @@ static auto FromHex(char c) -> std::optional<char> {
   return std::nullopt;
 }
 
-auto UnescapeStringLiteral(llvm::StringRef source)
+// Creates an error instance with the specified `message`.
+llvm::Expected<std::string> MakeError(llvm::Twine message) {
+  return llvm::createStringError(llvm::inconvertibleErrorCode(), message);
+}
+
+}  // namespace
+
+auto UnescapeStringLiteral(llvm::StringRef source, bool is_block_string)
     -> std::optional<std::string> {
   std::string ret;
   ret.reserve(source.size());
@@ -74,6 +90,11 @@ auto UnescapeStringLiteral(llvm::StringRef source)
           }
           case 'u':
             FATAL() << "\\u is not yet supported in string literals";
+          case '\n':
+            if (!is_block_string) {
+              return std::nullopt;
+            }
+            break;
           default:
             // Unsupported.
             return std::nullopt;
@@ -95,4 +116,61 @@ auto UnescapeStringLiteral(llvm::StringRef source)
   return ret;
 }
 
+auto ParseBlockStringLiteral(llvm::StringRef source)
+    -> llvm::Expected<std::string> {
+  llvm::SmallVector<llvm::StringRef> lines;
+  source.split(lines, '\n', /*MaxSplit=*/-1, /*KeepEmpty=*/true);
+  if (lines.size() < 2) {
+    return MakeError("Too few lines");
+  }
+
+  llvm::StringRef first = lines[0];
+  if (!first.consume_front(TripleQuotes)) {
+    return MakeError("Should start with triple quotes: " + first);
+  }
+  first = first.rtrim(HorizontalWhitespaceChars);
+  // Remaining chars, if any, are a file type indicator.
+  if (first.find_first_of("\"#") != llvm::StringRef::npos ||
+      first.find_first_of(HorizontalWhitespaceChars) != llvm::StringRef::npos) {
+    return MakeError("Invalid characters in file type indicator: " + first);
+  }
+
+  llvm::StringRef last = lines[lines.size() - 1];
+  const size_t last_length = last.size();
+  last = last.ltrim(HorizontalWhitespaceChars);
+  const size_t indent = last_length - last.size();
+  if (last != TripleQuotes) {
+    return MakeError("Should end with triple quotes: " + last);
+  }
+
+  std::string parsed;
+  for (size_t i = 1; i < lines.size() - 1; ++i) {
+    llvm::StringRef line = lines[i];
+    const size_t first_non_ws =
+        line.find_first_not_of(HorizontalWhitespaceChars);
+    if (first_non_ws == llvm::StringRef::npos) {
+      // Empty or whitespace-only line.
+      line = "";
+    } else {
+      if (first_non_ws < indent) {
+        return MakeError("Wrong indent for line: " + line + ", expected " +
+                         llvm::Twine(indent));
+      }
+      line = line.drop_front(indent).rtrim(HorizontalWhitespaceChars);
+    }
+    // Unescaping with \n appended to handle things like \\<newline>.
+    llvm::SmallVector<char> buffer;
+    std::optional<std::string> unescaped = UnescapeStringLiteral(
+        (line + "\n").toStringRef(buffer), /*is_block_string=*/true);
+    if (!unescaped.has_value()) {
+      return MakeError("Invalid escaping in " + line);
+    }
+    // A \<newline> string collapses into nothing.
+    if (!unescaped->empty()) {
+      parsed.append(*unescaped);
+    }
+  }
+  return parsed;
+}
+
 }  // namespace Carbon

+ 8 - 2
common/string_helpers.h

@@ -9,6 +9,7 @@
 #include <string>
 
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Error.h"
 
 namespace Carbon {
 
@@ -16,10 +17,15 @@ namespace Carbon {
 // complementary to this.
 
 // Unescapes Carbon escape sequences in the source string. Returns std::nullopt
-// on bad input.
-auto UnescapeStringLiteral(llvm::StringRef source)
+// on bad input. `is_block_string` enables escaping unique to block string
+// literals, such as \<newline>.
+auto UnescapeStringLiteral(llvm::StringRef source, bool is_block_string = false)
     -> std::optional<std::string>;
 
+// Parses a block string literal in `source`.
+auto ParseBlockStringLiteral(llvm::StringRef source)
+    -> llvm::Expected<std::string>;
+
 }  // namespace Carbon
 
 #endif  // COMMON_STRING_HELPERS_H_

+ 139 - 0
common/string_helpers_test.cpp

@@ -9,6 +9,11 @@
 
 #include <string>
 
+#include "llvm/Support/Error.h"
+#include "llvm/Testing/Support/Error.h"
+
+using ::llvm::FailedWithMessage;
+using ::llvm::HasValue;
 using ::testing::Eq;
 using ::testing::Optional;
 
@@ -52,5 +57,139 @@ TEST(UnescapeStringLiteral, Nul) {
   EXPECT_THAT((*str)[2], Eq('b'));
 }
 
+TEST(ParseBlockStringLiteral, FailTooFewLines) {
+  EXPECT_THAT_EXPECTED(ParseBlockStringLiteral(""),
+                       FailedWithMessage("Too few lines"));
+}
+
+TEST(ParseBlockStringLiteral, FailNoLeadingTripleQuotes) {
+  EXPECT_THAT_EXPECTED(
+      ParseBlockStringLiteral("'a'\n"),
+      FailedWithMessage("Should start with triple quotes: 'a'"));
+}
+
+TEST(ParseBlockStringLiteral, FailInvalideFiletypeIndicator) {
+  EXPECT_THAT_EXPECTED(
+      ParseBlockStringLiteral("\"\"\"carbon file\n"),
+      FailedWithMessage(
+          "Invalid characters in file type indicator: carbon file"));
+}
+
+TEST(ParseBlockStringLiteral, FailEndingTripleQuotes) {
+  EXPECT_THAT_EXPECTED(ParseBlockStringLiteral("\"\"\"\n"),
+                       FailedWithMessage("Should end with triple quotes: "));
+}
+
+TEST(ParseBlockStringLiteral, FailWrongIndent) {
+  EXPECT_THAT_EXPECTED(
+      ParseBlockStringLiteral(R"("""
+     A block string literal
+    with wrong indent
+     """)"),
+      FailedWithMessage(
+          "Wrong indent for line:     with wrong indent, expected 5"));
+}
+
+TEST(ParseBlockStringLiteral, FailInvalidEscaping) {
+  EXPECT_THAT_EXPECTED(ParseBlockStringLiteral(R"("""
+     \q
+     """)"),
+                       FailedWithMessage("Invalid escaping in \\q"));
+}
+
+TEST(ParseBlockStringLiteral, OkEmptyString) {
+  EXPECT_THAT_EXPECTED(ParseBlockStringLiteral(R"("""
+""")"),
+                       HasValue(""));
+}
+
+TEST(ParseBlockStringLiteral, OkOneLineString) {
+  EXPECT_THAT_EXPECTED(ParseBlockStringLiteral(R"("""
+     A block string literal
+     """)"),
+                       HasValue(R"(A block string literal
+)"));
+}
+
+TEST(ParseBlockStringLiteral, OkTwoLineString) {
+  EXPECT_THAT_EXPECTED(ParseBlockStringLiteral(R"("""
+     A block string literal
+       with indent.
+     """)"),
+                       HasValue(R"(A block string literal
+  with indent.
+)"));
+}
+
+TEST(ParseBlockStringLiteral, OkWithFileTypeIndicator) {
+  EXPECT_THAT_EXPECTED(ParseBlockStringLiteral(R"("""carbon
+     A block string literal
+       with file type indicator.
+     """)"),
+                       HasValue(R"(A block string literal
+  with file type indicator.
+)"));
+}
+
+TEST(ParseBlockStringLiteral, OkWhitespaceAfterOpeningQuotes) {
+  EXPECT_THAT_EXPECTED(ParseBlockStringLiteral(R"("""
+     A block string literal
+     """)"),
+                       HasValue(R"(A block string literal
+)"));
+}
+
+TEST(ParseBlockStringLiteral, OkWithEmptyLines) {
+  EXPECT_THAT_EXPECTED(ParseBlockStringLiteral(R"("""
+     A block string literal
+
+       with
+
+       empty
+
+       lines.
+     """)"),
+                       HasValue(R"(A block string literal
+
+  with
+
+  empty
+
+  lines.
+)"));
+}
+
+TEST(ParseBlockStringLiteral, OkWithSlashNewlineEscape) {
+  EXPECT_THAT_EXPECTED(ParseBlockStringLiteral(R"("""
+     A block string literal\
+     """)"),
+                       HasValue("A block string literal"));
+}
+
+TEST(ParseBlockStringLiteral, OkWithDoubleSlashNewline) {
+  EXPECT_THAT_EXPECTED(ParseBlockStringLiteral(R"("""
+     A block string literal\\
+     """)"),
+                       HasValue(R"(A block string literal\
+)"));
+}
+
+TEST(ParseBlockStringLiteral, OkWithTripleSlashNewline) {
+  EXPECT_THAT_EXPECTED(ParseBlockStringLiteral(R"("""
+     A block string literal\\\
+     """)"),
+                       HasValue(R"(A block string literal\)"));
+}
+
+TEST(ParseBlockStringLiteral, OkMultipleSlashes) {
+  EXPECT_THAT_EXPECTED(ParseBlockStringLiteral(R"("""
+     A block string literal\
+     \
+     \
+     \
+     """)"),
+                       HasValue("A block string literal"));
+}
+
 }  // namespace
 }  // namespace Carbon

+ 1 - 0
executable_semantics/syntax/BUILD

@@ -49,6 +49,7 @@ cc_library(
     ],
     # Disable warnings for generated code.
     copts = [
+        "-Wno-implicit-fallthrough",  # Needed to make yyinput() code compile.
         "-Wno-unneeded-internal-declaration",
         "-Wno-unused-function",
         "-Wno-writable-strings",

+ 7 - 6
executable_semantics/syntax/format_grammar.py

@@ -94,12 +94,13 @@ def _clang_format(code: str, base_style: str, cols: int) -> str:
 
 def _find_string_end(content: str, start: int) -> int:
     """Returns the end of a string, skipping escapes."""
-    i = start
+    quote = content[start]
+    i = start + 1
     while i < len(content):
         c = content[i]
         if c == "\\":
             i += 1
-        elif c == '"':
+        elif c == quote:
             return i
         i += 1
     exit("failed to find end of string: %s" % content[start : start + 20])
@@ -113,9 +114,9 @@ def _find_brace_end(content: str, has_percent: bool, start: int) -> int:
     i = start
     while i < len(content):
         c = content[i]
-        if c == '"':
+        if c == '"' or c == "'":
             # Skip over strings.
-            i = _find_string_end(content, i + 1)
+            i = _find_string_end(content, i)
         elif c == "/" and content[i + 1 : i + 2] == "/":
             # Skip over line comments.
             i = content.find("\n", i + 2)
@@ -287,9 +288,9 @@ def _parse_segments(
     table_segments: List[_Table] = []
     while i < len(content):
         c = content[i]
-        if c == '"':
+        if c == '"' or c == "'":
             # Skip over strings.
-            i = _find_string_end(content, i + 1)
+            i = _find_string_end(content, i)
         elif c == "/" and content[i + 1 : i + 2] == "*":
             text_segment_start, i = _parse_comment(
                 content=content,

+ 58 - 14
executable_semantics/syntax/lexer.lpp

@@ -12,10 +12,20 @@ SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   #include "executable_semantics/syntax/parse_and_lex_context.h"
   #include "executable_semantics/syntax/parser.h"
   #include "llvm/ADT/StringExtras.h"
+
+  // Prints a newline in trace mode because trace prints an incomplete line
+  // "Reading a token: " which can prevent LIT from finding expected patterns.
+  #define FATAL_SYNTAX_ERROR(context)                                       \
+    RAW_EXITING_STREAM() << (context.trace() ? "\n" : "")                   \
+                         << "COMPILATION ERROR: " << (context.source_loc()) \
+                         << ": "
+
+  // Reads and returns a single character. Fails on EOF.
+  char ReadChar(yyscan_t yyscanner, const Carbon::ParseAndLexContext& context);
 %}
 
 /* Turn off legacy bits we don't need. */
-%option noyywrap nounput nodefault noinput
+%option noyywrap nounput nodefault
 
 %option reentrant
 
@@ -256,17 +266,48 @@ string_literal        \"([^\\\"\n\v\f\r]|\\.)*\"
   CHECK(str.consume_front("\"") && str.consume_back("\""));
   std::optional<std::string> unescaped = Carbon::UnescapeStringLiteral(str);
   if (unescaped == std::nullopt) {
-    if (context.trace()) {
-      // Print a newline because trace prints an incomplete line
-      // "Reading a token: ".
-      llvm::errs() << "\n";
-    }
-    FATAL_COMPILATION_ERROR(context.source_loc())
-        << "Invalid escaping in string: " << yytext;
+    FATAL_SYNTAX_ERROR(context) << "Invalid escaping in string: " << yytext;
   }
   return ARG_TOKEN(string_literal, *unescaped);
 }
 
+\"\"\" {
+  // Block string literal.
+  std::string s(yytext);
+  // Scans for the closing """, checking for possible escape sequences
+  // like \""".
+  for (;;) {
+    char c = ReadChar(yyscanner, context);
+    s.push_back(c);
+    if (c != '"' && c != '\\') {
+      continue;
+    }
+    if (c == '\\') {
+      // \" in \""" is not a terminator.
+      s.push_back(ReadChar(yyscanner, context));
+      continue;
+    }
+
+    c = ReadChar(yyscanner, context);
+    s.push_back(c);
+    if (c != '"') {
+      continue;
+    }
+
+    c = ReadChar(yyscanner, context);
+    s.push_back(c);
+    if (c == '"') {
+      break;
+    }
+  }
+  llvm::Expected<std::string> block_string = Carbon::ParseBlockStringLiteral(s);
+  if (!block_string) {
+    FATAL_SYNTAX_ERROR(context)
+        << "Invalid block string: " << toString(block_string.takeError());
+  }
+  return ARG_TOKEN(string_literal, *block_string);
+}
+
 {one_line_comment} {
   // Advance end by 1 line, resetting the column to zero.
   context.current_token_position.lines(1);
@@ -289,14 +330,17 @@ string_literal        \"([^\\\"\n\v\f\r]|\\.)*\"
 }
 
 . {
-  if (context.trace()) {
-    // Print a newline because trace prints an incomplete line
-    // "Reading a token: ".
-    llvm::errs() << "\n";
-  }
-  FATAL_COMPILATION_ERROR(context.source_loc())
+  FATAL_SYNTAX_ERROR(context)
       << "invalid character '\\x" << llvm::toHex(llvm::StringRef(yytext, 1))
       << "' in source file.";
 }
 
 %%
+
+char ReadChar(yyscan_t yyscanner, const Carbon::ParseAndLexContext& context) {
+  const int c = yyinput(yyscanner);
+  if (c == EOF) {
+    FATAL_SYNTAX_ERROR(context) << "Unexpected end of file";
+  }
+  return c;
+}

+ 2 - 2
executable_semantics/syntax/parse_and_lex_context.h

@@ -23,12 +23,12 @@ class ParseAndLexContext {
   // Writes a syntax error diagnostic containing message to standard error.
   auto PrintDiagnostic(const std::string& message) -> void;
 
-  auto source_loc() -> SourceLocation {
+  auto source_loc() const -> SourceLocation {
     return SourceLocation(input_file_name_,
                           static_cast<int>(current_token_position.begin.line));
   }
 
-  auto trace() -> bool { return trace_; }
+  auto trace() const -> bool { return trace_; }
 
   // The source range of the token being (or just) lex'd.
   location current_token_position;

+ 24 - 0
executable_semantics/testdata/string/block.carbon

@@ -0,0 +1,24 @@
+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+// Exceptions. See /LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// RUN: %{executable_semantics} %s 2>&1 | \
+// RUN:   %{FileCheck} --match-full-lines --allow-unused-prefixes=false %s
+// RUN: %{executable_semantics} --trace %s 2>&1 | \
+// RUN:   %{FileCheck} --match-full-lines --allow-unused-prefixes %s
+// AUTOUPDATE: %{executable_semantics} %s
+// CHECK: result: 0
+
+package ExecutableSemanticsTest api;
+
+fn Main() -> i32 {
+  var s: String = """
+    A "block" ""string"" literal
+      with indent.
+    """;
+  if (s == "A \"block\" \"\"string\"\" literal\n  with indent.\n") {
+    return 0;
+  } else {
+    return 1;
+  }
+}

+ 24 - 0
executable_semantics/testdata/string/block_escaped_triple_quotes.carbon

@@ -0,0 +1,24 @@
+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+// Exceptions. See /LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// RUN: %{executable_semantics} %s 2>&1 | \
+// RUN:   %{FileCheck} --match-full-lines --allow-unused-prefixes=false %s
+// RUN: %{executable_semantics} --trace %s 2>&1 | \
+// RUN:   %{FileCheck} --match-full-lines --allow-unused-prefixes %s
+// AUTOUPDATE: %{executable_semantics} %s
+// CHECK: result: 0
+
+package ExecutableSemanticsTest api;
+
+fn Main() -> i32 {
+  var s: String = """
+    A block string literal
+    \"""
+    """;
+  if (s == "A block string literal\n\"\"\"\n") {
+    return 0;
+  } else {
+    return 1;
+  }
+}

+ 20 - 0
executable_semantics/testdata/string/fail_block_quotes_not_on_own_line.carbon

@@ -0,0 +1,20 @@
+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+// Exceptions. See /LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// RUN: %{not} %{executable_semantics} %s 2>&1 | \
+// RUN:   %{FileCheck} --match-full-lines --allow-unused-prefixes=false %s
+// RUN: %{not} %{executable_semantics} --trace %s 2>&1 | \
+// RUN:   %{FileCheck} --match-full-lines --allow-unused-prefixes %s
+// AUTOUPDATE: %{executable_semantics} %s
+// CHECK: COMPILATION ERROR: {{.*}}/executable_semantics/testdata/string/fail_block_quotes_not_on_own_line.carbon:15: Invalid block string: Should end with triple quotes: error: closing """
+
+package ExecutableSemanticsTest api;
+
+fn Main() -> i32 {
+  var s: String = """
+    error: closing """ is not on its own line.
+  """;
+
+  return 0;
+}