tomteb
/
carbon-lang


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294
							// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
// Exceptions. See /LICENSE for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#include "toolchain/lexer/string_literal.h"

#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "toolchain/diagnostics/diagnostic_emitter.h"
#include "toolchain/lexer/test_helpers.h"

namespace Carbon {
namespace {

struct StringLiteralTest : ::testing::Test {
  StringLiteralTest() : error_tracker(ConsoleDiagnosticConsumer()) {}

  ErrorTrackingDiagnosticConsumer error_tracker;

  auto Lex(llvm::StringRef text) -> LexedStringLiteral {
    llvm::Optional<LexedStringLiteral> result = LexedStringLiteral::Lex(text);
    assert(result);
    EXPECT_EQ(result->Text(), text);
    return *result;
  }

  auto Parse(llvm::StringRef text) -> std::string {
    LexedStringLiteral token = Lex(text);
    Testing::SingleTokenDiagnosticTranslator translator(text);
    DiagnosticEmitter<const char*> emitter(translator, error_tracker);
    return token.ComputeValue(emitter);
  }
};

TEST_F(StringLiteralTest, StringLiteralBounds) {
  llvm::StringLiteral valid[] = {
      R"("")",
      R"("""
      """)",
      R"("""
      "foo"
      """)",

      // Escaped terminators don't end the string.
      R"("\"")",
      R"("\\")",
      R"("\\\"")",
      R"("""
      \"""
      """)",
      R"("""
      "\""
      """)",
      R"("""
      ""\"
      """)",
      R"("""
      ""\
      """)",
      R"(#"""
      """\#n
      """#)",

      // Only a matching number of '#'s terminates the string.
      R"(#""#)",
      R"(#"xyz"foo"#)",
      R"(##"xyz"#foo"##)",
      R"(#"\""#)",

      // Escape sequences likewise require a matching number of '#'s.
      R"(#"\#"#"#)",
      R"(#"\"#)",
      R"(#"""
      \#"""#
      """#)",

      // #"""# does not start a multiline string literal.
      R"(#"""#)",
      R"(##"""##)",
  };

  for (llvm::StringLiteral test : valid) {
    llvm::Optional<LexedStringLiteral> result = LexedStringLiteral::Lex(test);
    EXPECT_TRUE(result.hasValue()) << test;
    if (result) {
      EXPECT_EQ(result->Text(), test);
    }
  }

  llvm::StringLiteral invalid[] = {
      R"(")",
      R"("""
      "")",
      R"("\)",  //
      R"("\")",
      R"("\\)",  //
      R"("\\\")",
      R"("""
      )",
      R"(#"""
      """)",
      R"(" \
      ")",
  };

  for (llvm::StringLiteral test : invalid) {
    EXPECT_FALSE(LexedStringLiteral::Lex(test).hasValue())
        << "`" << test << "`";
  }
}

TEST_F(StringLiteralTest, StringLiteralContents) {
  // We use ""s strings to handle embedded nul characters below.
  using std::operator""s;

  std::pair<llvm::StringLiteral, llvm::StringLiteral> testcases[] = {
      // Empty strings.
      {R"("")", ""},

      {R"(
"""
"""
       )",
       ""},

      // Nearly-empty strings.
      {R"(
"""

"""
       )",
       "\n"},

      // Lines containing only whitespace are treated as empty even if they
      // contain tabs.
      {"\"\"\"\n\t  \t\n\"\"\"", "\n"},

      // Indent removal.
      {R"(
       """file type indicator
          indented contents \
         """
       )",
       " indented contents "},

      // Removal of tabs in indent and suffix.
      {"\"\"\"\n \t  hello \t \n \t \"\"\"", " hello\n"},

      {R"(
    """
   hello
  world

   end of test
  """
       )",
       " hello\nworld\n\n end of test\n"},

      // Escape sequences.
      {R"(
       "\x14,\u{1234},\u{00000010},\n,\r,\t,\0,\",\',\\"
       )",
       llvm::StringLiteral::withInnerNUL(
           "\x14,\xE1\x88\xB4,\x10,\x0A,\x0D,\x09,\x00,\x22,\x27,\x5C")},

      {R"(
       "\0A\x1234"
       )",
       llvm::StringLiteral::withInnerNUL("\0A\x12"
                                         "34")},

      {R"(
       "\u{D7FF},\u{E000},\u{10FFFF}"
       )",
       "\xED\x9F\xBF,\xEE\x80\x80,\xF4\x8F\xBF\xBF"},

      // Escape sequences in 'raw' strings.
      {R"(
       #"\#x00,\#xFF,\#u{56789},\#u{ABCD},\#u{00000000000000000EF}"#
       )",
       llvm::StringLiteral::withInnerNUL(
           "\x00,\xFF,\xF1\x96\x9E\x89,\xEA\xAF\x8D,\xC3\xAF")},

      {R"(
       ##"\n,\#n,\##n,\##\##n,\##\###n"##
       )",
       "\\n,\\#n,\n,\\##n,\\###n"},

      // Trailing whitespace handling.
      {"\"\"\"\n  Hello \\\n  World \t \n  Bye!  \\\n  \"\"\"",
       "Hello World\nBye!  "},
  };

  for (auto [test, contents] : testcases) {
    error_tracker.Reset();
    auto value = Parse(test.trim());
    EXPECT_FALSE(error_tracker.SeenError()) << "`" << test << "`";
    EXPECT_EQ(value, contents);
  }
}

TEST_F(StringLiteralTest, StringLiteralBadIndent) {
  std::pair<llvm::StringLiteral, llvm::StringLiteral> testcases[] = {
      // Indent doesn't match the last line.
      {"\"\"\"\n \tx\n  \"\"\"", "x\n"},
      {"\"\"\"\n x\n  \"\"\"", "x\n"},
      {"\"\"\"\n  x\n\t\"\"\"", "x\n"},
      {"\"\"\"\n  ok\n bad\n  \"\"\"", "ok\nbad\n"},
      {"\"\"\"\n bad\n  ok\n  \"\"\"", "bad\nok\n"},
      {"\"\"\"\n  escaped,\\\n bad\n  \"\"\"", "escaped,bad\n"},

      // Indent on last line is followed by text.
      {"\"\"\"\n  x\n  x\"\"\"", "x\nx"},
      {"\"\"\"\n   x\n  x\"\"\"", " x\nx"},
      {"\"\"\"\n x\n  x\"\"\"", "x\nx"},
  };

  for (auto [test, contents] : testcases) {
    error_tracker.Reset();
    auto value = Parse(test);
    EXPECT_TRUE(error_tracker.SeenError()) << "`" << test << "`";
    EXPECT_EQ(value, contents);
  }
}

TEST_F(StringLiteralTest, StringLiteralBadEscapeSequence) {
  llvm::StringLiteral testcases[] = {
      R"("\a")",
      R"("\b")",
      R"("\e")",
      R"("\f")",
      R"("\v")",
      R"("\?")",
      R"("\1")",
      R"("\9")",

      // \0 can't be followed by a decimal digit.
      R"("\01")",
      R"("\09")",

      // \x requires two (uppercase) hexadecimal digits.
      R"("\x")",
      R"("\x0")",
      R"("\x0G")",
      R"("\xab")",
      R"("\x\n")",
      R"("\x\"")",

      // \u requires a braced list of one or more hexadecimal digits.
      R"("\u")",
      R"("\u?")",
      R"("\u\"")",
      R"("\u{")",
      R"("\u{}")",
      R"("\u{A")",
      R"("\u{G}")",
      R"("\u{0000012323127z}")",
      R"("\u{-3}")",

      // \u must specify a non-surrogate code point.
      R"("\u{110000}")",
      R"("\u{000000000000000000000000000000000110000}")",
      R"("\u{D800}")",
      R"("\u{DFFF}")",
  };

  for (llvm::StringLiteral test : testcases) {
    error_tracker.Reset();
    auto value = Parse(test);
    EXPECT_TRUE(error_tracker.SeenError()) << "`" << test << "`";
    // TODO: Test value produced by error recovery.
  }
}

TEST_F(StringLiteralTest, TabInString) {
  auto value = Parse("\"x\ty\"");
  EXPECT_TRUE(error_tracker.SeenError());
  EXPECT_EQ(value, "x\ty");
}

TEST_F(StringLiteralTest, TabAtEndOfString) {
  auto value = Parse("\"\t\t\t\"");
  EXPECT_TRUE(error_tracker.SeenError());
  EXPECT_EQ(value, "\t\t\t");
}

TEST_F(StringLiteralTest, TabInBlockString) {
  auto value = Parse("\"\"\"\nx\ty\n\"\"\"");
  EXPECT_TRUE(error_tracker.SeenError());
  EXPECT_EQ(value, "x\ty\n");
}

}  // namespace
}  // namespace Carbon