string_literal_test.cpp 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #include "toolchain/lex/string_literal.h"
  5. #include <gmock/gmock.h>
  6. #include <gtest/gtest.h>
  7. #include <optional>
  8. #include <string>
  9. #include <utility>
  10. #include "common/check.h"
  11. #include "toolchain/diagnostics/emitter.h"
  12. #include "toolchain/lex/test_helpers.h"
  13. namespace Carbon::Lex {
  14. namespace {
  15. class StringLiteralTest : public ::testing::Test {
  16. public:
  17. StringLiteralTest() : error_tracker_(Diagnostics::ConsoleConsumer()) {}
  18. auto Lex(llvm::StringRef text) -> StringLiteral {
  19. std::optional<StringLiteral> result = StringLiteral::Lex(text);
  20. CARBON_CHECK(result);
  21. EXPECT_EQ(result->text(), text);
  22. return *result;
  23. }
  24. auto Parse(llvm::StringRef text) -> llvm::StringRef {
  25. StringLiteral token = Lex(text);
  26. Testing::SingleTokenDiagnosticEmitter emitter(&error_tracker_, text);
  27. return token.ComputeStringValue(allocator_, emitter);
  28. }
  29. llvm::BumpPtrAllocator allocator_;
  30. Diagnostics::ErrorTrackingConsumer error_tracker_;
  31. };
  32. TEST_F(StringLiteralTest, StringLiteralBounds) {
  33. llvm::StringLiteral valid[] = {
  34. R"("")",
  35. R"('''
  36. ''')",
  37. R"('''
  38. "foo"
  39. ''')",
  40. // Lex """-delimited block string literals for error recovery.
  41. R"("""
  42. """)",
  43. R"("""
  44. "foo"
  45. """)",
  46. // Escaped terminators don't end the string.
  47. R"("\"")",
  48. R"("\\")",
  49. R"("\\\"")",
  50. R"('''
  51. \'''
  52. ''')",
  53. R"('''
  54. '\''
  55. ''')",
  56. R"('''
  57. ''\'
  58. ''')",
  59. R"('''
  60. ''\
  61. ''')",
  62. R"(#'''
  63. '''\#n
  64. '''#)",
  65. // Only a matching number of '#'s terminates the string.
  66. R"(#""#)",
  67. R"(#"xyz"foo"#)",
  68. R"(##"xyz"#foo"##)",
  69. R"(#"\""#)",
  70. // Escape sequences likewise require a matching number of '#'s.
  71. R"(#"\#"#"#)",
  72. R"(#"\"#)",
  73. R"(#'''
  74. \#'''#
  75. '''#)",
  76. // #"""# does not start a multiline string literal.
  77. R"(#"""#)",
  78. R"(##"""##)",
  79. };
  80. for (llvm::StringLiteral test : valid) {
  81. SCOPED_TRACE(test);
  82. std::optional<StringLiteral> result = StringLiteral::Lex(test);
  83. EXPECT_TRUE(result.has_value());
  84. if (result) {
  85. EXPECT_EQ(result->text(), test);
  86. }
  87. }
  88. llvm::StringLiteral invalid[] = {
  89. // clang-format off
  90. R"(")",
  91. R"("\)",
  92. R"("\")",
  93. R"("\\)",
  94. R"("\\\")",
  95. "'''\n",
  96. "'''\n'",
  97. "'''\n''",
  98. "#'''\n'''",
  99. R"(" \
  100. ")",
  101. // clang-format on
  102. };
  103. for (llvm::StringLiteral test : invalid) {
  104. SCOPED_TRACE(test);
  105. std::optional<StringLiteral> result = StringLiteral::Lex(test);
  106. EXPECT_TRUE(result.has_value());
  107. if (result) {
  108. EXPECT_FALSE(result->is_terminated());
  109. }
  110. }
  111. }
  112. TEST_F(StringLiteralTest, StringLiteralContents) {
  113. std::pair<llvm::StringLiteral, llvm::StringLiteral> testcases[] = {
  114. // Empty strings.
  115. {R"("")", ""},
  116. {R"(
  117. '''
  118. '''
  119. )",
  120. ""},
  121. // Nearly-empty strings.
  122. {R"(
  123. '''
  124. '''
  125. )",
  126. "\n"},
  127. // Lines containing only whitespace are treated as empty even if they
  128. // contain tabs.
  129. {"'''\n\t \t\n'''", "\n"},
  130. // Indent removal.
  131. {R"(
  132. '''file type indicator
  133. indented contents \
  134. '''
  135. )",
  136. " indented contents "},
  137. // Removal of tabs in indent and suffix.
  138. {"'''\n \t hello \t \n \t '''", " hello\n"},
  139. {R"(
  140. '''
  141. hello
  142. world
  143. end of test
  144. '''
  145. )",
  146. " hello\nworld\n\n end of test\n"},
  147. // Escape sequences.
  148. {R"(
  149. "\x14,\u{1234},\u{00000010},\n,\r,\t,\0,\",\',\\"
  150. )",
  151. llvm::StringLiteral::withInnerNUL(
  152. "\x14,\xE1\x88\xB4,\x10,\x0A,\x0D,\x09,\x00,\x22,\x27,\x5C")},
  153. {R"(
  154. "\0A\x1234"
  155. )",
  156. llvm::StringLiteral::withInnerNUL("\0A\x12"
  157. "34")},
  158. {R"(
  159. "\u{D7FF},\u{E000},\u{10FFFF}"
  160. )",
  161. "\xED\x9F\xBF,\xEE\x80\x80,\xF4\x8F\xBF\xBF"},
  162. // Escape sequences in 'raw' strings.
  163. {R"(
  164. #"\#x00,\#xFF,\#u{56789},\#u{ABCD},\#u{00000000000000000EF}"#
  165. )",
  166. llvm::StringLiteral::withInnerNUL(
  167. "\x00,\xFF,\xF1\x96\x9E\x89,\xEA\xAF\x8D,\xC3\xAF")},
  168. {R"(
  169. ##"\n,\#n,\##n,\##\##n,\##\###n"##
  170. )",
  171. "\\n,\\#n,\n,\\##n,\\###n"},
  172. // Trailing whitespace handling.
  173. {"'''\n Hello \\\n World \t \n Bye! \\\n '''",
  174. "Hello World\nBye! "},
  175. {"'''\n\\t\n'''", "\t\n"},
  176. {"'''\n\\t \n'''", "\t\n"},
  177. };
  178. for (auto [test, expected] : testcases) {
  179. error_tracker_.Reset();
  180. auto value = Parse(test.trim());
  181. EXPECT_FALSE(error_tracker_.seen_error()) << "`" << test << "`";
  182. EXPECT_EQ(value, expected);
  183. }
  184. }
  185. TEST_F(StringLiteralTest, DoubleQuotedMultiLineLiteral) {
  186. // For error recovery, """-delimited literals are lexed, but rejected.
  187. std::pair<llvm::StringLiteral, llvm::StringLiteral> testcases[] = {
  188. {R"(
  189. """
  190. '''
  191. """
  192. )",
  193. "'''\n"},
  194. {R"(
  195. #"""
  196. \#tx
  197. """#
  198. )",
  199. "\tx\n"},
  200. {R"(
  201. """abcxyz
  202. hello\
  203. """
  204. )",
  205. "hello"},
  206. };
  207. for (auto [test, contents] : testcases) {
  208. error_tracker_.Reset();
  209. auto value = Parse(test.trim());
  210. EXPECT_TRUE(error_tracker_.seen_error()) << "`" << test << "`";
  211. EXPECT_EQ(value, contents);
  212. }
  213. }
  214. TEST_F(StringLiteralTest, StringLiteralBadIndent) {
  215. std::pair<llvm::StringLiteral, llvm::StringLiteral> testcases[] = {
  216. // Indent doesn't match the last line.
  217. {"'''\n \tx\n '''", "x\n"},
  218. {"'''\n x\n '''", "x\n"},
  219. {"'''\n x\n\t'''", "x\n"},
  220. {"'''\n ok\n bad\n '''", "ok\nbad\n"},
  221. {"'''\n bad\n ok\n '''", "bad\nok\n"},
  222. {"'''\n escaped,\\\n bad\n '''", "escaped,bad\n"},
  223. // Indent on last line is followed by text.
  224. {"'''\n x\n x'''", "x\nx"},
  225. {"'''\n x\n x'''", " x\nx"},
  226. {"'''\n x\n x'''", "x\nx"},
  227. };
  228. for (auto [test, contents] : testcases) {
  229. error_tracker_.Reset();
  230. auto value = Parse(test);
  231. EXPECT_TRUE(error_tracker_.seen_error()) << "`" << test << "`";
  232. EXPECT_EQ(value, contents);
  233. }
  234. }
  235. TEST_F(StringLiteralTest, StringLiteralBadEscapeSequence) {
  236. llvm::StringLiteral testcases[] = {
  237. R"("\a")",
  238. R"("\b")",
  239. R"("\e")",
  240. R"("\f")",
  241. R"("\v")",
  242. R"("\?")",
  243. R"("\1")",
  244. R"("\9")",
  245. // \0 can't be followed by a decimal digit.
  246. R"("\01")",
  247. R"("\09")",
  248. // \x requires two (uppercase) hexadecimal digits.
  249. R"("\x")",
  250. R"("\x0")",
  251. R"("\x0G")",
  252. R"("\xab")",
  253. R"("\x\n")",
  254. R"("\x\"")",
  255. // \u requires a braced list of one or more hexadecimal digits.
  256. R"("\u")",
  257. R"("\u?")",
  258. R"("\u\"")",
  259. R"("\u{")",
  260. R"("\u{}")",
  261. R"("\u{A")",
  262. R"("\u{G}")",
  263. R"("\u{0000012323127z}")",
  264. R"("\u{-3}")",
  265. // \u must specify a non-surrogate code point.
  266. R"("\u{110000}")",
  267. R"("\u{000000000000000000000000000000000110000}")",
  268. R"("\u{D800}")",
  269. R"("\u{DFFF}")",
  270. };
  271. for (llvm::StringLiteral test : testcases) {
  272. error_tracker_.Reset();
  273. Parse(test);
  274. EXPECT_TRUE(error_tracker_.seen_error()) << "`" << test << "`";
  275. // TODO: Test value produced by error recovery.
  276. }
  277. }
  278. TEST_F(StringLiteralTest, TabInString) {
  279. auto value = Parse("\"x\ty\"");
  280. EXPECT_TRUE(error_tracker_.seen_error());
  281. EXPECT_EQ(value, "x\ty");
  282. }
  283. TEST_F(StringLiteralTest, TabAtEndOfString) {
  284. auto value = Parse("\"\t\t\t\"");
  285. EXPECT_TRUE(error_tracker_.seen_error());
  286. EXPECT_EQ(value, "\t\t\t");
  287. }
  288. TEST_F(StringLiteralTest, TabInBlockString) {
  289. auto value = Parse("'''\nx\ty\n'''");
  290. EXPECT_TRUE(error_tracker_.seen_error());
  291. EXPECT_EQ(value, "x\ty\n");
  292. }
  293. TEST_F(StringLiteralTest, UnicodeTooManyDigits) {
  294. std::string text = "u{";
  295. text.append(10000, '9');
  296. text.append("}");
  297. auto value = Parse("\"\\" + text + "\"");
  298. EXPECT_TRUE(error_tracker_.seen_error());
  299. EXPECT_EQ(value, text);
  300. }
  301. } // namespace
  302. } // namespace Carbon::Lex