string_literal.cpp 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #include "toolchain/lexer/string_literal.h"
  5. #include "common/check.h"
  6. #include "llvm/ADT/SmallString.h"
  7. #include "llvm/ADT/StringExtras.h"
  8. #include "llvm/Support/ConvertUTF.h"
  9. #include "llvm/Support/ErrorHandling.h"
  10. #include "llvm/Support/FormatVariadic.h"
  11. #include "toolchain/lexer/character_set.h"
  12. #include "toolchain/lexer/lex_helpers.h"
  13. namespace Carbon {
  14. using LexerDiagnosticEmitter = DiagnosticEmitter<const char*>;
  15. struct ContentBeforeStringTerminator
  16. : DiagnosticBase<ContentBeforeStringTerminator> {
  17. static constexpr llvm::StringLiteral ShortName = "syntax-invalid-string";
  18. static constexpr llvm::StringLiteral Message =
  19. "Only whitespace is permitted before the closing `\"\"\"` of a "
  20. "multi-line string.";
  21. };
  22. struct UnicodeEscapeTooLarge : DiagnosticBase<UnicodeEscapeTooLarge> {
  23. static constexpr llvm::StringLiteral ShortName = "syntax-invalid-string";
  24. static constexpr llvm::StringLiteral Message =
  25. "Code point specified by `\\u{...}` escape is greater than 0x10FFFF.";
  26. };
  27. struct UnicodeEscapeSurrogate : DiagnosticBase<UnicodeEscapeSurrogate> {
  28. static constexpr llvm::StringLiteral ShortName = "syntax-invalid-string";
  29. static constexpr llvm::StringLiteral Message =
  30. "Code point specified by `\\u{...}` escape is a surrogate character.";
  31. };
  32. struct UnicodeEscapeMissingBracedDigits
  33. : DiagnosticBase<UnicodeEscapeMissingBracedDigits> {
  34. static constexpr llvm::StringLiteral ShortName = "syntax-invalid-string";
  35. static constexpr llvm::StringLiteral Message =
  36. "Escape sequence `\\u` must be followed by a braced sequence of "
  37. "uppercase hexadecimal digits, for example `\\u{70AD}`.";
  38. };
  39. struct HexadecimalEscapeMissingDigits
  40. : DiagnosticBase<HexadecimalEscapeMissingDigits> {
  41. static constexpr llvm::StringLiteral ShortName = "syntax-invalid-string";
  42. static constexpr llvm::StringLiteral Message =
  43. "Escape sequence `\\x` must be followed by two "
  44. "uppercase hexadecimal digits, for example `\\x0F`.";
  45. };
  46. struct DecimalEscapeSequence : DiagnosticBase<DecimalEscapeSequence> {
  47. static constexpr llvm::StringLiteral ShortName = "syntax-invalid-string";
  48. static constexpr llvm::StringLiteral Message =
  49. "Decimal digit follows `\\0` escape sequence. Use `\\x00` instead of "
  50. "`\\0` if the next character is a digit.";
  51. };
  52. struct UnknownEscapeSequence : DiagnosticBase<UnknownEscapeSequence> {
  53. static constexpr llvm::StringLiteral ShortName = "syntax-invalid-string";
  54. static constexpr const char* Message = "Unrecognized escape sequence `{0}`.";
  55. auto Format() -> std::string { return llvm::formatv(Message, first).str(); }
  56. char first;
  57. };
  58. struct MismatchedIndentInString : DiagnosticBase<MismatchedIndentInString> {
  59. static constexpr llvm::StringLiteral ShortName = "syntax-invalid-string";
  60. static constexpr llvm::StringLiteral Message =
  61. "Indentation does not match that of the closing \"\"\" in multi-line "
  62. "string literal.";
  63. };
  64. struct InvalidHorizontalWhitespaceInString
  65. : DiagnosticBase<InvalidHorizontalWhitespaceInString> {
  66. static constexpr llvm::StringLiteral ShortName = "syntax-invalid-string";
  67. static constexpr llvm::StringLiteral Message =
  68. "Whitespace other than plain space must be expressed with an escape "
  69. "sequence in a string literal.";
  70. };
  71. static constexpr char MultiLineIndicator[] = R"(""")";
  72. // Return the number of opening characters of a multi-line string literal,
  73. // after any '#'s, including the file type indicator and following newline.
  74. static auto GetMultiLineStringLiteralPrefixSize(llvm::StringRef source_text)
  75. -> int {
  76. if (!source_text.startswith(MultiLineIndicator)) {
  77. return 0;
  78. }
  79. // The rest of the line must be a valid file type indicator: a sequence of
  80. // characters containing neither '#' nor '"' followed by a newline.
  81. auto prefix_end =
  82. source_text.find_first_of("#\n\"", strlen(MultiLineIndicator));
  83. if (prefix_end == llvm::StringRef::npos || source_text[prefix_end] != '\n') {
  84. return 0;
  85. }
  86. // Include the newline on return.
  87. return prefix_end + 1;
  88. }
  89. auto LexedStringLiteral::Lex(llvm::StringRef source_text)
  90. -> llvm::Optional<LexedStringLiteral> {
  91. int64_t cursor = 0;
  92. const int64_t source_text_size = source_text.size();
  93. // Determine the number of hashes prefixing.
  94. while (cursor < source_text_size && source_text[cursor] == '#') {
  95. ++cursor;
  96. }
  97. const int hash_level = cursor;
  98. llvm::SmallString<16> terminator("\"");
  99. llvm::SmallString<16> escape("\\");
  100. const int multi_line_prefix_size =
  101. GetMultiLineStringLiteralPrefixSize(source_text.substr(hash_level));
  102. const bool multi_line = multi_line_prefix_size > 0;
  103. if (multi_line) {
  104. cursor += multi_line_prefix_size;
  105. terminator = MultiLineIndicator;
  106. } else if (cursor < source_text_size && source_text[cursor] == '"') {
  107. ++cursor;
  108. } else {
  109. return llvm::None;
  110. }
  111. const int prefix_len = cursor;
  112. // The terminator and escape sequence marker require a number of '#'s
  113. // matching the leading sequence of '#'s.
  114. terminator.resize(terminator.size() + hash_level, '#');
  115. escape.resize(escape.size() + hash_level, '#');
  116. // TODO: Detect indent / dedent for multi-line string literals in order to
  117. // stop parsing on dedent before a terminator is found.
  118. for (; cursor < source_text_size; ++cursor) {
  119. // This switch and loop structure relies on multi-character terminators and
  120. // escape sequences starting with a predictable character and not containing
  121. // embedded and unescaped terminators or newlines.
  122. switch (source_text[cursor]) {
  123. case '\\':
  124. if (escape.size() == 1 ||
  125. source_text.substr(cursor).startswith(escape)) {
  126. cursor += escape.size();
  127. // If there's either not a character following the escape, or it's a
  128. // single-line string and the escaped character is a newline, we
  129. // should stop here.
  130. if (cursor >= source_text_size ||
  131. (!multi_line && source_text[cursor] == '\n')) {
  132. llvm::StringRef text = source_text.take_front(cursor);
  133. return LexedStringLiteral(text, text.drop_front(prefix_len),
  134. hash_level, multi_line,
  135. /*is_terminated=*/false);
  136. }
  137. }
  138. break;
  139. case '\n':
  140. if (!multi_line) {
  141. llvm::StringRef text = source_text.take_front(cursor);
  142. return LexedStringLiteral(text, text.drop_front(prefix_len),
  143. hash_level, multi_line,
  144. /*is_terminated=*/false);
  145. }
  146. break;
  147. case '\"': {
  148. if (terminator.size() == 1 ||
  149. source_text.substr(cursor).startswith(terminator)) {
  150. llvm::StringRef text =
  151. source_text.substr(0, cursor + terminator.size());
  152. llvm::StringRef content =
  153. source_text.substr(prefix_len, cursor - prefix_len);
  154. return LexedStringLiteral(text, content, hash_level, multi_line,
  155. /*is_terminated=*/true);
  156. }
  157. break;
  158. }
  159. }
  160. }
  161. // No terminator was found.
  162. return LexedStringLiteral(source_text, source_text.drop_front(prefix_len),
  163. hash_level, multi_line,
  164. /*is_terminated=*/false);
  165. }
  166. // Given a string that contains at least one newline, find the indent (the
  167. // leading sequence of horizontal whitespace) of its final line.
  168. static auto ComputeIndentOfFinalLine(llvm::StringRef text) -> llvm::StringRef {
  169. int indent_end = text.size();
  170. for (int i = indent_end - 1; i >= 0; --i) {
  171. if (text[i] == '\n') {
  172. int indent_start = i + 1;
  173. return text.substr(indent_start, indent_end - indent_start);
  174. }
  175. if (!IsSpace(text[i])) {
  176. indent_end = i;
  177. }
  178. }
  179. llvm_unreachable("Given text is required to contain a newline.");
  180. }
  181. // Check the literal is indented properly, if it's a multi-line litera.
  182. // Find the leading whitespace that should be removed from each line of a
  183. // multi-line string literal.
  184. static auto CheckIndent(LexerDiagnosticEmitter& emitter, llvm::StringRef text,
  185. llvm::StringRef content) -> llvm::StringRef {
  186. // Find the leading horizontal whitespace on the final line of this literal.
  187. // Note that for an empty literal, this might not be inside the content.
  188. llvm::StringRef indent = ComputeIndentOfFinalLine(text);
  189. // The last line is not permitted to contain any content after its
  190. // indentation.
  191. if (indent.end() != content.end()) {
  192. emitter.EmitError<ContentBeforeStringTerminator>(indent.end());
  193. }
  194. return indent;
  195. }
  196. // Expand a `\u{HHHHHH}` escape sequence into a sequence of UTF-8 code units.
  197. static auto ExpandUnicodeEscapeSequence(LexerDiagnosticEmitter& emitter,
  198. llvm::StringRef digits,
  199. std::string& result) -> bool {
  200. unsigned code_point;
  201. if (!CanLexInteger(emitter, digits)) {
  202. return false;
  203. }
  204. if (digits.getAsInteger(16, code_point) || code_point > 0x10FFFF) {
  205. emitter.EmitError<UnicodeEscapeTooLarge>(digits.begin());
  206. return false;
  207. }
  208. if (code_point >= 0xD800 && code_point < 0xE000) {
  209. emitter.EmitError<UnicodeEscapeSurrogate>(digits.begin());
  210. return false;
  211. }
  212. // Convert the code point to a sequence of UTF-8 code units.
  213. // Every code point fits in 6 UTF-8 code units.
  214. const llvm::UTF32 utf32_code_units[1] = {code_point};
  215. llvm::UTF8 utf8_code_units[6];
  216. const llvm::UTF32* src_pos = utf32_code_units;
  217. llvm::UTF8* dest_pos = utf8_code_units;
  218. llvm::ConversionResult conv_result = llvm::ConvertUTF32toUTF8(
  219. &src_pos, src_pos + 1, &dest_pos, dest_pos + 6, llvm::strictConversion);
  220. if (conv_result != llvm::conversionOK) {
  221. llvm_unreachable("conversion of valid code point to UTF-8 cannot fail");
  222. }
  223. result.insert(result.end(), reinterpret_cast<char*>(utf8_code_units),
  224. reinterpret_cast<char*>(dest_pos));
  225. return true;
  226. }
  227. // Expand an escape sequence, appending the expanded value to the given
  228. // `result` string. `content` is the string content, starting from the first
  229. // character after the escape sequence introducer (for example, the `n` in
  230. // `\n`), and will be updated to remove the leading escape sequence.
  231. static auto ExpandAndConsumeEscapeSequence(LexerDiagnosticEmitter& emitter,
  232. llvm::StringRef& content,
  233. std::string& result) -> void {
  234. CHECK(!content.empty()) << "should have escaped closing delimiter";
  235. char first = content.front();
  236. content = content.drop_front(1);
  237. switch (first) {
  238. case 't':
  239. result += '\t';
  240. return;
  241. case 'n':
  242. result += '\n';
  243. return;
  244. case 'r':
  245. result += '\r';
  246. return;
  247. case '"':
  248. result += '"';
  249. return;
  250. case '\'':
  251. result += '\'';
  252. return;
  253. case '\\':
  254. result += '\\';
  255. return;
  256. case '0':
  257. result += '\0';
  258. if (!content.empty() && IsDecimalDigit(content.front())) {
  259. emitter.EmitError<DecimalEscapeSequence>(content.begin());
  260. return;
  261. }
  262. return;
  263. case 'x':
  264. if (content.size() >= 2 && IsUpperHexDigit(content[0]) &&
  265. IsUpperHexDigit(content[1])) {
  266. result +=
  267. static_cast<char>(llvm::hexFromNibbles(content[0], content[1]));
  268. content = content.drop_front(2);
  269. return;
  270. }
  271. emitter.EmitError<HexadecimalEscapeMissingDigits>(content.begin());
  272. break;
  273. case 'u': {
  274. llvm::StringRef remaining = content;
  275. if (remaining.consume_front("{")) {
  276. llvm::StringRef digits = remaining.take_while(IsUpperHexDigit);
  277. remaining = remaining.drop_front(digits.size());
  278. if (!digits.empty() && remaining.consume_front("}")) {
  279. if (!ExpandUnicodeEscapeSequence(emitter, digits, result)) {
  280. break;
  281. }
  282. content = remaining;
  283. return;
  284. }
  285. }
  286. emitter.EmitError<UnicodeEscapeMissingBracedDigits>(content.begin());
  287. break;
  288. }
  289. default:
  290. emitter.EmitError<UnknownEscapeSequence>(content.begin() - 1,
  291. {.first = first});
  292. break;
  293. }
  294. // If we get here, we didn't recognize this escape sequence and have already
  295. // issued a diagnostic. For error recovery purposes, expand this escape
  296. // sequence to itself, dropping the introducer (for example, `\q` -> `q`).
  297. result += first;
  298. }
  299. // Expand any escape sequences in the given string literal.
  300. static auto ExpandEscapeSequencesAndRemoveIndent(
  301. LexerDiagnosticEmitter& emitter, llvm::StringRef contents, int hash_level,
  302. llvm::StringRef indent) -> std::string {
  303. std::string result;
  304. result.reserve(contents.size());
  305. llvm::SmallString<16> escape("\\");
  306. escape.resize(1 + hash_level, '#');
  307. // Process each line of the string literal.
  308. while (true) {
  309. // Every non-empty line (that contains anything other than horizontal
  310. // whitespace) is required to start with the string's indent. For error
  311. // recovery, remove all leading whitespace if the indent doesn't match.
  312. if (!contents.consume_front(indent)) {
  313. const char* line_start = contents.begin();
  314. contents = contents.drop_while(IsHorizontalWhitespace);
  315. if (!contents.startswith("\n")) {
  316. emitter.EmitError<MismatchedIndentInString>(line_start);
  317. }
  318. }
  319. // Process the contents of the line.
  320. while (true) {
  321. auto end_of_regular_text = contents.find_if([](char c) {
  322. return c == '\n' || c == '\\' ||
  323. (IsHorizontalWhitespace(c) && c != ' ');
  324. });
  325. result += contents.substr(0, end_of_regular_text);
  326. contents = contents.substr(end_of_regular_text);
  327. if (contents.empty()) {
  328. return result;
  329. }
  330. if (contents.consume_front("\n")) {
  331. // Trailing whitespace before a newline doesn't contribute to the string
  332. // literal value.
  333. while (!result.empty() && result.back() != '\n' &&
  334. IsSpace(result.back())) {
  335. result.pop_back();
  336. }
  337. result += '\n';
  338. // Move onto to the next line.
  339. break;
  340. }
  341. if (IsHorizontalWhitespace(contents.front())) {
  342. // Horizontal whitespace other than ` ` is valid only at the end of a
  343. // line.
  344. CHECK(contents.front() != ' ')
  345. << "should not have stopped at a plain space";
  346. auto after_space = contents.find_if_not(IsHorizontalWhitespace);
  347. if (after_space == llvm::StringRef::npos ||
  348. contents[after_space] != '\n') {
  349. // TODO: Include the source range of the whitespace up to
  350. // `contents.begin() + after_space` in the diagnostic.
  351. emitter.EmitError<InvalidHorizontalWhitespaceInString>(
  352. contents.begin());
  353. // Include the whitespace in the string contents for error recovery.
  354. result += contents.substr(0, after_space);
  355. }
  356. contents = contents.substr(after_space);
  357. continue;
  358. }
  359. if (!contents.consume_front(escape)) {
  360. // This is not an escape sequence, just a raw `\`.
  361. result += contents.front();
  362. contents = contents.drop_front(1);
  363. continue;
  364. }
  365. if (contents.consume_front("\n")) {
  366. // An escaped newline ends the line without producing any content and
  367. // without trimming trailing whitespace.
  368. break;
  369. }
  370. // Handle this escape sequence.
  371. ExpandAndConsumeEscapeSequence(emitter, contents, result);
  372. }
  373. }
  374. }
  375. auto LexedStringLiteral::ComputeValue(LexerDiagnosticEmitter& emitter) const
  376. -> std::string {
  377. if (!is_terminated_) {
  378. return "";
  379. }
  380. llvm::StringRef indent =
  381. multi_line_ ? CheckIndent(emitter, text_, content_) : llvm::StringRef();
  382. return ExpandEscapeSequencesAndRemoveIndent(emitter, content_, hash_level_,
  383. indent);
  384. }
  385. } // namespace Carbon