string_literal.cpp 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #include "lexer/string_literal.h"
  5. #include "llvm/ADT/SmallString.h"
  6. #include "llvm/ADT/StringExtras.h"
  7. #include "llvm/Support/ConvertUTF.h"
  8. #include "llvm/Support/ErrorHandling.h"
  9. #include "llvm/Support/FormatVariadic.h"
  10. namespace Carbon {
  11. struct ContentBeforeStringTerminator
  12. : SimpleDiagnostic<ContentBeforeStringTerminator> {
  13. static constexpr llvm::StringLiteral ShortName = "syntax-invalid-string";
  14. static constexpr llvm::StringLiteral Message =
  15. "Only whitespace is permitted before the closing `\"\"\"` of a "
  16. "multi-line string.";
  17. };
  18. struct UnicodeEscapeTooLarge : SimpleDiagnostic<UnicodeEscapeTooLarge> {
  19. static constexpr llvm::StringLiteral ShortName = "syntax-invalid-string";
  20. static constexpr llvm::StringLiteral Message =
  21. "Code point specified by `\\u{...}` escape is greater than 0x10FFFF.";
  22. };
  23. struct UnicodeEscapeSurrogate : SimpleDiagnostic<UnicodeEscapeSurrogate> {
  24. static constexpr llvm::StringLiteral ShortName = "syntax-invalid-string";
  25. static constexpr llvm::StringLiteral Message =
  26. "Code point specified by `\\u{...}` escape is a surrogate character.";
  27. };
  28. struct UnicodeEscapeMissingBracedDigits
  29. : SimpleDiagnostic<UnicodeEscapeMissingBracedDigits> {
  30. static constexpr llvm::StringLiteral ShortName = "syntax-invalid-string";
  31. static constexpr llvm::StringLiteral Message =
  32. "Escape sequence `\\u` must be followed by a braced sequence of "
  33. "uppercase hexadecimal digits, for example `\\u{70AD}`.";
  34. };
  35. struct HexadecimalEscapeMissingDigits
  36. : SimpleDiagnostic<HexadecimalEscapeMissingDigits> {
  37. static constexpr llvm::StringLiteral ShortName = "syntax-invalid-string";
  38. static constexpr llvm::StringLiteral Message =
  39. "Escape sequence `\\x` must be followed by two "
  40. "uppercase hexadecimal digits, for example `\\x0F`.";
  41. };
  42. struct DecimalEscapeSequence : SimpleDiagnostic<DecimalEscapeSequence> {
  43. static constexpr llvm::StringLiteral ShortName = "syntax-invalid-string";
  44. static constexpr llvm::StringLiteral Message =
  45. "Decimal digit follows `\\0` escape sequence. Use `\\x00` instead of "
  46. "`\\0` if the next character is a digit.";
  47. };
  48. struct UnknownEscapeSequence {
  49. static constexpr llvm::StringLiteral ShortName = "syntax-invalid-string";
  50. static constexpr const char* Message = "Unrecognized escape sequence `{0}`.";
  51. char first;
  52. auto Format() -> std::string { return llvm::formatv(Message, first).str(); }
  53. };
  54. struct MismatchedIndentInString : SimpleDiagnostic<MismatchedIndentInString> {
  55. static constexpr llvm::StringLiteral ShortName = "syntax-invalid-string";
  56. static constexpr llvm::StringLiteral Message =
  57. "Indentation does not match that of the closing \"\"\" in multi-line "
  58. "string literal.";
  59. };
  60. // TODO(zygoloid): Update this to match whatever we decide qualifies as
  61. // acceptable whitespace.
  62. static bool isSpace(char c) { return c == ' ' || c == '\n' || c == '\t'; }
  63. static constexpr llvm::StringLiteral HorizontalWhitespace = " \t";
  64. static bool isUpperHexDigit(char c) {
  65. return ('0' <= c && c <= '9') || ('A' <= c && c <= 'F');
  66. }
  67. // Find and return the opening characters of a multi-line string literal,
  68. // after any '#'s, including the file type indicator and following newline.
  69. static auto TakeMultiLineStringLiteralPrefix(llvm::StringRef source_text)
  70. -> llvm::StringRef {
  71. llvm::StringRef remaining = source_text;
  72. if (!remaining.consume_front("\"\"\"")) {
  73. return llvm::StringRef();
  74. }
  75. // The rest of the line must be a valid file type indicator: a sequence of
  76. // characters containing neither '#' nor '"' followed by a newline.
  77. remaining = remaining.drop_until(
  78. [](char c) { return c == '"' || c == '#' || c == '\n'; });
  79. if (!remaining.consume_front("\n")) {
  80. return llvm::StringRef();
  81. }
  82. return source_text.take_front(remaining.begin() - source_text.begin());
  83. }
  84. // If source_text begins with a string literal token, extract and return
  85. // information on that token.
  86. auto StringLiteralToken::Lex(llvm::StringRef source_text)
  87. -> llvm::Optional<StringLiteralToken> {
  88. const char* begin = source_text.begin();
  89. int hash_level = 0;
  90. while (source_text.consume_front("#")) {
  91. ++hash_level;
  92. }
  93. llvm::SmallString<16> terminator("\"");
  94. llvm::SmallString<16> escape("\\");
  95. llvm::StringRef multi_line_prefix =
  96. TakeMultiLineStringLiteralPrefix(source_text);
  97. bool multi_line = !multi_line_prefix.empty();
  98. if (multi_line) {
  99. source_text = source_text.drop_front(multi_line_prefix.size());
  100. terminator = "\"\"\"";
  101. } else if (!source_text.consume_front("\"")) {
  102. return llvm::None;
  103. }
  104. // The terminator and escape sequence marker require a number of '#'s
  105. // matching the leading sequence of '#'s.
  106. terminator.resize(terminator.size() + hash_level, '#');
  107. escape.resize(escape.size() + hash_level, '#');
  108. const char* content_begin = source_text.begin();
  109. const char* content_end = content_begin;
  110. while (!source_text.consume_front(terminator)) {
  111. // Let LexError figure out how to recover from an unterminated string
  112. // literal.
  113. if (source_text.empty()) {
  114. return llvm::None;
  115. }
  116. if (!multi_line && source_text.startswith("\n")) {
  117. return llvm::None;
  118. }
  119. // Consume an escape sequence marker if present.
  120. (void)source_text.consume_front(escape);
  121. // Then consume one more character, either of the content or of an
  122. // escape sequence. This relies on multi-character escape sequences
  123. // not containing an embedded and unescaped terminator or newline.
  124. source_text = source_text.substr(1);
  125. content_end = source_text.begin();
  126. }
  127. return StringLiteralToken(
  128. llvm::StringRef(begin, source_text.begin() - begin),
  129. llvm::StringRef(content_begin, content_end - content_begin), hash_level,
  130. multi_line);
  131. }
  132. // Given a string that contains at least one newline, find the indent (the
  133. // leading sequence of horizontal whitespace) of its final line.
  134. static auto ComputeIndentOfFinalLine(llvm::StringRef text) -> llvm::StringRef {
  135. int indent_end = text.size();
  136. for (int i = indent_end - 1; i >= 0; --i) {
  137. if (text[i] == '\n') {
  138. int indent_start = i + 1;
  139. return text.substr(indent_start, indent_end - indent_start);
  140. }
  141. if (!isSpace(text[i])) {
  142. indent_end = i;
  143. }
  144. }
  145. llvm_unreachable("Given text is required to contain a newline.");
  146. }
  147. namespace {
  148. // The leading whitespace in a multi-line string literal.
  149. struct Indent {
  150. llvm::StringRef indent;
  151. bool has_errors;
  152. };
  153. } // namespace
  154. // Check the literal is indented properly, if it's a multi-line litera.
  155. // Find the leading whitespace that should be removed from each line of a
  156. // multi-line string literal.
  157. static auto CheckIndent(DiagnosticEmitter& emitter, llvm::StringRef text,
  158. llvm::StringRef content) -> Indent {
  159. // Find the leading horizontal whitespace on the final line of this literal.
  160. // Note that for an empty literal, this might not be inside the content.
  161. llvm::StringRef indent = ComputeIndentOfFinalLine(text);
  162. bool has_errors = false;
  163. // The last line is not permitted to contain any content after its
  164. // indentation.
  165. if (indent.end() != content.end()) {
  166. emitter.EmitError<ContentBeforeStringTerminator>();
  167. has_errors = true;
  168. }
  169. return {.indent = indent, .has_errors = has_errors};
  170. }
  171. // Expand a `\u{HHHHHH}` escape sequence into a sequence of UTF-8 code units.
  172. static auto ExpandUnicodeEscapeSequence(DiagnosticEmitter& emitter,
  173. llvm::StringRef digits,
  174. std::string& result) -> bool {
  175. unsigned code_point;
  176. if (digits.getAsInteger(16, code_point) || code_point > 0x10FFFF) {
  177. emitter.EmitError<UnicodeEscapeTooLarge>();
  178. return false;
  179. }
  180. if (code_point >= 0xD800 && code_point < 0xE000) {
  181. emitter.EmitError<UnicodeEscapeSurrogate>();
  182. return false;
  183. }
  184. // Convert the code point to a sequence of UTF-8 code units.
  185. // Every code point fits in 6 UTF-8 code units.
  186. const llvm::UTF32 utf32_code_units[1] = {code_point};
  187. llvm::UTF8 utf8_code_units[6];
  188. const llvm::UTF32* src_pos = utf32_code_units;
  189. llvm::UTF8* dest_pos = utf8_code_units;
  190. llvm::ConversionResult conv_result = llvm::ConvertUTF32toUTF8(
  191. &src_pos, src_pos + 1, &dest_pos, dest_pos + 6, llvm::strictConversion);
  192. if (conv_result != llvm::conversionOK) {
  193. llvm_unreachable("conversion of valid code point to UTF-8 cannot fail");
  194. }
  195. result.insert(result.end(), reinterpret_cast<char*>(utf8_code_units),
  196. reinterpret_cast<char*>(dest_pos));
  197. return true;
  198. }
  199. // Expand an escape sequence, appending the expanded value to the given
  200. // `result` string. `content` is the string content, starting from the first
  201. // character after the escape sequence introducer (for example, the `n` in
  202. // `\n`), and will be updated to remove the leading escape sequence.
  203. static auto ExpandAndConsumeEscapeSequence(DiagnosticEmitter& emitter,
  204. llvm::StringRef& content,
  205. std::string& result) -> bool {
  206. assert(!content.empty() && "should have escaped closing delimiter");
  207. char first = content.front();
  208. content = content.drop_front(1);
  209. switch (first) {
  210. case 't':
  211. result += '\t';
  212. return true;
  213. case 'n':
  214. result += '\n';
  215. return true;
  216. case 'r':
  217. result += '\r';
  218. return true;
  219. case '"':
  220. result += '"';
  221. return true;
  222. case '\'':
  223. result += '\'';
  224. return true;
  225. case '\\':
  226. result += '\\';
  227. return true;
  228. case '0':
  229. result += '\0';
  230. if (!content.empty() && llvm::isDigit(content.front())) {
  231. emitter.EmitError<DecimalEscapeSequence>();
  232. return false;
  233. }
  234. return true;
  235. case 'x':
  236. if (content.size() >= 2 && isUpperHexDigit(content[0]) &&
  237. isUpperHexDigit(content[1])) {
  238. result +=
  239. static_cast<char>(llvm::hexFromNibbles(content[0], content[1]));
  240. content = content.drop_front(2);
  241. return true;
  242. }
  243. emitter.EmitError<HexadecimalEscapeMissingDigits>();
  244. break;
  245. case 'u': {
  246. llvm::StringRef remaining = content;
  247. if (remaining.consume_front("{")) {
  248. llvm::StringRef digits = remaining.take_while(isUpperHexDigit);
  249. remaining = remaining.drop_front(digits.size());
  250. if (!digits.empty() && remaining.consume_front("}")) {
  251. if (!ExpandUnicodeEscapeSequence(emitter, digits, result)) {
  252. break;
  253. }
  254. content = remaining;
  255. return true;
  256. }
  257. }
  258. emitter.EmitError<UnicodeEscapeMissingBracedDigits>();
  259. break;
  260. }
  261. default:
  262. emitter.EmitError<UnknownEscapeSequence>({.first = first});
  263. break;
  264. }
  265. // If we get here, we didn't recognize this escape sequence and have already
  266. // issued a diagnostic. For error recovery purposes, expand this escape
  267. // sequence to itself, dropping the introducer (for example, `\q` -> `q`).
  268. result += first;
  269. return false;
  270. }
  271. // Expand any escape sequences in the given string literal.
  272. static auto ExpandEscapeSequencesAndRemoveIndent(DiagnosticEmitter& emitter,
  273. llvm::StringRef contents,
  274. int hash_level,
  275. llvm::StringRef indent)
  276. -> StringLiteralToken::ExpandedValue {
  277. std::string result;
  278. result.reserve(contents.size());
  279. bool has_errors = false;
  280. llvm::SmallString<16> escape("\\");
  281. escape.resize(1 + hash_level, '#');
  282. // Process each line of the string literal.
  283. while (true) {
  284. // Every non-empty line (that contains anything other than horizontal
  285. // whitespace) is required to start with the string's indent. For error
  286. // recovery, remove all leading whitespace if the indent doesn't match.
  287. if (!contents.consume_front(indent)) {
  288. contents = contents.ltrim(HorizontalWhitespace);
  289. if (!contents.startswith("\n")) {
  290. emitter.EmitError<MismatchedIndentInString>();
  291. has_errors = true;
  292. }
  293. }
  294. // Process the contents of the line.
  295. while (true) {
  296. auto end_of_regular_text = contents.find_first_of("\n\\");
  297. result += contents.substr(0, end_of_regular_text);
  298. contents = contents.substr(end_of_regular_text);
  299. if (contents.empty()) {
  300. return {.result = result, .has_errors = has_errors};
  301. }
  302. if (contents.consume_front("\n")) {
  303. // Trailing whitespace before a newline doesn't contribute to the string
  304. // literal value.
  305. while (!result.empty() && result.back() != '\n' &&
  306. isSpace(result.back())) {
  307. result.pop_back();
  308. }
  309. result += '\n';
  310. // Move onto to the next line.
  311. break;
  312. }
  313. if (!contents.consume_front(escape)) {
  314. // This is not an escape sequence, just a raw `\`.
  315. result += contents.front();
  316. contents = contents.drop_front(1);
  317. continue;
  318. }
  319. if (contents.consume_front("\n")) {
  320. // An escaped ends the line without producing any content and without
  321. // trimming trailing whitespace.
  322. break;
  323. }
  324. // Handle this escape sequence.
  325. if (!ExpandAndConsumeEscapeSequence(emitter, contents, result)) {
  326. has_errors = true;
  327. }
  328. }
  329. }
  330. }
  331. auto StringLiteralToken::ComputeValue(DiagnosticEmitter& emitter) const
  332. -> ExpandedValue {
  333. auto indent = multi_line ? CheckIndent(emitter, text, content) : Indent();
  334. auto result = ExpandEscapeSequencesAndRemoveIndent(emitter, content,
  335. hash_level, indent.indent);
  336. result.has_errors |= indent.has_errors;
  337. return result;
  338. }
  339. } // namespace Carbon