string_literal.cpp 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #include "toolchain/lexer/string_literal.h"
  5. #include "common/check.h"
  6. #include "llvm/ADT/SmallString.h"
  7. #include "llvm/ADT/StringExtras.h"
  8. #include "llvm/Support/ConvertUTF.h"
  9. #include "llvm/Support/ErrorHandling.h"
  10. #include "llvm/Support/FormatVariadic.h"
  11. #include "toolchain/lexer/character_set.h"
  12. #include "toolchain/lexer/lex_helpers.h"
  13. namespace Carbon {
  14. using LexerDiagnosticEmitter = DiagnosticEmitter<const char*>;
  15. static constexpr char MultiLineIndicator[] = R"(''')";
  16. static constexpr char DoubleQuotedMultiLineIndicator[] = R"(""")";
  17. struct LexedStringLiteral::Introducer {
  18. // The kind of string being introduced.
  19. MultiLineKind kind;
  20. // The terminator for the string, without any '#' suffixes.
  21. llvm::StringRef terminator;
  22. // The length of the introducer, including the file type indicator and
  23. // newline for a multi-line string literal.
  24. int prefix_size;
  25. // Lex the introducer for a string literal, after any '#'s.
  26. static auto Lex(llvm::StringRef source_text) -> std::optional<Introducer>;
  27. };
  28. // Lex the introducer for a string literal, after any '#'s.
  29. //
  30. // We lex multi-line literals when spelled with either ''' or """ for error
  31. // recovery purposes, and reject """ literals after lexing.
  32. auto LexedStringLiteral::Introducer::Lex(llvm::StringRef source_text)
  33. -> std::optional<Introducer> {
  34. MultiLineKind kind = NotMultiLine;
  35. llvm::StringRef indicator;
  36. if (source_text.startswith(MultiLineIndicator)) {
  37. kind = MultiLine;
  38. indicator = llvm::StringRef(MultiLineIndicator);
  39. } else if (source_text.startswith(DoubleQuotedMultiLineIndicator)) {
  40. kind = MultiLineWithDoubleQuotes;
  41. indicator = llvm::StringRef(DoubleQuotedMultiLineIndicator);
  42. }
  43. if (kind != NotMultiLine) {
  44. // The rest of the line must be a valid file type indicator: a sequence of
  45. // characters containing neither '#' nor '"' followed by a newline.
  46. auto prefix_end = source_text.find_first_of("#\n\"", indicator.size());
  47. if (prefix_end != llvm::StringRef::npos &&
  48. source_text[prefix_end] == '\n') {
  49. // Include the newline in the prefix size.
  50. return Introducer{.kind = kind,
  51. .terminator = indicator,
  52. .prefix_size = static_cast<int>(prefix_end + 1)};
  53. }
  54. }
  55. if (!source_text.empty() && source_text[0] == '"') {
  56. return Introducer{
  57. .kind = NotMultiLine, .terminator = "\"", .prefix_size = 1};
  58. }
  59. return std::nullopt;
  60. }
  61. namespace {
  62. // A set of 'char' values.
  63. struct alignas(8) CharSet {
  64. bool Elements[UCHAR_MAX + 1];
  65. constexpr CharSet(std::initializer_list<char> chars) : Elements() {
  66. for (char c : chars) {
  67. Elements[static_cast<unsigned char>(c)] = true;
  68. }
  69. }
  70. constexpr auto operator[](char c) const -> bool {
  71. return Elements[static_cast<unsigned char>(c)];
  72. }
  73. };
  74. } // namespace
  75. auto LexedStringLiteral::Lex(llvm::StringRef source_text)
  76. -> std::optional<LexedStringLiteral> {
  77. int64_t cursor = 0;
  78. const int64_t source_text_size = source_text.size();
  79. // Determine the number of hashes prefixing.
  80. while (cursor < source_text_size && source_text[cursor] == '#') {
  81. ++cursor;
  82. }
  83. const int hash_level = cursor;
  84. const std::optional<Introducer> introducer =
  85. Introducer::Lex(source_text.substr(hash_level));
  86. if (!introducer) {
  87. return std::nullopt;
  88. }
  89. cursor += introducer->prefix_size;
  90. const int prefix_len = cursor;
  91. llvm::SmallString<16> terminator(introducer->terminator);
  92. llvm::SmallString<16> escape("\\");
  93. // The terminator and escape sequence marker require a number of '#'s
  94. // matching the leading sequence of '#'s.
  95. terminator.resize(terminator.size() + hash_level, '#');
  96. escape.resize(escape.size() + hash_level, '#');
  97. // TODO: Detect indent / dedent for multi-line string literals in order to
  98. // stop parsing on dedent before a terminator is found.
  99. for (; cursor < source_text_size; ++cursor) {
  100. // Use a lookup table to allow us to quickly skip uninteresting characters.
  101. static constexpr CharSet InterestingChars = {'\\', '\n', '"', '\''};
  102. if (!InterestingChars[source_text[cursor]]) {
  103. continue;
  104. }
  105. // This switch and loop structure relies on multi-character terminators and
  106. // escape sequences starting with a predictable character and not containing
  107. // embedded and unescaped terminators or newlines.
  108. switch (source_text[cursor]) {
  109. case '\\':
  110. if (escape.size() == 1 ||
  111. source_text.substr(cursor + 1).startswith(escape.substr(1))) {
  112. cursor += escape.size();
  113. // If there's either not a character following the escape, or it's a
  114. // single-line string and the escaped character is a newline, we
  115. // should stop here.
  116. if (cursor >= source_text_size || (introducer->kind == NotMultiLine &&
  117. source_text[cursor] == '\n')) {
  118. llvm::StringRef text = source_text.take_front(cursor);
  119. return LexedStringLiteral(text, text.drop_front(prefix_len),
  120. hash_level, introducer->kind,
  121. /*is_terminated=*/false);
  122. }
  123. }
  124. break;
  125. case '\n':
  126. if (introducer->kind == NotMultiLine) {
  127. llvm::StringRef text = source_text.take_front(cursor);
  128. return LexedStringLiteral(text, text.drop_front(prefix_len),
  129. hash_level, introducer->kind,
  130. /*is_terminated=*/false);
  131. }
  132. break;
  133. case '"':
  134. case '\'':
  135. if (source_text.substr(cursor).startswith(terminator)) {
  136. llvm::StringRef text =
  137. source_text.substr(0, cursor + terminator.size());
  138. llvm::StringRef content =
  139. source_text.substr(prefix_len, cursor - prefix_len);
  140. return LexedStringLiteral(text, content, hash_level, introducer->kind,
  141. /*is_terminated=*/true);
  142. }
  143. break;
  144. }
  145. }
  146. // No terminator was found.
  147. return LexedStringLiteral(source_text, source_text.drop_front(prefix_len),
  148. hash_level, introducer->kind,
  149. /*is_terminated=*/false);
  150. }
  151. // Given a string that contains at least one newline, find the indent (the
  152. // leading sequence of horizontal whitespace) of its final line.
  153. static auto ComputeIndentOfFinalLine(llvm::StringRef text) -> llvm::StringRef {
  154. int indent_end = text.size();
  155. for (int i = indent_end - 1; i >= 0; --i) {
  156. if (text[i] == '\n') {
  157. int indent_start = i + 1;
  158. return text.substr(indent_start, indent_end - indent_start);
  159. }
  160. if (!IsSpace(text[i])) {
  161. indent_end = i;
  162. }
  163. }
  164. llvm_unreachable("Given text is required to contain a newline.");
  165. }
  166. // Check the literal is indented properly, if it's a multi-line litera.
  167. // Find the leading whitespace that should be removed from each line of a
  168. // multi-line string literal.
  169. static auto CheckIndent(LexerDiagnosticEmitter& emitter, llvm::StringRef text,
  170. llvm::StringRef content) -> llvm::StringRef {
  171. // Find the leading horizontal whitespace on the final line of this literal.
  172. // Note that for an empty literal, this might not be inside the content.
  173. llvm::StringRef indent = ComputeIndentOfFinalLine(text);
  174. // The last line is not permitted to contain any content after its
  175. // indentation.
  176. if (indent.end() != content.end()) {
  177. CARBON_DIAGNOSTIC(
  178. ContentBeforeStringTerminator, Error,
  179. "Only whitespace is permitted before the closing `'''` of a "
  180. "multi-line string.");
  181. emitter.Emit(indent.end(), ContentBeforeStringTerminator);
  182. }
  183. return indent;
  184. }
  185. // Expand a `\u{HHHHHH}` escape sequence into a sequence of UTF-8 code units.
  186. static auto ExpandUnicodeEscapeSequence(LexerDiagnosticEmitter& emitter,
  187. llvm::StringRef digits,
  188. std::string& result) -> bool {
  189. unsigned code_point;
  190. if (!CanLexInteger(emitter, digits)) {
  191. return false;
  192. }
  193. if (digits.getAsInteger(16, code_point) || code_point > 0x10FFFF) {
  194. CARBON_DIAGNOSTIC(UnicodeEscapeTooLarge, Error,
  195. "Code point specified by `\\u{{...}}` escape is greater "
  196. "than 0x10FFFF.");
  197. emitter.Emit(digits.begin(), UnicodeEscapeTooLarge);
  198. return false;
  199. }
  200. if (code_point >= 0xD800 && code_point < 0xE000) {
  201. CARBON_DIAGNOSTIC(UnicodeEscapeSurrogate, Error,
  202. "Code point specified by `\\u{{...}}` escape is a "
  203. "surrogate character.");
  204. emitter.Emit(digits.begin(), UnicodeEscapeSurrogate);
  205. return false;
  206. }
  207. // Convert the code point to a sequence of UTF-8 code units.
  208. // Every code point fits in 6 UTF-8 code units.
  209. const llvm::UTF32 utf32_code_units[1] = {code_point};
  210. llvm::UTF8 utf8_code_units[6];
  211. const llvm::UTF32* src_pos = utf32_code_units;
  212. llvm::UTF8* dest_pos = utf8_code_units;
  213. llvm::ConversionResult conv_result = llvm::ConvertUTF32toUTF8(
  214. &src_pos, src_pos + 1, &dest_pos, dest_pos + 6, llvm::strictConversion);
  215. if (conv_result != llvm::conversionOK) {
  216. llvm_unreachable("conversion of valid code point to UTF-8 cannot fail");
  217. }
  218. result.insert(result.end(), reinterpret_cast<char*>(utf8_code_units),
  219. reinterpret_cast<char*>(dest_pos));
  220. return true;
  221. }
  222. // Expand an escape sequence, appending the expanded value to the given
  223. // `result` string. `content` is the string content, starting from the first
  224. // character after the escape sequence introducer (for example, the `n` in
  225. // `\n`), and will be updated to remove the leading escape sequence.
  226. static auto ExpandAndConsumeEscapeSequence(LexerDiagnosticEmitter& emitter,
  227. llvm::StringRef& content,
  228. std::string& result) -> void {
  229. CARBON_CHECK(!content.empty()) << "should have escaped closing delimiter";
  230. char first = content.front();
  231. content = content.drop_front(1);
  232. switch (first) {
  233. case 't':
  234. result += '\t';
  235. return;
  236. case 'n':
  237. result += '\n';
  238. return;
  239. case 'r':
  240. result += '\r';
  241. return;
  242. case '"':
  243. result += '"';
  244. return;
  245. case '\'':
  246. result += '\'';
  247. return;
  248. case '\\':
  249. result += '\\';
  250. return;
  251. case '0':
  252. result += '\0';
  253. if (!content.empty() && IsDecimalDigit(content.front())) {
  254. CARBON_DIAGNOSTIC(
  255. DecimalEscapeSequence, Error,
  256. "Decimal digit follows `\\0` escape sequence. Use `\\x00` instead "
  257. "of `\\0` if the next character is a digit.");
  258. emitter.Emit(content.begin(), DecimalEscapeSequence);
  259. return;
  260. }
  261. return;
  262. case 'x':
  263. if (content.size() >= 2 && IsUpperHexDigit(content[0]) &&
  264. IsUpperHexDigit(content[1])) {
  265. result +=
  266. static_cast<char>(llvm::hexFromNibbles(content[0], content[1]));
  267. content = content.drop_front(2);
  268. return;
  269. }
  270. CARBON_DIAGNOSTIC(HexadecimalEscapeMissingDigits, Error,
  271. "Escape sequence `\\x` must be followed by two "
  272. "uppercase hexadecimal digits, for example `\\x0F`.");
  273. emitter.Emit(content.begin(), HexadecimalEscapeMissingDigits);
  274. break;
  275. case 'u': {
  276. llvm::StringRef remaining = content;
  277. if (remaining.consume_front("{")) {
  278. llvm::StringRef digits = remaining.take_while(IsUpperHexDigit);
  279. remaining = remaining.drop_front(digits.size());
  280. if (!digits.empty() && remaining.consume_front("}")) {
  281. if (!ExpandUnicodeEscapeSequence(emitter, digits, result)) {
  282. break;
  283. }
  284. content = remaining;
  285. return;
  286. }
  287. }
  288. CARBON_DIAGNOSTIC(
  289. UnicodeEscapeMissingBracedDigits, Error,
  290. "Escape sequence `\\u` must be followed by a braced sequence of "
  291. "uppercase hexadecimal digits, for example `\\u{{70AD}}`.");
  292. emitter.Emit(content.begin(), UnicodeEscapeMissingBracedDigits);
  293. break;
  294. }
  295. default:
  296. CARBON_DIAGNOSTIC(UnknownEscapeSequence, Error,
  297. "Unrecognized escape sequence `{0}`.", char);
  298. emitter.Emit(content.begin() - 1, UnknownEscapeSequence, first);
  299. break;
  300. }
  301. // If we get here, we didn't recognize this escape sequence and have already
  302. // issued a diagnostic. For error recovery purposes, expand this escape
  303. // sequence to itself, dropping the introducer (for example, `\q` -> `q`).
  304. result += first;
  305. }
  306. // Expand any escape sequences in the given string literal.
  307. static auto ExpandEscapeSequencesAndRemoveIndent(
  308. LexerDiagnosticEmitter& emitter, llvm::StringRef contents, int hash_level,
  309. llvm::StringRef indent) -> std::string {
  310. std::string result;
  311. result.reserve(contents.size());
  312. llvm::SmallString<16> escape("\\");
  313. escape.resize(1 + hash_level, '#');
  314. // Process each line of the string literal.
  315. while (true) {
  316. // Every non-empty line (that contains anything other than horizontal
  317. // whitespace) is required to start with the string's indent. For error
  318. // recovery, remove all leading whitespace if the indent doesn't match.
  319. if (!contents.consume_front(indent)) {
  320. const char* line_start = contents.begin();
  321. contents = contents.drop_while(IsHorizontalWhitespace);
  322. if (!contents.startswith("\n")) {
  323. CARBON_DIAGNOSTIC(
  324. MismatchedIndentInString, Error,
  325. "Indentation does not match that of the closing `'''` in "
  326. "multi-line string literal.");
  327. emitter.Emit(line_start, MismatchedIndentInString);
  328. }
  329. }
  330. // Tracks the length of the result at the last time we expanded an escape
  331. // to ensure we don't misinterpret it as unescaped when backtracking.
  332. size_t last_escape_length = 0;
  333. // Process the contents of the line.
  334. while (true) {
  335. // Append the next segment of plain text.
  336. auto end_of_regular_text = contents.find_if([](char c) {
  337. return c == '\n' || c == '\\' ||
  338. (IsHorizontalWhitespace(c) && c != ' ');
  339. });
  340. result += contents.substr(0, end_of_regular_text);
  341. contents = contents.substr(end_of_regular_text);
  342. if (contents.empty()) {
  343. return result;
  344. }
  345. if (contents.consume_front("\n")) {
  346. // Trailing whitespace in the source before a newline doesn't contribute
  347. // to the string literal value. However, escaped whitespace (like `\t`)
  348. // and any whitespace just before that does contribute.
  349. while (!result.empty() && result.back() != '\n' &&
  350. IsSpace(result.back()) && result.length() > last_escape_length) {
  351. result.pop_back();
  352. }
  353. result += '\n';
  354. // Move onto to the next line.
  355. break;
  356. }
  357. if (IsHorizontalWhitespace(contents.front())) {
  358. // Horizontal whitespace other than ` ` is valid only at the end of a
  359. // line.
  360. CARBON_CHECK(contents.front() != ' ')
  361. << "should not have stopped at a plain space";
  362. auto after_space = contents.find_if_not(IsHorizontalWhitespace);
  363. if (after_space == llvm::StringRef::npos ||
  364. contents[after_space] != '\n') {
  365. // TODO: Include the source range of the whitespace up to
  366. // `contents.begin() + after_space` in the diagnostic.
  367. CARBON_DIAGNOSTIC(
  368. InvalidHorizontalWhitespaceInString, Error,
  369. "Whitespace other than plain space must be expressed with an "
  370. "escape sequence in a string literal.");
  371. emitter.Emit(contents.begin(), InvalidHorizontalWhitespaceInString);
  372. // Include the whitespace in the string contents for error recovery.
  373. result += contents.substr(0, after_space);
  374. }
  375. contents = contents.substr(after_space);
  376. continue;
  377. }
  378. if (!contents.consume_front(escape)) {
  379. // This is not an escape sequence, just a raw `\`.
  380. result += contents.front();
  381. contents = contents.drop_front(1);
  382. continue;
  383. }
  384. if (contents.consume_front("\n")) {
  385. // An escaped newline ends the line without producing any content and
  386. // without trimming trailing whitespace.
  387. break;
  388. }
  389. // Handle this escape sequence.
  390. ExpandAndConsumeEscapeSequence(emitter, contents, result);
  391. last_escape_length = result.length();
  392. }
  393. }
  394. }
  395. auto LexedStringLiteral::ComputeValue(LexerDiagnosticEmitter& emitter) const
  396. -> std::string {
  397. if (!is_terminated_) {
  398. return "";
  399. }
  400. if (multi_line_ == MultiLineWithDoubleQuotes) {
  401. CARBON_DIAGNOSTIC(
  402. MultiLineStringWithDoubleQuotes, Error,
  403. "Use `'''` delimiters for a multi-line string literal, not `\"\"\"`.");
  404. emitter.Emit(text_.begin(), MultiLineStringWithDoubleQuotes);
  405. }
  406. llvm::StringRef indent =
  407. multi_line_ ? CheckIndent(emitter, text_, content_) : llvm::StringRef();
  408. return ExpandEscapeSequencesAndRemoveIndent(emitter, content_, hash_level_,
  409. indent);
  410. }
  411. } // namespace Carbon