string_literal.cpp 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #include "toolchain/lexer/string_literal.h"
  5. #include "common/check.h"
  6. #include "llvm/ADT/SmallString.h"
  7. #include "llvm/ADT/StringExtras.h"
  8. #include "llvm/Support/ConvertUTF.h"
  9. #include "llvm/Support/ErrorHandling.h"
  10. #include "toolchain/lexer/character_set.h"
  11. #include "toolchain/lexer/lex_helpers.h"
  12. namespace Carbon {
  13. using LexerDiagnosticEmitter = DiagnosticEmitter<const char*>;
  14. static constexpr char MultiLineIndicator[] = R"(''')";
  15. static constexpr char DoubleQuotedMultiLineIndicator[] = R"(""")";
  16. struct LexedStringLiteral::Introducer {
  17. // The kind of string being introduced.
  18. MultiLineKind kind;
  19. // The terminator for the string, without any '#' suffixes.
  20. llvm::StringRef terminator;
  21. // The length of the introducer, including the file type indicator and
  22. // newline for a multi-line string literal.
  23. int prefix_size;
  24. // Lex the introducer for a string literal, after any '#'s.
  25. static auto Lex(llvm::StringRef source_text) -> std::optional<Introducer>;
  26. };
  27. // Lex the introducer for a string literal, after any '#'s.
  28. //
  29. // We lex multi-line literals when spelled with either ''' or """ for error
  30. // recovery purposes, and reject """ literals after lexing.
  31. auto LexedStringLiteral::Introducer::Lex(llvm::StringRef source_text)
  32. -> std::optional<Introducer> {
  33. MultiLineKind kind = NotMultiLine;
  34. llvm::StringRef indicator;
  35. if (source_text.startswith(MultiLineIndicator)) {
  36. kind = MultiLine;
  37. indicator = llvm::StringRef(MultiLineIndicator);
  38. } else if (source_text.startswith(DoubleQuotedMultiLineIndicator)) {
  39. kind = MultiLineWithDoubleQuotes;
  40. indicator = llvm::StringRef(DoubleQuotedMultiLineIndicator);
  41. }
  42. if (kind != NotMultiLine) {
  43. // The rest of the line must be a valid file type indicator: a sequence of
  44. // characters containing neither '#' nor '"' followed by a newline.
  45. auto prefix_end = source_text.find_first_of("#\n\"", indicator.size());
  46. if (prefix_end != llvm::StringRef::npos &&
  47. source_text[prefix_end] == '\n') {
  48. // Include the newline in the prefix size.
  49. return Introducer{.kind = kind,
  50. .terminator = indicator,
  51. .prefix_size = static_cast<int>(prefix_end + 1)};
  52. }
  53. }
  54. if (!source_text.empty() && source_text[0] == '"') {
  55. return Introducer{
  56. .kind = NotMultiLine, .terminator = "\"", .prefix_size = 1};
  57. }
  58. return std::nullopt;
  59. }
  60. namespace {
  61. // A set of 'char' values.
  62. struct alignas(8) CharSet {
  63. bool Elements[UCHAR_MAX + 1];
  64. constexpr CharSet(std::initializer_list<char> chars) : Elements() {
  65. for (char c : chars) {
  66. Elements[static_cast<unsigned char>(c)] = true;
  67. }
  68. }
  69. constexpr auto operator[](char c) const -> bool {
  70. return Elements[static_cast<unsigned char>(c)];
  71. }
  72. };
  73. } // namespace
  74. auto LexedStringLiteral::Lex(llvm::StringRef source_text)
  75. -> std::optional<LexedStringLiteral> {
  76. int64_t cursor = 0;
  77. const int64_t source_text_size = source_text.size();
  78. // Determine the number of hashes prefixing.
  79. while (cursor < source_text_size && source_text[cursor] == '#') {
  80. ++cursor;
  81. }
  82. const int hash_level = cursor;
  83. const std::optional<Introducer> introducer =
  84. Introducer::Lex(source_text.substr(hash_level));
  85. if (!introducer) {
  86. return std::nullopt;
  87. }
  88. cursor += introducer->prefix_size;
  89. const int prefix_len = cursor;
  90. llvm::SmallString<16> terminator(introducer->terminator);
  91. llvm::SmallString<16> escape("\\");
  92. // The terminator and escape sequence marker require a number of '#'s
  93. // matching the leading sequence of '#'s.
  94. terminator.resize(terminator.size() + hash_level, '#');
  95. escape.resize(escape.size() + hash_level, '#');
  96. // TODO: Detect indent / dedent for multi-line string literals in order to
  97. // stop parsing on dedent before a terminator is found.
  98. for (; cursor < source_text_size; ++cursor) {
  99. // Use a lookup table to allow us to quickly skip uninteresting characters.
  100. static constexpr CharSet InterestingChars = {'\\', '\n', '"', '\''};
  101. if (!InterestingChars[source_text[cursor]]) {
  102. continue;
  103. }
  104. // This switch and loop structure relies on multi-character terminators and
  105. // escape sequences starting with a predictable character and not containing
  106. // embedded and unescaped terminators or newlines.
  107. switch (source_text[cursor]) {
  108. case '\\':
  109. if (escape.size() == 1 ||
  110. source_text.substr(cursor + 1).startswith(escape.substr(1))) {
  111. cursor += escape.size();
  112. // If there's either not a character following the escape, or it's a
  113. // single-line string and the escaped character is a newline, we
  114. // should stop here.
  115. if (cursor >= source_text_size || (introducer->kind == NotMultiLine &&
  116. source_text[cursor] == '\n')) {
  117. llvm::StringRef text = source_text.take_front(cursor);
  118. return LexedStringLiteral(text, text.drop_front(prefix_len),
  119. hash_level, introducer->kind,
  120. /*is_terminated=*/false);
  121. }
  122. }
  123. break;
  124. case '\n':
  125. if (introducer->kind == NotMultiLine) {
  126. llvm::StringRef text = source_text.take_front(cursor);
  127. return LexedStringLiteral(text, text.drop_front(prefix_len),
  128. hash_level, introducer->kind,
  129. /*is_terminated=*/false);
  130. }
  131. break;
  132. case '"':
  133. case '\'':
  134. if (source_text.substr(cursor).startswith(terminator)) {
  135. llvm::StringRef text =
  136. source_text.substr(0, cursor + terminator.size());
  137. llvm::StringRef content =
  138. source_text.substr(prefix_len, cursor - prefix_len);
  139. return LexedStringLiteral(text, content, hash_level, introducer->kind,
  140. /*is_terminated=*/true);
  141. }
  142. break;
  143. default:
  144. // No action for non-terminators.
  145. break;
  146. }
  147. }
  148. // No terminator was found.
  149. return LexedStringLiteral(source_text, source_text.drop_front(prefix_len),
  150. hash_level, introducer->kind,
  151. /*is_terminated=*/false);
  152. }
  153. // Given a string that contains at least one newline, find the indent (the
  154. // leading sequence of horizontal whitespace) of its final line.
  155. static auto ComputeIndentOfFinalLine(llvm::StringRef text) -> llvm::StringRef {
  156. int indent_end = text.size();
  157. for (int i = indent_end - 1; i >= 0; --i) {
  158. if (text[i] == '\n') {
  159. int indent_start = i + 1;
  160. return text.substr(indent_start, indent_end - indent_start);
  161. }
  162. if (!IsSpace(text[i])) {
  163. indent_end = i;
  164. }
  165. }
  166. llvm_unreachable("Given text is required to contain a newline.");
  167. }
  168. // Check the literal is indented properly, if it's a multi-line litera.
  169. // Find the leading whitespace that should be removed from each line of a
  170. // multi-line string literal.
  171. static auto CheckIndent(LexerDiagnosticEmitter& emitter, llvm::StringRef text,
  172. llvm::StringRef content) -> llvm::StringRef {
  173. // Find the leading horizontal whitespace on the final line of this literal.
  174. // Note that for an empty literal, this might not be inside the content.
  175. llvm::StringRef indent = ComputeIndentOfFinalLine(text);
  176. // The last line is not permitted to contain any content after its
  177. // indentation.
  178. if (indent.end() != content.end()) {
  179. CARBON_DIAGNOSTIC(
  180. ContentBeforeStringTerminator, Error,
  181. "Only whitespace is permitted before the closing `'''` of a "
  182. "multi-line string.");
  183. emitter.Emit(indent.end(), ContentBeforeStringTerminator);
  184. }
  185. return indent;
  186. }
  187. // Expand a `\u{HHHHHH}` escape sequence into a sequence of UTF-8 code units.
  188. static auto ExpandUnicodeEscapeSequence(LexerDiagnosticEmitter& emitter,
  189. llvm::StringRef digits,
  190. std::string& result) -> bool {
  191. unsigned code_point;
  192. if (!CanLexInteger(emitter, digits)) {
  193. return false;
  194. }
  195. if (digits.getAsInteger(16, code_point) || code_point > 0x10FFFF) {
  196. CARBON_DIAGNOSTIC(UnicodeEscapeTooLarge, Error,
  197. "Code point specified by `\\u{{...}}` escape is greater "
  198. "than 0x10FFFF.");
  199. emitter.Emit(digits.begin(), UnicodeEscapeTooLarge);
  200. return false;
  201. }
  202. if (code_point >= 0xD800 && code_point < 0xE000) {
  203. CARBON_DIAGNOSTIC(UnicodeEscapeSurrogate, Error,
  204. "Code point specified by `\\u{{...}}` escape is a "
  205. "surrogate character.");
  206. emitter.Emit(digits.begin(), UnicodeEscapeSurrogate);
  207. return false;
  208. }
  209. // Convert the code point to a sequence of UTF-8 code units.
  210. // Every code point fits in 6 UTF-8 code units.
  211. const llvm::UTF32 utf32_code_units[1] = {code_point};
  212. llvm::UTF8 utf8_code_units[6];
  213. const llvm::UTF32* src_pos = utf32_code_units;
  214. llvm::UTF8* dest_pos = utf8_code_units;
  215. llvm::ConversionResult conv_result = llvm::ConvertUTF32toUTF8(
  216. &src_pos, src_pos + 1, &dest_pos, dest_pos + 6, llvm::strictConversion);
  217. if (conv_result != llvm::conversionOK) {
  218. llvm_unreachable("conversion of valid code point to UTF-8 cannot fail");
  219. }
  220. result.insert(result.end(), reinterpret_cast<char*>(utf8_code_units),
  221. reinterpret_cast<char*>(dest_pos));
  222. return true;
  223. }
  224. // Expand an escape sequence, appending the expanded value to the given
  225. // `result` string. `content` is the string content, starting from the first
  226. // character after the escape sequence introducer (for example, the `n` in
  227. // `\n`), and will be updated to remove the leading escape sequence.
  228. static auto ExpandAndConsumeEscapeSequence(LexerDiagnosticEmitter& emitter,
  229. llvm::StringRef& content,
  230. std::string& result) -> void {
  231. CARBON_CHECK(!content.empty()) << "should have escaped closing delimiter";
  232. char first = content.front();
  233. content = content.drop_front(1);
  234. switch (first) {
  235. case 't':
  236. result += '\t';
  237. return;
  238. case 'n':
  239. result += '\n';
  240. return;
  241. case 'r':
  242. result += '\r';
  243. return;
  244. case '"':
  245. result += '"';
  246. return;
  247. case '\'':
  248. result += '\'';
  249. return;
  250. case '\\':
  251. result += '\\';
  252. return;
  253. case '0':
  254. result += '\0';
  255. if (!content.empty() && IsDecimalDigit(content.front())) {
  256. CARBON_DIAGNOSTIC(
  257. DecimalEscapeSequence, Error,
  258. "Decimal digit follows `\\0` escape sequence. Use `\\x00` instead "
  259. "of `\\0` if the next character is a digit.");
  260. emitter.Emit(content.begin(), DecimalEscapeSequence);
  261. return;
  262. }
  263. return;
  264. case 'x':
  265. if (content.size() >= 2 && IsUpperHexDigit(content[0]) &&
  266. IsUpperHexDigit(content[1])) {
  267. result +=
  268. static_cast<char>(llvm::hexFromNibbles(content[0], content[1]));
  269. content = content.drop_front(2);
  270. return;
  271. }
  272. CARBON_DIAGNOSTIC(HexadecimalEscapeMissingDigits, Error,
  273. "Escape sequence `\\x` must be followed by two "
  274. "uppercase hexadecimal digits, for example `\\x0F`.");
  275. emitter.Emit(content.begin(), HexadecimalEscapeMissingDigits);
  276. break;
  277. case 'u': {
  278. llvm::StringRef remaining = content;
  279. if (remaining.consume_front("{")) {
  280. llvm::StringRef digits = remaining.take_while(IsUpperHexDigit);
  281. remaining = remaining.drop_front(digits.size());
  282. if (!digits.empty() && remaining.consume_front("}")) {
  283. if (!ExpandUnicodeEscapeSequence(emitter, digits, result)) {
  284. break;
  285. }
  286. content = remaining;
  287. return;
  288. }
  289. }
  290. CARBON_DIAGNOSTIC(
  291. UnicodeEscapeMissingBracedDigits, Error,
  292. "Escape sequence `\\u` must be followed by a braced sequence of "
  293. "uppercase hexadecimal digits, for example `\\u{{70AD}}`.");
  294. emitter.Emit(content.begin(), UnicodeEscapeMissingBracedDigits);
  295. break;
  296. }
  297. default:
  298. CARBON_DIAGNOSTIC(UnknownEscapeSequence, Error,
  299. "Unrecognized escape sequence `{0}`.", char);
  300. emitter.Emit(content.begin() - 1, UnknownEscapeSequence, first);
  301. break;
  302. }
  303. // If we get here, we didn't recognize this escape sequence and have already
  304. // issued a diagnostic. For error recovery purposes, expand this escape
  305. // sequence to itself, dropping the introducer (for example, `\q` -> `q`).
  306. result += first;
  307. }
  308. // Expand any escape sequences in the given string literal.
  309. static auto ExpandEscapeSequencesAndRemoveIndent(
  310. LexerDiagnosticEmitter& emitter, llvm::StringRef contents, int hash_level,
  311. llvm::StringRef indent) -> std::string {
  312. std::string result;
  313. result.reserve(contents.size());
  314. llvm::SmallString<16> escape("\\");
  315. escape.resize(1 + hash_level, '#');
  316. // Process each line of the string literal.
  317. while (true) {
  318. // Every non-empty line (that contains anything other than horizontal
  319. // whitespace) is required to start with the string's indent. For error
  320. // recovery, remove all leading whitespace if the indent doesn't match.
  321. if (!contents.consume_front(indent)) {
  322. const char* line_start = contents.begin();
  323. contents = contents.drop_while(IsHorizontalWhitespace);
  324. if (!contents.startswith("\n")) {
  325. CARBON_DIAGNOSTIC(
  326. MismatchedIndentInString, Error,
  327. "Indentation does not match that of the closing `'''` in "
  328. "multi-line string literal.");
  329. emitter.Emit(line_start, MismatchedIndentInString);
  330. }
  331. }
  332. // Tracks the length of the result at the last time we expanded an escape
  333. // to ensure we don't misinterpret it as unescaped when backtracking.
  334. size_t last_escape_length = 0;
  335. // Process the contents of the line.
  336. while (true) {
  337. // Append the next segment of plain text.
  338. auto end_of_regular_text = contents.find_if([](char c) {
  339. return c == '\n' || c == '\\' ||
  340. (IsHorizontalWhitespace(c) && c != ' ');
  341. });
  342. result += contents.substr(0, end_of_regular_text);
  343. contents = contents.substr(end_of_regular_text);
  344. if (contents.empty()) {
  345. return result;
  346. }
  347. if (contents.consume_front("\n")) {
  348. // Trailing whitespace in the source before a newline doesn't contribute
  349. // to the string literal value. However, escaped whitespace (like `\t`)
  350. // and any whitespace just before that does contribute.
  351. while (!result.empty() && result.back() != '\n' &&
  352. IsSpace(result.back()) && result.length() > last_escape_length) {
  353. result.pop_back();
  354. }
  355. result += '\n';
  356. // Move onto to the next line.
  357. break;
  358. }
  359. if (IsHorizontalWhitespace(contents.front())) {
  360. // Horizontal whitespace other than ` ` is valid only at the end of a
  361. // line.
  362. CARBON_CHECK(contents.front() != ' ')
  363. << "should not have stopped at a plain space";
  364. auto after_space = contents.find_if_not(IsHorizontalWhitespace);
  365. if (after_space == llvm::StringRef::npos ||
  366. contents[after_space] != '\n') {
  367. // TODO: Include the source range of the whitespace up to
  368. // `contents.begin() + after_space` in the diagnostic.
  369. CARBON_DIAGNOSTIC(
  370. InvalidHorizontalWhitespaceInString, Error,
  371. "Whitespace other than plain space must be expressed with an "
  372. "escape sequence in a string literal.");
  373. emitter.Emit(contents.begin(), InvalidHorizontalWhitespaceInString);
  374. // Include the whitespace in the string contents for error recovery.
  375. result += contents.substr(0, after_space);
  376. }
  377. contents = contents.substr(after_space);
  378. continue;
  379. }
  380. if (!contents.consume_front(escape)) {
  381. // This is not an escape sequence, just a raw `\`.
  382. result += contents.front();
  383. contents = contents.drop_front(1);
  384. continue;
  385. }
  386. if (contents.consume_front("\n")) {
  387. // An escaped newline ends the line without producing any content and
  388. // without trimming trailing whitespace.
  389. break;
  390. }
  391. // Handle this escape sequence.
  392. ExpandAndConsumeEscapeSequence(emitter, contents, result);
  393. last_escape_length = result.length();
  394. }
  395. }
  396. }
  397. auto LexedStringLiteral::ComputeValue(LexerDiagnosticEmitter& emitter) const
  398. -> std::string {
  399. if (!is_terminated_) {
  400. return "";
  401. }
  402. if (multi_line_ == MultiLineWithDoubleQuotes) {
  403. CARBON_DIAGNOSTIC(
  404. MultiLineStringWithDoubleQuotes, Error,
  405. "Use `'''` delimiters for a multi-line string literal, not `\"\"\"`.");
  406. emitter.Emit(text_.begin(), MultiLineStringWithDoubleQuotes);
  407. }
  408. llvm::StringRef indent =
  409. multi_line_ ? CheckIndent(emitter, text_, content_) : llvm::StringRef();
  410. return ExpandEscapeSequencesAndRemoveIndent(emitter, content_, hash_level_,
  411. indent);
  412. }
  413. } // namespace Carbon