string_literal.cpp 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #include "toolchain/lex/string_literal.h"
  5. #include <initializer_list>
  6. #include <optional>
  7. #include "common/check.h"
  8. #include "llvm/ADT/SmallString.h"
  9. #include "llvm/ADT/StringExtras.h"
  10. #include "llvm/Support/ConvertUTF.h"
  11. #include "llvm/Support/ErrorHandling.h"
  12. #include "toolchain/lex/character_set.h"
  13. #include "toolchain/lex/helpers.h"
  14. namespace Carbon::Lex {
  15. using DiagnosticEmitter = Diagnostics::Emitter<const char*>;
  16. static constexpr char MultiLineIndicator[] = R"(''')";
  17. static constexpr char DoubleQuotedMultiLineIndicator[] = R"(""")";
  18. struct StringLiteral::Introducer {
  19. // The kind of string being introduced.
  20. Kind kind;
  21. // The terminator for the string, without any '#' suffixes.
  22. llvm::StringRef terminator;
  23. // The length of the introducer, including the file type indicator and
  24. // newline for a multi-line string literal.
  25. int prefix_size;
  26. // Lex the introducer for a string literal, after any '#'s.
  27. static auto Lex(llvm::StringRef source_text) -> std::optional<Introducer>;
  28. };
  29. // Lex the introducer for a string literal, after any '#'s.
  30. //
  31. // We lex multi-line literals when spelled with either ''' or """ for error
  32. // recovery purposes, and reject """ literals after lexing.
  33. auto StringLiteral::Introducer::Lex(llvm::StringRef source_text)
  34. -> std::optional<Introducer> {
  35. Kind kind = Kind::SingleLine;
  36. llvm::StringRef indicator;
  37. if (source_text.starts_with(MultiLineIndicator)) {
  38. kind = Kind::MultiLine;
  39. indicator = llvm::StringRef(MultiLineIndicator);
  40. } else if (source_text.starts_with(DoubleQuotedMultiLineIndicator)) {
  41. kind = Kind::MultiLineWithDoubleQuotes;
  42. indicator = llvm::StringRef(DoubleQuotedMultiLineIndicator);
  43. }
  44. if (kind != Kind::SingleLine) {
  45. // The rest of the line must be a valid file type indicator: a sequence of
  46. // characters containing neither '#' nor '"' followed by a newline.
  47. auto prefix_end = source_text.find_first_of("#\n\"", indicator.size());
  48. if (prefix_end != llvm::StringRef::npos &&
  49. source_text[prefix_end] == '\n') {
  50. // Include the newline in the prefix size.
  51. return Introducer{.kind = kind,
  52. .terminator = indicator,
  53. .prefix_size = static_cast<int>(prefix_end + 1)};
  54. }
  55. }
  56. if (source_text.starts_with('"')) {
  57. return Introducer{
  58. .kind = Kind::SingleLine, .terminator = "\"", .prefix_size = 1};
  59. }
  60. if (source_text.starts_with('\'')) {
  61. return Introducer{.kind = Kind::Char, .terminator = "'", .prefix_size = 1};
  62. }
  63. return std::nullopt;
  64. }
  65. namespace {
  66. // A set of 'char' values.
  67. struct alignas(8) CharSet {
  68. bool Elements[UCHAR_MAX + 1];
  69. constexpr CharSet(std::initializer_list<char> chars) : Elements() {
  70. for (char c : chars) {
  71. Elements[static_cast<unsigned char>(c)] = true;
  72. }
  73. }
  74. constexpr auto operator[](char c) const -> bool {
  75. return Elements[static_cast<unsigned char>(c)];
  76. }
  77. };
  78. } // namespace
  79. // Determine whether this is a multi-line string literal.
  80. static auto IsMultiLine(StringLiteral::Kind kind) -> bool {
  81. return kind == StringLiteral::Kind::MultiLine ||
  82. kind == StringLiteral::Kind::MultiLineWithDoubleQuotes;
  83. }
  84. auto StringLiteral::Lex(llvm::StringRef source_text)
  85. -> std::optional<StringLiteral> {
  86. int64_t cursor = 0;
  87. const int64_t source_text_size = source_text.size();
  88. // Determine the number of hashes prefixing.
  89. while (cursor < source_text_size && source_text[cursor] == '#') {
  90. ++cursor;
  91. }
  92. const int hash_level = cursor;
  93. const std::optional<Introducer> introducer =
  94. Introducer::Lex(source_text.substr(hash_level));
  95. if (!introducer) {
  96. return std::nullopt;
  97. }
  98. cursor += introducer->prefix_size;
  99. const int prefix_len = cursor;
  100. llvm::SmallString<16> terminator(introducer->terminator);
  101. llvm::SmallString<16> escape("\\");
  102. // The terminator and escape sequence marker require a number of '#'s
  103. // matching the leading sequence of '#'s.
  104. terminator.resize(terminator.size() + hash_level, '#');
  105. escape.resize(escape.size() + hash_level, '#');
  106. bool content_needs_validation = false;
  107. // TODO: Detect indent / dedent for multi-line string literals in order to
  108. // stop parsing on dedent before a terminator is found.
  109. for (; cursor < source_text_size; ++cursor) {
  110. // Use a lookup table to allow us to quickly skip uninteresting characters.
  111. static constexpr CharSet InterestingChars = {'\\', '\n', '"', '\'', '\t'};
  112. if (!InterestingChars[source_text[cursor]]) {
  113. continue;
  114. }
  115. // This switch and loop structure relies on multi-character terminators and
  116. // escape sequences starting with a predictable character and not containing
  117. // embedded and unescaped terminators or newlines.
  118. switch (source_text[cursor]) {
  119. case '\t':
  120. // Tabs have extra validation.
  121. content_needs_validation = true;
  122. break;
  123. case '\\':
  124. if (escape.size() == 1 ||
  125. source_text.substr(cursor + 1).starts_with(escape.substr(1))) {
  126. content_needs_validation = true;
  127. cursor += escape.size();
  128. // If there's either not a character following the escape, or it's a
  129. // single-line string and the escaped character is a newline, we
  130. // should stop here.
  131. if (cursor >= source_text_size ||
  132. (!IsMultiLine(introducer->kind) && source_text[cursor] == '\n')) {
  133. llvm::StringRef text = source_text.take_front(cursor);
  134. return StringLiteral(text, text.drop_front(prefix_len),
  135. content_needs_validation, hash_level,
  136. introducer->kind,
  137. /*is_terminated=*/false);
  138. }
  139. }
  140. break;
  141. case '\n':
  142. if (!IsMultiLine(introducer->kind)) {
  143. llvm::StringRef text = source_text.take_front(cursor);
  144. return StringLiteral(text, text.drop_front(prefix_len),
  145. content_needs_validation, hash_level,
  146. introducer->kind,
  147. /*is_terminated=*/false);
  148. }
  149. break;
  150. case '"':
  151. case '\'':
  152. if (source_text.substr(cursor).starts_with(terminator)) {
  153. llvm::StringRef text =
  154. source_text.substr(0, cursor + terminator.size());
  155. llvm::StringRef content =
  156. source_text.substr(prefix_len, cursor - prefix_len);
  157. return StringLiteral(text, content, content_needs_validation,
  158. hash_level, introducer->kind,
  159. /*is_terminated=*/true);
  160. }
  161. break;
  162. default:
  163. // No action for non-terminators.
  164. break;
  165. }
  166. }
  167. // No terminator was found.
  168. return StringLiteral(source_text, source_text.drop_front(prefix_len),
  169. content_needs_validation, hash_level, introducer->kind,
  170. /*is_terminated=*/false);
  171. }
  172. // Given a string that contains at least one newline, find the indent (the
  173. // leading sequence of horizontal whitespace) of its final line.
  174. static auto ComputeIndentOfFinalLine(llvm::StringRef text) -> llvm::StringRef {
  175. int indent_end = text.size();
  176. for (int i = indent_end - 1; i >= 0; --i) {
  177. if (text[i] == '\n') {
  178. int indent_start = i + 1;
  179. return text.substr(indent_start, indent_end - indent_start);
  180. }
  181. if (!IsSpace(text[i])) {
  182. indent_end = i;
  183. }
  184. }
  185. llvm_unreachable("Given text is required to contain a newline.");
  186. }
  187. // Check the literal is indented properly, if it's a multi-line litera.
  188. // Find the leading whitespace that should be removed from each line of a
  189. // multi-line string literal.
  190. static auto CheckIndent(DiagnosticEmitter& emitter, llvm::StringRef text,
  191. llvm::StringRef content) -> llvm::StringRef {
  192. // Find the leading horizontal whitespace on the final line of this literal.
  193. // Note that for an empty literal, this might not be inside the content.
  194. llvm::StringRef indent = ComputeIndentOfFinalLine(text);
  195. // The last line is not permitted to contain any content after its
  196. // indentation.
  197. if (indent.end() != content.end()) {
  198. CARBON_DIAGNOSTIC(
  199. ContentBeforeStringTerminator, Error,
  200. "only whitespace is permitted before the closing `'''` of a "
  201. "multi-line string");
  202. emitter.Emit(indent.end(), ContentBeforeStringTerminator);
  203. }
  204. return indent;
  205. }
  206. // Expand a `\u{HHHHHH}` escape sequence into a sequence of UTF-8 code units.
  207. static auto ExpandUnicodeEscapeSequence(DiagnosticEmitter& emitter,
  208. llvm::StringRef digits,
  209. char*& buffer_cursor) -> bool {
  210. unsigned code_point;
  211. if (!CanLexInt(emitter, digits)) {
  212. return false;
  213. }
  214. if (digits.getAsInteger(16, code_point) || code_point > 0x10FFFF) {
  215. CARBON_DIAGNOSTIC(UnicodeEscapeTooLarge, Error,
  216. "code point specified by `\\u{{...}}` escape is greater "
  217. "than 0x10FFFF");
  218. emitter.Emit(digits.begin(), UnicodeEscapeTooLarge);
  219. return false;
  220. }
  221. if (code_point >= 0xD800 && code_point < 0xE000) {
  222. CARBON_DIAGNOSTIC(UnicodeEscapeSurrogate, Error,
  223. "code point specified by `\\u{{...}}` escape is a "
  224. "surrogate character");
  225. emitter.Emit(digits.begin(), UnicodeEscapeSurrogate);
  226. return false;
  227. }
  228. // Convert the code point to a sequence of UTF-8 code units.
  229. // Every code point fits in 6 UTF-8 code units.
  230. const llvm::UTF32 utf32_code_units[1] = {code_point};
  231. const llvm::UTF32* src_pos = utf32_code_units;
  232. auto*& buffer_cursor_as_utf8 = reinterpret_cast<llvm::UTF8*&>(buffer_cursor);
  233. llvm::ConversionResult conv_result = llvm::ConvertUTF32toUTF8(
  234. &src_pos, src_pos + 1, &buffer_cursor_as_utf8, buffer_cursor_as_utf8 + 6,
  235. llvm::strictConversion);
  236. if (conv_result != llvm::conversionOK) {
  237. llvm_unreachable("conversion of valid code point to UTF-8 cannot fail");
  238. }
  239. return true;
  240. }
  241. // Appends a character to the buffer and advances the cursor.
  242. static auto AppendChar(char*& buffer_cursor, char append_char) -> void {
  243. buffer_cursor[0] = append_char;
  244. ++buffer_cursor;
  245. }
  246. // Appends the front of contents to the buffer and advances the cursor.
  247. static auto AppendFrontOfContents(char*& buffer_cursor,
  248. llvm::StringRef contents, size_t len_or_npos)
  249. -> void {
  250. auto len =
  251. len_or_npos == llvm::StringRef::npos ? contents.size() : len_or_npos;
  252. memcpy(buffer_cursor, contents.data(), len);
  253. buffer_cursor += len;
  254. }
  255. // Expand an escape sequence, appending the expanded value to the given
  256. // `result` string. `content` is the string content, starting from the first
  257. // character after the escape sequence introducer (for example, the `n` in
  258. // `\n`), and will be updated to remove the leading escape sequence.
  259. static auto ExpandAndConsumeEscapeSequence(DiagnosticEmitter& emitter,
  260. llvm::StringRef& content,
  261. char*& buffer_cursor) -> void {
  262. CARBON_CHECK(!content.empty(), "should have escaped closing delimiter");
  263. char first = content.front();
  264. content = content.drop_front(1);
  265. switch (first) {
  266. case 't':
  267. AppendChar(buffer_cursor, '\t');
  268. return;
  269. case 'n':
  270. AppendChar(buffer_cursor, '\n');
  271. return;
  272. case 'r':
  273. AppendChar(buffer_cursor, '\r');
  274. return;
  275. case '"':
  276. AppendChar(buffer_cursor, '"');
  277. return;
  278. case '\'':
  279. AppendChar(buffer_cursor, '\'');
  280. return;
  281. case '\\':
  282. AppendChar(buffer_cursor, '\\');
  283. return;
  284. case '0':
  285. AppendChar(buffer_cursor, '\0');
  286. if (!content.empty() && IsDecimalDigit(content.front())) {
  287. CARBON_DIAGNOSTIC(
  288. DecimalEscapeSequence, Error,
  289. "decimal digit follows `\\0` escape sequence. Use `\\x00` instead "
  290. "of `\\0` if the next character is a digit");
  291. emitter.Emit(content.begin(), DecimalEscapeSequence);
  292. return;
  293. }
  294. return;
  295. case 'x':
  296. if (content.size() >= 2 && IsUpperHexDigit(content[0]) &&
  297. IsUpperHexDigit(content[1])) {
  298. AppendChar(buffer_cursor, static_cast<char>(llvm::hexFromNibbles(
  299. content[0], content[1])));
  300. content = content.drop_front(2);
  301. return;
  302. }
  303. CARBON_DIAGNOSTIC(HexadecimalEscapeMissingDigits, Error,
  304. "escape sequence `\\x` must be followed by two "
  305. "uppercase hexadecimal digits, for example `\\x0F`");
  306. emitter.Emit(content.begin(), HexadecimalEscapeMissingDigits);
  307. break;
  308. case 'u': {
  309. llvm::StringRef remaining = content;
  310. if (remaining.consume_front("{")) {
  311. llvm::StringRef digits = remaining.take_while(IsUpperHexDigit);
  312. remaining = remaining.drop_front(digits.size());
  313. if (!digits.empty() && remaining.consume_front("}")) {
  314. if (!ExpandUnicodeEscapeSequence(emitter, digits, buffer_cursor)) {
  315. break;
  316. }
  317. content = remaining;
  318. return;
  319. }
  320. }
  321. CARBON_DIAGNOSTIC(
  322. UnicodeEscapeMissingBracedDigits, Error,
  323. "escape sequence `\\u` must be followed by a braced sequence of "
  324. "uppercase hexadecimal digits, for example `\\u{{70AD}}`");
  325. emitter.Emit(content.begin(), UnicodeEscapeMissingBracedDigits);
  326. break;
  327. }
  328. default:
  329. CARBON_DIAGNOSTIC(UnknownEscapeSequence, Error,
  330. "unrecognized escape sequence `{0}`", char);
  331. emitter.Emit(content.begin() - 1, UnknownEscapeSequence, first);
  332. break;
  333. }
  334. // If we get here, we didn't recognize this escape sequence and have already
  335. // issued a diagnostic. For error recovery purposes, expand this escape
  336. // sequence to itself, dropping the introducer (for example, `\q` -> `q`).
  337. AppendChar(buffer_cursor, first);
  338. }
  339. // Expand any escape sequences in the given string literal.
  340. static auto ExpandEscapeSequencesAndRemoveIndent(
  341. DiagnosticEmitter& emitter, llvm::StringRef contents, int hash_level,
  342. llvm::StringRef indent, char* buffer) -> llvm::StringRef {
  343. char* buffer_cursor = buffer;
  344. llvm::SmallString<16> escape("\\");
  345. escape.resize(1 + hash_level, '#');
  346. // Process each line of the string literal.
  347. while (true) {
  348. // Every non-empty line (that contains anything other than horizontal
  349. // whitespace) is required to start with the string's indent. For error
  350. // recovery, remove all leading whitespace if the indent doesn't match.
  351. if (!contents.consume_front(indent)) {
  352. const char* line_start = contents.begin();
  353. contents = contents.drop_while(IsHorizontalWhitespace);
  354. if (!contents.starts_with("\n")) {
  355. CARBON_DIAGNOSTIC(
  356. MismatchedIndentInString, Error,
  357. "indentation does not match that of the closing `'''` in "
  358. "multi-line string literal");
  359. emitter.Emit(line_start, MismatchedIndentInString);
  360. }
  361. }
  362. // Tracks the position at the last time we expanded an escape to ensure we
  363. // don't misinterpret it as unescaped when backtracking.
  364. char* buffer_last_escape = buffer_cursor;
  365. // Process the contents of the line.
  366. while (true) {
  367. // Append the next segment of plain text.
  368. auto end_of_regular_text = contents.find_if([](char c) {
  369. return c == '\n' || c == '\\' ||
  370. (IsHorizontalWhitespace(c) && c != ' ');
  371. });
  372. AppendFrontOfContents(buffer_cursor, contents, end_of_regular_text);
  373. if (end_of_regular_text == llvm::StringRef::npos) {
  374. return llvm::StringRef(buffer, buffer_cursor - buffer);
  375. }
  376. contents = contents.drop_front(end_of_regular_text);
  377. if (contents.consume_front("\n")) {
  378. // Trailing whitespace in the source before a newline doesn't contribute
  379. // to the string literal value. However, escaped whitespace (like `\t`)
  380. // and any whitespace just before that does contribute.
  381. while (buffer_cursor > buffer_last_escape) {
  382. char back = *(buffer_cursor - 1);
  383. if (back == '\n' || !IsSpace(back)) {
  384. break;
  385. }
  386. --buffer_cursor;
  387. }
  388. AppendChar(buffer_cursor, '\n');
  389. // Move onto to the next line.
  390. break;
  391. }
  392. // TODO: Also reject vertical whitespace other than \n, but ignore a \r
  393. // before a \n.
  394. if (IsHorizontalWhitespace(contents.front())) {
  395. // Horizontal whitespace other than ` ` is valid only at the end of a
  396. // line.
  397. CARBON_CHECK(contents.front() != ' ',
  398. "should not have stopped at a plain space");
  399. auto after_space = contents.find_if_not(IsHorizontalWhitespace);
  400. if (after_space == llvm::StringRef::npos ||
  401. contents[after_space] != '\n') {
  402. // TODO: Include the source range of the whitespace up to
  403. // `contents.begin() + after_space` in the diagnostic.
  404. CARBON_DIAGNOSTIC(
  405. InvalidHorizontalWhitespaceInString, Error,
  406. "whitespace other than plain space must be expressed with an "
  407. "escape sequence in a string literal");
  408. emitter.Emit(contents.begin(), InvalidHorizontalWhitespaceInString);
  409. // Include the whitespace in the string contents for error recovery.
  410. AppendFrontOfContents(buffer_cursor, contents, after_space);
  411. }
  412. contents = contents.substr(after_space);
  413. continue;
  414. }
  415. if (!contents.consume_front(escape)) {
  416. // This is not an escape sequence, just a raw `\`.
  417. AppendChar(buffer_cursor, contents.front());
  418. contents = contents.drop_front(1);
  419. continue;
  420. }
  421. if (contents.consume_front("\n")) {
  422. // An escaped newline ends the line without producing any content and
  423. // without trimming trailing whitespace.
  424. break;
  425. }
  426. // Handle this escape sequence.
  427. ExpandAndConsumeEscapeSequence(emitter, contents, buffer_cursor);
  428. buffer_last_escape = buffer_cursor;
  429. }
  430. }
  431. }
  432. // Returns whether the given character is a control character.
  433. static auto IsControlCharacter(llvm::UTF32 c) -> bool {
  434. return (c >= 0 && c <= 0x1F) || (c >= 0x7F && c <= 0x9F);
  435. }
  436. auto StringLiteral::ComputeCharLiteralValue(
  437. Diagnostics::Emitter<const char*>& emitter) const
  438. -> std::optional<CharLiteralValue> {
  439. CARBON_DCHECK(kind_ == Kind::Char);
  440. CARBON_DCHECK(is_terminated_);
  441. if (hash_level_ != 0) {
  442. CARBON_DIAGNOSTIC(CharLiteralRaw, Error,
  443. "unexpected `#` before character literal");
  444. emitter.Emit(text_.begin(), CharLiteralRaw);
  445. }
  446. // Allocate a buffer sized to the content. Note it's possible this could be
  447. // more efficient/faster with a `ExpandEscapeSequencesAndRemoveIndent`
  448. // implementation aware of the buffer size, but this is trying to share logic
  449. // with string expansion.
  450. llvm::SmallVector<char> buffer;
  451. buffer.resize_for_overwrite(content_.size());
  452. auto result = ExpandEscapeSequencesAndRemoveIndent(
  453. emitter, content_, hash_level_, /*indent=*/llvm::StringRef(),
  454. buffer.data());
  455. CARBON_CHECK(result.size() <= content_.size(),
  456. "Content grew from {0} to {1}: `{2}`", content_.size(),
  457. result.size(), content_);
  458. llvm::UTF32 target[1];
  459. const auto* source_cursor =
  460. reinterpret_cast<const llvm::UTF8*>(result.begin());
  461. llvm::UTF32* target_cursor = target;
  462. llvm::ConversionResult conv_result = llvm::ConvertUTF8toUTF32(
  463. &source_cursor, reinterpret_cast<const llvm::UTF8*>(result.end()),
  464. &target_cursor, std::end(target), llvm::strictConversion);
  465. switch (conv_result) {
  466. case llvm::conversionOK: {
  467. if (target_cursor == target) {
  468. CARBON_DIAGNOSTIC(CharLiteralEmpty, Error, "empty character literal");
  469. emitter.Emit(text_.begin(), CharLiteralEmpty);
  470. return std::nullopt;
  471. }
  472. auto result = target[0];
  473. // Check for a control character that's not written as an escape sequence.
  474. // Also don't diagnose horizontal whitespace, because that was already
  475. // done by ExpandEscapeSequencesAndRemoveIndent.
  476. if (IsControlCharacter(result) && content_.front() != '\\' &&
  477. !IsHorizontalWhitespace(content_.front())) {
  478. // TODO: Suggest \0 instead of \u{00} for a NUL character.
  479. CARBON_DIAGNOSTIC(CharLiteralControlCharacter, Error,
  480. "control character in character literal; specify as "
  481. "escape sequence `\\u{{{0:X-2}}`",
  482. llvm::UTF32);
  483. emitter.Emit(text_.begin(), CharLiteralControlCharacter, result);
  484. return std::nullopt;
  485. }
  486. if (content_.starts_with("\\x")) {
  487. CARBON_DIAGNOSTIC(CharLiteralHexEscape, Error,
  488. "escape sequence `\\x` in character literal; specify "
  489. "as escape sequence `\\u{{{0:X-2}}`",
  490. llvm::UTF32);
  491. emitter.Emit(text_.begin(), CharLiteralHexEscape, result);
  492. return std::nullopt;
  493. }
  494. return CharLiteralValue{.value = static_cast<int32_t>(result)};
  495. }
  496. case llvm::sourceExhausted: {
  497. CARBON_DIAGNOSTIC(CharLiteralUnderflow, Error, "incomplete UTF-8");
  498. emitter.Emit(text_.begin(), CharLiteralUnderflow);
  499. return std::nullopt;
  500. }
  501. case llvm::targetExhausted: {
  502. CARBON_DIAGNOSTIC(CharLiteralOverflow, Error, "too many characters");
  503. emitter.Emit(text_.begin(), CharLiteralOverflow);
  504. return std::nullopt;
  505. }
  506. case llvm::sourceIllegal: {
  507. CARBON_DIAGNOSTIC(CharLiteralInvalidUTF8, Error,
  508. "invalid UTF-8 character");
  509. emitter.Emit(text_.begin(), CharLiteralInvalidUTF8);
  510. return std::nullopt;
  511. }
  512. }
  513. }
  514. auto StringLiteral::ComputeStringValue(llvm::BumpPtrAllocator& allocator,
  515. DiagnosticEmitter& emitter) const
  516. -> llvm::StringRef {
  517. CARBON_DCHECK(kind_ != Kind::Char);
  518. CARBON_DCHECK(is_terminated_);
  519. if (kind_ == Kind::MultiLineWithDoubleQuotes) {
  520. CARBON_DIAGNOSTIC(
  521. MultiLineStringWithDoubleQuotes, Error,
  522. "use `'''` delimiters for a multi-line string literal, not `\"\"\"`");
  523. emitter.Emit(text_.begin(), MultiLineStringWithDoubleQuotes);
  524. }
  525. llvm::StringRef indent = IsMultiLine(kind_)
  526. ? CheckIndent(emitter, text_, content_)
  527. : llvm::StringRef();
  528. if (!content_needs_validation_ && (!IsMultiLine(kind_) || indent.empty())) {
  529. return content_;
  530. }
  531. // "Expanding" escape sequences should only ever shorten content. As a
  532. // consequence, the output string should allows fit within this allocation.
  533. // Although this may waste some space, it avoids a reallocation.
  534. auto result = ExpandEscapeSequencesAndRemoveIndent(
  535. emitter, content_, hash_level_, indent,
  536. allocator.Allocate<char>(content_.size()));
  537. CARBON_CHECK(result.size() <= content_.size(),
  538. "Content grew from {0} to {1}: `{2}`", content_.size(),
  539. result.size(), content_);
  540. return result;
  541. }
  542. } // namespace Carbon::Lex