string_literal.cpp 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #include "toolchain/lex/string_literal.h"
  5. #include "common/check.h"
  6. #include "llvm/ADT/SmallString.h"
  7. #include "llvm/ADT/StringExtras.h"
  8. #include "llvm/Support/ConvertUTF.h"
  9. #include "llvm/Support/ErrorHandling.h"
  10. #include "toolchain/lex/character_set.h"
  11. #include "toolchain/lex/helpers.h"
  12. namespace Carbon::Lex {
  13. using LexerDiagnosticEmitter = DiagnosticEmitter<const char*>;
  14. static constexpr char MultiLineIndicator[] = R"(''')";
  15. static constexpr char DoubleQuotedMultiLineIndicator[] = R"(""")";
  16. struct StringLiteral::Introducer {
  17. // The kind of string being introduced.
  18. MultiLineKind kind;
  19. // The terminator for the string, without any '#' suffixes.
  20. llvm::StringRef terminator;
  21. // The length of the introducer, including the file type indicator and
  22. // newline for a multi-line string literal.
  23. int prefix_size;
  24. // Lex the introducer for a string literal, after any '#'s.
  25. static auto Lex(llvm::StringRef source_text) -> std::optional<Introducer>;
  26. };
  27. // Lex the introducer for a string literal, after any '#'s.
  28. //
  29. // We lex multi-line literals when spelled with either ''' or """ for error
  30. // recovery purposes, and reject """ literals after lexing.
  31. auto StringLiteral::Introducer::Lex(llvm::StringRef source_text)
  32. -> std::optional<Introducer> {
  33. MultiLineKind kind = NotMultiLine;
  34. llvm::StringRef indicator;
  35. if (source_text.startswith(MultiLineIndicator)) {
  36. kind = MultiLine;
  37. indicator = llvm::StringRef(MultiLineIndicator);
  38. } else if (source_text.startswith(DoubleQuotedMultiLineIndicator)) {
  39. kind = MultiLineWithDoubleQuotes;
  40. indicator = llvm::StringRef(DoubleQuotedMultiLineIndicator);
  41. }
  42. if (kind != NotMultiLine) {
  43. // The rest of the line must be a valid file type indicator: a sequence of
  44. // characters containing neither '#' nor '"' followed by a newline.
  45. auto prefix_end = source_text.find_first_of("#\n\"", indicator.size());
  46. if (prefix_end != llvm::StringRef::npos &&
  47. source_text[prefix_end] == '\n') {
  48. // Include the newline in the prefix size.
  49. return Introducer{.kind = kind,
  50. .terminator = indicator,
  51. .prefix_size = static_cast<int>(prefix_end + 1)};
  52. }
  53. }
  54. if (!source_text.empty() && source_text[0] == '"') {
  55. return Introducer{
  56. .kind = NotMultiLine, .terminator = "\"", .prefix_size = 1};
  57. }
  58. return std::nullopt;
  59. }
  60. namespace {
  61. // A set of 'char' values.
  62. struct alignas(8) CharSet {
  63. bool Elements[UCHAR_MAX + 1];
  64. constexpr CharSet(std::initializer_list<char> chars) : Elements() {
  65. for (char c : chars) {
  66. Elements[static_cast<unsigned char>(c)] = true;
  67. }
  68. }
  69. constexpr auto operator[](char c) const -> bool {
  70. return Elements[static_cast<unsigned char>(c)];
  71. }
  72. };
  73. } // namespace
  74. auto StringLiteral::Lex(llvm::StringRef source_text)
  75. -> std::optional<StringLiteral> {
  76. int64_t cursor = 0;
  77. const int64_t source_text_size = source_text.size();
  78. // Determine the number of hashes prefixing.
  79. while (cursor < source_text_size && source_text[cursor] == '#') {
  80. ++cursor;
  81. }
  82. const int hash_level = cursor;
  83. const std::optional<Introducer> introducer =
  84. Introducer::Lex(source_text.substr(hash_level));
  85. if (!introducer) {
  86. return std::nullopt;
  87. }
  88. cursor += introducer->prefix_size;
  89. const int prefix_len = cursor;
  90. llvm::SmallString<16> terminator(introducer->terminator);
  91. llvm::SmallString<16> escape("\\");
  92. // The terminator and escape sequence marker require a number of '#'s
  93. // matching the leading sequence of '#'s.
  94. terminator.resize(terminator.size() + hash_level, '#');
  95. escape.resize(escape.size() + hash_level, '#');
  96. bool content_needs_validation = false;
  97. // TODO: Detect indent / dedent for multi-line string literals in order to
  98. // stop parsing on dedent before a terminator is found.
  99. for (; cursor < source_text_size; ++cursor) {
  100. // Use a lookup table to allow us to quickly skip uninteresting characters.
  101. static constexpr CharSet InterestingChars = {'\\', '\n', '"', '\'', '\t'};
  102. if (!InterestingChars[source_text[cursor]]) {
  103. continue;
  104. }
  105. // This switch and loop structure relies on multi-character terminators and
  106. // escape sequences starting with a predictable character and not containing
  107. // embedded and unescaped terminators or newlines.
  108. switch (source_text[cursor]) {
  109. case '\t':
  110. // Tabs have extra validation.
  111. content_needs_validation = true;
  112. break;
  113. case '\\':
  114. if (escape.size() == 1 ||
  115. source_text.substr(cursor + 1).startswith(escape.substr(1))) {
  116. content_needs_validation = true;
  117. cursor += escape.size();
  118. // If there's either not a character following the escape, or it's a
  119. // single-line string and the escaped character is a newline, we
  120. // should stop here.
  121. if (cursor >= source_text_size || (introducer->kind == NotMultiLine &&
  122. source_text[cursor] == '\n')) {
  123. llvm::StringRef text = source_text.take_front(cursor);
  124. return StringLiteral(text, text.drop_front(prefix_len),
  125. content_needs_validation, hash_level,
  126. introducer->kind,
  127. /*is_terminated=*/false);
  128. }
  129. }
  130. break;
  131. case '\n':
  132. if (introducer->kind == NotMultiLine) {
  133. llvm::StringRef text = source_text.take_front(cursor);
  134. return StringLiteral(text, text.drop_front(prefix_len),
  135. content_needs_validation, hash_level,
  136. introducer->kind,
  137. /*is_terminated=*/false);
  138. }
  139. break;
  140. case '"':
  141. case '\'':
  142. if (source_text.substr(cursor).startswith(terminator)) {
  143. llvm::StringRef text =
  144. source_text.substr(0, cursor + terminator.size());
  145. llvm::StringRef content =
  146. source_text.substr(prefix_len, cursor - prefix_len);
  147. return StringLiteral(text, content, content_needs_validation,
  148. hash_level, introducer->kind,
  149. /*is_terminated=*/true);
  150. }
  151. break;
  152. default:
  153. // No action for non-terminators.
  154. break;
  155. }
  156. }
  157. // No terminator was found.
  158. return StringLiteral(source_text, source_text.drop_front(prefix_len),
  159. content_needs_validation, hash_level, introducer->kind,
  160. /*is_terminated=*/false);
  161. }
  162. // Given a string that contains at least one newline, find the indent (the
  163. // leading sequence of horizontal whitespace) of its final line.
  164. static auto ComputeIndentOfFinalLine(llvm::StringRef text) -> llvm::StringRef {
  165. int indent_end = text.size();
  166. for (int i = indent_end - 1; i >= 0; --i) {
  167. if (text[i] == '\n') {
  168. int indent_start = i + 1;
  169. return text.substr(indent_start, indent_end - indent_start);
  170. }
  171. if (!IsSpace(text[i])) {
  172. indent_end = i;
  173. }
  174. }
  175. llvm_unreachable("Given text is required to contain a newline.");
  176. }
  177. // Check the literal is indented properly, if it's a multi-line litera.
  178. // Find the leading whitespace that should be removed from each line of a
  179. // multi-line string literal.
  180. static auto CheckIndent(LexerDiagnosticEmitter& emitter, llvm::StringRef text,
  181. llvm::StringRef content) -> llvm::StringRef {
  182. // Find the leading horizontal whitespace on the final line of this literal.
  183. // Note that for an empty literal, this might not be inside the content.
  184. llvm::StringRef indent = ComputeIndentOfFinalLine(text);
  185. // The last line is not permitted to contain any content after its
  186. // indentation.
  187. if (indent.end() != content.end()) {
  188. CARBON_DIAGNOSTIC(
  189. ContentBeforeStringTerminator, Error,
  190. "Only whitespace is permitted before the closing `'''` of a "
  191. "multi-line string.");
  192. emitter.Emit(indent.end(), ContentBeforeStringTerminator);
  193. }
  194. return indent;
  195. }
  196. // Expand a `\u{HHHHHH}` escape sequence into a sequence of UTF-8 code units.
  197. static auto ExpandUnicodeEscapeSequence(LexerDiagnosticEmitter& emitter,
  198. llvm::StringRef digits,
  199. char*& buffer_cursor) -> bool {
  200. unsigned code_point;
  201. if (!CanLexInteger(emitter, digits)) {
  202. return false;
  203. }
  204. if (digits.getAsInteger(16, code_point) || code_point > 0x10FFFF) {
  205. CARBON_DIAGNOSTIC(UnicodeEscapeTooLarge, Error,
  206. "Code point specified by `\\u{{...}}` escape is greater "
  207. "than 0x10FFFF.");
  208. emitter.Emit(digits.begin(), UnicodeEscapeTooLarge);
  209. return false;
  210. }
  211. if (code_point >= 0xD800 && code_point < 0xE000) {
  212. CARBON_DIAGNOSTIC(UnicodeEscapeSurrogate, Error,
  213. "Code point specified by `\\u{{...}}` escape is a "
  214. "surrogate character.");
  215. emitter.Emit(digits.begin(), UnicodeEscapeSurrogate);
  216. return false;
  217. }
  218. // Convert the code point to a sequence of UTF-8 code units.
  219. // Every code point fits in 6 UTF-8 code units.
  220. const llvm::UTF32 utf32_code_units[1] = {code_point};
  221. const llvm::UTF32* src_pos = utf32_code_units;
  222. auto*& buffer_cursor_as_utf8 = reinterpret_cast<llvm::UTF8*&>(buffer_cursor);
  223. llvm::ConversionResult conv_result = llvm::ConvertUTF32toUTF8(
  224. &src_pos, src_pos + 1, &buffer_cursor_as_utf8, buffer_cursor_as_utf8 + 6,
  225. llvm::strictConversion);
  226. if (conv_result != llvm::conversionOK) {
  227. llvm_unreachable("conversion of valid code point to UTF-8 cannot fail");
  228. }
  229. return true;
  230. }
  231. // Appends a character to the buffer and advances the cursor.
  232. static auto AppendChar(char*& buffer_cursor, char append_char) -> void {
  233. buffer_cursor[0] = append_char;
  234. ++buffer_cursor;
  235. }
  236. // Appends the front of contents to the buffer and advances the cursor.
  237. static auto AppendFrontOfContents(char*& buffer_cursor,
  238. llvm::StringRef contents, size_t len_or_npos)
  239. -> void {
  240. auto len =
  241. len_or_npos == llvm::StringRef::npos ? contents.size() : len_or_npos;
  242. memcpy(buffer_cursor, contents.data(), len);
  243. buffer_cursor += len;
  244. }
  245. // Expand an escape sequence, appending the expanded value to the given
  246. // `result` string. `content` is the string content, starting from the first
  247. // character after the escape sequence introducer (for example, the `n` in
  248. // `\n`), and will be updated to remove the leading escape sequence.
  249. static auto ExpandAndConsumeEscapeSequence(LexerDiagnosticEmitter& emitter,
  250. llvm::StringRef& content,
  251. char*& buffer_cursor) -> void {
  252. CARBON_CHECK(!content.empty()) << "should have escaped closing delimiter";
  253. char first = content.front();
  254. content = content.drop_front(1);
  255. switch (first) {
  256. case 't':
  257. AppendChar(buffer_cursor, '\t');
  258. return;
  259. case 'n':
  260. AppendChar(buffer_cursor, '\n');
  261. return;
  262. case 'r':
  263. AppendChar(buffer_cursor, '\r');
  264. return;
  265. case '"':
  266. AppendChar(buffer_cursor, '"');
  267. return;
  268. case '\'':
  269. AppendChar(buffer_cursor, '\'');
  270. return;
  271. case '\\':
  272. AppendChar(buffer_cursor, '\\');
  273. return;
  274. case '0':
  275. AppendChar(buffer_cursor, '\0');
  276. if (!content.empty() && IsDecimalDigit(content.front())) {
  277. CARBON_DIAGNOSTIC(
  278. DecimalEscapeSequence, Error,
  279. "Decimal digit follows `\\0` escape sequence. Use `\\x00` instead "
  280. "of `\\0` if the next character is a digit.");
  281. emitter.Emit(content.begin(), DecimalEscapeSequence);
  282. return;
  283. }
  284. return;
  285. case 'x':
  286. if (content.size() >= 2 && IsUpperHexDigit(content[0]) &&
  287. IsUpperHexDigit(content[1])) {
  288. AppendChar(buffer_cursor, static_cast<char>(llvm::hexFromNibbles(
  289. content[0], content[1])));
  290. content = content.drop_front(2);
  291. return;
  292. }
  293. CARBON_DIAGNOSTIC(HexadecimalEscapeMissingDigits, Error,
  294. "Escape sequence `\\x` must be followed by two "
  295. "uppercase hexadecimal digits, for example `\\x0F`.");
  296. emitter.Emit(content.begin(), HexadecimalEscapeMissingDigits);
  297. break;
  298. case 'u': {
  299. llvm::StringRef remaining = content;
  300. if (remaining.consume_front("{")) {
  301. llvm::StringRef digits = remaining.take_while(IsUpperHexDigit);
  302. remaining = remaining.drop_front(digits.size());
  303. if (!digits.empty() && remaining.consume_front("}")) {
  304. if (!ExpandUnicodeEscapeSequence(emitter, digits, buffer_cursor)) {
  305. break;
  306. }
  307. content = remaining;
  308. return;
  309. }
  310. }
  311. CARBON_DIAGNOSTIC(
  312. UnicodeEscapeMissingBracedDigits, Error,
  313. "Escape sequence `\\u` must be followed by a braced sequence of "
  314. "uppercase hexadecimal digits, for example `\\u{{70AD}}`.");
  315. emitter.Emit(content.begin(), UnicodeEscapeMissingBracedDigits);
  316. break;
  317. }
  318. default:
  319. CARBON_DIAGNOSTIC(UnknownEscapeSequence, Error,
  320. "Unrecognized escape sequence `{0}`.", char);
  321. emitter.Emit(content.begin() - 1, UnknownEscapeSequence, first);
  322. break;
  323. }
  324. // If we get here, we didn't recognize this escape sequence and have already
  325. // issued a diagnostic. For error recovery purposes, expand this escape
  326. // sequence to itself, dropping the introducer (for example, `\q` -> `q`).
  327. AppendChar(buffer_cursor, first);
  328. }
  329. // Expand any escape sequences in the given string literal.
  330. static auto ExpandEscapeSequencesAndRemoveIndent(
  331. LexerDiagnosticEmitter& emitter, llvm::StringRef contents, int hash_level,
  332. llvm::StringRef indent, char* buffer) -> llvm::StringRef {
  333. char* buffer_cursor = buffer;
  334. llvm::SmallString<16> escape("\\");
  335. escape.resize(1 + hash_level, '#');
  336. // Process each line of the string literal.
  337. while (true) {
  338. // Every non-empty line (that contains anything other than horizontal
  339. // whitespace) is required to start with the string's indent. For error
  340. // recovery, remove all leading whitespace if the indent doesn't match.
  341. if (!contents.consume_front(indent)) {
  342. const char* line_start = contents.begin();
  343. contents = contents.drop_while(IsHorizontalWhitespace);
  344. if (!contents.startswith("\n")) {
  345. CARBON_DIAGNOSTIC(
  346. MismatchedIndentInString, Error,
  347. "Indentation does not match that of the closing `'''` in "
  348. "multi-line string literal.");
  349. emitter.Emit(line_start, MismatchedIndentInString);
  350. }
  351. }
  352. // Tracks the position at the last time we expanded an escape to ensure we
  353. // don't misinterpret it as unescaped when backtracking.
  354. char* buffer_last_escape = buffer_cursor;
  355. // Process the contents of the line.
  356. while (true) {
  357. // Append the next segment of plain text.
  358. auto end_of_regular_text = contents.find_if([](char c) {
  359. return c == '\n' || c == '\\' ||
  360. (IsHorizontalWhitespace(c) && c != ' ');
  361. });
  362. AppendFrontOfContents(buffer_cursor, contents, end_of_regular_text);
  363. if (end_of_regular_text == llvm::StringRef::npos) {
  364. return llvm::StringRef(buffer, buffer_cursor - buffer);
  365. }
  366. contents = contents.drop_front(end_of_regular_text);
  367. if (contents.consume_front("\n")) {
  368. // Trailing whitespace in the source before a newline doesn't contribute
  369. // to the string literal value. However, escaped whitespace (like `\t`)
  370. // and any whitespace just before that does contribute.
  371. while (buffer_cursor > buffer_last_escape) {
  372. char back = *(buffer_cursor - 1);
  373. if (back == '\n' || !IsSpace(back)) {
  374. break;
  375. }
  376. --buffer_cursor;
  377. }
  378. AppendChar(buffer_cursor, '\n');
  379. // Move onto to the next line.
  380. break;
  381. }
  382. if (IsHorizontalWhitespace(contents.front())) {
  383. // Horizontal whitespace other than ` ` is valid only at the end of a
  384. // line.
  385. CARBON_CHECK(contents.front() != ' ')
  386. << "should not have stopped at a plain space";
  387. auto after_space = contents.find_if_not(IsHorizontalWhitespace);
  388. if (after_space == llvm::StringRef::npos ||
  389. contents[after_space] != '\n') {
  390. // TODO: Include the source range of the whitespace up to
  391. // `contents.begin() + after_space` in the diagnostic.
  392. CARBON_DIAGNOSTIC(
  393. InvalidHorizontalWhitespaceInString, Error,
  394. "Whitespace other than plain space must be expressed with an "
  395. "escape sequence in a string literal.");
  396. emitter.Emit(contents.begin(), InvalidHorizontalWhitespaceInString);
  397. // Include the whitespace in the string contents for error recovery.
  398. AppendFrontOfContents(buffer_cursor, contents, after_space);
  399. }
  400. contents = contents.substr(after_space);
  401. continue;
  402. }
  403. if (!contents.consume_front(escape)) {
  404. // This is not an escape sequence, just a raw `\`.
  405. AppendChar(buffer_cursor, contents.front());
  406. contents = contents.drop_front(1);
  407. continue;
  408. }
  409. if (contents.consume_front("\n")) {
  410. // An escaped newline ends the line without producing any content and
  411. // without trimming trailing whitespace.
  412. break;
  413. }
  414. // Handle this escape sequence.
  415. ExpandAndConsumeEscapeSequence(emitter, contents, buffer_cursor);
  416. buffer_last_escape = buffer_cursor;
  417. }
  418. }
  419. }
  420. auto StringLiteral::ComputeValue(llvm::BumpPtrAllocator& allocator,
  421. LexerDiagnosticEmitter& emitter) const
  422. -> llvm::StringRef {
  423. if (!is_terminated_) {
  424. return "";
  425. }
  426. if (multi_line_ == MultiLineWithDoubleQuotes) {
  427. CARBON_DIAGNOSTIC(
  428. MultiLineStringWithDoubleQuotes, Error,
  429. "Use `'''` delimiters for a multi-line string literal, not `\"\"\"`.");
  430. emitter.Emit(text_.begin(), MultiLineStringWithDoubleQuotes);
  431. }
  432. llvm::StringRef indent =
  433. multi_line_ ? CheckIndent(emitter, text_, content_) : llvm::StringRef();
  434. if (!content_needs_validation_ && (!multi_line_ || indent.empty())) {
  435. return content_;
  436. }
  437. // "Expanding" escape sequences should only ever shorten content. As a
  438. // consequence, the output string should allows fit within this allocation.
  439. // Although this may waste some space, it avoids a reallocation.
  440. auto result = ExpandEscapeSequencesAndRemoveIndent(
  441. emitter, content_, hash_level_, indent,
  442. allocator.Allocate<char>(content_.size()));
  443. CARBON_CHECK(result.size() <= content_.size())
  444. << "Content grew from " << content_.size() << " to " << result.size()
  445. << ": `" << content_ << "`";
  446. return result;
  447. }
  448. } // namespace Carbon::Lex