| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496 |
- // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
- // Exceptions. See /LICENSE for license information.
- // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- #include "toolchain/lex/string_literal.h"
- #include "common/check.h"
- #include "llvm/ADT/SmallString.h"
- #include "llvm/ADT/StringExtras.h"
- #include "llvm/Support/ConvertUTF.h"
- #include "llvm/Support/ErrorHandling.h"
- #include "toolchain/lex/character_set.h"
- #include "toolchain/lex/helpers.h"
- namespace Carbon::Lex {
- using LexerDiagnosticEmitter = DiagnosticEmitter<const char*>;
- static constexpr char MultiLineIndicator[] = R"(''')";
- static constexpr char DoubleQuotedMultiLineIndicator[] = R"(""")";
- struct StringLiteral::Introducer {
- // The kind of string being introduced.
- MultiLineKind kind;
- // The terminator for the string, without any '#' suffixes.
- llvm::StringRef terminator;
- // The length of the introducer, including the file type indicator and
- // newline for a multi-line string literal.
- int prefix_size;
- // Lex the introducer for a string literal, after any '#'s.
- static auto Lex(llvm::StringRef source_text) -> std::optional<Introducer>;
- };
- // Lex the introducer for a string literal, after any '#'s.
- //
- // We lex multi-line literals when spelled with either ''' or """ for error
- // recovery purposes, and reject """ literals after lexing.
- auto StringLiteral::Introducer::Lex(llvm::StringRef source_text)
- -> std::optional<Introducer> {
- MultiLineKind kind = NotMultiLine;
- llvm::StringRef indicator;
- if (source_text.startswith(MultiLineIndicator)) {
- kind = MultiLine;
- indicator = llvm::StringRef(MultiLineIndicator);
- } else if (source_text.startswith(DoubleQuotedMultiLineIndicator)) {
- kind = MultiLineWithDoubleQuotes;
- indicator = llvm::StringRef(DoubleQuotedMultiLineIndicator);
- }
- if (kind != NotMultiLine) {
- // The rest of the line must be a valid file type indicator: a sequence of
- // characters containing neither '#' nor '"' followed by a newline.
- auto prefix_end = source_text.find_first_of("#\n\"", indicator.size());
- if (prefix_end != llvm::StringRef::npos &&
- source_text[prefix_end] == '\n') {
- // Include the newline in the prefix size.
- return Introducer{.kind = kind,
- .terminator = indicator,
- .prefix_size = static_cast<int>(prefix_end + 1)};
- }
- }
- if (!source_text.empty() && source_text[0] == '"') {
- return Introducer{
- .kind = NotMultiLine, .terminator = "\"", .prefix_size = 1};
- }
- return std::nullopt;
- }
- namespace {
- // A set of 'char' values.
- struct alignas(8) CharSet {
- bool Elements[UCHAR_MAX + 1];
- constexpr CharSet(std::initializer_list<char> chars) : Elements() {
- for (char c : chars) {
- Elements[static_cast<unsigned char>(c)] = true;
- }
- }
- constexpr auto operator[](char c) const -> bool {
- return Elements[static_cast<unsigned char>(c)];
- }
- };
- } // namespace
- auto StringLiteral::Lex(llvm::StringRef source_text)
- -> std::optional<StringLiteral> {
- int64_t cursor = 0;
- const int64_t source_text_size = source_text.size();
- // Determine the number of hashes prefixing.
- while (cursor < source_text_size && source_text[cursor] == '#') {
- ++cursor;
- }
- const int hash_level = cursor;
- const std::optional<Introducer> introducer =
- Introducer::Lex(source_text.substr(hash_level));
- if (!introducer) {
- return std::nullopt;
- }
- cursor += introducer->prefix_size;
- const int prefix_len = cursor;
- llvm::SmallString<16> terminator(introducer->terminator);
- llvm::SmallString<16> escape("\\");
- // The terminator and escape sequence marker require a number of '#'s
- // matching the leading sequence of '#'s.
- terminator.resize(terminator.size() + hash_level, '#');
- escape.resize(escape.size() + hash_level, '#');
- bool content_needs_validation = false;
- // TODO: Detect indent / dedent for multi-line string literals in order to
- // stop parsing on dedent before a terminator is found.
- for (; cursor < source_text_size; ++cursor) {
- // Use a lookup table to allow us to quickly skip uninteresting characters.
- static constexpr CharSet InterestingChars = {'\\', '\n', '"', '\'', '\t'};
- if (!InterestingChars[source_text[cursor]]) {
- continue;
- }
- // This switch and loop structure relies on multi-character terminators and
- // escape sequences starting with a predictable character and not containing
- // embedded and unescaped terminators or newlines.
- switch (source_text[cursor]) {
- case '\t':
- // Tabs have extra validation.
- content_needs_validation = true;
- break;
- case '\\':
- if (escape.size() == 1 ||
- source_text.substr(cursor + 1).startswith(escape.substr(1))) {
- content_needs_validation = true;
- cursor += escape.size();
- // If there's either not a character following the escape, or it's a
- // single-line string and the escaped character is a newline, we
- // should stop here.
- if (cursor >= source_text_size || (introducer->kind == NotMultiLine &&
- source_text[cursor] == '\n')) {
- llvm::StringRef text = source_text.take_front(cursor);
- return StringLiteral(text, text.drop_front(prefix_len),
- content_needs_validation, hash_level,
- introducer->kind,
- /*is_terminated=*/false);
- }
- }
- break;
- case '\n':
- if (introducer->kind == NotMultiLine) {
- llvm::StringRef text = source_text.take_front(cursor);
- return StringLiteral(text, text.drop_front(prefix_len),
- content_needs_validation, hash_level,
- introducer->kind,
- /*is_terminated=*/false);
- }
- break;
- case '"':
- case '\'':
- if (source_text.substr(cursor).startswith(terminator)) {
- llvm::StringRef text =
- source_text.substr(0, cursor + terminator.size());
- llvm::StringRef content =
- source_text.substr(prefix_len, cursor - prefix_len);
- return StringLiteral(text, content, content_needs_validation,
- hash_level, introducer->kind,
- /*is_terminated=*/true);
- }
- break;
- default:
- // No action for non-terminators.
- break;
- }
- }
- // No terminator was found.
- return StringLiteral(source_text, source_text.drop_front(prefix_len),
- content_needs_validation, hash_level, introducer->kind,
- /*is_terminated=*/false);
- }
- // Given a string that contains at least one newline, find the indent (the
- // leading sequence of horizontal whitespace) of its final line.
- static auto ComputeIndentOfFinalLine(llvm::StringRef text) -> llvm::StringRef {
- int indent_end = text.size();
- for (int i = indent_end - 1; i >= 0; --i) {
- if (text[i] == '\n') {
- int indent_start = i + 1;
- return text.substr(indent_start, indent_end - indent_start);
- }
- if (!IsSpace(text[i])) {
- indent_end = i;
- }
- }
- llvm_unreachable("Given text is required to contain a newline.");
- }
- // Check the literal is indented properly, if it's a multi-line litera.
- // Find the leading whitespace that should be removed from each line of a
- // multi-line string literal.
- static auto CheckIndent(LexerDiagnosticEmitter& emitter, llvm::StringRef text,
- llvm::StringRef content) -> llvm::StringRef {
- // Find the leading horizontal whitespace on the final line of this literal.
- // Note that for an empty literal, this might not be inside the content.
- llvm::StringRef indent = ComputeIndentOfFinalLine(text);
- // The last line is not permitted to contain any content after its
- // indentation.
- if (indent.end() != content.end()) {
- CARBON_DIAGNOSTIC(
- ContentBeforeStringTerminator, Error,
- "Only whitespace is permitted before the closing `'''` of a "
- "multi-line string.");
- emitter.Emit(indent.end(), ContentBeforeStringTerminator);
- }
- return indent;
- }
- // Expand a `\u{HHHHHH}` escape sequence into a sequence of UTF-8 code units.
- static auto ExpandUnicodeEscapeSequence(LexerDiagnosticEmitter& emitter,
- llvm::StringRef digits,
- char*& buffer_cursor) -> bool {
- unsigned code_point;
- if (!CanLexInteger(emitter, digits)) {
- return false;
- }
- if (digits.getAsInteger(16, code_point) || code_point > 0x10FFFF) {
- CARBON_DIAGNOSTIC(UnicodeEscapeTooLarge, Error,
- "Code point specified by `\\u{{...}}` escape is greater "
- "than 0x10FFFF.");
- emitter.Emit(digits.begin(), UnicodeEscapeTooLarge);
- return false;
- }
- if (code_point >= 0xD800 && code_point < 0xE000) {
- CARBON_DIAGNOSTIC(UnicodeEscapeSurrogate, Error,
- "Code point specified by `\\u{{...}}` escape is a "
- "surrogate character.");
- emitter.Emit(digits.begin(), UnicodeEscapeSurrogate);
- return false;
- }
- // Convert the code point to a sequence of UTF-8 code units.
- // Every code point fits in 6 UTF-8 code units.
- const llvm::UTF32 utf32_code_units[1] = {code_point};
- const llvm::UTF32* src_pos = utf32_code_units;
- auto*& buffer_cursor_as_utf8 = reinterpret_cast<llvm::UTF8*&>(buffer_cursor);
- llvm::ConversionResult conv_result = llvm::ConvertUTF32toUTF8(
- &src_pos, src_pos + 1, &buffer_cursor_as_utf8, buffer_cursor_as_utf8 + 6,
- llvm::strictConversion);
- if (conv_result != llvm::conversionOK) {
- llvm_unreachable("conversion of valid code point to UTF-8 cannot fail");
- }
- return true;
- }
- // Appends a character to the buffer and advances the cursor.
- static auto AppendChar(char*& buffer_cursor, char append_char) -> void {
- buffer_cursor[0] = append_char;
- ++buffer_cursor;
- }
- // Appends the front of contents to the buffer and advances the cursor.
- static auto AppendFrontOfContents(char*& buffer_cursor,
- llvm::StringRef contents, size_t len_or_npos)
- -> void {
- auto len =
- len_or_npos == llvm::StringRef::npos ? contents.size() : len_or_npos;
- memcpy(buffer_cursor, contents.data(), len);
- buffer_cursor += len;
- }
- // Expand an escape sequence, appending the expanded value to the given
- // `result` string. `content` is the string content, starting from the first
- // character after the escape sequence introducer (for example, the `n` in
- // `\n`), and will be updated to remove the leading escape sequence.
- static auto ExpandAndConsumeEscapeSequence(LexerDiagnosticEmitter& emitter,
- llvm::StringRef& content,
- char*& buffer_cursor) -> void {
- CARBON_CHECK(!content.empty()) << "should have escaped closing delimiter";
- char first = content.front();
- content = content.drop_front(1);
- switch (first) {
- case 't':
- AppendChar(buffer_cursor, '\t');
- return;
- case 'n':
- AppendChar(buffer_cursor, '\n');
- return;
- case 'r':
- AppendChar(buffer_cursor, '\r');
- return;
- case '"':
- AppendChar(buffer_cursor, '"');
- return;
- case '\'':
- AppendChar(buffer_cursor, '\'');
- return;
- case '\\':
- AppendChar(buffer_cursor, '\\');
- return;
- case '0':
- AppendChar(buffer_cursor, '\0');
- if (!content.empty() && IsDecimalDigit(content.front())) {
- CARBON_DIAGNOSTIC(
- DecimalEscapeSequence, Error,
- "Decimal digit follows `\\0` escape sequence. Use `\\x00` instead "
- "of `\\0` if the next character is a digit.");
- emitter.Emit(content.begin(), DecimalEscapeSequence);
- return;
- }
- return;
- case 'x':
- if (content.size() >= 2 && IsUpperHexDigit(content[0]) &&
- IsUpperHexDigit(content[1])) {
- AppendChar(buffer_cursor, static_cast<char>(llvm::hexFromNibbles(
- content[0], content[1])));
- content = content.drop_front(2);
- return;
- }
- CARBON_DIAGNOSTIC(HexadecimalEscapeMissingDigits, Error,
- "Escape sequence `\\x` must be followed by two "
- "uppercase hexadecimal digits, for example `\\x0F`.");
- emitter.Emit(content.begin(), HexadecimalEscapeMissingDigits);
- break;
- case 'u': {
- llvm::StringRef remaining = content;
- if (remaining.consume_front("{")) {
- llvm::StringRef digits = remaining.take_while(IsUpperHexDigit);
- remaining = remaining.drop_front(digits.size());
- if (!digits.empty() && remaining.consume_front("}")) {
- if (!ExpandUnicodeEscapeSequence(emitter, digits, buffer_cursor)) {
- break;
- }
- content = remaining;
- return;
- }
- }
- CARBON_DIAGNOSTIC(
- UnicodeEscapeMissingBracedDigits, Error,
- "Escape sequence `\\u` must be followed by a braced sequence of "
- "uppercase hexadecimal digits, for example `\\u{{70AD}}`.");
- emitter.Emit(content.begin(), UnicodeEscapeMissingBracedDigits);
- break;
- }
- default:
- CARBON_DIAGNOSTIC(UnknownEscapeSequence, Error,
- "Unrecognized escape sequence `{0}`.", char);
- emitter.Emit(content.begin() - 1, UnknownEscapeSequence, first);
- break;
- }
- // If we get here, we didn't recognize this escape sequence and have already
- // issued a diagnostic. For error recovery purposes, expand this escape
- // sequence to itself, dropping the introducer (for example, `\q` -> `q`).
- AppendChar(buffer_cursor, first);
- }
- // Expand any escape sequences in the given string literal.
- static auto ExpandEscapeSequencesAndRemoveIndent(
- LexerDiagnosticEmitter& emitter, llvm::StringRef contents, int hash_level,
- llvm::StringRef indent, char* buffer) -> llvm::StringRef {
- char* buffer_cursor = buffer;
- llvm::SmallString<16> escape("\\");
- escape.resize(1 + hash_level, '#');
- // Process each line of the string literal.
- while (true) {
- // Every non-empty line (that contains anything other than horizontal
- // whitespace) is required to start with the string's indent. For error
- // recovery, remove all leading whitespace if the indent doesn't match.
- if (!contents.consume_front(indent)) {
- const char* line_start = contents.begin();
- contents = contents.drop_while(IsHorizontalWhitespace);
- if (!contents.startswith("\n")) {
- CARBON_DIAGNOSTIC(
- MismatchedIndentInString, Error,
- "Indentation does not match that of the closing `'''` in "
- "multi-line string literal.");
- emitter.Emit(line_start, MismatchedIndentInString);
- }
- }
- // Tracks the position at the last time we expanded an escape to ensure we
- // don't misinterpret it as unescaped when backtracking.
- char* buffer_last_escape = buffer_cursor;
- // Process the contents of the line.
- while (true) {
- // Append the next segment of plain text.
- auto end_of_regular_text = contents.find_if([](char c) {
- return c == '\n' || c == '\\' ||
- (IsHorizontalWhitespace(c) && c != ' ');
- });
- AppendFrontOfContents(buffer_cursor, contents, end_of_regular_text);
- if (end_of_regular_text == llvm::StringRef::npos) {
- return llvm::StringRef(buffer, buffer_cursor - buffer);
- }
- contents = contents.drop_front(end_of_regular_text);
- if (contents.consume_front("\n")) {
- // Trailing whitespace in the source before a newline doesn't contribute
- // to the string literal value. However, escaped whitespace (like `\t`)
- // and any whitespace just before that does contribute.
- while (buffer_cursor > buffer_last_escape) {
- char back = *(buffer_cursor - 1);
- if (back == '\n' || !IsSpace(back)) {
- break;
- }
- --buffer_cursor;
- }
- AppendChar(buffer_cursor, '\n');
- // Move onto to the next line.
- break;
- }
- if (IsHorizontalWhitespace(contents.front())) {
- // Horizontal whitespace other than ` ` is valid only at the end of a
- // line.
- CARBON_CHECK(contents.front() != ' ')
- << "should not have stopped at a plain space";
- auto after_space = contents.find_if_not(IsHorizontalWhitespace);
- if (after_space == llvm::StringRef::npos ||
- contents[after_space] != '\n') {
- // TODO: Include the source range of the whitespace up to
- // `contents.begin() + after_space` in the diagnostic.
- CARBON_DIAGNOSTIC(
- InvalidHorizontalWhitespaceInString, Error,
- "Whitespace other than plain space must be expressed with an "
- "escape sequence in a string literal.");
- emitter.Emit(contents.begin(), InvalidHorizontalWhitespaceInString);
- // Include the whitespace in the string contents for error recovery.
- AppendFrontOfContents(buffer_cursor, contents, after_space);
- }
- contents = contents.substr(after_space);
- continue;
- }
- if (!contents.consume_front(escape)) {
- // This is not an escape sequence, just a raw `\`.
- AppendChar(buffer_cursor, contents.front());
- contents = contents.drop_front(1);
- continue;
- }
- if (contents.consume_front("\n")) {
- // An escaped newline ends the line without producing any content and
- // without trimming trailing whitespace.
- break;
- }
- // Handle this escape sequence.
- ExpandAndConsumeEscapeSequence(emitter, contents, buffer_cursor);
- buffer_last_escape = buffer_cursor;
- }
- }
- }
- auto StringLiteral::ComputeValue(llvm::BumpPtrAllocator& allocator,
- LexerDiagnosticEmitter& emitter) const
- -> llvm::StringRef {
- if (!is_terminated_) {
- return "";
- }
- if (multi_line_ == MultiLineWithDoubleQuotes) {
- CARBON_DIAGNOSTIC(
- MultiLineStringWithDoubleQuotes, Error,
- "Use `'''` delimiters for a multi-line string literal, not `\"\"\"`.");
- emitter.Emit(text_.begin(), MultiLineStringWithDoubleQuotes);
- }
- llvm::StringRef indent =
- multi_line_ ? CheckIndent(emitter, text_, content_) : llvm::StringRef();
- if (!content_needs_validation_ && (!multi_line_ || indent.empty())) {
- return content_;
- }
- // "Expanding" escape sequences should only ever shorten content. As a
- // consequence, the output string should allows fit within this allocation.
- // Although this may waste some space, it avoids a reallocation.
- auto result = ExpandEscapeSequencesAndRemoveIndent(
- emitter, content_, hash_level_, indent,
- allocator.Allocate<char>(content_.size()));
- CARBON_CHECK(result.size() <= content_.size())
- << "Content grew from " << content_.size() << " to " << result.size()
- << ": `" << content_ << "`";
- return result;
- }
- } // namespace Carbon::Lex
|