| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416 |
- // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
- // Exceptions. See /LICENSE for license information.
- // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- #include "toolchain/lexer/string_literal.h"
- #include "common/check.h"
- #include "llvm/ADT/SmallString.h"
- #include "llvm/ADT/StringExtras.h"
- #include "llvm/Support/ConvertUTF.h"
- #include "llvm/Support/ErrorHandling.h"
- #include "llvm/Support/FormatVariadic.h"
- #include "toolchain/lexer/character_set.h"
- namespace Carbon {
- using LexerDiagnosticEmitter = DiagnosticEmitter<const char*>;
- struct ContentBeforeStringTerminator
- : DiagnosticBase<ContentBeforeStringTerminator> {
- static constexpr llvm::StringLiteral ShortName = "syntax-invalid-string";
- static constexpr llvm::StringLiteral Message =
- "Only whitespace is permitted before the closing `\"\"\"` of a "
- "multi-line string.";
- };
- struct UnicodeEscapeTooLarge : DiagnosticBase<UnicodeEscapeTooLarge> {
- static constexpr llvm::StringLiteral ShortName = "syntax-invalid-string";
- static constexpr llvm::StringLiteral Message =
- "Code point specified by `\\u{...}` escape is greater than 0x10FFFF.";
- };
- struct UnicodeEscapeSurrogate : DiagnosticBase<UnicodeEscapeSurrogate> {
- static constexpr llvm::StringLiteral ShortName = "syntax-invalid-string";
- static constexpr llvm::StringLiteral Message =
- "Code point specified by `\\u{...}` escape is a surrogate character.";
- };
- struct UnicodeEscapeMissingBracedDigits
- : DiagnosticBase<UnicodeEscapeMissingBracedDigits> {
- static constexpr llvm::StringLiteral ShortName = "syntax-invalid-string";
- static constexpr llvm::StringLiteral Message =
- "Escape sequence `\\u` must be followed by a braced sequence of "
- "uppercase hexadecimal digits, for example `\\u{70AD}`.";
- };
- struct HexadecimalEscapeMissingDigits
- : DiagnosticBase<HexadecimalEscapeMissingDigits> {
- static constexpr llvm::StringLiteral ShortName = "syntax-invalid-string";
- static constexpr llvm::StringLiteral Message =
- "Escape sequence `\\x` must be followed by two "
- "uppercase hexadecimal digits, for example `\\x0F`.";
- };
- struct DecimalEscapeSequence : DiagnosticBase<DecimalEscapeSequence> {
- static constexpr llvm::StringLiteral ShortName = "syntax-invalid-string";
- static constexpr llvm::StringLiteral Message =
- "Decimal digit follows `\\0` escape sequence. Use `\\x00` instead of "
- "`\\0` if the next character is a digit.";
- };
- struct UnknownEscapeSequence : DiagnosticBase<UnknownEscapeSequence> {
- static constexpr llvm::StringLiteral ShortName = "syntax-invalid-string";
- static constexpr const char* Message = "Unrecognized escape sequence `{0}`.";
- auto Format() -> std::string { return llvm::formatv(Message, first).str(); }
- char first;
- };
- struct MismatchedIndentInString : DiagnosticBase<MismatchedIndentInString> {
- static constexpr llvm::StringLiteral ShortName = "syntax-invalid-string";
- static constexpr llvm::StringLiteral Message =
- "Indentation does not match that of the closing \"\"\" in multi-line "
- "string literal.";
- };
- struct InvalidHorizontalWhitespaceInString
- : DiagnosticBase<InvalidHorizontalWhitespaceInString> {
- static constexpr llvm::StringLiteral ShortName = "syntax-invalid-string";
- static constexpr llvm::StringLiteral Message =
- "Whitespace other than plain space must be expressed with an escape "
- "sequence in a string literal.";
- };
- static constexpr char MultiLineIndicator[] = R"(""")";
- // Return the number of opening characters of a multi-line string literal,
- // after any '#'s, including the file type indicator and following newline.
- static auto GetMultiLineStringLiteralPrefixSize(llvm::StringRef source_text)
- -> int {
- if (!source_text.startswith(MultiLineIndicator)) {
- return 0;
- }
- // The rest of the line must be a valid file type indicator: a sequence of
- // characters containing neither '#' nor '"' followed by a newline.
- auto prefix_end =
- source_text.find_first_of("#\n\"", strlen(MultiLineIndicator));
- if (prefix_end == llvm::StringRef::npos || source_text[prefix_end] != '\n') {
- return 0;
- }
- // Include the newline on return.
- return prefix_end + 1;
- }
- auto LexedStringLiteral::Lex(llvm::StringRef source_text)
- -> llvm::Optional<LexedStringLiteral> {
- int64_t cursor = 0;
- const int64_t source_text_size = source_text.size();
- // Determine the number of hashes prefixing.
- while (cursor < source_text_size && source_text[cursor] == '#') {
- ++cursor;
- }
- const int hash_level = cursor;
- llvm::SmallString<16> terminator("\"");
- llvm::SmallString<16> escape("\\");
- const int multi_line_prefix_size =
- GetMultiLineStringLiteralPrefixSize(source_text.substr(hash_level));
- const bool multi_line = multi_line_prefix_size > 0;
- if (multi_line) {
- cursor += multi_line_prefix_size;
- terminator = MultiLineIndicator;
- } else if (cursor < source_text_size && source_text[cursor] == '"') {
- ++cursor;
- } else {
- return llvm::None;
- }
- const int prefix_len = cursor;
- // The terminator and escape sequence marker require a number of '#'s
- // matching the leading sequence of '#'s.
- terminator.resize(terminator.size() + hash_level, '#');
- escape.resize(escape.size() + hash_level, '#');
- for (; cursor < source_text_size; ++cursor) {
- // This switch and loop structure relies on multi-character terminators and
- // escape sequences starting with a predictable character and not containing
- // embedded and unescaped terminators or newlines.
- switch (source_text[cursor]) {
- case '\\':
- if (escape.size() == 1 ||
- source_text.substr(cursor).startswith(escape)) {
- cursor += escape.size();
- // If there's either not a character following the escape, or it's a
- // single-line string and the escaped character is a newline, we
- // should stop here.
- if (cursor >= source_text_size ||
- (!multi_line && source_text[cursor] == '\n')) {
- return llvm::None;
- }
- }
- break;
- case '\n':
- if (!multi_line) {
- return llvm::None;
- }
- break;
- case '\"': {
- if (terminator.size() == 1 ||
- source_text.substr(cursor).startswith(terminator)) {
- llvm::StringRef text =
- source_text.substr(0, cursor + terminator.size());
- llvm::StringRef content =
- source_text.substr(prefix_len, cursor - prefix_len);
- return LexedStringLiteral(text, content, hash_level, multi_line);
- }
- break;
- }
- }
- }
- // Let LexError figure out how to recover from an unterminated string
- // literal.
- return llvm::None;
- }
- // Given a string that contains at least one newline, find the indent (the
- // leading sequence of horizontal whitespace) of its final line.
- static auto ComputeIndentOfFinalLine(llvm::StringRef text) -> llvm::StringRef {
- int indent_end = text.size();
- for (int i = indent_end - 1; i >= 0; --i) {
- if (text[i] == '\n') {
- int indent_start = i + 1;
- return text.substr(indent_start, indent_end - indent_start);
- }
- if (!IsSpace(text[i])) {
- indent_end = i;
- }
- }
- llvm_unreachable("Given text is required to contain a newline.");
- }
- // Check the literal is indented properly, if it's a multi-line litera.
- // Find the leading whitespace that should be removed from each line of a
- // multi-line string literal.
- static auto CheckIndent(LexerDiagnosticEmitter& emitter, llvm::StringRef text,
- llvm::StringRef content) -> llvm::StringRef {
- // Find the leading horizontal whitespace on the final line of this literal.
- // Note that for an empty literal, this might not be inside the content.
- llvm::StringRef indent = ComputeIndentOfFinalLine(text);
- // The last line is not permitted to contain any content after its
- // indentation.
- if (indent.end() != content.end()) {
- emitter.EmitError<ContentBeforeStringTerminator>(indent.end());
- }
- return indent;
- }
- // Expand a `\u{HHHHHH}` escape sequence into a sequence of UTF-8 code units.
- static auto ExpandUnicodeEscapeSequence(LexerDiagnosticEmitter& emitter,
- llvm::StringRef digits,
- std::string& result) -> bool {
- unsigned code_point;
- if (digits.getAsInteger(16, code_point) || code_point > 0x10FFFF) {
- emitter.EmitError<UnicodeEscapeTooLarge>(digits.begin());
- return false;
- }
- if (code_point >= 0xD800 && code_point < 0xE000) {
- emitter.EmitError<UnicodeEscapeSurrogate>(digits.begin());
- return false;
- }
- // Convert the code point to a sequence of UTF-8 code units.
- // Every code point fits in 6 UTF-8 code units.
- const llvm::UTF32 utf32_code_units[1] = {code_point};
- llvm::UTF8 utf8_code_units[6];
- const llvm::UTF32* src_pos = utf32_code_units;
- llvm::UTF8* dest_pos = utf8_code_units;
- llvm::ConversionResult conv_result = llvm::ConvertUTF32toUTF8(
- &src_pos, src_pos + 1, &dest_pos, dest_pos + 6, llvm::strictConversion);
- if (conv_result != llvm::conversionOK) {
- llvm_unreachable("conversion of valid code point to UTF-8 cannot fail");
- }
- result.insert(result.end(), reinterpret_cast<char*>(utf8_code_units),
- reinterpret_cast<char*>(dest_pos));
- return true;
- }
- // Expand an escape sequence, appending the expanded value to the given
- // `result` string. `content` is the string content, starting from the first
- // character after the escape sequence introducer (for example, the `n` in
- // `\n`), and will be updated to remove the leading escape sequence.
- static auto ExpandAndConsumeEscapeSequence(LexerDiagnosticEmitter& emitter,
- llvm::StringRef& content,
- std::string& result) -> void {
- CHECK(!content.empty()) << "should have escaped closing delimiter";
- char first = content.front();
- content = content.drop_front(1);
- switch (first) {
- case 't':
- result += '\t';
- return;
- case 'n':
- result += '\n';
- return;
- case 'r':
- result += '\r';
- return;
- case '"':
- result += '"';
- return;
- case '\'':
- result += '\'';
- return;
- case '\\':
- result += '\\';
- return;
- case '0':
- result += '\0';
- if (!content.empty() && IsDecimalDigit(content.front())) {
- emitter.EmitError<DecimalEscapeSequence>(content.begin());
- return;
- }
- return;
- case 'x':
- if (content.size() >= 2 && IsUpperHexDigit(content[0]) &&
- IsUpperHexDigit(content[1])) {
- result +=
- static_cast<char>(llvm::hexFromNibbles(content[0], content[1]));
- content = content.drop_front(2);
- return;
- }
- emitter.EmitError<HexadecimalEscapeMissingDigits>(content.begin());
- break;
- case 'u': {
- llvm::StringRef remaining = content;
- if (remaining.consume_front("{")) {
- llvm::StringRef digits = remaining.take_while(IsUpperHexDigit);
- remaining = remaining.drop_front(digits.size());
- if (!digits.empty() && remaining.consume_front("}")) {
- if (!ExpandUnicodeEscapeSequence(emitter, digits, result)) {
- break;
- }
- content = remaining;
- return;
- }
- }
- emitter.EmitError<UnicodeEscapeMissingBracedDigits>(content.begin());
- break;
- }
- default:
- emitter.EmitError<UnknownEscapeSequence>(content.begin() - 1,
- {.first = first});
- break;
- }
- // If we get here, we didn't recognize this escape sequence and have already
- // issued a diagnostic. For error recovery purposes, expand this escape
- // sequence to itself, dropping the introducer (for example, `\q` -> `q`).
- result += first;
- }
- // Expand any escape sequences in the given string literal.
- static auto ExpandEscapeSequencesAndRemoveIndent(
- LexerDiagnosticEmitter& emitter, llvm::StringRef contents, int hash_level,
- llvm::StringRef indent) -> std::string {
- std::string result;
- result.reserve(contents.size());
- llvm::SmallString<16> escape("\\");
- escape.resize(1 + hash_level, '#');
- // Process each line of the string literal.
- while (true) {
- // Every non-empty line (that contains anything other than horizontal
- // whitespace) is required to start with the string's indent. For error
- // recovery, remove all leading whitespace if the indent doesn't match.
- if (!contents.consume_front(indent)) {
- const char* line_start = contents.begin();
- contents = contents.drop_while(IsHorizontalWhitespace);
- if (!contents.startswith("\n")) {
- emitter.EmitError<MismatchedIndentInString>(line_start);
- }
- }
- // Process the contents of the line.
- while (true) {
- auto end_of_regular_text = contents.find_if([](char c) {
- return c == '\n' || c == '\\' ||
- (IsHorizontalWhitespace(c) && c != ' ');
- });
- result += contents.substr(0, end_of_regular_text);
- contents = contents.substr(end_of_regular_text);
- if (contents.empty()) {
- return result;
- }
- if (contents.consume_front("\n")) {
- // Trailing whitespace before a newline doesn't contribute to the string
- // literal value.
- while (!result.empty() && result.back() != '\n' &&
- IsSpace(result.back())) {
- result.pop_back();
- }
- result += '\n';
- // Move onto to the next line.
- break;
- }
- if (IsHorizontalWhitespace(contents.front())) {
- // Horizontal whitespace other than ` ` is valid only at the end of a
- // line.
- CHECK(contents.front() != ' ')
- << "should not have stopped at a plain space";
- auto after_space = contents.find_if_not(IsHorizontalWhitespace);
- if (after_space == llvm::StringRef::npos ||
- contents[after_space] != '\n') {
- // TODO: Include the source range of the whitespace up to
- // `contents.begin() + after_space` in the diagnostic.
- emitter.EmitError<InvalidHorizontalWhitespaceInString>(
- contents.begin());
- // Include the whitespace in the string contents for error recovery.
- result += contents.substr(0, after_space);
- }
- contents = contents.substr(after_space);
- continue;
- }
- if (!contents.consume_front(escape)) {
- // This is not an escape sequence, just a raw `\`.
- result += contents.front();
- contents = contents.drop_front(1);
- continue;
- }
- if (contents.consume_front("\n")) {
- // An escaped newline ends the line without producing any content and
- // without trimming trailing whitespace.
- break;
- }
- // Handle this escape sequence.
- ExpandAndConsumeEscapeSequence(emitter, contents, result);
- }
- }
- }
- auto LexedStringLiteral::ComputeValue(LexerDiagnosticEmitter& emitter) const
- -> std::string {
- llvm::StringRef indent =
- multi_line_ ? CheckIndent(emitter, text_, content_) : llvm::StringRef();
- return ExpandEscapeSequencesAndRemoveIndent(emitter, content_, hash_level_,
- indent);
- }
- } // namespace Carbon
|