Преглед на файлове

Enforce a couple of char literal restrictions from #1964: (#5960)

* `\x` escapes are not permitted in character literals
* ASCII control characters (U+0000 .. U+001F) are not permitted in
character literals unless specified with escape sequences.
Richard Smith преди 8 месеца
родител
ревизия
7727c62880

+ 4 - 0
.gitattributes

@@ -5,3 +5,7 @@
 # This tells Github to detect files having the extension `.def` as `C++` files, which
 # ensures that these files get syntax highlighted properly.
 *.def linguist-language=C++
+
+# This tells Git to treat lexer tests as text when producing diffs, even if
+# they contain non-printable characters.
+toolchain/lex/testdata/*.carbon diff

+ 0 - 18
toolchain/check/testdata/builtins/char/convert_checked.carbon

@@ -68,8 +68,6 @@ import library "builtin";
 let a: UInt(8) = ToChar('\0');
 let b: UInt(8) = ToChar('b');
 let c: UInt(8) = ToChar('\u{7F}');
-// TODO: According to #1964, \x escapes should not be permitted in character literals.
-let d: UInt(8) = ToChar('\x7F');
 //@dump-sem-ir-end
 
 // --- fail_size_small.carbon
@@ -157,19 +155,6 @@ let c: UInt(8) = ToChar('\u{1E15}');
 // CHECK:STDOUT:   %.loc8_33.1: %u8.builtin = value_of_initializer @__global_init.%ToChar.call.loc8 [concrete = constants.%int_127]
 // CHECK:STDOUT:   %.loc8_33.2: %u8.builtin = converted @__global_init.%ToChar.call.loc8, %.loc8_33.1 [concrete = constants.%int_127]
 // CHECK:STDOUT:   %c: %u8.builtin = bind_name c, %.loc8_33.2
-// CHECK:STDOUT:   name_binding_decl {
-// CHECK:STDOUT:     %d.patt: %pattern_type.456 = binding_pattern d [concrete]
-// CHECK:STDOUT:   }
-// CHECK:STDOUT:   %.loc10_14.1: type = splice_block %.loc10_14.3 [concrete = constants.%u8.builtin] {
-// CHECK:STDOUT:     %UInt.ref.loc10: %UInt.type = name_ref UInt, imports.%Main.UInt [concrete = constants.%UInt]
-// CHECK:STDOUT:     %int_8.loc10: Core.IntLiteral = int_value 8 [concrete = constants.%int_8]
-// CHECK:STDOUT:     %UInt.call.loc10: init type = call %UInt.ref.loc10(%int_8.loc10) [concrete = constants.%u8.builtin]
-// CHECK:STDOUT:     %.loc10_14.2: type = value_of_initializer %UInt.call.loc10 [concrete = constants.%u8.builtin]
-// CHECK:STDOUT:     %.loc10_14.3: type = converted %UInt.call.loc10, %.loc10_14.2 [concrete = constants.%u8.builtin]
-// CHECK:STDOUT:   }
-// CHECK:STDOUT:   %.loc10_31.1: %u8.builtin = value_of_initializer @__global_init.%ToChar.call.loc10 [concrete = constants.%int_127]
-// CHECK:STDOUT:   %.loc10_31.2: %u8.builtin = converted @__global_init.%ToChar.call.loc10, %.loc10_31.1 [concrete = constants.%int_127]
-// CHECK:STDOUT:   %d: %u8.builtin = bind_name d, %.loc10_31.2
 // CHECK:STDOUT: }
 // CHECK:STDOUT:
 // CHECK:STDOUT: fn @__global_init() {
@@ -183,9 +168,6 @@ let c: UInt(8) = ToChar('\u{1E15}');
 // CHECK:STDOUT:   %ToChar.ref.loc8: %ToChar.type = name_ref ToChar, imports.%Main.ToChar [concrete = constants.%ToChar]
 // CHECK:STDOUT:   %.loc8: Core.CharLiteral = char_value U+007F [concrete = constants.%.e28]
 // CHECK:STDOUT:   %ToChar.call.loc8: init %u8.builtin = call %ToChar.ref.loc8(%.loc8) [concrete = constants.%int_127]
-// CHECK:STDOUT:   %ToChar.ref.loc10: %ToChar.type = name_ref ToChar, imports.%Main.ToChar [concrete = constants.%ToChar]
-// CHECK:STDOUT:   %.loc10: Core.CharLiteral = char_value U+007F [concrete = constants.%.e28]
-// CHECK:STDOUT:   %ToChar.call.loc10: init %u8.builtin = call %ToChar.ref.loc10(%.loc10) [concrete = constants.%int_127]
 // CHECK:STDOUT:   <elided>
 // CHECK:STDOUT: }
 // CHECK:STDOUT:

+ 2 - 0
toolchain/diagnostics/diagnostic_kind.def

@@ -78,6 +78,8 @@ CARBON_DIAGNOSTIC_KIND(CharLiteralEmpty)
 CARBON_DIAGNOSTIC_KIND(CharLiteralInvalidUTF8)
 CARBON_DIAGNOSTIC_KIND(CharLiteralOverflow)
 CARBON_DIAGNOSTIC_KIND(CharLiteralRaw)
+CARBON_DIAGNOSTIC_KIND(CharLiteralHexEscape)
+CARBON_DIAGNOSTIC_KIND(CharLiteralControlCharacter)
 CARBON_DIAGNOSTIC_KIND(CharLiteralUnderflow)
 
 // ============================================================================

+ 34 - 1
toolchain/lex/string_literal.cpp

@@ -434,6 +434,8 @@ static auto ExpandEscapeSequencesAndRemoveIndent(
         break;
       }
 
+      // TODO: Also reject vertical whitespace other than \n, but ignore a \r
+      // before a \n.
       if (IsHorizontalWhitespace(contents.front())) {
         // Horizontal whitespace other than ` ` is valid only at the end of a
         // line.
@@ -476,6 +478,11 @@ static auto ExpandEscapeSequencesAndRemoveIndent(
   }
 }
 
+// Returns whether the given character is a control character.
+static auto IsControlCharacter(llvm::UTF32 c) -> bool {
+  return (c >= 0 && c <= 0x1F) || (c >= 0x7F && c <= 0x9F);
+}
+
 auto StringLiteral::ComputeCharLiteralValue(
     Diagnostics::Emitter<const char*>& emitter) const
     -> std::optional<CharLiteralValue> {
@@ -516,7 +523,33 @@ auto StringLiteral::ComputeCharLiteralValue(
         emitter.Emit(text_.begin(), CharLiteralEmpty);
         return std::nullopt;
       }
-      return CharLiteralValue{.value = static_cast<int32_t>(target[0])};
+
+      auto result = target[0];
+
+      // Check for a control character that's not written as an escape sequence.
+      // Also don't diagnose horizontal whitespace, because that was already
+      // done by ExpandEscapeSequencesAndRemoveIndent.
+      if (IsControlCharacter(result) && content_.front() != '\\' &&
+          !IsHorizontalWhitespace(content_.front())) {
+        // TODO: Suggest \0 instead of \u{00} for a NUL character.
+        CARBON_DIAGNOSTIC(CharLiteralControlCharacter, Error,
+                          "control character in character literal; specify as "
+                          "escape sequence `\\u{{{0:X-2}}`",
+                          llvm::UTF32);
+        emitter.Emit(text_.begin(), CharLiteralControlCharacter, result);
+        return std::nullopt;
+      }
+
+      if (content_.starts_with("\\x")) {
+        CARBON_DIAGNOSTIC(CharLiteralHexEscape, Error,
+                          "escape sequence `\\x` in character literal; specify "
+                          "as escape sequence `\\u{{{0:X-2}}`",
+                          llvm::UTF32);
+        emitter.Emit(text_.begin(), CharLiteralHexEscape, result);
+        return std::nullopt;
+      }
+
+      return CharLiteralValue{.value = static_cast<int32_t>(result)};
     }
     case llvm::sourceExhausted: {
       CARBON_DIAGNOSTIC(CharLiteralUnderflow, Error, "incomplete UTF-8");

+ 70 - 21
toolchain/lex/testdata/char_literals.carbon

@@ -13,51 +13,100 @@
 // CHECK:STDOUT:   tokens:
 
 'a'
-// CHECK:STDOUT:   - { index: 1, kind: "CharLiteral", line: {{ *}}[[@LINE-1]], column: 1, indent: 1, spelling: "'a'", has_leading_space: true }
+// CHECK:STDOUT:   - { index:  1, kind: "CharLiteral", line: {{ *}}[[@LINE-1]], column: 1, indent: 1, spelling: "'a'", has_leading_space: true }
 '\n'
-// CHECK:STDOUT:   - { index: 2, kind: "CharLiteral", line: {{ *}}[[@LINE-1]], column: 1, indent: 1, spelling: "'\\n'", has_leading_space: true }
-'\x7F'
-// CHECK:STDOUT:   - { index: 3, kind: "CharLiteral", line: {{ *}}[[@LINE-1]], column: 1, indent: 1, spelling: "'\\x7F'", has_leading_space: true }
+// CHECK:STDOUT:   - { index:  2, kind: "CharLiteral", line: {{ *}}[[@LINE-1]], column: 1, indent: 1, spelling: "'\\n'", has_leading_space: true }
+'\0'
+// CHECK:STDOUT:   - { index:  3, kind: "CharLiteral", line: {{ *}}[[@LINE-1]], column: 1, indent: 1, spelling: "'\\0'", has_leading_space: true }
+'\u{00}'
+// CHECK:STDOUT:   - { index:  4, kind: "CharLiteral", line: {{ *}}[[@LINE-1]], column: 1, indent: 1, spelling: "'\\u{00}'", has_leading_space: true }
+'\u{1F}'
+// CHECK:STDOUT:   - { index:  5, kind: "CharLiteral", line: {{ *}}[[@LINE-1]], column: 1, indent: 1, spelling: "'\\u{1F}'", has_leading_space: true }
+'\u{20}'
+// CHECK:STDOUT:   - { index:  6, kind: "CharLiteral", line: {{ *}}[[@LINE-1]], column: 1, indent: 1, spelling: "'\\u{20}'", has_leading_space: true }
+'\u{7F}'
+// CHECK:STDOUT:   - { index:  7, kind: "CharLiteral", line: {{ *}}[[@LINE-1]], column: 1, indent: 1, spelling: "'\\u{7F}'", has_leading_space: true }
 '\u{123}'
-// CHECK:STDOUT:   - { index: 4, kind: "CharLiteral", line: {{ *}}[[@LINE-1]], column: 1, indent: 1, spelling: "'\\u{123}'", has_leading_space: true }
-'\xC3\xA9'
-// CHECK:STDOUT:   - { index: 5, kind: "CharLiteral", line: {{ *}}[[@LINE-1]], column: 1, indent: 1, spelling: "'\\xC3\\xA9'", has_leading_space: true }
+// CHECK:STDOUT:   - { index:  8, kind: "CharLiteral", line: {{ *}}[[@LINE-1]], column: 1, indent: 1, spelling: "'\\u{123}'", has_leading_space: true }
 
 // --- fail_invalid.carbon
 // CHECK:STDOUT: - filename: fail_invalid.carbon
 // CHECK:STDOUT:   tokens:
 
-// CHECK:STDERR: fail_invalid.carbon:[[@LINE+4]]:1: error: empty character literal [CharLiteralEmpty]
-// CHECK:STDERR: ''
-// CHECK:STDERR: ^
-// CHECK:STDERR:
-''
-// CHECK:STDOUT:   - { index: 1, kind:       "Error", line: {{ *}}[[@LINE-1]], column:   1, indent: 1, spelling: "''", has_leading_space: true }
-
-// CHECK:STDERR: fail_invalid.carbon:[[@LINE+4]]:1: error: too many characters [CharLiteralOverflow]
-// CHECK:STDERR: 'abcde'
+// CHECK:STDERR: fail_invalid.carbon:[[@LINE+4]]:1: error: escape sequence `\x` in character literal; specify as escape sequence `\u{70}` [CharLiteralHexEscape]
+// CHECK:STDERR: '\x70'
 // CHECK:STDERR: ^
 // CHECK:STDERR:
-'abcde'
-// CHECK:STDOUT:   - { index: 2, kind:       "Error", line: {{ *}}[[@LINE-1]], column:   1, indent: 1, spelling: "'abcde'", has_leading_space: true }
+'\x70'
+// CHECK:STDOUT:   - { index:  1, kind:       "Error", line: {{ *}}[[@LINE-1]], column:   1, indent: 1, spelling: "'\\x70'", has_leading_space: true }
 
 // CHECK:STDERR: fail_invalid.carbon:[[@LINE+4]]:1: error: incomplete UTF-8 [CharLiteralUnderflow]
 // CHECK:STDERR: '\xC3'
 // CHECK:STDERR: ^
 // CHECK:STDERR:
 '\xC3'
-// CHECK:STDOUT:   - { index: 3, kind:       "Error", line: {{ *}}[[@LINE-1]], column:   1, indent: 1, spelling: "'\\xC3'", has_leading_space: true }
+// CHECK:STDOUT:   - { index:  2, kind:       "Error", line: {{ *}}[[@LINE-1]], column:   1, indent: 1, spelling: "'\\xC3'", has_leading_space: true }
+
+// CHECK:STDERR: fail_invalid.carbon:[[@LINE+4]]:1: error: escape sequence `\x` in character literal; specify as escape sequence `\u{E9}` [CharLiteralHexEscape]
+// CHECK:STDERR: '\xC3\xA9'
+// CHECK:STDERR: ^
+// CHECK:STDERR:
+'\xC3\xA9'
+// CHECK:STDOUT:   - { index:  3, kind:       "Error", line: {{ *}}[[@LINE-1]], column:   1, indent: 1, spelling: "'\\xC3\\xA9'", has_leading_space: true }
 
 // CHECK:STDERR: fail_invalid.carbon:[[@LINE+4]]:1: error: invalid UTF-8 character [CharLiteralInvalidUTF8]
 // CHECK:STDERR: '\xC3\xFF'
 // CHECK:STDERR: ^
 // CHECK:STDERR:
 '\xC3\xFF'
-// CHECK:STDOUT:   - { index: 4, kind:       "Error", line: {{ *}}[[@LINE-1]], column:   1, indent: 1, spelling: "'\\xC3\\xFF'", has_leading_space: true }
+// CHECK:STDOUT:   - { index:  4, kind:       "Error", line: {{ *}}[[@LINE-1]], column:   1, indent: 1, spelling: "'\\xC3\\xFF'", has_leading_space: true }
+
+// CHECK:STDERR: fail_invalid.carbon:[[@LINE+4]]:1: error: empty character literal [CharLiteralEmpty]
+// CHECK:STDERR: ''
+// CHECK:STDERR: ^
+// CHECK:STDERR:
+''
+// CHECK:STDOUT:   - { index:  5, kind:       "Error", line: {{ *}}[[@LINE-1]], column:   1, indent: 1, spelling: "''", has_leading_space: true }
+
+// CHECK:STDERR: fail_invalid.carbon:[[@LINE+4]]:1: error: too many characters [CharLiteralOverflow]
+// CHECK:STDERR: 'abcde'
+// CHECK:STDERR: ^
+// CHECK:STDERR:
+'abcde'
+// CHECK:STDOUT:   - { index:  6, kind:       "Error", line: {{ *}}[[@LINE-1]], column:   1, indent: 1, spelling: "'abcde'", has_leading_space: true }
 
 // CHECK:STDERR: fail_invalid.carbon:[[@LINE+4]]:1: error: unexpected `#` before character literal [CharLiteralRaw]
 // CHECK:STDERR: #'a'#
 // CHECK:STDERR: ^
 // CHECK:STDERR:
 #'a'#
-// CHECK:STDOUT:   - { index: 5, kind: "CharLiteral", line: {{ *}}[[@LINE-1]], column:   1, indent: 1, spelling: "#'a'#", has_leading_space: true }
+// CHECK:STDOUT:   - { index:  7, kind: "CharLiteral", line: {{ *}}[[@LINE-1]], column:   1, indent: 1, spelling: "#'a'#", has_leading_space: true }
+
+// CHECK:STDERR: fail_invalid.carbon:[[@LINE+4]]:1: error: character literal is missing a terminator [UnterminatedString]
+// CHECK:STDERR: '
+// CHECK:STDERR: ^
+// CHECK:STDERR:
+'
+// CHECK:STDOUT:   - { index:  8, kind:       "Error", line: {{ *}}[[@LINE-1]], column:   1, indent: 1, spelling: "'", has_leading_space: true }
+
+// CHECK:STDERR: fail_invalid.carbon:[[@LINE+4]]:1: error: character literal is missing a terminator [UnterminatedString]
+// CHECK:STDERR: '\'
+// CHECK:STDERR: ^
+// CHECK:STDERR:
+'\'
+// CHECK:STDOUT:   - { index:  9, kind:       "Error", line: {{ *}}[[@LINE-1]], column:   1, indent: 1, spelling: "'\\'", has_leading_space: true }
+
+// CHECK:STDERR: fail_invalid.carbon:[[@LINE+4]]:1: error: character literal is missing a terminator [UnterminatedString]
+// CHECK:STDERR: '\
+// CHECK:STDERR: ^
+// CHECK:STDERR:
+'\
+// CHECK:STDOUT:   - { index: 10, kind:       "Error", line: {{ *}}[[@LINE-1]], column:   1, indent: 1, spelling: "'\\", has_leading_space: true }
+
+// This literal contains a raw tab character.
+// CHECK:STDERR: fail_invalid.carbon:[[@LINE+4]]:2: error: whitespace other than plain space must be expressed with an escape sequence in a string literal [InvalidHorizontalWhitespaceInString]
+// CHECK:STDERR: '{{\t}}'
+// CHECK:STDERR:  ^
+// CHECK:STDERR:
+'	'
+// CHECK:STDOUT:   - { index: 11, kind: "CharLiteral", line: {{ *}}[[@LINE-1]], column:   1, indent: 1, spelling: "'\t'", has_leading_space: true }

BIN
toolchain/lex/testdata/fail_char_literals_bad_encoding.carbon