浏览代码

fix crash caused by unicode chars (#3387)

Prevent a crash during lexing for unicode chars.
Jacob Schneider 2 年之前
父节点
当前提交
482d233def
共有 2 个文件被更改,包括 24 次插入11 次删除
  1. 15 10
      toolchain/lex/lex.cpp
  2. 9 1
      toolchain/lex/testdata/fail_bad_raw_identifier.carbon

+ 15 - 10
toolchain/lex/lex.cpp

@@ -503,14 +503,17 @@ CARBON_DISPATCH_LEX_TOKEN(LexNumericLiteral)
 CARBON_DISPATCH_LEX_TOKEN(LexStringLiteral)
 
 // A custom dispatch functions that pre-select the symbol token to lex.
-#define CARBON_DISPATCH_LEX_SYMBOL_TOKEN(LexMethod)                           \
-  static auto Dispatch##LexMethod##SymbolToken(                               \
-      Lexer& lexer, llvm::StringRef source_text, ssize_t position)            \
-      ->void {                                                                \
-    Lexer::LexResult result = lexer.LexMethod##SymbolToken(                   \
-        source_text, OneCharTokenKindTable[source_text[position]], position); \
-    CARBON_CHECK(result) << "Failed to form a token!";                        \
-    [[clang::musttail]] return DispatchNext(lexer, source_text, position);    \
+#define CARBON_DISPATCH_LEX_SYMBOL_TOKEN(LexMethod)                        \
+  static auto Dispatch##LexMethod##SymbolToken(                            \
+      Lexer& lexer, llvm::StringRef source_text, ssize_t position)         \
+      ->void {                                                             \
+    Lexer::LexResult result = lexer.LexMethod##SymbolToken(                \
+        source_text,                                                       \
+        OneCharTokenKindTable[static_cast<unsigned char>(                  \
+            source_text[position])],                                       \
+        position);                                                         \
+    CARBON_CHECK(result) << "Failed to form a token!";                     \
+    [[clang::musttail]] return DispatchNext(lexer, source_text, position); \
   }
 CARBON_DISPATCH_LEX_SYMBOL_TOKEN(LexOneChar)
 CARBON_DISPATCH_LEX_SYMBOL_TOKEN(LexOpening)
@@ -1151,7 +1154,8 @@ auto Lexer::LexKeywordOrIdentifier(llvm::StringRef source_text,
     // TODO: Need to add support for Unicode lexing.
     return LexError(source_text, position);
   }
-  CARBON_CHECK(IsIdStartByteTable[source_text[position]]);
+  CARBON_CHECK(
+      IsIdStartByteTable[static_cast<unsigned char>(source_text[position])]);
 
   int column = ComputeColumn(position);
 
@@ -1192,7 +1196,8 @@ auto Lexer::LexKeywordOrIdentifierMaybeRaw(llvm::StringRef source_text,
   // TODO: Need to add support for Unicode lexing.
   if (LLVM_LIKELY(position + 2 >= static_cast<ssize_t>(source_text.size()) ||
                   source_text[position + 1] != '#' ||
-                  !IsIdStartByteTable[source_text[position + 2]])) {
+                  !IsIdStartByteTable[static_cast<unsigned char>(
+                      source_text[position + 2])])) {
     // TODO: Should this print a different error when there is `r#`, but it
     // isn't followed by identifier text? Or is it right to put it back so
     // that the `#` could be parsed as part of a raw string literal?

+ 9 - 1
toolchain/lex/testdata/fail_bad_raw_identifier.carbon

@@ -24,5 +24,13 @@ r#3
 // CHECK:STDOUT:     { index: 4, kind:          'Error', line: {{ *}}[[@LINE-2]], column:  2, indent: 1, spelling: '#' },
 // CHECK:STDOUT:     { index: 5, kind: 'IntegerLiteral', line: {{ *}}[[@LINE-3]], column:  3, indent: 1, spelling: '3', value: `3`, has_trailing_space: true },
 
-// CHECK:STDOUT:     { index: 6, kind:      'EndOfFile', line: {{ *}}[[@LINE+1]], column: {{ *\d+}}, indent: 1, spelling: '' },
+// Non ascii start to identifier.
+// CHECK:STDERR: fail_bad_raw_identifier.carbon:[[@LINE+3]]:2: ERROR: Encountered unrecognized characters while parsing.
+// CHECK:STDERR: r#á
+// CHECK:STDERR:  ^
+r#á
+// CHECK:STDOUT:     { index: 6, kind:     'Identifier', line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: 'r', identifier: 0 },
+// CHECK:STDOUT:     { index: 7, kind:          'Error', line: {{ *}}[[@LINE-2]], column:  2, indent: 1, spelling: '#á', has_trailing_space: true },
+
+// CHECK:STDOUT:     { index: 8, kind:      'EndOfFile', line: {{ *}}[[@LINE+1]], column: {{ *\d+}}, indent: 1, spelling: '' },
 // CHECK:STDOUT:   ]