1 tahun lalu · 565fc5cebb
--- a/toolchain/diagnostics/diagnostic_kind.def
+++ b/toolchain/diagnostics/diagnostic_kind.def
@@ -48,6 +48,8 @@ CARBON_DIAGNOSTIC_KIND(UnicodeEscapeSurrogate)
 
				 CARBON_DIAGNOSTIC_KIND(UnicodeEscapeTooLarge)
			
 
				 CARBON_DIAGNOSTIC_KIND(UnknownBaseSpecifier)
			
 
				 CARBON_DIAGNOSTIC_KIND(UnknownEscapeSequence)
			
 
				+CARBON_DIAGNOSTIC_KIND(UnsupportedCRLineEnding)
			
 
				+CARBON_DIAGNOSTIC_KIND(UnsupportedLFCRLineEnding)
			
 
				 CARBON_DIAGNOSTIC_KIND(UnmatchedOpening)
			
 
				 CARBON_DIAGNOSTIC_KIND(UnmatchedClosing)
			
 
				 CARBON_DIAGNOSTIC_KIND(UnrecognizedCharacters)
			
--- a/toolchain/lex/lex.cpp
+++ b/toolchain/lex/lex.cpp
@@ -115,6 +115,8 @@ class [[clang::internal_linkage]] Lexer {
 
				   auto LexVerticalWhitespace(llvm::StringRef source_text, ssize_t& position)
			
 
				       -> void;
			
 
				 
			
 
				+  auto LexCR(llvm::StringRef source_text, ssize_t& position) -> void;
			
 
				+
			
 
				   auto LexCommentOrSlash(llvm::StringRef source_text, ssize_t& position)
			
 
				       -> void;
			
 
				 
			
@@ -502,6 +504,7 @@ CARBON_DISPATCH_LEX_SYMBOL_TOKEN(LexClosing)
 
				   }
			
 
				 CARBON_DISPATCH_LEX_NON_TOKEN(LexHorizontalWhitespace)
			
 
				 CARBON_DISPATCH_LEX_NON_TOKEN(LexVerticalWhitespace)
			
 
				+CARBON_DISPATCH_LEX_NON_TOKEN(LexCR)
			
 
				 CARBON_DISPATCH_LEX_NON_TOKEN(LexCommentOrSlash)
			
 
				 
			
 
				 // Build a table of function pointers that we can use to dispatch to the
			
@@ -598,6 +601,7 @@ static constexpr auto MakeDispatchTable() -> DispatchTableT {
 
				   table[' '] = &DispatchLexHorizontalWhitespace;
			
 
				   table['\t'] = &DispatchLexHorizontalWhitespace;
			
 
				   table['\n'] = &DispatchLexVerticalWhitespace;
			
 
				+  table['\r'] = &DispatchLexCR;
			
 
				 
			
 
				   return table;
			
 
				 }
			
@@ -647,11 +651,19 @@ auto Lexer::MakeLines(llvm::StringRef source_text) -> void {
 
				   // carefully selected variables and the `ssize_t` type for performance and
			
 
				   // code size of this hot loop.
			
 
				   //
			
 
				-  // TODO: Eventually, we'll likely need to roll our own SIMD-optimized
			
 
				-  // routine here in order to handle CR+LF line endings, as we'll want those
			
 
				-  // to stay on the fast path. We'll also need to detect and diagnose Unicode
			
 
				-  // vertical whitespace. Starting with `memchr` should give us a strong
			
 
				-  // baseline performance target when adding those features.
			
 
				+  // Note that the `memchr` approach here works equally well for LF and CR+LF
			
 
				+  // line endings. Either way, it finds the end of the line and the start of the
			
 
				+  // next line. The lexer below will find the CR byte and peek to see the
			
 
				+  // following LF and jump to the next line correctly. However, this approach
			
 
				+  // does *not* support plain CR or LF+CR line endings. Nor does it support
			
 
				+  // vertical tab or other vertical whitespace.
			
 
				+  //
			
 
				+  // TODO: Eventually, we should extend this to have correct fallback support
			
 
				+  // for handling CR, LF+CR, vertical tab, and other esoteric vertical
			
 
				+  // whitespace as line endings. Notably, including *mixtures* of them. This
			
 
				+  // will likely be somewhat tricky as even detecting their absence without
			
 
				+  // performance overhead and without a custom scanner here rather than memchr
			
 
				+  // is likely to be difficult.
			
 
				   const char* const text = source_text.data();
			
 
				   const ssize_t size = source_text.size();
			
 
				   ssize_t start = 0;
			
@@ -708,6 +720,33 @@ auto Lexer::LexVerticalWhitespace(llvm::StringRef source_text,
 
				   line_info->indent = position - line_start;
			
 
				 }
			
 
				 
			
 
				+auto Lexer::LexCR(llvm::StringRef source_text, ssize_t& position) -> void {
			
 
				+  if (LLVM_LIKELY((position + 1) < static_cast<ssize_t>(source_text.size())) &&
			
 
				+      LLVM_LIKELY(source_text[position + 1] == '\n')) {
			
 
				+    // Skip to the vertical whitespace path, it will skip over both CR and LF.
			
 
				+    LexVerticalWhitespace(source_text, position);
			
 
				+    return;
			
 
				+  }
			
 
				+
			
 
				+  CARBON_DIAGNOSTIC(UnsupportedLFCRLineEnding, Error,
			
 
				+                    "The LF+CR line ending is not supported, only LF and CR+LF "
			
 
				+                    "are supported.");
			
 
				+  CARBON_DIAGNOSTIC(UnsupportedCRLineEnding, Error,
			
 
				+                    "A raw CR line ending is not supported, only LF and CR+LF "
			
 
				+                    "are supported.");
			
 
				+  bool is_lfcr = position > 0 && source_text[position - 1] == '\n';
			
 
				+  // TODO: This diagnostic has an unfortunate snippet -- we should tweak the
			
 
				+  // snippet rendering to gracefully handle CRs.
			
 
				+  emitter_.Emit(source_text.begin() + position,
			
 
				+                is_lfcr ? UnsupportedLFCRLineEnding : UnsupportedCRLineEnding);
			
 
				+
			
 
				+  // Recover by treating the CR as a horizontal whitespace. This should make our
			
 
				+  // whitespace rules largely work and parse cleanly without disrupting the line
			
 
				+  // tracking data structures that were pre-built.
			
 
				+  NoteWhitespace();
			
 
				+  ++position;
			
 
				+}
			
 
				+
			
 
				 auto Lexer::LexCommentOrSlash(llvm::StringRef source_text, ssize_t& position)
			
 
				     -> void {
			
 
				   CARBON_DCHECK(source_text[position] == '/');
			
--- a/toolchain/lex/tokenized_buffer_test.cpp
+++ b/toolchain/lex/tokenized_buffer_test.cpp
@@ -101,6 +101,88 @@ TEST_F(LexerTest, TracksLinesAndColumns) {
 
				       }));
			
 
				 }
			
 
				 
			
 
				+TEST_F(LexerTest, TracksLinesAndColumnsCRLF) {
			
 
				+  auto buffer =
			
 
				+      Lex("\r\n  ;;\r\n   ;;;\r\n   x\"foo\" '''baz\r\n  a\r\n ''' y");
			
 
				+  EXPECT_FALSE(buffer.has_errors());
			
 
				+  EXPECT_THAT(
			
 
				+      buffer,
			
 
				+      HasTokens(llvm::ArrayRef<ExpectedToken>{
			
 
				+          {.kind = TokenKind::FileStart,
			
 
				+           .line = 1,
			
 
				+           .column = 1,
			
 
				+           .indent_column = 1},
			
 
				+          {.kind = TokenKind::Semi, .line = 2, .column = 3, .indent_column = 3},
			
 
				+          {.kind = TokenKind::Semi, .line = 2, .column = 4, .indent_column = 3},
			
 
				+          {.kind = TokenKind::Semi, .line = 3, .column = 4, .indent_column = 4},
			
 
				+          {.kind = TokenKind::Semi, .line = 3, .column = 5, .indent_column = 4},
			
 
				+          {.kind = TokenKind::Semi, .line = 3, .column = 6, .indent_column = 4},
			
 
				+          {.kind = TokenKind::Identifier,
			
 
				+           .line = 4,
			
 
				+           .column = 4,
			
 
				+           .indent_column = 4,
			
 
				+           .text = "x"},
			
 
				+          {.kind = TokenKind::StringLiteral,
			
 
				+           .line = 4,
			
 
				+           .column = 5,
			
 
				+           .indent_column = 4},
			
 
				+          {.kind = TokenKind::StringLiteral,
			
 
				+           .line = 4,
			
 
				+           .column = 11,
			
 
				+           .indent_column = 4},
			
 
				+          {.kind = TokenKind::Identifier,
			
 
				+           .line = 6,
			
 
				+           .column = 6,
			
 
				+           .indent_column = 11,
			
 
				+           .text = "y"},
			
 
				+          {.kind = TokenKind::FileEnd, .line = 6, .column = 7},
			
 
				+      }));
			
 
				+}
			
 
				+
			
 
				+TEST_F(LexerTest, InvalidCR) {
			
 
				+  auto buffer = Lex("\n ;;\r ;\n   x");
			
 
				+  EXPECT_TRUE(buffer.has_errors());
			
 
				+  EXPECT_THAT(
			
 
				+      buffer,
			
 
				+      HasTokens(llvm::ArrayRef<ExpectedToken>{
			
 
				+          {.kind = TokenKind::FileStart,
			
 
				+           .line = 1,
			
 
				+           .column = 1,
			
 
				+           .indent_column = 1},
			
 
				+          {.kind = TokenKind::Semi, .line = 2, .column = 2, .indent_column = 2},
			
 
				+          {.kind = TokenKind::Semi, .line = 2, .column = 3, .indent_column = 2},
			
 
				+          {.kind = TokenKind::Semi, .line = 2, .column = 6, .indent_column = 2},
			
 
				+          {.kind = TokenKind::Identifier,
			
 
				+           .line = 3,
			
 
				+           .column = 4,
			
 
				+           .indent_column = 4,
			
 
				+           .text = "x"},
			
 
				+          {.kind = TokenKind::FileEnd, .line = 3, .column = 5},
			
 
				+      }));
			
 
				+}
			
 
				+
			
 
				+TEST_F(LexerTest, InvalidLFCR) {
			
 
				+  auto buffer = Lex("\n ;;\n\r ;\n   x");
			
 
				+  EXPECT_TRUE(buffer.has_errors());
			
 
				+  EXPECT_THAT(
			
 
				+      buffer,
			
 
				+      HasTokens(llvm::ArrayRef<ExpectedToken>{
			
 
				+          {.kind = TokenKind::FileStart,
			
 
				+           .line = 1,
			
 
				+           .column = 1,
			
 
				+           .indent_column = 1},
			
 
				+          {.kind = TokenKind::Semi, .line = 2, .column = 2, .indent_column = 2},
			
 
				+          {.kind = TokenKind::Semi, .line = 2, .column = 3, .indent_column = 2},
			
 
				+          {.kind = TokenKind::Semi, .line = 3, .column = 3, .indent_column = 1},
			
 
				+          {.kind = TokenKind::Identifier,
			
 
				+           .line = 4,
			
 
				+           .column = 4,
			
 
				+           .indent_column = 4,
			
 
				+           .text = "x"},
			
 
				+          {.kind = TokenKind::FileEnd, .line = 4, .column = 5},
			
 
				+      }));
			
 
				+}
			
 
				+
			
 
				 TEST_F(LexerTest, HandlesNumericLiteral) {
			
 
				   auto buffer = Lex("12-578\n  1  2\n0x12_3ABC\n0b10_10_11\n1_234_567\n1.5e9");
			
 
				   EXPECT_FALSE(buffer.has_errors());