Jelajahi Sumber

Initial support for CR+LF (DOS / Windows) line endings. (#4056)

It turns out we can make these work with very minimal complexity because
the LF is still in the right place either way. This also lets us easily
support mixtures of LF and CR+LF line endings gracefully. We create the
line structures around the LF bytes and then have the byte-dispatch loop
notice a CR followed by an LF and skip to the LF behavior.

Rather than add the remaining complexity around supporting bare CR and
LF+CR sequences (both of which are quite rare now), this just adds
diagnostics when we encounter a CR byte that won't fall out of our CR+LF
handling. This is a better experience for users than the alternative. We
still have a TODO to handle the full complexity of vertical whitespace,
but I've updated it to reflect that the common case should be handled
already.

This isn't complete though: we need to add support in string literal
lexing, and we need to teach the diagnostic rendering to handle the
error messages above better. But those will be future PRs, this is
enough to unblock folks who happen to edit a Carbon source file with
notepad on Windows which seems important.

---------

Co-authored-by: Richard Smith <richard@metafoo.co.uk>
Chandler Carruth 1 tahun lalu
induk
melakukan
565fc5cebb

+ 2 - 0
toolchain/diagnostics/diagnostic_kind.def

@@ -48,6 +48,8 @@ CARBON_DIAGNOSTIC_KIND(UnicodeEscapeSurrogate)
 CARBON_DIAGNOSTIC_KIND(UnicodeEscapeTooLarge)
 CARBON_DIAGNOSTIC_KIND(UnknownBaseSpecifier)
 CARBON_DIAGNOSTIC_KIND(UnknownEscapeSequence)
+CARBON_DIAGNOSTIC_KIND(UnsupportedCRLineEnding)
+CARBON_DIAGNOSTIC_KIND(UnsupportedLFCRLineEnding)
 CARBON_DIAGNOSTIC_KIND(UnmatchedOpening)
 CARBON_DIAGNOSTIC_KIND(UnmatchedClosing)
 CARBON_DIAGNOSTIC_KIND(UnrecognizedCharacters)

+ 44 - 5
toolchain/lex/lex.cpp

@@ -115,6 +115,8 @@ class [[clang::internal_linkage]] Lexer {
   auto LexVerticalWhitespace(llvm::StringRef source_text, ssize_t& position)
       -> void;
 
+  auto LexCR(llvm::StringRef source_text, ssize_t& position) -> void;
+
   auto LexCommentOrSlash(llvm::StringRef source_text, ssize_t& position)
       -> void;
 
@@ -502,6 +504,7 @@ CARBON_DISPATCH_LEX_SYMBOL_TOKEN(LexClosing)
   }
 CARBON_DISPATCH_LEX_NON_TOKEN(LexHorizontalWhitespace)
 CARBON_DISPATCH_LEX_NON_TOKEN(LexVerticalWhitespace)
+CARBON_DISPATCH_LEX_NON_TOKEN(LexCR)
 CARBON_DISPATCH_LEX_NON_TOKEN(LexCommentOrSlash)
 
 // Build a table of function pointers that we can use to dispatch to the
@@ -598,6 +601,7 @@ static constexpr auto MakeDispatchTable() -> DispatchTableT {
   table[' '] = &DispatchLexHorizontalWhitespace;
   table['\t'] = &DispatchLexHorizontalWhitespace;
   table['\n'] = &DispatchLexVerticalWhitespace;
+  table['\r'] = &DispatchLexCR;
 
   return table;
 }
@@ -647,11 +651,19 @@ auto Lexer::MakeLines(llvm::StringRef source_text) -> void {
   // carefully selected variables and the `ssize_t` type for performance and
   // code size of this hot loop.
   //
-  // TODO: Eventually, we'll likely need to roll our own SIMD-optimized
-  // routine here in order to handle CR+LF line endings, as we'll want those
-  // to stay on the fast path. We'll also need to detect and diagnose Unicode
-  // vertical whitespace. Starting with `memchr` should give us a strong
-  // baseline performance target when adding those features.
+  // Note that the `memchr` approach here works equally well for LF and CR+LF
+  // line endings. Either way, it finds the end of the line and the start of the
+  // next line. The lexer below will find the CR byte and peek to see the
+  // following LF and jump to the next line correctly. However, this approach
+  // does *not* support plain CR or LF+CR line endings. Nor does it support
+  // vertical tab or other vertical whitespace.
+  //
+  // TODO: Eventually, we should extend this to have correct fallback support
+  // for handling CR, LF+CR, vertical tab, and other esoteric vertical
+  // whitespace as line endings. Notably, including *mixtures* of them. This
+  // will likely be somewhat tricky as even detecting their absence without
+  // performance overhead and without a custom scanner here rather than memchr
+  // is likely to be difficult.
   const char* const text = source_text.data();
   const ssize_t size = source_text.size();
   ssize_t start = 0;
@@ -708,6 +720,33 @@ auto Lexer::LexVerticalWhitespace(llvm::StringRef source_text,
   line_info->indent = position - line_start;
 }
 
+auto Lexer::LexCR(llvm::StringRef source_text, ssize_t& position) -> void {
+  if (LLVM_LIKELY((position + 1) < static_cast<ssize_t>(source_text.size())) &&
+      LLVM_LIKELY(source_text[position + 1] == '\n')) {
+    // Skip to the vertical whitespace path, it will skip over both CR and LF.
+    LexVerticalWhitespace(source_text, position);
+    return;
+  }
+
+  CARBON_DIAGNOSTIC(UnsupportedLFCRLineEnding, Error,
+                    "The LF+CR line ending is not supported, only LF and CR+LF "
+                    "are supported.");
+  CARBON_DIAGNOSTIC(UnsupportedCRLineEnding, Error,
+                    "A raw CR line ending is not supported, only LF and CR+LF "
+                    "are supported.");
+  bool is_lfcr = position > 0 && source_text[position - 1] == '\n';
+  // TODO: This diagnostic has an unfortunate snippet -- we should tweak the
+  // snippet rendering to gracefully handle CRs.
+  emitter_.Emit(source_text.begin() + position,
+                is_lfcr ? UnsupportedLFCRLineEnding : UnsupportedCRLineEnding);
+
+  // Recover by treating the CR as a horizontal whitespace. This should make our
+  // whitespace rules largely work and parse cleanly without disrupting the line
+  // tracking data structures that were pre-built.
+  NoteWhitespace();
+  ++position;
+}
+
 auto Lexer::LexCommentOrSlash(llvm::StringRef source_text, ssize_t& position)
     -> void {
   CARBON_DCHECK(source_text[position] == '/');

+ 82 - 0
toolchain/lex/tokenized_buffer_test.cpp

@@ -101,6 +101,88 @@ TEST_F(LexerTest, TracksLinesAndColumns) {
       }));
 }
 
+TEST_F(LexerTest, TracksLinesAndColumnsCRLF) {
+  auto buffer =
+      Lex("\r\n  ;;\r\n   ;;;\r\n   x\"foo\" '''baz\r\n  a\r\n ''' y");
+  EXPECT_FALSE(buffer.has_errors());
+  EXPECT_THAT(
+      buffer,
+      HasTokens(llvm::ArrayRef<ExpectedToken>{
+          {.kind = TokenKind::FileStart,
+           .line = 1,
+           .column = 1,
+           .indent_column = 1},
+          {.kind = TokenKind::Semi, .line = 2, .column = 3, .indent_column = 3},
+          {.kind = TokenKind::Semi, .line = 2, .column = 4, .indent_column = 3},
+          {.kind = TokenKind::Semi, .line = 3, .column = 4, .indent_column = 4},
+          {.kind = TokenKind::Semi, .line = 3, .column = 5, .indent_column = 4},
+          {.kind = TokenKind::Semi, .line = 3, .column = 6, .indent_column = 4},
+          {.kind = TokenKind::Identifier,
+           .line = 4,
+           .column = 4,
+           .indent_column = 4,
+           .text = "x"},
+          {.kind = TokenKind::StringLiteral,
+           .line = 4,
+           .column = 5,
+           .indent_column = 4},
+          {.kind = TokenKind::StringLiteral,
+           .line = 4,
+           .column = 11,
+           .indent_column = 4},
+          {.kind = TokenKind::Identifier,
+           .line = 6,
+           .column = 6,
+           .indent_column = 11,
+           .text = "y"},
+          {.kind = TokenKind::FileEnd, .line = 6, .column = 7},
+      }));
+}
+
+TEST_F(LexerTest, InvalidCR) {
+  auto buffer = Lex("\n ;;\r ;\n   x");
+  EXPECT_TRUE(buffer.has_errors());
+  EXPECT_THAT(
+      buffer,
+      HasTokens(llvm::ArrayRef<ExpectedToken>{
+          {.kind = TokenKind::FileStart,
+           .line = 1,
+           .column = 1,
+           .indent_column = 1},
+          {.kind = TokenKind::Semi, .line = 2, .column = 2, .indent_column = 2},
+          {.kind = TokenKind::Semi, .line = 2, .column = 3, .indent_column = 2},
+          {.kind = TokenKind::Semi, .line = 2, .column = 6, .indent_column = 2},
+          {.kind = TokenKind::Identifier,
+           .line = 3,
+           .column = 4,
+           .indent_column = 4,
+           .text = "x"},
+          {.kind = TokenKind::FileEnd, .line = 3, .column = 5},
+      }));
+}
+
+TEST_F(LexerTest, InvalidLFCR) {
+  auto buffer = Lex("\n ;;\n\r ;\n   x");
+  EXPECT_TRUE(buffer.has_errors());
+  EXPECT_THAT(
+      buffer,
+      HasTokens(llvm::ArrayRef<ExpectedToken>{
+          {.kind = TokenKind::FileStart,
+           .line = 1,
+           .column = 1,
+           .indent_column = 1},
+          {.kind = TokenKind::Semi, .line = 2, .column = 2, .indent_column = 2},
+          {.kind = TokenKind::Semi, .line = 2, .column = 3, .indent_column = 2},
+          {.kind = TokenKind::Semi, .line = 3, .column = 3, .indent_column = 1},
+          {.kind = TokenKind::Identifier,
+           .line = 4,
+           .column = 4,
+           .indent_column = 4,
+           .text = "x"},
+          {.kind = TokenKind::FileEnd, .line = 4, .column = 5},
+      }));
+}
+
 TEST_F(LexerTest, HandlesNumericLiteral) {
   auto buffer = Lex("12-578\n  1  2\n0x12_3ABC\n0b10_10_11\n1_234_567\n1.5e9");
   EXPECT_FALSE(buffer.has_errors());