Просмотр исходного кода

Stitch together adjacent comments using the indent. (#4397)

This is improving the comment production to produce fewer distinct
comments.

At present, comment processing uses strict prefix matching. It either
expects `// ` (with a space) for valid comments, or just `//` (without a
space) for invalid comments that lacked the space.

As a consequence, the following would be three comments:

```
// Comment 1
//
//
// Comment 4
```

This is because a 3-character prefix is used for valid comments. The
prefix switches between lines 1 and 2, and again between lines 3 and 4,
each resulting in a separate comment.

For contrast, this is one comment because only a 2-character prefix is
used:

```
//Comment 1
//
//
//Comment 4
```

That's because all lines lack a suffix space.

Additionally, with SIMD 16-byte boundaries, further splits can occur if
processing needs to transition to non-SIMD.

Here, I'm trying to just address all of this by:

1. Stitching together adjacent comments. Since a lexed comment starts at
the `//` excluding the indent, the delta from the prior comment must be
precisely the indent.
2. Adding support for switching from SIMD to non-SIMD on file
boundaries.

I considered trying to have a separate `//\n` prefix for SIMD processing
of `// `, but I wasn't sure about the tradeoff of doing both at the same
time (in particular, it'd require constructing a string for the
different prefix), thus this stitch approach. This does mean multiple
passes will be required for a typical long comment structure using blank
comment lines to separate paragraphs (for performance reasons, I will
recommend engineers not write comm... nevermind).

---------

Co-authored-by: Chandler Carruth <chandlerc@gmail.com>
Jon Ross-Perkins 1 год назад
Родитель
Сommit
0db96ebc52

+ 2 - 6
toolchain/format/testdata/basics/comments.carbon

@@ -47,11 +47,7 @@ class C {
 // CHECK:STDOUT:  }
 // CHECK:STDOUT: // Another
 // CHECK:STDOUT:   // Block
-// CHECK:STDOUT:
-// CHECK:STDOUT:
-// CHECK:STDOUT: //
-// CHECK:STDOUT:
-// CHECK:STDOUT:
-// CHECK:STDOUT: // Comment
+// CHECK:STDOUT:   //
+// CHECK:STDOUT:   // Comment
 // CHECK:STDOUT:
 // CHECK:STDOUT:

+ 2 - 6
toolchain/lex/lex.cpp

@@ -875,9 +875,7 @@ auto Lexer::LexComment(llvm::StringRef source_text, ssize_t& position) -> void {
     // whitespace, which already is designed to skip over any erroneous text at
     // the end of the line.
     LexVerticalWhitespace(source_text, position);
-    buffer_.comments_.push_back(
-        {.start = comment_start,
-         .length = static_cast<int32_t>(position) - comment_start});
+    buffer_.AddComment(line_info->indent, comment_start, position);
     return;
   }
 
@@ -981,9 +979,7 @@ auto Lexer::LexComment(llvm::StringRef source_text, ssize_t& position) -> void {
     }
   }
 
-  buffer_.comments_.push_back(
-      {.start = comment_start,
-       .length = static_cast<int32_t>(position) - comment_start});
+  buffer_.AddComment(indent, comment_start, position);
 
   // Now compute the indent of this next line before we finish.
   ssize_t line_start = position;

+ 12 - 0
toolchain/lex/tokenized_buffer.cpp

@@ -357,6 +357,18 @@ auto TokenizedBuffer::GetCommentText(CommentIndex comment_index) const
   return source_->text().substr(comment_data.start, comment_data.length);
 }
 
+auto TokenizedBuffer::AddComment(int32_t indent, int32_t start, int32_t end)
+    -> void {
+  if (!comments_.empty()) {
+    auto& comment = comments_.back();
+    if (comment.start + comment.length + indent == start) {
+      comment.length = end - comment.start;
+      return;
+    }
+  }
+  comments_.push_back({.start = start, .length = end - start});
+}
+
 auto TokenizedBuffer::CollectMemUsage(MemUsage& mem_usage,
                                       llvm::StringRef label) const -> void {
   mem_usage.Add(MemUsage::ConcatLabel(label, "allocator_"), allocator_);

+ 6 - 0
toolchain/lex/tokenized_buffer.h

@@ -214,6 +214,8 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
                             CommentIterator(CommentIndex(comments_.size())));
   }
 
+  auto comments_size() const -> size_t { return comments_.size(); }
+
   // This is an upper bound on the number of output parse nodes in the absence
   // of errors.
   auto expected_max_parse_tree_size() const -> int {
@@ -457,6 +459,10 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
   auto PrintToken(llvm::raw_ostream& output_stream, TokenIndex token,
                   PrintWidths widths) const -> void;
 
+  // Adds a comment. This uses the indent to potentially stitch together two
+  // adjacent comments.
+  auto AddComment(int32_t indent, int32_t start, int32_t end) -> void;
+
   // Used to allocate computed string literals.
   llvm::BumpPtrAllocator allocator_;
 

+ 108 - 0
toolchain/lex/tokenized_buffer_test.cpp

@@ -11,6 +11,7 @@
 #include <iterator>
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "testing/base/test_raw_ostream.h"
 #include "toolchain/base/value_store.h"
 #include "toolchain/diagnostics/diagnostic_emitter.h"
@@ -1106,6 +1107,113 @@ TEST_F(LexerTest, DiagnosticUnrecognizedChar) {
   compile_helper_.GetTokenizedBuffer("\b", &consumer);
 }
 
+// Appends comment lines to the string, to create a comment block.
+static auto AppendCommentLines(std::string& str, int count, llvm::StringRef tag)
+    -> void {
+  llvm::raw_string_ostream out(str);
+  for (int i : llvm::seq(count)) {
+    out << "// " << tag << i << "\n";
+  }
+}
+
+TEST_F(LexerTest, CommentBlock) {
+  for (int comments_before = 0; comments_before < 5; ++comments_before) {
+    std::string prefix;
+    AppendCommentLines(prefix, comments_before, "B");
+
+    for (int comments_after = 1; comments_after < 5; ++comments_after) {
+      std::string source = prefix;
+      if (comments_before > 0) {
+        source += "//\n";
+      }
+      AppendCommentLines(source, comments_after, "C");
+
+      SCOPED_TRACE(llvm::formatv(
+          "{0} comment lines before the empty comment line, {1} after",
+          comments_before, comments_after));
+
+      auto& buffer = compile_helper_.GetTokenizedBuffer(source);
+      ASSERT_FALSE(buffer.has_errors());
+
+      EXPECT_THAT(buffer.comments_size(), Eq(1));
+    }
+  }
+}
+
+TEST_F(LexerTest, IndentedComments) {
+  for (int indent = 0; indent < 40; ++indent) {
+    SCOPED_TRACE(llvm::formatv("Indent: {0}", indent));
+
+    std::string source;
+    llvm::raw_string_ostream source_stream(source);
+    source_stream.indent(indent);
+    source_stream << "// Comment\n";
+
+    auto& buffer = compile_helper_.GetTokenizedBuffer(source);
+    ASSERT_FALSE(buffer.has_errors());
+    EXPECT_THAT(buffer.comments_size(), Eq(1));
+
+    std::string simd_source =
+        source +
+        "\"Add a bunch of padding so that SIMD logic shouldn't hit EOF\"";
+    auto& simd_buffer = compile_helper_.GetTokenizedBuffer(source);
+    ASSERT_FALSE(simd_buffer.has_errors());
+    EXPECT_THAT(simd_buffer.comments_size(), Eq(1));
+  }
+}
+
+TEST_F(LexerTest, MultipleComments) {
+  constexpr llvm::StringLiteral Format = R"(
+{0}
+  {1}
+
+{2}
+                                                              {3}
+
+'''This is a string, not a comment. The next comment will stop SIMD due to being
+   too close to the EOF.
+   '''
+
+{4}
+x
+)";
+  constexpr llvm::StringLiteral Comments[] = {
+      // NOLINTNEXTLINE(bugprone-suspicious-missing-comma)
+      "// This comment should be possible to parse with SIMD.\n"
+      "// This one too.\n",
+      "// This one as well, though it's a different indent.\n"
+      "        // And mixes indent.\n"
+      "   // And mixes indent more.\n",
+      "// This is one comment:\n"
+      "//Invalid\n"
+      "// Valid\n"
+      "//Invalid\n"
+      "//\n"
+      "// Valid\n"
+      "//\n"
+      "// Valid\n",
+      "// This uses a high indent, which stops SIMD.\n", "//\n"};
+  std::string source = llvm::formatv(Format.data(), Comments[0], Comments[1],
+                                     Comments[2], Comments[3], Comments[4])
+                           .str();
+
+  auto& buffer = compile_helper_.GetTokenizedBuffer(source);
+  EXPECT_TRUE(buffer.has_errors());
+
+  EXPECT_THAT(buffer.comments_size(), Eq(std::size(Comments)));
+  for (int i :
+       llvm::seq(std::min<int>(buffer.comments_size(), std::size(Comments)))) {
+    EXPECT_THAT(buffer.GetCommentText(CommentIndex(i)).str(),
+                testing::StrEq(Comments[i]));
+  }
+  EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
+                          {.kind = TokenKind::FileStart},
+                          {.kind = TokenKind::StringLiteral},
+                          {.kind = TokenKind::Identifier},
+                          {.kind = TokenKind::FileEnd},
+                      }));
+}
+
 TEST_F(LexerTest, PrintingOutputYaml) {
   // Test that we can parse this into YAML and verify line and indent data.
   auto& buffer =