1 год назад · 0db96ebc52
--- a/toolchain/format/testdata/basics/comments.carbon
+++ b/toolchain/format/testdata/basics/comments.carbon
@@ -47,11 +47,7 @@ class C {
 
				 // CHECK:STDOUT:  }
			
 
				 // CHECK:STDOUT: // Another
			
 
				 // CHECK:STDOUT:   // Block
			
 
				-// CHECK:STDOUT:
			
 
				-// CHECK:STDOUT:
			
 
				-// CHECK:STDOUT: //
			
 
				-// CHECK:STDOUT:
			
 
				-// CHECK:STDOUT:
			
 
				-// CHECK:STDOUT: // Comment
			
 
				+// CHECK:STDOUT:   //
			
 
				+// CHECK:STDOUT:   // Comment
			
 
				 // CHECK:STDOUT:
			
 
				 // CHECK:STDOUT:
			
--- a/toolchain/lex/lex.cpp
+++ b/toolchain/lex/lex.cpp
@@ -875,9 +875,7 @@ auto Lexer::LexComment(llvm::StringRef source_text, ssize_t& position) -> void {
 
				     // whitespace, which already is designed to skip over any erroneous text at
			
 
				     // the end of the line.
			
 
				     LexVerticalWhitespace(source_text, position);
			
 
				-    buffer_.comments_.push_back(
			
 
				-        {.start = comment_start,
			
 
				-         .length = static_cast<int32_t>(position) - comment_start});
			
 
				+    buffer_.AddComment(line_info->indent, comment_start, position);
			
 
				     return;
			
 
				   }
			
 
				 
			
@@ -981,9 +979,7 @@ auto Lexer::LexComment(llvm::StringRef source_text, ssize_t& position) -> void {
 
				     }
			
 
				   }
			
 
				 
			
 
				-  buffer_.comments_.push_back(
			
 
				-      {.start = comment_start,
			
 
				-       .length = static_cast<int32_t>(position) - comment_start});
			
 
				+  buffer_.AddComment(indent, comment_start, position);
			
 
				 
			
 
				   // Now compute the indent of this next line before we finish.
			
 
				   ssize_t line_start = position;
			
--- a/toolchain/lex/tokenized_buffer.cpp
+++ b/toolchain/lex/tokenized_buffer.cpp
@@ -357,6 +357,18 @@ auto TokenizedBuffer::GetCommentText(CommentIndex comment_index) const
 
				   return source_->text().substr(comment_data.start, comment_data.length);
			
 
				 }
			
 
				 
			
 
				+auto TokenizedBuffer::AddComment(int32_t indent, int32_t start, int32_t end)
			
 
				+    -> void {
			
 
				+  if (!comments_.empty()) {
			
 
				+    auto& comment = comments_.back();
			
 
				+    if (comment.start + comment.length + indent == start) {
			
 
				+      comment.length = end - comment.start;
			
 
				+      return;
			
 
				+    }
			
 
				+  }
			
 
				+  comments_.push_back({.start = start, .length = end - start});
			
 
				+}
			
 
				+
			
 
				 auto TokenizedBuffer::CollectMemUsage(MemUsage& mem_usage,
			
 
				                                       llvm::StringRef label) const -> void {
			
 
				   mem_usage.Add(MemUsage::ConcatLabel(label, "allocator_"), allocator_);
			
--- a/toolchain/lex/tokenized_buffer.h
+++ b/toolchain/lex/tokenized_buffer.h
@@ -214,6 +214,8 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
 
				                             CommentIterator(CommentIndex(comments_.size())));
			
 
				   }
			
 
				 
			
 
				+  auto comments_size() const -> size_t { return comments_.size(); }
			
 
				+
			
 
				   // This is an upper bound on the number of output parse nodes in the absence
			
 
				   // of errors.
			
 
				   auto expected_max_parse_tree_size() const -> int {
			
@@ -457,6 +459,10 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
 
				   auto PrintToken(llvm::raw_ostream& output_stream, TokenIndex token,
			
 
				                   PrintWidths widths) const -> void;
			
 
				 
			
 
				+  // Adds a comment. This uses the indent to potentially stitch together two
			
 
				+  // adjacent comments.
			
 
				+  auto AddComment(int32_t indent, int32_t start, int32_t end) -> void;
			
 
				+
			
 
				   // Used to allocate computed string literals.
			
 
				   llvm::BumpPtrAllocator allocator_;
			
 
				 
			
--- a/toolchain/lex/tokenized_buffer_test.cpp
+++ b/toolchain/lex/tokenized_buffer_test.cpp
@@ -11,6 +11,7 @@
 
				 #include <iterator>
			
 
				 
			
 
				 #include "llvm/ADT/ArrayRef.h"
			
 
				+#include "llvm/Support/FormatVariadic.h"
			
 
				 #include "testing/base/test_raw_ostream.h"
			
 
				 #include "toolchain/base/value_store.h"
			
 
				 #include "toolchain/diagnostics/diagnostic_emitter.h"
			
@@ -1106,6 +1107,113 @@ TEST_F(LexerTest, DiagnosticUnrecognizedChar) {
 
				   compile_helper_.GetTokenizedBuffer("\b", &consumer);
			
 
				 }
			
 
				 
			
 
				+// Appends comment lines to the string, to create a comment block.
			
 
				+static auto AppendCommentLines(std::string& str, int count, llvm::StringRef tag)
			
 
				+    -> void {
			
 
				+  llvm::raw_string_ostream out(str);
			
 
				+  for (int i : llvm::seq(count)) {
			
 
				+    out << "// " << tag << i << "\n";
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+TEST_F(LexerTest, CommentBlock) {
			
 
				+  for (int comments_before = 0; comments_before < 5; ++comments_before) {
			
 
				+    std::string prefix;
			
 
				+    AppendCommentLines(prefix, comments_before, "B");
			
 
				+
			
 
				+    for (int comments_after = 1; comments_after < 5; ++comments_after) {
			
 
				+      std::string source = prefix;
			
 
				+      if (comments_before > 0) {
			
 
				+        source += "//\n";
			
 
				+      }
			
 
				+      AppendCommentLines(source, comments_after, "C");
			
 
				+
			
 
				+      SCOPED_TRACE(llvm::formatv(
			
 
				+          "{0} comment lines before the empty comment line, {1} after",
			
 
				+          comments_before, comments_after));
			
 
				+
			
 
				+      auto& buffer = compile_helper_.GetTokenizedBuffer(source);
			
 
				+      ASSERT_FALSE(buffer.has_errors());
			
 
				+
			
 
				+      EXPECT_THAT(buffer.comments_size(), Eq(1));
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+TEST_F(LexerTest, IndentedComments) {
			
 
				+  for (int indent = 0; indent < 40; ++indent) {
			
 
				+    SCOPED_TRACE(llvm::formatv("Indent: {0}", indent));
			
 
				+
			
 
				+    std::string source;
			
 
				+    llvm::raw_string_ostream source_stream(source);
			
 
				+    source_stream.indent(indent);
			
 
				+    source_stream << "// Comment\n";
			
 
				+
			
 
				+    auto& buffer = compile_helper_.GetTokenizedBuffer(source);
			
 
				+    ASSERT_FALSE(buffer.has_errors());
			
 
				+    EXPECT_THAT(buffer.comments_size(), Eq(1));
			
 
				+
			
 
				+    std::string simd_source =
			
 
				+        source +
			
 
				+        "\"Add a bunch of padding so that SIMD logic shouldn't hit EOF\"";
			
 
				+    auto& simd_buffer = compile_helper_.GetTokenizedBuffer(source);
			
 
				+    ASSERT_FALSE(simd_buffer.has_errors());
			
 
				+    EXPECT_THAT(simd_buffer.comments_size(), Eq(1));
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+TEST_F(LexerTest, MultipleComments) {
			
 
				+  constexpr llvm::StringLiteral Format = R"(
			
 
				+{0}
			
 
				+  {1}
			
 
				+
			
 
				+{2}
			
 
				+                                                              {3}
			
 
				+
			
 
				+'''This is a string, not a comment. The next comment will stop SIMD due to being
			
 
				+   too close to the EOF.
			
 
				+   '''
			
 
				+
			
 
				+{4}
			
 
				+x
			
 
				+)";
			
 
				+  constexpr llvm::StringLiteral Comments[] = {
			
 
				+      // NOLINTNEXTLINE(bugprone-suspicious-missing-comma)
			
 
				+      "// This comment should be possible to parse with SIMD.\n"
			
 
				+      "// This one too.\n",
			
 
				+      "// This one as well, though it's a different indent.\n"
			
 
				+      "        // And mixes indent.\n"
			
 
				+      "   // And mixes indent more.\n",
			
 
				+      "// This is one comment:\n"
			
 
				+      "//Invalid\n"
			
 
				+      "// Valid\n"
			
 
				+      "//Invalid\n"
			
 
				+      "//\n"
			
 
				+      "// Valid\n"
			
 
				+      "//\n"
			
 
				+      "// Valid\n",
			
 
				+      "// This uses a high indent, which stops SIMD.\n", "//\n"};
			
 
				+  std::string source = llvm::formatv(Format.data(), Comments[0], Comments[1],
			
 
				+                                     Comments[2], Comments[3], Comments[4])
			
 
				+                           .str();
			
 
				+
			
 
				+  auto& buffer = compile_helper_.GetTokenizedBuffer(source);
			
 
				+  EXPECT_TRUE(buffer.has_errors());
			
 
				+
			
 
				+  EXPECT_THAT(buffer.comments_size(), Eq(std::size(Comments)));
			
 
				+  for (int i :
			
 
				+       llvm::seq(std::min<int>(buffer.comments_size(), std::size(Comments)))) {
			
 
				+    EXPECT_THAT(buffer.GetCommentText(CommentIndex(i)).str(),
			
 
				+                testing::StrEq(Comments[i]));
			
 
				+  }
			
 
				+  EXPECT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
			
 
				+                          {.kind = TokenKind::FileStart},
			
 
				+                          {.kind = TokenKind::StringLiteral},
			
 
				+                          {.kind = TokenKind::Identifier},
			
 
				+                          {.kind = TokenKind::FileEnd},
			
 
				+                      }));
			
 
				+}
			
 
				+
			
 
				 TEST_F(LexerTest, PrintingOutputYaml) {
			
 
				   // Test that we can parse this into YAML and verify line and indent data.
			
 
				   auto& buffer =