Просмотр исходного кода

Add tracking of lexed comments, with skeletal formatting. (#4385)

In order to format comments, it's helpful if they're tracked. This
tracks them separately from tokens in order to avoid interfering with
parse; it'd be inconvenient if comment tokens could show up in arbitrary
locations, albeit possible to support.

This additionally extracts out the TokenIterator support into a template
in order to generally have it available for IndexBase types. I'm only
adding it for CommentInfo, not sure if we'll want it elsewhere, but this
structure still felt like a good fit.
Jon Ross-Perkins 1 год назад
Родитель
Сommit
1338f9e0ad

+ 1 - 0
.codespell_ignore

@@ -12,6 +12,7 @@ crossreference
 falsy
 forin
 groupt
+indext
 inout
 parameteras
 pullrequest

+ 49 - 0
toolchain/base/index_base.h

@@ -7,8 +7,11 @@
 
 #include <compare>
 #include <concepts>
+#include <iterator>
+#include <type_traits>
 
 #include "common/ostream.h"
+#include "llvm/ADT/iterator.h"
 
 namespace Carbon {
 
@@ -74,6 +77,52 @@ auto operator<=>(IndexType lhs, IndexType rhs) -> std::strong_ordering {
   return lhs.index <=> rhs.index;
 }
 
+// A random-access iterator for arrays using IndexBase-derived types.
+template <typename IndexT>
+class IndexIterator
+    : public llvm::iterator_facade_base<IndexIterator<IndexT>,
+                                        std::random_access_iterator_tag,
+                                        const IndexT, int>,
+      public Printable<IndexIterator<IndexT>> {
+ public:
+  IndexIterator() = delete;
+
+  explicit IndexIterator(IndexT index) : index_(index) {}
+
+  auto operator==(const IndexIterator& rhs) const -> bool {
+    return index_ == rhs.index_;
+  }
+  auto operator<=>(const IndexIterator& rhs) const -> std::strong_ordering {
+    return index_ <=> rhs.index_;
+  }
+
+  auto operator*() const -> const IndexT& { return index_; }
+
+  using llvm::iterator_facade_base<IndexIterator,
+                                   std::random_access_iterator_tag,
+                                   const IndexT, int>::operator-;
+  auto operator-(const IndexIterator& rhs) const -> int {
+    return index_.index - rhs.index_.index;
+  }
+
+  auto operator+=(int n) -> IndexIterator& {
+    index_.index += n;
+    return *this;
+  }
+  auto operator-=(int n) -> IndexIterator& {
+    index_.index -= n;
+    return *this;
+  }
+
+  // Prints the raw token index.
+  auto Print(llvm::raw_ostream& output) const -> void {
+    output << index_.index;
+  }
+
+ private:
+  IndexT index_;
+};
+
 }  // namespace Carbon
 
 #endif  // CARBON_TOOLCHAIN_BASE_INDEX_BASE_H_

+ 12 - 0
toolchain/format/format.cpp

@@ -16,8 +16,20 @@ auto Format(const Lex::TokenizedBuffer& tokens, llvm::raw_ostream& out)
     // TODO: Error recovery.
     return false;
   }
+
+  auto comments = tokens.comments();
+  auto comment_it = comments.begin();
+
   llvm::ListSeparator sep(" ");
+
   for (auto token : tokens.tokens()) {
+    while (comment_it != comments.end() &&
+           tokens.IsAfterComment(token, *comment_it)) {
+      // TODO: Fix newlines and indent.
+      out << "\n" << tokens.GetCommentText(*comment_it) << "\n";
+      ++comment_it;
+    }
+
     switch (tokens.GetKind(token)) {
       case Lex::TokenKind::FileStart:
         break;

+ 57 - 0
toolchain/format/testdata/basics/comments.carbon

@@ -0,0 +1,57 @@
+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+// Exceptions. See /LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// AUTOUPDATE
+// TIP: To test this file alone, run:
+// TIP:   bazel test //toolchain/testing:file_test --test_arg=--file_tests=toolchain/format/testdata/basics/comments.carbon
+// TIP: To dump output, run:
+// TIP:   bazel run //toolchain/testing:file_test -- --dump_output --file_tests=toolchain/format/testdata/basics/comments.carbon
+
+// --- test.carbon
+
+// A comment
+fn F() {}
+
+// Another comment
+
+  // Block
+  // comment
+
+
+class C {
+    // Internal comment
+}
+
+
+  // Another
+  // Block
+  //
+  // Comment
+
+// --- AUTOUPDATE-SPLIT
+
+// CHECK:STDOUT:
+// CHECK:STDOUT: // A comment
+// CHECK:STDOUT:
+// CHECK:STDOUT: fn F ( ) { }
+// CHECK:STDOUT: // Another comment
+// CHECK:STDOUT:
+// CHECK:STDOUT:
+// CHECK:STDOUT: // Block
+// CHECK:STDOUT:   // comment
+// CHECK:STDOUT:
+// CHECK:STDOUT:  class C {
+// CHECK:STDOUT: // Internal comment
+// CHECK:STDOUT:
+// CHECK:STDOUT:  }
+// CHECK:STDOUT: // Another
+// CHECK:STDOUT:   // Block
+// CHECK:STDOUT:
+// CHECK:STDOUT:
+// CHECK:STDOUT: //
+// CHECK:STDOUT:
+// CHECK:STDOUT:
+// CHECK:STDOUT: // Comment
+// CHECK:STDOUT:
+// CHECK:STDOUT:

+ 0 - 1
toolchain/format/testdata/basics/fail_invalid_comment.carbon

@@ -8,7 +8,6 @@
 // TIP: To dump output, run:
 // TIP:   bazel run //toolchain/testing:file_test -- --dump_output --file_tests=toolchain/format/testdata/basics/fail_invalid_comment.carbon
 
-
 // --- fail_test.carbon
 
 //f

+ 8 - 0
toolchain/lex/lex.cpp

@@ -860,6 +860,7 @@ auto Lexer::LexCommentOrSlash(llvm::StringRef source_text, ssize_t& position)
 
 auto Lexer::LexComment(llvm::StringRef source_text, ssize_t& position) -> void {
   CARBON_DCHECK(source_text.substr(position).starts_with("//"));
+  int32_t comment_start = position;
 
   // Any comment must be the only non-whitespace on the line.
   const auto* line_info = current_line_info();
@@ -874,6 +875,9 @@ auto Lexer::LexComment(llvm::StringRef source_text, ssize_t& position) -> void {
     // whitespace, which already is designed to skip over any erroneous text at
     // the end of the line.
     LexVerticalWhitespace(source_text, position);
+    buffer_.comments_.push_back(
+        {.start = comment_start,
+         .length = static_cast<int32_t>(position) - comment_start});
     return;
   }
 
@@ -977,6 +981,10 @@ auto Lexer::LexComment(llvm::StringRef source_text, ssize_t& position) -> void {
     }
   }
 
+  buffer_.comments_.push_back(
+      {.start = comment_start,
+       .length = static_cast<int32_t>(position) - comment_start});
+
   // Now compute the indent of this next line before we finish.
   ssize_t line_start = position;
   SkipHorizontalWhitespace(source_text, position);

+ 13 - 4
toolchain/lex/tokenized_buffer.cpp

@@ -345,15 +345,24 @@ auto TokenizedBuffer::AddLine(LineInfo info) -> LineIndex {
   return LineIndex(static_cast<int>(line_infos_.size()) - 1);
 }
 
+auto TokenizedBuffer::IsAfterComment(TokenIndex token,
+                                     CommentIndex comment_index) const -> bool {
+  const auto& comment_data = comments_[comment_index.index];
+  return GetTokenInfo(token).byte_offset() > comment_data.start;
+}
+
+auto TokenizedBuffer::GetCommentText(CommentIndex comment_index) const
+    -> llvm::StringRef {
+  const auto& comment_data = comments_[comment_index.index];
+  return source_->text().substr(comment_data.start, comment_data.length);
+}
+
 auto TokenizedBuffer::CollectMemUsage(MemUsage& mem_usage,
                                       llvm::StringRef label) const -> void {
   mem_usage.Add(MemUsage::ConcatLabel(label, "allocator_"), allocator_);
   mem_usage.Add(MemUsage::ConcatLabel(label, "token_infos_"), token_infos_);
   mem_usage.Add(MemUsage::ConcatLabel(label, "line_infos_"), line_infos_);
-}
-
-auto TokenIterator::Print(llvm::raw_ostream& output) const -> void {
-  output << token_.index;
+  mem_usage.Add(MemUsage::ConcatLabel(label, "comments_"), comments_);
 }
 
 auto TokenizedBuffer::SourceBufferDiagnosticConverter::ConvertLoc(

+ 55 - 43
toolchain/lex/tokenized_buffer.h

@@ -5,15 +5,12 @@
 #ifndef CARBON_TOOLCHAIN_LEX_TOKENIZED_BUFFER_H_
 #define CARBON_TOOLCHAIN_LEX_TOKENIZED_BUFFER_H_
 
-#include <compare>
 #include <cstdint>
-#include <iterator>
 
 #include "common/ostream.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/iterator.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/raw_ostream.h"
@@ -45,50 +42,21 @@ struct LineIndex : public IndexBase {
   using IndexBase::IndexBase;
 };
 
-constexpr LineIndex LineIndex::Invalid(LineIndex::InvalidIndex);
+constexpr LineIndex LineIndex::Invalid(InvalidIndex);
 
-// Random-access iterator over tokens within the buffer.
-class TokenIterator
-    : public llvm::iterator_facade_base<TokenIterator,
-                                        std::random_access_iterator_tag,
-                                        const TokenIndex, int>,
-      public Printable<TokenIterator> {
- public:
-  TokenIterator() = delete;
-
-  explicit TokenIterator(TokenIndex token) : token_(token) {}
-
-  auto operator==(const TokenIterator& rhs) const -> bool {
-    return token_ == rhs.token_;
-  }
-  auto operator<=>(const TokenIterator& rhs) const -> std::strong_ordering {
-    return token_ <=> rhs.token_;
-  }
-
-  auto operator*() const -> const TokenIndex& { return token_; }
-
-  using iterator_facade_base::operator-;
-  auto operator-(const TokenIterator& rhs) const -> int {
-    return token_.index - rhs.token_.index;
-  }
-
-  auto operator+=(int n) -> TokenIterator& {
-    token_.index += n;
-    return *this;
-  }
-  auto operator-=(int n) -> TokenIterator& {
-    token_.index -= n;
-    return *this;
-  }
+// Indices for comments within the buffer.
+struct CommentIndex : public IndexBase {
+  static const CommentIndex Invalid;
+  using IndexBase::IndexBase;
+};
 
-  // Prints the raw token index.
-  auto Print(llvm::raw_ostream& output) const -> void;
+constexpr CommentIndex CommentIndex::Invalid(InvalidIndex);
 
- private:
-  friend class TokenizedBuffer;
+// Random-access iterator over comments within the buffer.
+using CommentIterator = IndexIterator<CommentIndex>;
 
-  TokenIndex token_;
-};
+// Random-access iterator over tokens within the buffer.
+using TokenIterator = IndexIterator<TokenIndex>;
 
 // A diagnostic location converter that maps token locations into source
 // buffer locations.
@@ -115,6 +83,21 @@ class TokenDiagnosticConverter : public DiagnosticConverter<TokenIndex> {
 // `HasError` returning true.
 class TokenizedBuffer : public Printable<TokenizedBuffer> {
  public:
+  // A comment, which can be a block of lines.
+  //
+  // This is the API version of `CommentData`.
+  struct CommentInfo {
+    // The comment's full text, including `//` symbols. This may have several
+    // lines for block comments.
+    llvm::StringRef text;
+
+    // The comment's indent.
+    int32_t indent;
+
+    // The first line of the comment.
+    LineIndex start_line;
+  };
+
   auto GetKind(TokenIndex token) const -> TokenKind;
   auto GetLine(TokenIndex token) const -> LineIndex;
 
@@ -179,6 +162,13 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
   // Returns the previous line handle.
   auto GetPrevLine(LineIndex line) const -> LineIndex;
 
+  // Returns true if the token comes after the comment.
+  auto IsAfterComment(TokenIndex token, CommentIndex comment_index) const
+      -> bool;
+
+  // Returns the comment's full text range.
+  auto GetCommentText(CommentIndex comment_index) const -> llvm::StringRef;
+
   // Prints a description of the tokenized stream to the provided `raw_ostream`.
   //
   // It prints one line of information for each token in the buffer, including
@@ -219,6 +209,11 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
 
   auto size() const -> int { return token_infos_.size(); }
 
+  auto comments() const -> llvm::iterator_range<CommentIterator> {
+    return llvm::make_range(CommentIterator(CommentIndex(0)),
+                            CommentIterator(CommentIndex(comments_.size())));
+  }
+
   // This is an upper bound on the number of output parse nodes in the absence
   // of errors.
   auto expected_max_parse_tree_size() const -> int {
@@ -418,6 +413,20 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
   static_assert(sizeof(TokenInfo) == 8,
                 "Expected `TokenInfo` to pack to an 8-byte structure.");
 
+  // A comment, which can be a block of lines. These are tracked separately from
+  // tokens because they don't affect parse; if they were part of tokens, we'd
+  // need more general special-casing within token logic.
+  //
+  // Note that `CommentInfo` is used for an API to expose the comment.
+  struct CommentData {
+    // Zero-based byte offset of the start of the comment within the source
+    // buffer provided.
+    int32_t start;
+
+    // The comment's length.
+    int32_t length;
+  };
+
   struct LineInfo {
     explicit LineInfo(int32_t start) : start(start), indent(0) {}
 
@@ -458,6 +467,9 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
 
   llvm::SmallVector<LineInfo> line_infos_;
 
+  // Comments in the file.
+  llvm::SmallVector<CommentData> comments_;
+
   // An upper bound on the number of parse tree nodes that we expect to be
   // created for the tokens in this buffer.
   int expected_max_parse_tree_size_ = 0;