Ver Fonte

Rewrite the FormatterChunks API (#6784)

This is a refactoring change with no output changes.

The chunk logic already separates the concepts of "nodes with children"
and "nodes with content" in practice, but it's not obvious in the API.
This rewrites the logic to make the separation clearer.

This also subtly takes advantage of the API to avoid creating lots of
empty chunks... Right now, there's always an empty chunk between two
tentative chunks. With this change, it lazily creates a chunk only when
`out()` is used (which it often isn't), which should substantially
reduce the number of chunks created.
Jon Ross-Perkins há 2 meses atrás
pai
commit
3df256cfa9

+ 28 - 17
toolchain/sem_ir/formatter.cpp

@@ -61,30 +61,38 @@ Formatter::Formatter(
       get_tree_and_subtrees_(get_tree_and_subtrees),
       include_ir_in_dumps_(include_ir_in_dumps),
       use_dump_sem_ir_ranges_(use_dump_sem_ir_ranges),
-      // Create a placeholder visible chunk and assign it to all instructions
-      // that don't have a chunk of their own.
-      tentative_inst_chunks_(
-          sem_ir_->insts(),
-          chunks_.AddChunkNoFlush(/*include_in_output=*/true)) {
+      tentative_inst_chunks_(sem_ir_->insts(), FormatterChunks::None) {
   if (use_dump_sem_ir_ranges_) {
     ComputeNodeParents();
   }
 
+  // Reserve space for parents. There will be more content, but we don't try to
+  // guess how much.
+  size_t reserve_chunks = scope_label_chunks_.size();
+  for (auto [_, insts] : GetTentativeScopes(*sem_ir_)) {
+    reserve_chunks += insts.size();
+  }
+  chunks_.Reserve(reserve_chunks);
+
+  // Create parent chunks for scopes.
   for (auto& chunk : scope_label_chunks_) {
-    chunk = chunks_.AddChunkNoFlush(/*include_in_output=*/false);
+    chunk = chunks_.AddParent();
   }
 
-  // Create empty placeholder chunks for instructions that we output lazily.
+  // Create parent chunks for the tentative instructions.
   for (auto [scope_id, insts] : GetTentativeScopes(*sem_ir_)) {
     auto scope_chunk = scope_label_chunks_[static_cast<size_t>(scope_id)];
     for (auto inst_id : insts) {
-      tentative_inst_chunks_.Set(
-          inst_id, chunks_.AddTentativeChunkWithChild(scope_chunk));
+      // Instructions are "parents" of their scopes because if any instruction
+      // is printed, the label is also printed.
+      tentative_inst_chunks_.Set(inst_id, chunks_.AddParent(scope_chunk));
     }
   }
 
-  // Create a real chunk for the start of the output.
-  chunks_.AddChunkNoFlush(/*include_in_output=*/true);
+  CARBON_CHECK(chunks_.size() == reserve_chunks);
+
+  // Prepare to add content.
+  chunks_.StartContent();
 }
 
 auto Formatter::Format() -> void {
@@ -282,7 +290,7 @@ auto Formatter::FormatTopLevelScope(InstNamer::ScopeId scope_id,
   llvm::SaveAndRestore scope(scope_, scope_id);
   auto scope_chunk = scope_label_chunks_[static_cast<size_t>(scope_id)];
 
-  chunks_.FormatTentativeChunkWithParent(scope_chunk, [&] {
+  chunks_.FormatChildContent(scope_chunk, [&] {
     // Note, we don't use OpenBrace() / CloseBrace() here because we always want
     // a newline to avoid misformatting if the first instruction is omitted.
     out() << "\n" << inst_namer_.GetScopeName(scope_id) << " {\n";
@@ -301,17 +309,17 @@ auto Formatter::FormatTopLevelScope(InstNamer::ScopeId scope_id,
 
       FormatInst(inst_id);
       // Include the `file` scope label directly here.
-      chunks_.IncludeChunkInOutput(scope_chunk);
+      chunks_.AppendChildToCurrentParent(scope_chunk);
     } else {
       // Other scopes format each instruction in its own chunk, to support
       // tentative formatting.
-      chunks_.FormatTentativeChunkWithParent(
-          tentative_inst_chunks_.Get(inst_id), [&] { FormatInst(inst_id); });
+      chunks_.FormatChildContent(tentative_inst_chunks_.Get(inst_id),
+                                 [&] { FormatInst(inst_id); });
     }
   }
   indent_ -= 2;
 
-  chunks_.FormatTentativeChunkWithParent(scope_chunk, [&] { out() << "}\n"; });
+  chunks_.FormatChildContent(scope_chunk, [&] { out() << "}\n"; });
 }
 
 auto Formatter::FormatClass(ClassId id, const Class& class_info) -> void {
@@ -1562,7 +1570,10 @@ auto Formatter::FormatName(NameId id) -> void {
 
 auto Formatter::FormatName(InstId id) -> void {
   if (id.has_value()) {
-    chunks_.IncludeChunkInOutput(tentative_inst_chunks_.Get(id));
+    if (auto chunk = tentative_inst_chunks_.Get(id);
+        chunk != FormatterChunks::None) {
+      chunks_.AppendChildToCurrentParent(chunk);
+    }
   }
   out() << inst_namer_.GetNameFor(scope_, id);
 }

+ 68 - 45
toolchain/sem_ir/formatter_chunks.cpp

@@ -8,77 +8,100 @@
 
 namespace Carbon::SemIR {
 
-auto FormatterChunks::FlushChunk() -> void {
-  CARBON_CHECK(output_chunks_.back().chunk.empty());
-  output_chunks_.back().chunk = std::move(buffer_);
-  buffer_.clear();
+auto FormatterChunks::StartContent() -> void {
+  CARBON_CHECK(content_start_id_ == None);
+  content_start_id_ = ChunkId{.index = chunks_.size()};
 }
 
-auto FormatterChunks::AddChunkNoFlush(bool include_in_output) -> ChunkId {
-  CARBON_CHECK(buffer_.empty());
-  output_chunks_.push_back({.include_in_output = include_in_output});
-  return ChunkId{.index = output_chunks_.size() - 1};
+auto FormatterChunks::AddContent(bool include_in_output) -> ChunkId {
+  CARBON_CHECK(content_start_id_ != None, "Must call StartContent first");
+  auto chunk_id = ChunkId{.index = chunks_.size()};
+  chunks_.push_back(
+      {.include_in_output = include_in_output, .data = std::string()});
+  out_ = std::make_unique<llvm::raw_string_ostream>(
+      std::get<std::string>(Get(chunk_id).data));
+  return chunk_id;
 }
 
-auto FormatterChunks::AddChunk(bool include_in_output) -> ChunkId {
-  FlushChunk();
-  return AddChunkNoFlush(include_in_output);
-}
-
-auto FormatterChunks::AddTentativeChunkWithChild(ChunkId child_chunk)
-    -> ChunkId {
-  auto chunk = AddChunkNoFlush(/*include_in_output=*/false);
-  output_chunks_[chunk.index].dependencies.push_back(child_chunk);
-  return chunk;
+auto FormatterChunks::AddParent(ChunkId child_chunk_id) -> ChunkId {
+  CARBON_CHECK(content_start_id_ == None, "Already called StartContent");
+  llvm::SmallVector<ChunkId> children;
+  if (child_chunk_id != None) {
+    children.push_back(child_chunk_id);
+  }
+  auto chunk_id = ChunkId{.index = chunks_.size()};
+  chunks_.push_back({.include_in_output = false, .data = std::move(children)});
+  return chunk_id;
 }
 
-auto FormatterChunks::FormatTentativeChunkWithParent(
-    ChunkId parent_chunk, llvm::function_ref<auto()->void> format) -> void {
-  CARBON_CHECK(output_chunks_.back().include_in_output,
-               "All non-included chunks must be added first.");
+auto FormatterChunks::FormatChildContent(
+    ChunkId parent_chunk_id, llvm::function_ref<auto()->void> format) -> void {
+  CARBON_CHECK(content_start_id_ != None, "Must call StartContent first");
+  CARBON_CHECK(current_parent_id_ == None, "Cannot nest FormatChildContent");
 
-  // If the parent is already included, we don't need to make a chunk.
-  if (output_chunks_[parent_chunk.index].include_in_output) {
+  // We only need to call `AddContent` for non-included content because `out()`
+  // is included by default.
+  if (Get(parent_chunk_id).include_in_output) {
     format();
     return;
   }
 
-  // Otherwise, create a new chunk and include it only if the parent is later
-  // found to be used.
-  auto chunk = AddChunk(false);
-  output_chunks_[parent_chunk.index].dependencies.push_back(chunk);
+  // Otherwise, create a content `Chunk` and include it only if the parent is
+  // later found to be used.
+  AppendChildToParent(AddContent(/*include_in_output=*/false), parent_chunk_id);
+
+  current_parent_id_ = parent_chunk_id;
   format();
-  auto next_chunk = AddChunk(true);
-  CARBON_CHECK(next_chunk.index == chunk.index + 1, "Nested FormatChildChunk");
+  current_parent_id_ = None;
+
+  // Reset the output stream so that the next call to `out()` creates a new
+  // chunk.
+  out_.reset();
 }
 
-auto FormatterChunks::IncludeChunkInOutput(ChunkId chunk) -> void {
-  CARBON_CHECK(chunk.index != output_chunks_.size() - 1,
-               "Should only be called on earlier chunks");
+auto FormatterChunks::AppendChildToParent(ChunkId child_chunk_id,
+                                          ChunkId parent_chunk_id) -> void {
+  CARBON_CHECK(!Get(parent_chunk_id).include_in_output);
+  auto* children =
+      std::get_if<llvm::SmallVector<ChunkId>>(&Get(parent_chunk_id).data);
+  CARBON_CHECK(children);
+  children->push_back(child_chunk_id);
+}
 
-  if (auto& current_chunk = output_chunks_.back();
-      !current_chunk.include_in_output) {
-    current_chunk.dependencies.push_back(chunk);
-    return;
+auto FormatterChunks::AppendChildToCurrentParent(ChunkId child_chunk_id)
+    -> void {
+  if (current_parent_id_ != None) {
+    // If the parent is not included, add the `chunk` to the parent's children
+    // for conditional inclusion.
+    if (!Get(current_parent_id_).include_in_output) {
+      AppendChildToParent(child_chunk_id, current_parent_id_);
+      return;
+    }
   }
 
-  llvm::SmallVector<ChunkId> to_add = {chunk};
-  while (!to_add.empty()) {
-    auto& chunk_ref = output_chunks_[to_add.pop_back_val().index];
+  // If the parent is already included, or there is no parent (this is not
+  // currently a tentative chunk), include the chunk and all of its children.
+  llvm::SmallVector<ChunkId> to_include = {child_chunk_id};
+  while (!to_include.empty()) {
+    auto& chunk_ref = Get(to_include.pop_back_val());
     if (chunk_ref.include_in_output) {
       continue;
     }
     chunk_ref.include_in_output = true;
-    to_add.append(chunk_ref.dependencies);
-    chunk_ref.dependencies.clear();
+    if (auto* children =
+            std::get_if<llvm::SmallVector<ChunkId>>(&chunk_ref.data)) {
+      to_include.append(*children);
+      children->clear();
+    }
   }
 }
 
 auto FormatterChunks::Write(llvm::raw_ostream& stream) -> void {
-  FlushChunk();
-  for (const auto& chunk : output_chunks_) {
+  CARBON_CHECK(content_start_id_ != None);
+  for (const auto& chunk :
+       llvm::ArrayRef(chunks_).drop_front(content_start_id_.index)) {
     if (chunk.include_in_output) {
-      stream << chunk.chunk;
+      stream << std::get<std::string>(chunk.data);
     }
   }
 }

+ 84 - 50
toolchain/sem_ir/formatter_chunks.h

@@ -7,6 +7,7 @@
 
 #include <string>
 
+#include "common/check.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -14,74 +15,107 @@ namespace Carbon::SemIR {
 
 // Manages the chunks created by the formatter.
 //
-// All output of the formatter is stored as `OutputChunk`s. Tentative scopes,
-// such as constants, may have output for instructions made optional; these are
-// stored as `include_in_output=false`, but have dependencies in case they're
-// later referenced. Unreferenced tentative chunks are omitted from the output.
+// There are two kinds of `Chunk`s:
+// - Parent `Chunk`s, with children in a vector.
+// - Content `Chunk`s, with content in a string.
 //
-// Output of the main file scope is always included in output, and causes
-// referenced tentative instructions to be included in output, including
-// indirect dependencies. During this, `Formatter::FormatName` will mark
-// referenced instructions for inclusion.
+// The high level usage is:
+// 1. Calls to `AddParent` to prepare parent `Chunk`s.
+// 2. One call to `StartContent` to switch modes.
+// 3. Calls to `FormatChildContent`, `AppendChildToCurrentParent`, and `out`.
+// 4. Calls to `Write`.
 class FormatterChunks {
  public:
+  // A type-safe index into `chunks_`.
   struct ChunkId {
-    size_t index;
-  };
+    auto operator==(const ChunkId& other) const -> bool = default;
 
-  // A chunk of the buffered output.
-  struct OutputChunk {
-    // Whether this chunk is known to be included in the output.
-    bool include_in_output;
-    // The textual contents of this chunk.
-    std::string chunk = std::string();
-    // Indices in `ouput_chunks_` that should be included in the output if this
-    // one is.
-    llvm::SmallVector<ChunkId> dependencies = {};
+    size_t index;
   };
 
-  // Flushes the buffered output to the current chunk.
-  auto FlushChunk() -> void;
+  // An empty `ChunkId`.
+  static constexpr ChunkId None = ChunkId(-1);
+
+  // Reserves space for at least `count` chunks.
+  auto Reserve(size_t count) -> void { chunks_.reserve(count); }
+
+  // Adds a parent `Chunk` and returns its `ChunkId`. If `child_chunk_id` isn't
+  // `None`, it's added as a child. Must be called before `StartContent`.
+  //
+  // By default the parent `Chunk` will not be included in the output, and
+  // `AppendChildToCurrentParent` must be called to include it.
+  auto AddParent(ChunkId child_chunk_id = None) -> ChunkId;
+
+  // Switches from adding parents to adding content.
+  auto StartContent() -> void;
+
+  // Calls `format` to add content conditionally included when `parent_chunk_id`
+  // is included. Must be called after `StartContent`.
+  //
+  // During a `FormatChildContent` call where the `parent_chunk_id` is not
+  // already included in output, a new content `Chunk` is created, marked as a
+  // child of `parent_chunk_id`, and `out` is temporarily directed to it during
+  // the duration of `format. Otherwise, `out` lazily creates a content `Chunk`
+  // which is always included in output.
+  auto FormatChildContent(ChunkId parent_chunk_id,
+                          llvm::function_ref<auto()->void> format) -> void;
+
+  // Adds `child_chunk_id` to the children of a `FormatChildContent`'s
+  // `parent_chunk_id` if called during `format`, or otherwise includes the
+  // chunk in output.
+  auto AppendChildToCurrentParent(ChunkId child_chunk_id) -> void;
 
-  // Adds a new chunk with `include_in_output`. Does not flush existing output,
-  // so should only be called if there is no buffered output.
-  auto AddChunkNoFlush(bool include_in_output) -> ChunkId;
+  // Writes included chunks to the given stream.
+  auto Write(llvm::raw_ostream& stream) -> void;
 
-  // Flushes the current chunk and add a new chunk with `include_in_output`.
-  auto AddChunk(bool include_in_output) -> ChunkId;
+  // Returns a stream to write to a content `Chunk`. The returned reference is
+  // only valid until the next `Chunk` starts. Must be called after
+  // `StartContent`.
+  //
+  // See `FormatChildContent` for details of the target content `Chunk`.
+  auto out() -> llvm::raw_ostream& {
+    CARBON_CHECK(content_start_id_ != None);
+    if (!out_) {
+      AddContent(/*include_in_output=*/true);
+    }
+    return *out_;
+  }
+
+  auto size() -> size_t { return chunks_.size(); }
 
-  // Adds a new tentative `OutputChunk`. If the new chunk is included in
-  // output, it'll also include `child_chunk`.
-  auto AddTentativeChunkWithChild(ChunkId child_chunk) -> ChunkId;
+ private:
+  // Either a parent or content.
+  struct Chunk {
+    // Whether this chunk is known to be included in the output.
+    bool include_in_output;
 
-  // Adds a new tentative `OutputChunk`. If the `parent_chunk` is included in
-  // output, it'll also include the new chunk. Calls `format` to support adding
-  // content to the new chunk.
-  auto FormatTentativeChunkWithParent(ChunkId parent_chunk,
-                                      llvm::function_ref<auto()->void> format)
-      -> void;
+    // Either children or content.
+    std::variant<llvm::SmallVector<ChunkId>, std::string> data;
+  };
 
-  // Marks the given chunk as being included in the output if the current chunk
-  // is.
-  auto IncludeChunkInOutput(ChunkId chunk) -> void;
+  // Adds a `Chunk` that will have `content`, and directs `out_` to it.
+  auto AddContent(bool include_in_output) -> ChunkId;
 
-  // Writes included chunks to the given stream.
-  auto Write(llvm::raw_ostream& stream) -> void;
+  // Adds `child_chunk_id` to the children of `parent_chunk_id`.
+  auto AppendChildToParent(ChunkId child_chunk_id, ChunkId parent_chunk_id)
+      -> void;
 
-  // Returns stream representing the buffer for the current chunk.
-  auto out() -> llvm::raw_ostream& { return out_; }
+  // Indexes into `chunks_`.
+  auto Get(ChunkId chunk_id) -> Chunk& { return chunks_[chunk_id.index]; }
 
- private:
-  friend struct TentativeOutputScope;
+  // An output stream pointing at the current content `Chunk`.
+  std::unique_ptr<llvm::raw_string_ostream> out_;
 
-  // The output stream buffer.
-  std::string buffer_;
+  // The location where content started. Set by `StartContent`.
+  ChunkId content_start_id_ = None;
 
-  // The output stream.
-  llvm::raw_string_ostream out_{buffer_};
+  // The current parent `Chunk`. This is only set during calls to
+  // `FormatChildContent`.
+  ChunkId current_parent_id_ = None;
 
-  // Chunks of output text that we have created so far.
-  llvm::SmallVector<OutputChunk> output_chunks_;
+  // A sequential ordering of `Chunk`s. This will have all parent `Chunk`s
+  // first, followed by content `Chunk`s at `content_start_`.
+  llvm::SmallVector<Chunk> chunks_;
 };
 
 }  // namespace Carbon::SemIR