Procházet zdrojové kódy

Switch to a constant-time approach for dump ranges, tracking node parents (#5394)

- Adds parent information as a single-pass calculation
  - Moves off `GetSubtreeTokenRange` because it's O(N)
- Stops using the node's subtree when formatting a single instruction
- This excludes `%F.call` and the following `return`, for example,
because only parameters are marked for formatting
Jon Ross-Perkins před 1 rokem
rodič
revize
6469f67b14

+ 0 - 6
toolchain/check/testdata/basics/no_prelude/dump_sem_ir_range.carbon

@@ -136,8 +136,6 @@ fn G() {
 // CHECK:STDOUT:
 // CHECK:STDOUT: class @A {
 // CHECK:STDOUT:   %G.decl: %G.type = fn_decl @G [concrete = constants.%G] {} {}
-// CHECK:STDOUT:   %empty_struct_type: type = struct_type {} [concrete = constants.%empty_struct_type]
-// CHECK:STDOUT:   %complete_type: <witness> = complete_type_witness %empty_struct_type [concrete = constants.%complete_type]
 // CHECK:STDOUT:   complete_type_witness = %complete_type
 // CHECK:STDOUT:
 // CHECK:STDOUT: !members:
@@ -148,8 +146,6 @@ fn G() {
 // CHECK:STDOUT:
 // CHECK:STDOUT: class @C {
 // CHECK:STDOUT:   %I.decl: %I.type = fn_decl @I [concrete = constants.%I] {} {}
-// CHECK:STDOUT:   %empty_struct_type: type = struct_type {} [concrete = constants.%empty_struct_type]
-// CHECK:STDOUT:   %complete_type: <witness> = complete_type_witness %empty_struct_type [concrete = constants.%complete_type]
 // CHECK:STDOUT:   complete_type_witness = %complete_type
 // CHECK:STDOUT:
 // CHECK:STDOUT: !members:
@@ -183,7 +179,5 @@ fn G() {
 // CHECK:STDOUT:   %.loc17_7.2: ref %empty_tuple.type = temporary %.loc17_7.1, %C.call
 // CHECK:STDOUT:   %tuple.loc17: %empty_tuple.type = tuple_value () [concrete = constants.%empty_tuple]
 // CHECK:STDOUT:   %.loc17_7.3: %empty_tuple.type = converted %C.call, %tuple.loc17 [concrete = constants.%empty_tuple]
-// CHECK:STDOUT:   %F.call: init %empty_tuple.type = call %F.ref(%.loc13_7.3, %.loc15_7.3, %.loc17_7.3)
-// CHECK:STDOUT:   return
 // CHECK:STDOUT: }
 // CHECK:STDOUT:

+ 66 - 44
toolchain/sem_ir/formatter.cpp

@@ -44,6 +44,10 @@ Formatter::Formatter(const File* sem_ir,
   auto first_chunk = AddChunkNoFlush(true);
   tentative_inst_chunks_.resize(sem_ir_->insts().size(), first_chunk);
 
+  if (sem_ir_->parse_tree().tokens().has_dump_sem_ir_ranges()) {
+    ComputeNodeParents();
+  }
+
   // Create empty placeholder chunks for instructions that we output lazily.
   for (auto lazy_insts :
        {sem_ir_->constants().array_ref(),
@@ -107,6 +111,16 @@ auto Formatter::Format() -> void {
   out_ << "\n";
 }
 
+auto Formatter::ComputeNodeParents() -> void {
+  CARBON_CHECK(node_parents_.empty());
+  node_parents_.resize(sem_ir_->parse_tree().size(), Parse::NodeId::None);
+  for (auto n : sem_ir_->parse_tree().postorder()) {
+    for (auto child : get_tree_and_subtrees_().children(n)) {
+      node_parents_[child.index] = n;
+    }
+  }
+}
+
 auto Formatter::Write(llvm::raw_ostream& out) -> void {
   FlushChunk();
   for (const auto& chunk : output_chunks_) {
@@ -156,61 +170,68 @@ auto Formatter::IncludeChunkInOutput(size_t chunk) -> void {
   }
 }
 
-auto Formatter::OverlapsWithDumpSemIRRange(
-    InstId inst_id, llvm::ArrayRef<InstBlockId> body_block_ids) -> bool {
+auto Formatter::ShouldFormatEntity(InstId decl_id, bool is_definition_start)
+    -> bool {
+  if (!decl_id.has_value()) {
+    return true;
+  }
+  if (!should_format_entity_(decl_id)) {
+    return false;
+  }
+
   if (!sem_ir_->parse_tree().tokens().has_dump_sem_ir_ranges()) {
     return true;
   }
 
-  auto loc_id = sem_ir_->insts().GetCanonicalLocId(inst_id);
+  // When there are dump ranges, ignore imported instructions.
+  auto loc_id = sem_ir_->insts().GetCanonicalLocId(decl_id);
   if (loc_id.kind() != LocId::Kind::NodeId) {
     return false;
   }
 
-  // For the declaration, we use the helper for checking the full range.
-  auto token_range =
-      get_tree_and_subtrees_().GetSubtreeTokenRange(loc_id.node_id());
-  if (sem_ir_->parse_tree().tokens().OverlapsWithDumpSemIRRange(
-          token_range.begin, token_range.end)) {
-    return true;
-  }
+  const auto& tree_and_subtrees = get_tree_and_subtrees_();
 
-  // If the declaration wasn't in scope, we need to check the body.
-  // TODO: We currently don't track the definition end, so this checks all
-  // instructions in the body. Maybe we should start tracking definition end
-  // nodes on entities?
-  for (auto body_block_id : body_block_ids) {
-    auto block = sem_ir_->inst_blocks().GetOrEmpty(body_block_id);
-    for (auto inst_id : block) {
-      auto loc_id = sem_ir_->insts().GetCanonicalLocId(inst_id);
-      if (loc_id.kind() == LocId::Kind::NodeId) {
-        auto token = sem_ir_->parse_tree().node_token(loc_id.node_id());
-        if (sem_ir_->parse_tree().tokens().OverlapsWithDumpSemIRRange(token,
-                                                                      token)) {
-          return true;
-        }
-      }
-    }
+  // This takes the earliest token from either the node or its first postorder
+  // child. The first postorder child isn't necessarily the earliest token in
+  // the subtree (for example, it can miss modifiers), but finding the earliest
+  // token requires walking *all* children, whereas this approach is
+  // constant-time.
+  auto begin = sem_ir_->parse_tree().node_token(
+      *tree_and_subtrees.postorder(loc_id.node_id()).begin());
+
+  // Non-defining declarations will be associated with a `Decl` node.
+  // Definitions will have a `DefinitionStart` for which we can use the parent
+  // to find the `Definition`, giving a range that includes the definition's
+  // body.
+  auto end_node_id = loc_id.node_id();
+  if (is_definition_start) {
+    end_node_id = node_parents_[end_node_id.index];
   }
-  return false;
+  auto inclusive_end = sem_ir_->parse_tree().node_token(end_node_id);
+
+  return sem_ir_->parse_tree().tokens().OverlapsWithDumpSemIRRange(
+      begin, inclusive_end);
 }
 
-auto Formatter::ShouldFormatEntity(InstId decl_id,
-                                   llvm::ArrayRef<InstBlockId> body_block_ids)
-    -> bool {
-  if (!decl_id.has_value()) {
+auto Formatter::ShouldFormatEntity(const EntityWithParamsBase& entity) -> bool {
+  return ShouldFormatEntity(entity.latest_decl_id(),
+                            entity.definition_id.has_value());
+}
+
+auto Formatter::ShouldFormatInst(InstId inst_id) -> bool {
+  if (!sem_ir_->parse_tree().tokens().has_dump_sem_ir_ranges()) {
     return true;
   }
-  if (!should_format_entity_(decl_id)) {
+
+  // When there are dump ranges, ignore imported instructions.
+  auto loc_id = sem_ir_->insts().GetCanonicalLocId(inst_id);
+  if (loc_id.kind() != LocId::Kind::NodeId) {
     return false;
   }
-  return OverlapsWithDumpSemIRRange(decl_id, body_block_ids);
-}
 
-auto Formatter::ShouldFormatEntity(const EntityWithParamsBase& entity,
-                                   llvm::ArrayRef<InstBlockId> body_block_ids)
-    -> bool {
-  return ShouldFormatEntity(entity.latest_decl_id(), body_block_ids);
+  auto token = sem_ir_->parse_tree().node_token(loc_id.node_id());
+  return sem_ir_->parse_tree().tokens().OverlapsWithDumpSemIRRange(token,
+                                                                   token);
 }
 
 auto Formatter::OpenBrace() -> void {
@@ -277,7 +298,7 @@ auto Formatter::FormatScopeIfUsed(InstNamer::ScopeId scope_id,
 
 auto Formatter::FormatClass(ClassId id) -> void {
   const Class& class_info = sem_ir_->classes().Get(id);
-  if (!ShouldFormatEntity(class_info, class_info.body_block_id)) {
+  if (!ShouldFormatEntity(class_info)) {
     return;
   }
 
@@ -306,7 +327,7 @@ auto Formatter::FormatClass(ClassId id) -> void {
 
 auto Formatter::FormatInterface(InterfaceId id) -> void {
   const Interface& interface_info = sem_ir_->interfaces().Get(id);
-  if (!ShouldFormatEntity(interface_info, interface_info.body_block_id)) {
+  if (!ShouldFormatEntity(interface_info)) {
     return;
   }
 
@@ -342,7 +363,8 @@ auto Formatter::FormatInterface(InterfaceId id) -> void {
 auto Formatter::FormatAssociatedConstant(AssociatedConstantId id) -> void {
   const AssociatedConstant& assoc_const =
       sem_ir_->associated_constants().Get(id);
-  if (!ShouldFormatEntity(assoc_const.decl_id, /*body_block_ids=*/{})) {
+  if (!ShouldFormatEntity(assoc_const.decl_id,
+                          /*is_definition_start=*/false)) {
     return;
   }
 
@@ -366,7 +388,7 @@ auto Formatter::FormatAssociatedConstant(AssociatedConstantId id) -> void {
 
 auto Formatter::FormatImpl(ImplId id) -> void {
   const Impl& impl_info = sem_ir_->impls().Get(id);
-  if (!ShouldFormatEntity(impl_info, impl_info.body_block_id)) {
+  if (!ShouldFormatEntity(impl_info)) {
     return;
   }
 
@@ -408,7 +430,7 @@ auto Formatter::FormatImpl(ImplId id) -> void {
 
 auto Formatter::FormatFunction(FunctionId id) -> void {
   const Function& fn = sem_ir_->functions().Get(id);
-  if (!ShouldFormatEntity(fn, fn.body_block_ids)) {
+  if (!ShouldFormatEntity(fn)) {
     return;
   }
 
@@ -718,7 +740,7 @@ auto Formatter::FormatInst(InstId inst_id, ImportRefUnloaded inst) -> void {
 }
 
 auto Formatter::FormatInst(InstId inst_id) -> void {
-  if (!OverlapsWithDumpSemIRRange(inst_id, /*body_block_ids=*/{})) {
+  if (!ShouldFormatInst(inst_id)) {
     return;
   }
 

+ 16 - 11
toolchain/sem_ir/formatter.h

@@ -71,6 +71,10 @@ class Formatter {
     size_t index;
   };
 
+  // Fills `node_parents_` with parent information. Called at most once during
+  // construction.
+  auto ComputeNodeParents() -> void;
+
   // Flushes the buffered output to the current chunk.
   auto FlushChunk() -> void;
 
@@ -85,19 +89,16 @@ class Formatter {
   // is.
   auto IncludeChunkInOutput(size_t chunk) -> void;
 
-  // Returns true if the node subtree for the instruction or body overlaps with
-  // a dump range, or if there are no ranges.
-  auto OverlapsWithDumpSemIRRange(InstId inst_id,
-                                  llvm::ArrayRef<InstBlockId> body_block_ids)
-      -> bool;
-
   // Determines whether the specified entity should be included in the formatted
-  // output.
-  auto ShouldFormatEntity(InstId decl_id,
-                          llvm::ArrayRef<InstBlockId> body_block_ids) -> bool;
+  // output. `is_definition_start` should indicate whether, if `decl_id`'s
+  // `LocId` is a `NodeId`, it is expected to be a `DefinitionStart` kind.
+  auto ShouldFormatEntity(InstId decl_id, bool is_definition_start) -> bool;
 
-  auto ShouldFormatEntity(const EntityWithParamsBase& entity,
-                          llvm::ArrayRef<InstBlockId> body_block_ids) -> bool;
+  auto ShouldFormatEntity(const EntityWithParamsBase& entity) -> bool;
+
+  // Determines whether a single instruction should be included in the
+  // formatted output.
+  auto ShouldFormatInst(InstId inst_id) -> bool;
 
   // Begins a braced block. Writes an open brace, and prepares to insert a
   // newline after it if the braced block is non-empty.
@@ -369,6 +370,10 @@ class Formatter {
   // referenced, indexed by the instruction's index. This is resized in advance
   // to the correct size.
   llvm::SmallVector<size_t, 0> tentative_inst_chunks_;
+
+  // Maps nodes to their parents. Only set when dump ranges are in use, because
+  // the parents aren't used otherwise.
+  llvm::SmallVector<Parse::NodeId> node_parents_;
 };
 
 template <typename IdT>