Explorar o código

Consolidate token-related range handling to one struct (#5399)

This consolidates Lex::TokenizedBuffer::DumpSemIRRange and
Parse::TreeAndSubtrees::TokenRange into a single InclusiveTokenRange,
also making the OverlapsWithDumpSemIRRange function take the new struct.

I considered switching to `llvm::iterator_range<Lex::TokenIterator>`,
but we often want to see if the range is size one. Using `TokenIterator`
just looked like it'd add a bunch of offsetting to make it work; I view
that as low-value overhead.

For example:

```
  Lex::InclusiveTokenRange token_range = GetSubtreeTokenRange(node_id);
  auto begin_loc = tree_->tokens().TokenToDiagnosticLoc(token_range.begin);
  if (token_range.begin == token_range.end) {
    return begin_loc;
  }
  auto end_loc = tree_->tokens().TokenToDiagnosticLoc(token_range.end);
```

would become:

```
  llvm::iterator_range<Lex::TokenIterator> token_range = GetSubtreeTokenRange(node_id);
  auto begin_loc = tree_->tokens().TokenToDiagnosticLoc(*token_range.begin());
  if (token_range.begin() + 1 == token_range.end()) {
    return begin_loc;
  }
  auto end_loc = tree_->tokens().TokenToDiagnosticLoc(*(token_range.end() - 1));
```

So I'm keeping the bespoke struct.
Jon Ross-Perkins hai 1 ano
pai
achega
1f268b5d8b

+ 1 - 1
toolchain/lex/lex.cpp

@@ -915,7 +915,7 @@ auto Lexer::EndDumpSemIRRange(const char* diag_loc) -> void {
     return;
   }
 
-  buffer_.dump_sem_ir_ranges_.back().end = TokenIndex(buffer_.size());
+  buffer_.dump_sem_ir_ranges_.back().end = TokenIndex(buffer_.size() - 1);
 }
 
 auto Lexer::EndDumpSemIRRangeIfIncomplete(const char* diag_loc) -> void {

+ 11 - 11
toolchain/lex/testdata/dump_sem_ir_range.carbon

@@ -20,7 +20,7 @@ b
 c
 // CHECK:STDOUT:   - { index: 3, kind: "Identifier", line: {{ *}}[[@LINE-1]], column: 1, indent: 1, spelling: "c", identifier: 2, has_leading_space: true }
 // CHECK:STDOUT:   dump_sem_ir_ranges:
-// CHECK:STDOUT:   - {begin: 1, end: 4}
+// CHECK:STDOUT:   - {begin: 1, end: 3}
 //@dump-sem-ir-end
 
 // --- multi_section.carbon
@@ -42,16 +42,16 @@ d
 e
 // CHECK:STDOUT:   - { index: 5, kind: "Identifier", line: {{ *}}[[@LINE-1]], column: 1, indent: 1, spelling: "e", identifier: 4, has_leading_space: true }
 // CHECK:STDOUT:   dump_sem_ir_ranges:
-// CHECK:STDOUT:   - {begin: 2, end: 3}
-// CHECK:STDOUT:   - {begin: 4, end: 5}
+// CHECK:STDOUT:   - {begin: 2, end: 2}
+// CHECK:STDOUT:   - {begin: 4, end: 4}
 
 // --- compact.carbon
 // CHECK:STDOUT: - filename: compact.carbon
 // CHECK:STDOUT:   tokens:
 // CHECK:STDOUT:   dump_sem_ir_ranges:
-// CHECK:STDOUT:   - {begin: 1, end: 1}
-// CHECK:STDOUT:   - {begin: 1, end: 1}
-// CHECK:STDOUT:   - {begin: 1, end: 1}
+// CHECK:STDOUT:   - {begin: 1, end: 0}
+// CHECK:STDOUT:   - {begin: 1, end: 0}
+// CHECK:STDOUT:   - {begin: 1, end: 0}
 //@dump-sem-ir-begin
 //@dump-sem-ir-end
 //@dump-sem-ir-begin
@@ -73,7 +73,7 @@ e
 // CHECK:STDOUT: - filename: fail_start_only.carbon
 // CHECK:STDOUT:   tokens:
 // CHECK:STDOUT:   dump_sem_ir_ranges:
-// CHECK:STDOUT:   - {begin: 1, end: 1}
+// CHECK:STDOUT:   - {begin: 1, end: 0}
 
 //@dump-sem-ir-begin
 // CHECK:STDERR: fail_start_only.carbon:[[@LINE+4]]:1: error: missing `//@dump-sem-ir-end` to match `//@dump-sem-ir-begin` [DumpSemIRRangeMissingEnd]
@@ -118,8 +118,8 @@ d
 e
 // CHECK:STDOUT:   - { index: 5, kind: "Identifier", line: {{ *}}[[@LINE-1]], column: 1, indent: 1, spelling: "e", identifier: 4, has_leading_space: true }
 // CHECK:STDOUT:   dump_sem_ir_ranges:
-// CHECK:STDOUT:   - {begin: 2, end: 3}
-// CHECK:STDOUT:   - {begin: 3, end: 4}
+// CHECK:STDOUT:   - {begin: 2, end: 2}
+// CHECK:STDOUT:   - {begin: 3, end: 3}
 
 // --- fail_count_mismatch.carbon
 // CHECK:STDOUT: - filename: fail_count_mismatch.carbon
@@ -137,8 +137,8 @@ c
 d
 // CHECK:STDOUT:   - { index: 4, kind: "Identifier", line: {{ *}}[[@LINE-1]], column:  1, indent: 1, spelling: "d", identifier: 3, has_leading_space: true }
 // CHECK:STDOUT:   dump_sem_ir_ranges:
-// CHECK:STDOUT:   - {begin: 2, end: 3}
-// CHECK:STDOUT:   - {begin: 4, end: 5}
+// CHECK:STDOUT:   - {begin: 2, end: 2}
+// CHECK:STDOUT:   - {begin: 4, end: 4}
 
 // CHECK:STDERR: fail_count_mismatch.carbon:[[@LINE+3]]:17: error: missing `//@dump-sem-ir-end` to match `//@dump-sem-ir-begin` [DumpSemIRRangeMissingEnd]
 // CHECK:STDERR: // CHECK:STDERR:

+ 6 - 9
toolchain/lex/tokenized_buffer.cpp

@@ -440,18 +440,15 @@ auto TokenizedBuffer::TokenToDiagnosticLoc(TokenIndex token) const
   return converted;
 }
 
-auto TokenizedBuffer::OverlapsWithDumpSemIRRange(TokenIndex begin,
-                                                 TokenIndex inclusive_end) const
-    -> bool {
-  if (dump_sem_ir_ranges_.empty()) {
-    return true;
-  }
+auto TokenizedBuffer::OverlapsWithDumpSemIRRange(
+    Lex::InclusiveTokenRange range) const -> bool {
+  CARBON_CHECK(!dump_sem_ir_ranges_.empty());
 
   // Ranges are ordered, so we can decide overlap as soon as we find a range
   // that ends after `begin`.
-  for (auto range : dump_sem_ir_ranges_) {
-    if (range.end > begin) {
-      return range.begin <= inclusive_end;
+  for (auto dump_range : dump_sem_ir_ranges_) {
+    if (dump_range.end >= range.begin) {
+      return dump_range.begin <= range.end;
     }
   }
   return false;

+ 17 - 17
toolchain/lex/tokenized_buffer.h

@@ -60,6 +60,12 @@ using CommentIterator = IndexIterator<CommentIndex>;
 // Random-access iterator over tokens within the buffer.
 using TokenIterator = IndexIterator<TokenIndex>;
 
+// A token range which is inclusive of the begin and end.
+struct InclusiveTokenRange {
+  TokenIndex begin;
+  TokenIndex end;
+};
+
 // A buffer of tokenized Carbon source code.
 //
 // This is constructed by lexing the source code text into a series of tokens.
@@ -180,9 +186,10 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
   auto TokenToDiagnosticLoc(TokenIndex token) const
       -> Diagnostics::ConvertedLoc;
 
-  // Returns true if the given range overlaps with a `DumpSemIRRange`.
-  auto OverlapsWithDumpSemIRRange(TokenIndex begin,
-                                  TokenIndex inclusive_end) const -> bool;
+  // Returns true if the given range overlaps with an entry in
+  // `dump_sem_ir_ranges_`. Must not be called when there are no ranges; query
+  // `has_dump_sem_ir_ranges` first.
+  auto OverlapsWithDumpSemIRRange(Lex::InclusiveTokenRange range) const -> bool;
 
   // Returns true if the buffer has errors that were detected at lexing time.
   auto has_errors() const -> bool { return has_errors_; }
@@ -250,18 +257,6 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
     const TokenizedBuffer* tokens_;
   };
 
-  // A range of tokens marked by `//@dump-semir-[begin|end]`. The end token is
-  // non-inclusive: [begin, end).
-  //
-  // The particular syntax was chosen because it can be lexed efficiently. It
-  // only occurs in invalid comment strings, so shouldn't slow down lexing of
-  // correct code. It's also comment-like because its presence won't affect
-  // parse/check.
-  struct DumpSemIRRange {
-    TokenIndex begin;
-    TokenIndex end;
-  };
-
   // Converts a pointer into the source to a diagnostic location.
   auto SourcePointerToDiagnosticLoc(const char* loc) const
       -> Diagnostics::ConvertedLoc;
@@ -503,8 +498,13 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
   // Comments in the file.
   llvm::SmallVector<CommentData> comments_;
 
-  // Ranges of SemIR to dump.
-  llvm::SmallVector<DumpSemIRRange> dump_sem_ir_ranges_;
+  // A range of tokens marked by `//@dump-semir-[begin|end]`.
+  //
+  // The particular syntax was chosen because it can be lexed efficiently. It
+  // only occurs in invalid comment strings, so shouldn't slow down lexing of
+  // correct code. It's also comment-like because its presence won't affect
+  // parse/check.
+  llvm::SmallVector<InclusiveTokenRange> dump_sem_ir_ranges_;
 
   // An upper bound on the number of parse tree nodes that we expect to be
   // created for the tokens in this buffer.

+ 5 - 4
toolchain/parse/tree_and_subtrees.cpp

@@ -232,9 +232,10 @@ auto TreeAndSubtrees::CollectMemUsage(MemUsage& mem_usage,
                     subtree_sizes_);
 }
 
-auto TreeAndSubtrees::GetSubtreeTokenRange(NodeId node_id) const -> TokenRange {
-  TokenRange range = {.begin = tree_->node_token(node_id),
-                      .end = Lex::TokenIndex::None};
+auto TreeAndSubtrees::GetSubtreeTokenRange(NodeId node_id) const
+    -> Lex::InclusiveTokenRange {
+  Lex::InclusiveTokenRange range = {.begin = tree_->node_token(node_id),
+                                    .end = Lex::TokenIndex::None};
   range.end = range.begin;
   for (NodeId desc : postorder(node_id)) {
     Lex::TokenIndex desc_token = tree_->node_token(desc);
@@ -264,7 +265,7 @@ auto TreeAndSubtrees::NodeToDiagnosticLoc(NodeId node_id, bool token_only) const
 
   // Construct a location that encompasses all tokens that descend from this
   // node (including the root).
-  TokenRange token_range = GetSubtreeTokenRange(node_id);
+  Lex::InclusiveTokenRange token_range = GetSubtreeTokenRange(node_id);
   auto begin_loc = tree_->tokens().TokenToDiagnosticLoc(token_range.begin);
   if (token_range.begin == token_range.end) {
     return begin_loc;

+ 1 - 7
toolchain/parse/tree_and_subtrees.h

@@ -17,12 +17,6 @@ namespace Carbon::Parse {
 // This requires a complete tree.
 class TreeAndSubtrees {
  public:
-  // A range of tokens, returned by GetSubtreeTokenRange.
-  struct TokenRange {
-    Lex::TokenIndex begin;
-    Lex::TokenIndex end;
-  };
-
   class SiblingIterator;
 
   explicit TreeAndSubtrees(const Lex::TokenizedBuffer& tokens,
@@ -115,7 +109,7 @@ class TreeAndSubtrees {
       -> void;
 
   // Returns the range of tokens in the node's subtree.
-  auto GetSubtreeTokenRange(NodeId node_id) const -> TokenRange;
+  auto GetSubtreeTokenRange(NodeId node_id) const -> Lex::InclusiveTokenRange;
 
   // Converts the node to a diagnostic location, covering either the full
   // subtree or only the token.

+ 7 - 7
toolchain/sem_ir/formatter.cpp

@@ -196,8 +196,7 @@ auto Formatter::ShouldFormatEntity(InstId decl_id, bool is_definition_start)
   // the subtree (for example, it can miss modifiers), but finding the earliest
   // token requires walking *all* children, whereas this approach is
   // constant-time.
-  auto begin = sem_ir_->parse_tree().node_token(
-      *tree_and_subtrees.postorder(loc_id.node_id()).begin());
+  auto begin_node_id = *tree_and_subtrees.postorder(loc_id.node_id()).begin();
 
   // Non-defining declarations will be associated with a `Decl` node.
   // Definitions will have a `DefinitionStart` for which we can use the parent
@@ -207,10 +206,11 @@ auto Formatter::ShouldFormatEntity(InstId decl_id, bool is_definition_start)
   if (is_definition_start) {
     end_node_id = node_parents_[end_node_id.index];
   }
-  auto inclusive_end = sem_ir_->parse_tree().node_token(end_node_id);
 
-  return sem_ir_->parse_tree().tokens().OverlapsWithDumpSemIRRange(
-      begin, inclusive_end);
+  Lex::InclusiveTokenRange range = {
+      .begin = sem_ir_->parse_tree().node_token(begin_node_id),
+      .end = sem_ir_->parse_tree().node_token(end_node_id)};
+  return sem_ir_->parse_tree().tokens().OverlapsWithDumpSemIRRange(range);
 }
 
 auto Formatter::ShouldFormatEntity(const EntityWithParamsBase& entity) -> bool {
@@ -230,8 +230,8 @@ auto Formatter::ShouldFormatInst(InstId inst_id) -> bool {
   }
 
   auto token = sem_ir_->parse_tree().node_token(loc_id.node_id());
-  return sem_ir_->parse_tree().tokens().OverlapsWithDumpSemIRRange(token,
-                                                                   token);
+  return sem_ir_->parse_tree().tokens().OverlapsWithDumpSemIRRange(
+      Lex::InclusiveTokenRange{.begin = token, .end = token});
 }
 
 auto Formatter::OpenBrace() -> void {