Просмотр исходного кода

Key-type customization in `CanonicalValueStore` and `ClangDecl` cleanups (#5743)

The `ClangDecl` struct caused some confusion here -- it is embedding
extra data into a `CanonicalValueStore` that isn't used for lookups or
canonicalization, but is useful to store along side. This changes the
`CanonicalValueStore` to support customized key type for `Lookup` so
that we can provide the more direct API that only takes the relevant
key.

This in turn takes advantage of the support for heterogenous keys in the
underlying `Set` as long as hashing and equality are consistent. We do
need to add support for heterogenous equality comparison with
`clang::Decl*`, but that is fairly easily done now that the
argument-reversed form isn't needed as well.

Lastly, this cleans up the `ClangDecl` customization points to be more
idiomatic by using `operator==` and `CarbonHashValue`. While there, I've
added comments to make it unambiguous why we can use the pointer value
for the underlying `clang::Decl` due to the Clang AST's
address-as-identity model.

Resolves the immediate TODOs around this type.

Future work might involve changing from the current `Add` API to one
more like `Map` and `Set`'s API where a callback is used to create the
object, but that level of API complexity isn't necessarily motivated yet
and can easily be a follow-on if and when its worth doing. The `Add`
code paths *are* working with the `inst_id` in order to create an
instruction if we are importing the Clang declaration. It is the
`Lookup` code paths that never needed to know about the `inst_id` and
became more confusing for having to stub it out in the API.

---------

Co-authored-by: Geoff Romer <gromer@google.com>
Chandler Carruth 10 месяцев назад
Родитель
Сommit
bba037738d
3 измененных файлов с 61 добавлено и 20 удалено
  1. 25 5
      toolchain/base/value_store.h
  2. 4 7
      toolchain/check/import_cpp.cpp
  3. 32 8
      toolchain/sem_ir/clang_decl.h

+ 25 - 5
toolchain/base/value_store.h

@@ -5,6 +5,7 @@
 #ifndef CARBON_TOOLCHAIN_BASE_VALUE_STORE_H_
 #define CARBON_TOOLCHAIN_BASE_VALUE_STORE_H_
 
+#include <concepts>
 #include <memory>
 #include <type_traits>
 #include <utility>
@@ -37,11 +38,17 @@ template <class IdT>
 class ValueStoreRange;
 
 // Common calculation for ValueStore types.
-template <typename IdT, typename ValueT = IdT::ValueType>
+template <typename IdT, typename ValueT = IdT::ValueType,
+          typename KeyT = ValueT>
 class ValueStoreTypes {
  public:
   using ValueType = std::decay_t<ValueT>;
 
+  // TODO: Would be a bit cleaner to not have this here as it's only meaningful
+  // to the `CanonicalValueStore`, not to other `ValueStore`s. Planned to fix
+  // with a larger refactoring.
+  using KeyType = std::decay_t<KeyT>;
+
   // Typically we want to use `ValueType&` and `const ValueType& to avoid
   // copies, but when the value type is a `StringRef`, we assume external
   // storage for the string data and both our value type and ref type will be
@@ -53,6 +60,14 @@ class ValueStoreTypes {
                          llvm::StringRef, const ValueType&>;
 };
 
+// If `IdT` provides a distinct `IdT::KeyType`, default to that for the key
+// type.
+template <typename IdT>
+  requires(!std::same_as<typename IdT::ValueType, typename IdT::KeyType>)
+class ValueStoreTypes<IdT>
+    : public ValueStoreTypes<IdT, typename IdT::ValueType,
+                             typename IdT::KeyType> {};
+
 // A simple wrapper for accumulating values, providing IDs to later retrieve the
 // value. This does not do deduplication.
 //
@@ -221,11 +236,16 @@ class ValueStoreRange {
 // A wrapper for accumulating immutable values with deduplication, providing IDs
 // to later retrieve the value.
 //
-// IdT::ValueType must represent the type being indexed.
+// `IdT::ValueType` must represent the type being indexed.
+//
+// `IdT::KeyType` can optionally be present, and if so is used for the argument
+// to `Lookup`. It must be valid to use both `KeyType` and `ValueType` as lookup
+// types in the underlying `Set`.
 template <typename IdT>
 class CanonicalValueStore {
  public:
   using ValueType = ValueStoreTypes<IdT>::ValueType;
+  using KeyType = ValueStoreTypes<IdT>::KeyType;
   using RefType = ValueStoreTypes<IdT>::RefType;
   using ConstRefType = ValueStoreTypes<IdT>::ConstRefType;
 
@@ -237,7 +257,7 @@ class CanonicalValueStore {
 
   // Looks up the canonical ID for a value, or returns `None` if not in the
   // store.
-  auto Lookup(ValueType value) const -> IdT;
+  auto Lookup(KeyType key) const -> IdT;
 
   // Reserves space.
   auto Reserve(size_t size) -> void;
@@ -290,8 +310,8 @@ auto CanonicalValueStore<IdT>::Add(ValueType value) -> IdT {
 }
 
 template <typename IdT>
-auto CanonicalValueStore<IdT>::Lookup(ValueType value) const -> IdT {
-  if (auto result = set_.Lookup(value, KeyContext(&values_))) {
+auto CanonicalValueStore<IdT>::Lookup(KeyType key) const -> IdT {
+  if (auto result = set_.Lookup(key, KeyContext(&values_))) {
     return result.key();
   }
   return IdT::None;

+ 4 - 7
toolchain/check/import_cpp.cpp

@@ -350,9 +350,8 @@ static auto AsCarbonNamespace(Context& context,
   auto& clang_decls = context.sem_ir().clang_decls();
 
   // Check if the decl context is already mapped to a Carbon namespace.
-  if (auto context_clang_decl_id = clang_decls.Lookup(
-          {.decl = clang::dyn_cast<clang::Decl>(decl_context),
-           .inst_id = SemIR::InstId::None});
+  if (auto context_clang_decl_id =
+          clang_decls.Lookup(clang::dyn_cast<clang::Decl>(decl_context));
       context_clang_decl_id.has_value()) {
     return clang_decls.Get(context_clang_decl_id).inst_id;
   }
@@ -365,8 +364,7 @@ static auto AsCarbonNamespace(Context& context,
     decl_contexts.push_back(decl_context);
     decl_context = decl_context->getParent();
     parent_decl_id =
-        clang_decls.Lookup({.decl = clang::dyn_cast<clang::Decl>(decl_context),
-                            .inst_id = SemIR::InstId::None});
+        clang_decls.Lookup(clang::dyn_cast<clang::Decl>(decl_context));
   } while (!parent_decl_id.has_value());
 
   // We know the parent of the last decl context is mapped, map the rest.
@@ -532,8 +530,7 @@ static auto MapRecordType(Context& context, SemIR::LocId loc_id,
   if (record_decl && !record_decl->isUnion()) {
     auto& clang_decls = context.sem_ir().clang_decls();
     SemIR::InstId record_inst_id = SemIR::InstId::None;
-    if (auto record_clang_decl_id = clang_decls.Lookup(
-            {.decl = record_decl, .inst_id = SemIR::InstId::None});
+    if (auto record_clang_decl_id = clang_decls.Lookup(record_decl);
         record_clang_decl_id.has_value()) {
       record_inst_id = clang_decls.Get(record_clang_decl_id).inst_id;
     } else {

+ 32 - 8
toolchain/sem_ir/clang_decl.h

@@ -20,17 +20,28 @@ class Decl;
 namespace Carbon::SemIR {
 
 // A Clang declaration mapped to a Carbon instruction.
-// Using custom hashing since the declaration is keyed by the `decl` member for
-// lookup.
-// TODO: Avoid custom hashing by either having the data structure support keying
-// or create a dedicated mapping. See
-// https://discord.com/channels/655572317891461132/768530752592805919/1384999468293947537
+//
+// Note that Clang's AST uses address-identity for nodes, which means the
+// pointer is the canonical way to represent a specific AST node and is expected
+// to be sufficient for comparison, hashing, etc.
+//
+// This type is specifically designed for use in a `CanonicalValueStore` and
+// provide a single canonical access from SemIR to each `clang::Decl*` used.
+// This also ensures that a given `clang::Decl*` is associated with exactly one
+// instruction, and the `inst_id` here provides access to that instruction from
+// either the `ClangDeclId` or the `clang::Decl*`.
 struct ClangDecl : public Printable<ClangDecl> {
   auto Print(llvm::raw_ostream& out) const -> void;
 
-  friend auto CarbonHashtableEq(const ClangDecl& lhs, const ClangDecl& rhs)
-      -> bool {
-    return HashtableEq(lhs.decl, rhs.decl);
+  // Equality comparison uses the address-identity property of the Clang AST and
+  // just compares the `decl` pointers. The `inst_id` is always the same due to
+  // the canonicalization.
+  auto operator==(const ClangDecl& rhs) const -> bool {
+    return decl == rhs.decl;
+  }
+  // Support direct comparison with the Clang AST node pointer.
+  auto operator==(const clang::Decl* rhs_decl) const -> bool {
+    return decl == rhs_decl;
   }
 
   // Hashing for ClangDecl. See common/hashing.h.
@@ -45,15 +56,28 @@ struct ClangDecl : public Printable<ClangDecl> {
   clang::Decl* decl = nullptr;
 
   // The instruction the Clang declaration is mapped to.
+  //
+  // This is stored along side the `decl` pointer to avoid having to lookup both
+  // the pointer and the instruction ID in two separate areas of storage.
   InstId inst_id;
 };
 
 // The ID of a `ClangDecl`.
+//
+// These IDs are importantly distinct from the `inst_id` associated with each
+// declaration. These form a dense range of IDs that is used to reference the
+// AST node pointers without storing those pointers directly into SemIR and
+// needing space to hold a full pointer. We can't avoid having these IDs without
+// embedding pointers directly into the storage of SemIR as part of an
+// instruction.
 struct ClangDeclId : public IdBase<ClangDeclId> {
   static constexpr llvm::StringLiteral Label = "clang_decl_id";
 
   using ValueType = ClangDecl;
 
+  // Use the AST node pointer directly when doing `Lookup` to find an ID.
+  using KeyType = clang::Decl*;
+
   using IdBase::IdBase;
 };