Просмотр исходного кода

Use separate value stores for identifiers and string literals (#4106)

This undoes a previous change to unify them, and I think at my advice.
=[ Sorry about that, I think I was just wrong.

Specifically, I think I had suggested that it would be more efficient to
have a single shared hashtable of strings. The more I look at profiles
of the toolchain, the less likely that seems. Specifically for
identifiers and string literals it seems especially problematic.

Using a single, joint hashtable is likely a good idea when all of the
different querying code paths are equally likely, the strings follow the
same distribution of sizes, and either there is no clustering of access
to different sets of strings or none of the sets are meaningfully small
enough to fit into a lower level of resident cache.

I think essentially none of these predicates actually hold for
identifiers vs. string literals:
- Identifiers are *much* more hot
- They have wildly different size distributions.
- The access patterns are very clustered

Sorry for the misleading advice on that one.

While splitting them, I've worked to simplify the code a bit by building
a way to have the `StringRef` holding canonical value stores not require
specializations, and so we get a pretty large code cleanup in the
process here.
Chandler Carruth 1 год назад
Родитель
Сommit
e71e6ca07f

+ 41 - 127
toolchain/base/value_store.h

@@ -88,39 +88,29 @@ struct RealId : public IdBase, public Printable<RealId> {
 };
 constexpr RealId RealId::Invalid(RealId::InvalidIndex);
 
-// Corresponds to a StringRef.
-struct StringId : public IdBase, public Printable<StringId> {
-  using ValueType = std::string;
-  static const StringId Invalid;
-  using IdBase::IdBase;
-  auto Print(llvm::raw_ostream& out) const -> void {
-    out << "str";
-    IdBase::Print(out);
-  }
-};
-constexpr StringId StringId::Invalid(StringId::InvalidIndex);
-
-// Adapts StringId for identifiers.
+// Corresponds to StringRefs for identifiers.
 //
 // `NameId` relies on the values of this type other than `Invalid` all being
 // non-negative.
 struct IdentifierId : public IdBase, public Printable<IdentifierId> {
+  using ValueType = llvm::StringRef;
   static const IdentifierId Invalid;
   using IdBase::IdBase;
   auto Print(llvm::raw_ostream& out) const -> void {
-    out << "strId";
+    out << "identifier";
     IdBase::Print(out);
   }
 };
 constexpr IdentifierId IdentifierId::Invalid(IdentifierId::InvalidIndex);
 
-// Adapts StringId for values of string literals.
+// Corresponds to StringRefs for string literals.
 struct StringLiteralValueId : public IdBase,
                               public Printable<StringLiteralValueId> {
+  using ValueType = llvm::StringRef;
   static const StringLiteralValueId Invalid;
   using IdBase::IdBase;
   auto Print(llvm::raw_ostream& out) const -> void {
-    out << "strLit";
+    out << "string";
     IdBase::Print(out);
   }
 };
@@ -128,6 +118,7 @@ constexpr StringLiteralValueId StringLiteralValueId::Invalid(
     StringLiteralValueId::InvalidIndex);
 
 namespace Internal {
+
 // Used as a parent class for non-printable types. This is just for
 // std::conditional, not as an API.
 class ValueStoreNotPrintable {};
@@ -146,6 +137,16 @@ class ValueStore
  public:
   using ValueType = typename IdT::ValueType;
 
+  // Typically we want to use `ValueType&` and `const ValueType& to avoid
+  // copies, but when the value type is a `StringRef`, we assume external
+  // storage for the string data and both our value type and ref type will be
+  // `StringRef`. This will preclude mutation of the string data.
+  using RefType = std::conditional_t<std::same_as<llvm::StringRef, ValueType>,
+                                     llvm::StringRef, ValueType&>;
+  using ConstRefType =
+      std::conditional_t<std::same_as<llvm::StringRef, ValueType>,
+                         llvm::StringRef, const ValueType&>;
+
   // Stores the value and returns an ID to reference it.
   auto Add(ValueType value) -> IdT {
     IdT id = IdT(values_.size());
@@ -162,13 +163,13 @@ class ValueStore
   }
 
   // Returns a mutable value for an ID.
-  auto Get(IdT id) -> ValueType& {
+  auto Get(IdT id) -> RefType {
     CARBON_CHECK(id.index >= 0) << id;
     return values_[id.index];
   }
 
   // Returns the value for an ID.
-  auto Get(IdT id) const -> const ValueType& {
+  auto Get(IdT id) const -> ConstRefType {
     CARBON_CHECK(id.index >= 0) << id;
     return values_[id.index];
   }
@@ -203,12 +204,18 @@ template <typename IdT>
 class CanonicalValueStore {
  public:
   using ValueType = typename IdT::ValueType;
+  using RefType = typename ValueStore<IdT>::RefType;
+  using ConstRefType = typename ValueStore<IdT>::ConstRefType;
 
   // Stores a canonical copy of the value and returns an ID to reference it.
   auto Add(ValueType value) -> IdT;
 
   // Returns the value for an ID.
-  auto Get(IdT id) const -> const ValueType& { return values_.Get(id); }
+  auto Get(IdT id) const -> ConstRefType { return values_.Get(id); }
+
+  // Looks up the canonical ID for a value, or returns invalid if not in the
+  // store.
+  auto Lookup(ValueType value) const -> IdT;
 
   // Reserves space.
   auto Reserve(size_t size) -> void {
@@ -259,123 +266,30 @@ auto CanonicalValueStore<IdT>::Add(ValueType value) -> IdT {
   return set_.Insert(value, make_key, KeyContext(values_.array_ref())).key();
 }
 
-// Storage for StringRefs. The caller is responsible for ensuring storage is
-// allocated.
-template <>
-class CanonicalValueStore<StringId>
-    : public Yaml::Printable<CanonicalValueStore<StringId>> {
- public:
-  // Returns an ID to reference the value. May return an existing ID if the
-  // string was previously added.
-  auto Add(llvm::StringRef value) -> StringId;
-
-  // Returns the value for an ID.
-  auto Get(StringId id) const -> llvm::StringRef {
-    CARBON_CHECK(id.is_valid());
-    return values_[id.index];
-  }
-
-  // Returns an ID for the value, or Invalid if not found.
-  auto Lookup(llvm::StringRef value) const -> StringId;
-
-  auto OutputYaml() const -> Yaml::OutputMapping {
-    return Yaml::OutputMapping([&](Yaml::OutputMapping::Map map) {
-      for (auto [i, val] : llvm::enumerate(values_)) {
-        map.Add(PrintToString(StringId(i)), val);
-      }
-    });
-  }
-
-  auto size() const -> size_t { return values_.size(); }
-
- private:
-  class KeyContext;
-
-  // Set inline sizes to 0 because these will typically be too large for the
-  // stack, while this does make File smaller.
-  Set<StringId, /*SmallSize=*/0, KeyContext> set_;
-  llvm::SmallVector<llvm::StringRef, 0> values_;
-};
-
-class CanonicalValueStore<StringId>::KeyContext
-    : public TranslatingKeyContext<KeyContext> {
- public:
-  explicit KeyContext(llvm::ArrayRef<llvm::StringRef> values)
-      : values_(values) {}
-
-  auto TranslateKey(StringId id) const -> llvm::StringRef {
-    return values_[id.index];
-  }
-
- private:
-  llvm::ArrayRef<llvm::StringRef> values_;
-};
-
-inline auto CanonicalValueStore<StringId>::Add(llvm::StringRef value)
-    -> StringId {
-  auto make_key = [&] {
-    auto id = static_cast<StringId>(values_.size());
-    CARBON_CHECK(id.index >= 0) << "Too many unique strings";
-    values_.push_back(value);
-    return id;
-  };
-  return set_.Insert(value, make_key, KeyContext(values_)).key();
-}
-
-inline auto CanonicalValueStore<StringId>::Lookup(llvm::StringRef value) const
-    -> StringId {
-  if (auto result = set_.Lookup(value, KeyContext(values_))) {
+template <typename IdT>
+auto CanonicalValueStore<IdT>::Lookup(ValueType value) const -> IdT {
+  if (auto result = set_.Lookup(value, KeyContext(values_.array_ref()))) {
     return result.key();
   }
-  return StringId::Invalid;
+  return IdT::Invalid;
 }
 
-// A thin wrapper around a `ValueStore<StringId>` that provides a different IdT,
-// while using a unified storage for values. This avoids potentially
-// duplicative string hash maps, which are expensive.
-template <typename IdT>
-class StringStoreWrapper : public Printable<StringStoreWrapper<IdT>> {
- public:
-  explicit StringStoreWrapper(CanonicalValueStore<StringId>* values)
-      : values_(values) {}
-
-  auto Add(llvm::StringRef value) -> IdT {
-    return IdT(values_->Add(value).index);
-  }
-
-  auto Get(IdT id) const -> llvm::StringRef {
-    return values_->Get(StringId(id.index));
-  }
-
-  auto Lookup(llvm::StringRef value) const -> IdT {
-    return IdT(values_->Lookup(value).index);
-  }
-
-  auto Print(llvm::raw_ostream& out) const -> void { out << *values_; }
-
-  auto size() const -> size_t { return values_->size(); }
-
- private:
-  CanonicalValueStore<StringId>* values_;
-};
-
 using FloatValueStore = CanonicalValueStore<FloatId>;
 
 // Stores that will be used across compiler phases for a given compilation unit.
 // This is provided mainly so that they don't need to be passed separately.
 class SharedValueStores : public Yaml::Printable<SharedValueStores> {
  public:
-  explicit SharedValueStores()
-      : identifiers_(&strings_), string_literal_values_(&strings_) {}
+  explicit SharedValueStores() = default;
 
   // Not copyable or movable.
   SharedValueStores(const SharedValueStores&) = delete;
   auto operator=(const SharedValueStores&) -> SharedValueStores& = delete;
 
-  auto identifiers() -> StringStoreWrapper<IdentifierId>& {
+  auto identifiers() -> CanonicalValueStore<IdentifierId>& {
     return identifiers_;
   }
-  auto identifiers() const -> const StringStoreWrapper<IdentifierId>& {
+  auto identifiers() const -> const CanonicalValueStore<IdentifierId>& {
     return identifiers_;
   }
   auto ints() -> CanonicalValueStore<IntId>& { return ints_; }
@@ -384,12 +298,12 @@ class SharedValueStores : public Yaml::Printable<SharedValueStores> {
   auto reals() const -> const ValueStore<RealId>& { return reals_; }
   auto floats() -> FloatValueStore& { return floats_; }
   auto floats() const -> const FloatValueStore& { return floats_; }
-  auto string_literal_values() -> StringStoreWrapper<StringLiteralValueId>& {
-    return string_literal_values_;
+  auto string_literal_values() -> CanonicalValueStore<StringLiteralValueId>& {
+    return string_literals_;
   }
   auto string_literal_values() const
-      -> const StringStoreWrapper<StringLiteralValueId>& {
-    return string_literal_values_;
+      -> const CanonicalValueStore<StringLiteralValueId>& {
+    return string_literals_;
   }
 
   auto OutputYaml(std::optional<llvm::StringRef> filename = std::nullopt) const
@@ -402,7 +316,8 @@ class SharedValueStores : public Yaml::Printable<SharedValueStores> {
               Yaml::OutputMapping([&](Yaml::OutputMapping::Map map) {
                 map.Add("ints", ints_.OutputYaml());
                 map.Add("reals", reals_.OutputYaml());
-                map.Add("strings", strings_.OutputYaml());
+                map.Add("identifiers", identifiers_.OutputYaml());
+                map.Add("strings", string_literals_.OutputYaml());
               }));
     });
   }
@@ -412,9 +327,8 @@ class SharedValueStores : public Yaml::Printable<SharedValueStores> {
   ValueStore<RealId> reals_;
   FloatValueStore floats_;
 
-  CanonicalValueStore<StringId> strings_;
-  StringStoreWrapper<IdentifierId> identifiers_;
-  StringStoreWrapper<StringLiteralValueId> string_literal_values_;
+  CanonicalValueStore<IdentifierId> identifiers_;
+  CanonicalValueStore<StringLiteralValueId> string_literals_;
 };
 
 }  // namespace Carbon

+ 37 - 14
toolchain/base/value_store_test.cpp

@@ -77,34 +77,55 @@ TEST(ValueStore, Float) {
               Eq(llvm::APFloatBase::cmpEqual));
 }
 
-TEST(ValueStore, String) {
+TEST(ValueStore, Identifiers) {
   std::string a = "a";
   std::string b = "b";
   SharedValueStores value_stores;
+
   auto a_id = value_stores.identifiers().Add(a);
-  auto b_id = value_stores.string_literal_values().Add(b);
+  auto b_id = value_stores.identifiers().Add(b);
 
   ASSERT_TRUE(a_id.is_valid());
   ASSERT_TRUE(b_id.is_valid());
+  EXPECT_THAT(a_id, Not(Eq(b_id)));
 
-  EXPECT_THAT(a_id.index, Not(Eq(b_id.index)));
   EXPECT_THAT(value_stores.identifiers().Get(a_id), Eq(a));
+  EXPECT_THAT(value_stores.identifiers().Get(b_id), Eq(b));
+
+  EXPECT_THAT(value_stores.identifiers().Lookup(a), Eq(a_id));
+  EXPECT_THAT(value_stores.identifiers().Lookup("c"),
+              Eq(IdentifierId::Invalid));
+}
+
+TEST(ValueStore, StringLiterals) {
+  std::string a = "a";
+  std::string b = "b";
+  SharedValueStores value_stores;
+
+  auto a_id = value_stores.string_literal_values().Add(a);
+  auto b_id = value_stores.string_literal_values().Add(b);
+
+  ASSERT_TRUE(a_id.is_valid());
+  ASSERT_TRUE(b_id.is_valid());
+  EXPECT_THAT(a_id, Not(Eq(b_id)));
+
+  EXPECT_THAT(value_stores.string_literal_values().Get(a_id), Eq(a));
   EXPECT_THAT(value_stores.string_literal_values().Get(b_id), Eq(b));
 
-  // Adding the same string again, even with a different Id type, should return
-  // the same id.
-  EXPECT_THAT(value_stores.string_literal_values().Add(a).index,
-              Eq(a_id.index));
-  EXPECT_THAT(value_stores.identifiers().Add(b).index, Eq(b_id.index));
+  EXPECT_THAT(value_stores.string_literal_values().Lookup(a), Eq(a_id));
+  EXPECT_THAT(value_stores.string_literal_values().Lookup("c"),
+              Eq(StringLiteralValueId::Invalid));
 }
 
 auto MatchSharedValues(testing::Matcher<Yaml::MappingValue> ints,
                        testing::Matcher<Yaml::MappingValue> reals,
+                       testing::Matcher<Yaml::MappingValue> identifiers,
                        testing::Matcher<Yaml::MappingValue> strings) -> auto {
   return Yaml::IsYaml(Yaml::Sequence(ElementsAre(Yaml::Mapping(ElementsAre(Pair(
       "shared_values",
       Yaml::Mapping(ElementsAre(Pair("ints", Yaml::Mapping(ints)),
                                 Pair("reals", Yaml::Mapping(reals)),
+                                Pair("identifiers", Yaml::Mapping(identifiers)),
                                 Pair("strings", Yaml::Mapping(strings))))))))));
 }
 
@@ -113,7 +134,7 @@ TEST(ValueStore, PrintEmpty) {
   TestRawOstream out;
   value_stores.Print(out);
   EXPECT_THAT(Yaml::Value::FromText(out.TakeStr()),
-              MatchSharedValues(IsEmpty(), IsEmpty(), IsEmpty()));
+              MatchSharedValues(IsEmpty(), IsEmpty(), IsEmpty(), IsEmpty()));
 }
 
 TEST(ValueStore, PrintVals) {
@@ -122,15 +143,17 @@ TEST(ValueStore, PrintVals) {
   value_stores.ints().Add(apint);
   value_stores.reals().Add(
       Real{.mantissa = apint, .exponent = apint, .is_decimal = true});
+  value_stores.identifiers().Add("a");
   value_stores.string_literal_values().Add("foo'\"baz");
   TestRawOstream out;
   value_stores.Print(out);
 
-  EXPECT_THAT(
-      Yaml::Value::FromText(out.TakeStr()),
-      MatchSharedValues(ElementsAre(Pair("int0", Yaml::Scalar("8"))),
-                        ElementsAre(Pair("real0", Yaml::Scalar("8*10^8"))),
-                        ElementsAre(Pair("str0", Yaml::Scalar("foo'\"baz")))));
+  EXPECT_THAT(Yaml::Value::FromText(out.TakeStr()),
+              MatchSharedValues(
+                  ElementsAre(Pair("int0", Yaml::Scalar("8"))),
+                  ElementsAre(Pair("real0", Yaml::Scalar("8*10^8"))),
+                  ElementsAre(Pair("identifier0", Yaml::Scalar("a"))),
+                  ElementsAre(Pair("string0", Yaml::Scalar("foo'\"baz")))));
 }
 
 }  // namespace

+ 2 - 2
toolchain/check/context.h

@@ -374,13 +374,13 @@ class Context {
 
   // Directly expose SemIR::File data accessors for brevity in calls.
 
-  auto identifiers() -> StringStoreWrapper<IdentifierId>& {
+  auto identifiers() -> CanonicalValueStore<IdentifierId>& {
     return sem_ir().identifiers();
   }
   auto ints() -> CanonicalValueStore<IntId>& { return sem_ir().ints(); }
   auto reals() -> ValueStore<RealId>& { return sem_ir().reals(); }
   auto floats() -> FloatValueStore& { return sem_ir().floats(); }
-  auto string_literal_values() -> StringStoreWrapper<StringLiteralValueId>& {
+  auto string_literal_values() -> CanonicalValueStore<StringLiteralValueId>& {
     return sem_ir().string_literal_values();
   }
   auto bind_names() -> SemIR::BindNameStore& { return sem_ir().bind_names(); }

+ 1 - 1
toolchain/check/lexical_lookup.h

@@ -37,7 +37,7 @@ class LexicalLookup {
     SemIR::InstId inst_id;
   };
 
-  explicit LexicalLookup(const StringStoreWrapper<IdentifierId>& identifiers)
+  explicit LexicalLookup(const CanonicalValueStore<IdentifierId>& identifiers)
       : lookup_(identifiers.size() + SemIR::NameId::NonIndexValueCount) {}
 
   // Returns the lexical lookup results for a name.

+ 1 - 1
toolchain/check/scope_stack.h

@@ -18,7 +18,7 @@ namespace Carbon::Check {
 // checking within.
 class ScopeStack {
  public:
-  explicit ScopeStack(const StringStoreWrapper<IdentifierId>& identifiers)
+  explicit ScopeStack(const CanonicalValueStore<IdentifierId>& identifiers)
       : lexical_lookup_(identifiers) {}
 
   // A scope in which `break` and `continue` can be used.

+ 10 - 9
toolchain/driver/testdata/dump_shared_values.carbon

@@ -31,14 +31,15 @@ var str2: String = "ab'\"c";
 // CHECK:STDOUT:     real0:           10*10^-1
 // CHECK:STDOUT:     real1:           8*10^7
 // CHECK:STDOUT:     real2:           8*10^8
+// CHECK:STDOUT:   identifiers:
+// CHECK:STDOUT:     identifier0:     int1
+// CHECK:STDOUT:     identifier1:     int2
+// CHECK:STDOUT:     identifier2:     real1
+// CHECK:STDOUT:     identifier3:     real2
+// CHECK:STDOUT:     identifier4:     real3
+// CHECK:STDOUT:     identifier5:     str1
+// CHECK:STDOUT:     identifier6:     str2
 // CHECK:STDOUT:   strings:
-// CHECK:STDOUT:     str0:            int1
-// CHECK:STDOUT:     str1:            int2
-// CHECK:STDOUT:     str2:            real1
-// CHECK:STDOUT:     str3:            real2
-// CHECK:STDOUT:     str4:            real3
-// CHECK:STDOUT:     str5:            str1
-// CHECK:STDOUT:     str6:            abc
-// CHECK:STDOUT:     str7:            str2
-// CHECK:STDOUT:     str8:            'ab''"c'
+// CHECK:STDOUT:     string0:         abc
+// CHECK:STDOUT:     string1:         ab'"c
 // CHECK:STDOUT: ...

+ 4 - 4
toolchain/sem_ir/file.h

@@ -91,10 +91,10 @@ class File : public Printable<File> {
   auto check_ir_id() const -> CheckIRId { return check_ir_id_; }
 
   // Directly expose SharedValueStores members.
-  auto identifiers() -> StringStoreWrapper<IdentifierId>& {
+  auto identifiers() -> CanonicalValueStore<IdentifierId>& {
     return value_stores_->identifiers();
   }
-  auto identifiers() const -> const StringStoreWrapper<IdentifierId>& {
+  auto identifiers() const -> const CanonicalValueStore<IdentifierId>& {
     return value_stores_->identifiers();
   }
   auto ints() -> CanonicalValueStore<IntId>& { return value_stores_->ints(); }
@@ -109,11 +109,11 @@ class File : public Printable<File> {
   auto floats() const -> const FloatValueStore& {
     return value_stores_->floats();
   }
-  auto string_literal_values() -> StringStoreWrapper<StringLiteralValueId>& {
+  auto string_literal_values() -> CanonicalValueStore<StringLiteralValueId>& {
     return value_stores_->string_literal_values();
   }
   auto string_literal_values() const
-      -> const StringStoreWrapper<StringLiteralValueId>& {
+      -> const CanonicalValueStore<StringLiteralValueId>& {
     return value_stores_->string_literal_values();
   }
 

+ 3 - 2
toolchain/sem_ir/name.h

@@ -23,7 +23,8 @@ namespace Carbon::SemIR {
 // currently a wrapper around an identifier store that has no state of its own.
 class NameStoreWrapper {
  public:
-  explicit NameStoreWrapper(const StringStoreWrapper<IdentifierId>* identifiers)
+  explicit NameStoreWrapper(
+      const CanonicalValueStore<IdentifierId>* identifiers)
       : identifiers_(identifiers) {}
 
   // Returns the requested name as a string, if it is an identifier name. This
@@ -48,7 +49,7 @@ class NameStoreWrapper {
   auto GetIRBaseName(NameId name_id) const -> llvm::StringRef;
 
  private:
-  const StringStoreWrapper<IdentifierId>* identifiers_;
+  const CanonicalValueStore<IdentifierId>* identifiers_;
 };
 
 }  // namespace Carbon::SemIR