1 年之前 · 44c85e0872
--- a/toolchain/base/value_store.h
+++ b/toolchain/base/value_store.h
@@ -225,14 +225,7 @@ class CanonicalValueStore {
 
				   auto Lookup(ValueType value) const -> IdT;
			
 
				 
			
 
				   // Reserves space.
			
 
				-  auto Reserve(size_t size) -> void {
			
 
				-    // Compute the resulting new insert count using the size of values -- the
			
 
				-    // set doesn't have a fast to compute current size.
			
 
				-    if (size > values_.size()) {
			
 
				-      set_.GrowForInsertCount(size - values_.size(), KeyContext(values_));
			
 
				-    }
			
 
				-    values_.Reserve(size);
			
 
				-  }
			
 
				+  auto Reserve(size_t size) -> void;
			
 
				 
			
 
				   // These are to support printable structures, and are not guaranteed.
			
 
				   auto OutputYaml() const -> Yaml::OutputMapping {
			
@@ -290,6 +283,17 @@ auto CanonicalValueStore<IdT>::Lookup(ValueType value) const -> IdT {
 
				   return IdT::Invalid;
			
 
				 }
			
 
				 
			
 
				+template <typename IdT>
			
 
				+auto CanonicalValueStore<IdT>::Reserve(size_t size) -> void {
			
 
				+  // Compute the resulting new insert count using the size of values -- the
			
 
				+  // set doesn't have a fast to compute current size.
			
 
				+  if (size > values_.size()) {
			
 
				+    set_.GrowForInsertCount(size - values_.size(),
			
 
				+                            KeyContext(values_.array_ref()));
			
 
				+  }
			
 
				+  values_.Reserve(size);
			
 
				+}
			
 
				+
			
 
				 using FloatValueStore = CanonicalValueStore<FloatId>;
			
 
				 
			
 
				 // Stores that will be used across compiler phases for a given compilation unit.
			
--- a/toolchain/base/value_store_test.cpp
+++ b/toolchain/base/value_store_test.cpp
@@ -82,6 +82,9 @@ TEST(ValueStore, Identifiers) {
 
				   std::string b = "b";
			
 
				   SharedValueStores value_stores;
			
 
				 
			
 
				+  // Make sure reserve works, we use it with identifiers.
			
 
				+  value_stores.identifiers().Reserve(100);
			
 
				+
			
 
				   auto a_id = value_stores.identifiers().Add(a);
			
 
				   auto b_id = value_stores.identifiers().Add(b);
			
 
				 
			
--- a/toolchain/lex/lex.cpp
+++ b/toolchain/lex/lex.cpp
@@ -625,12 +625,59 @@ static auto DispatchNext(Lexer& lexer, llvm::StringRef source_text,
 
				   lexer.LexFileEnd(source_text, position);
			
 
				 }
			
 
				 
			
 
				+// Estimate an upper bound on the number of identifiers we will need to lex.
			
 
				+//
			
 
				+// When analyzing both Carbon and LLVM's C++ code, we have found a roughly
			
 
				+// normal distribution of unique identifiers in the file centered at 0.5 *
			
 
				+// lines, and in the vast majority of cases bounded below 1.0 * lines. For
			
 
				+// example, here is LLVM's distribution computed with `scripts/source_stats.py`
			
 
				+// and rendered in an ASCII-art histogram:
			
 
				+//
			
 
				+//   ## Unique IDs per 10 lines ## (median: 5, p90: 8, p95: 9, p99: 14)
			
 
				+//   1 ids   [  29]  ▍
			
 
				+//   2 ids   [ 282]  ███▊
			
 
				+//   3 ids   [1492]  ███████████████████▉
			
 
				+//   4 ids   [2674]  ███████████████████████████████████▌
			
 
				+//   5 ids   [3011]  ████████████████████████████████████████
			
 
				+//   6 ids   [2267]  ██████████████████████████████▏
			
 
				+//   7 ids   [1549]  ████████████████████▋
			
 
				+//   8 ids   [ 817]  ██████████▉
			
 
				+//   9 ids   [ 301]  ████
			
 
				+//   10 ids  [  98]  █▎
			
 
				+//
			
 
				+//   (Trimmed to only cover 1 - 10 unique IDs per 10 lines of code, 272 files
			
 
				+//    with more unique IDs in the tail.)
			
 
				+//
			
 
				+// We have checked this distribution with several large codebases (currently
			
 
				+// those at Google, happy to cross check with others) that use a similar coding
			
 
				+// style, and it appears to be very consistent. However, we suspect it may be
			
 
				+// dependent on the column width style. Currently, Carbon's toolchain style
			
 
				+// specifies 80-columns, but if we expect the lexer to routinely see files in
			
 
				+// different styles we should re-compute this estimate.
			
 
				+static auto EstimateUpperBoundOnNumIdentifiers(int line_count) -> int {
			
 
				+  return line_count;
			
 
				+}
			
 
				+
			
 
				 auto Lexer::Lex() && -> TokenizedBuffer {
			
 
				   llvm::StringRef source_text = buffer_.source_->text();
			
 
				 
			
 
				   // First build up our line data structures.
			
 
				   MakeLines(source_text);
			
 
				 
			
 
				+  // Use the line count (and any other info needed from this scan) to make rough
			
 
				+  // estimated reservations of memory in the hot data structures used by the
			
 
				+  // lexer. In practice, scanning for lines is one of the easiest parts of the
			
 
				+  // lexer to accelerate, and we can use its results to minimize the cost of
			
 
				+  // incrementally growing data structures during the hot path of the lexer.
			
 
				+  //
			
 
				+  // Note that for hashtables we want estimates near the upper bound to minimize
			
 
				+  // growth across the vast majority of inputs. They will also typically reserve
			
 
				+  // more memory than we request due to load factor and rounding to power-of-two
			
 
				+  // size. This overshoot is usually fine for hot parts of the lexer where
			
 
				+  // latency is expected to be more important than minimizing memory usage.
			
 
				+  buffer_.value_stores_->identifiers().Reserve(
			
 
				+      EstimateUpperBoundOnNumIdentifiers(buffer_.line_infos_.size()));
			
 
				+
			
 
				   ssize_t position = 0;
			
 
				   LexFileStart(source_text, position);