2 лет назад · c7e6238fa8
--- a/toolchain/lex/tokenized_buffer_benchmark.cpp
+++ b/toolchain/lex/tokenized_buffer_benchmark.cpp
@@ -5,6 +5,7 @@
 
				 #include <benchmark/benchmark.h>
			
 
				 
			
 
				 #include <algorithm>
			
 
				+#include <utility>
			
 
				 
			
 
				 #include "absl/random/random.h"
			
 
				 #include "common/check.h"
			
@@ -291,6 +292,8 @@ class LexerBenchHelper {
 
				     return result;
			
 
				   }
			
 
				 
			
 
				+  auto source_text() -> llvm::StringRef { return source_.text(); }
			
 
				+
			
 
				  private:
			
 
				   auto MakeSourceBuffer(llvm::StringRef text) -> SourceBuffer {
			
 
				     CARBON_CHECK(fs_.addFile(filename_, /*ModificationTime=*/0,
			
@@ -379,5 +382,183 @@ BENCHMARK(BM_ValidMix)
 
				     ->Args({50, 20})
			
 
				     ->Args({75, 10});
			
 
				 
			
 
				+// This is a speed-of-light benchmark that should reflect memory bandwidth
			
 
				+// (ideally) of simply reading all the source code. For speed-of-light we use
			
 
				+// `strcpy` -- this both examines ever byte of the input looking for a null to
			
 
				+// end the copy, and also writes to a data structure of roughly the same size as
			
 
				+// the input. This routine is one we expect to be *very* well optimized and give
			
 
				+// a good approximation of the fastest possible lexer given the physical
			
 
				+// constraints of the machine. Note that which particular source we use as input
			
 
				+// here isn't especially interesting, so we just pick one and should update it
			
 
				+// to reflect whatever distribution is most realistic long-term. The
			
 
				+// bytes/second throughput is the important output of this routine.
			
 
				+auto BM_SpeedOfLightStrCpy(benchmark::State& state) -> void {
			
 
				+  std::string source =
			
 
				+      RandomMixedSeq(/*symbol_percent=*/25, /*keyword_percent=*/30);
			
 
				+
			
 
				+  // A buffer to write the null-terminated contents of `source` into.
			
 
				+  llvm::OwningArrayRef<char> buffer(source.size() + 1);
			
 
				+
			
 
				+  for (auto _ : state) {
			
 
				+    const char* text = source.data();
			
 
				+    benchmark::DoNotOptimize(text);
			
 
				+    strcpy(buffer.data(), text);
			
 
				+    benchmark::DoNotOptimize(buffer.data());
			
 
				+  }
			
 
				+
			
 
				+  state.SetBytesProcessed(state.iterations() * source.size());
			
 
				+  state.counters["tokens_per_second"] = benchmark::Counter(
			
 
				+      NumTokens, benchmark::Counter::kIsIterationInvariantRate);
			
 
				+}
			
 
				+BENCHMARK(BM_SpeedOfLightStrCpy);
			
 
				+
			
 
				+// This is a speed-of-light benchmark that builds up a best-case byte-wise table
			
 
				+// dispatch using guaranteed tail recursion. The goal is both to ensure the
			
 
				+// general technique can reasonably hit the level of performance we need and to
			
 
				+// establish how far from this speed of light the actual lexer currently sits.
			
 
				+//
			
 
				+// A major impact on the observed performance of this technique is how many
			
 
				+// different functions are reached in this dispatch loop. This benchmark
			
 
				+// infrastructure tries to bracket the range of performance this technique
			
 
				+// affords with different numbers of dispatch target functions.
			
 
				+using DispatchPtrT = auto (*)(ssize_t& index, const char* text, char* buffer)
			
 
				+    -> void;
			
 
				+using DispatchTableT = std::array<DispatchPtrT, 256>;
			
 
				+
			
 
				+template <const DispatchTableT& Table>
			
 
				+auto BasicDispatch(ssize_t& index, const char* text, char* buffer) -> void {
			
 
				+  *buffer = text[index];
			
 
				+  ++index;
			
 
				+  [[clang::musttail]] return Table[static_cast<unsigned char>(text[index])](
			
 
				+      index, text, buffer);
			
 
				+}
			
 
				+
			
 
				+template <const DispatchTableT& Table, char C>
			
 
				+auto SpecializedDispatch(ssize_t& index, const char* text, char* buffer)
			
 
				+    -> void {
			
 
				+  CARBON_CHECK(C == text[index]);
			
 
				+  *buffer = C;
			
 
				+  ++index;
			
 
				+  [[clang::musttail]] return Table[static_cast<unsigned char>(text[index])](
			
 
				+      index, text, buffer);
			
 
				+}
			
 
				+
			
 
				+// A sample of the symbol characters used in Carbon code. Doesn't need to be
			
 
				+// perfect, as we just need to have a reasonably large # of distinct dispatch
			
 
				+// functions.
			
 
				+constexpr char DispatchSpecializableSymbols[] = {
			
 
				+    '!', '%', '(', ')', '*', '+', ',', '-', '.', ':',
			
 
				+    ';', '<', '=', '>', '?', '[', ']', '{', '}', '~',
			
 
				+};
			
 
				+
			
 
				+// Create an array of all the characters we can specialize dispatch over --
			
 
				+// [0-9A-Za-z] and the symbols above. Similar to the above symbols, doesn't need
			
 
				+// to be exhaustive.
			
 
				+constexpr std::array<char, 26 * 2 + 10 + sizeof(DispatchSpecializableSymbols)>
			
 
				+    DispatchSpecializableChars = []() constexpr {
			
 
				+      constexpr int Size = sizeof(DispatchSpecializableChars);
			
 
				+      std::array<char, Size> chars = {};
			
 
				+      int i = 0;
			
 
				+      for (char c = '0'; c <= '9'; ++c) {
			
 
				+        chars[i] = c;
			
 
				+        ++i;
			
 
				+      }
			
 
				+      for (char c = 'A'; c <= 'Z'; ++c) {
			
 
				+        chars[i] = c;
			
 
				+        ++i;
			
 
				+      }
			
 
				+      for (char c = 'a'; c <= 'z'; ++c) {
			
 
				+        chars[i] = c;
			
 
				+        ++i;
			
 
				+      }
			
 
				+      for (char c : DispatchSpecializableSymbols) {
			
 
				+        chars[i] = c;
			
 
				+        ++i;
			
 
				+      }
			
 
				+      CARBON_CHECK(i == Size);
			
 
				+      return chars;
			
 
				+    }();
			
 
				+
			
 
				+// Instantiate a number of specialized dispatch functions for characters in the
			
 
				+// array above, and assign those function addresses to the character's entry in
			
 
				+// the provided table. The provided `tmp_table` is a temporary that will
			
 
				+// eventually initialize the provided `Table` constant, so the constant is what
			
 
				+// we propagate to the instantiated function and the temporary is the one we
			
 
				+// initialize.
			
 
				+template <const DispatchTableT& Table, size_t... Indices>
			
 
				+constexpr auto SpecializeDispatchTable(
			
 
				+    DispatchTableT& tmp_table, std::index_sequence<Indices...> /*indices*/)
			
 
				+    -> void {
			
 
				+  static_assert(sizeof...(Indices) <= sizeof(DispatchSpecializableChars));
			
 
				+  ((tmp_table[static_cast<unsigned char>(DispatchSpecializableChars[Indices])] =
			
 
				+        &SpecializedDispatch<Table, DispatchSpecializableChars[Indices]>),
			
 
				+   ...);
			
 
				+}
			
 
				+
			
 
				+// The maximum number of dispatch targets is the size of the array + 1 (for the
			
 
				+// base case target).
			
 
				+constexpr int MaxDispatchTargets = sizeof(DispatchSpecializableChars) + 1;
			
 
				+
			
 
				+// Dispatch tables with a provided number of distinct dispatch targets. There
			
 
				+// will always be one additional target for the null byte to end the loop.
			
 
				+template <int NumDispatchTargets>
			
 
				+constexpr DispatchTableT DispatchTable = []() constexpr {
			
 
				+  static_assert(NumDispatchTargets > 0, "Need at least one dispatch target.");
			
 
				+  static_assert(NumDispatchTargets <= MaxDispatchTargets,
			
 
				+                "Limited number of dispatch targets available.");
			
 
				+
			
 
				+  DispatchTableT tmp_table = {};
			
 
				+  // Start with the basic dispatch target.
			
 
				+  for (int i = 0; i < 256; ++i) {
			
 
				+    tmp_table[i] = &BasicDispatch<DispatchTable<NumDispatchTargets>>;
			
 
				+  }
			
 
				+  if constexpr (NumDispatchTargets > 1) {
			
 
				+    // Add additional dispatch targets from our specializable array.
			
 
				+    SpecializeDispatchTable<DispatchTable<NumDispatchTargets>>(
			
 
				+        tmp_table, std::make_index_sequence<NumDispatchTargets - 1>());
			
 
				+  }
			
 
				+  // Special case the null byte index to end the tail-dispatch.
			
 
				+  tmp_table[0] =
			
 
				+      +[](ssize_t& index, const char* text, char* /*buffer*/) -> void {
			
 
				+    CARBON_CHECK(text[index] == '\0');
			
 
				+    return;
			
 
				+  };
			
 
				+  return tmp_table;
			
 
				+}();
			
 
				+
			
 
				+template <int NumDispatchTargets>
			
 
				+auto BM_SpeedOfLightDispatch(benchmark::State& state) -> void {
			
 
				+  std::string source =
			
 
				+      RandomMixedSeq(/*symbol_percent=*/25, /*keyword_percent=*/30);
			
 
				+
			
 
				+  // A buffer to write to, simulating some minimal write traffic.
			
 
				+  llvm::OwningArrayRef<char> buffer(source.size());
			
 
				+
			
 
				+  for (auto _ : state) {
			
 
				+    const char* text = source.data();
			
 
				+    benchmark::DoNotOptimize(text);
			
 
				+
			
 
				+    // Use `ssize_t` to minimize indexing overhead.
			
 
				+    ssize_t i = 0;
			
 
				+    // The dispatch table tail-recurses through the entire string.
			
 
				+    DispatchTable<NumDispatchTargets>[static_cast<unsigned char>(text[i])](
			
 
				+        i, text, buffer.data());
			
 
				+    CARBON_CHECK(i == static_cast<ssize_t>(source.size()));
			
 
				+
			
 
				+    benchmark::DoNotOptimize(buffer.data());
			
 
				+  }
			
 
				+
			
 
				+  state.SetBytesProcessed(state.iterations() * source.size());
			
 
				+  state.counters["tokens_per_second"] = benchmark::Counter(
			
 
				+      NumTokens, benchmark::Counter::kIsIterationInvariantRate);
			
 
				+}
			
 
				+BENCHMARK(BM_SpeedOfLightDispatch<1>);
			
 
				+BENCHMARK(BM_SpeedOfLightDispatch<2>);
			
 
				+BENCHMARK(BM_SpeedOfLightDispatch<4>);
			
 
				+BENCHMARK(BM_SpeedOfLightDispatch<8>);
			
 
				+BENCHMARK(BM_SpeedOfLightDispatch<16>);
			
 
				+BENCHMARK(BM_SpeedOfLightDispatch<32>);
			
 
				+BENCHMARK(BM_SpeedOfLightDispatch<MaxDispatchTargets>);
			
 
				+
			
 
				 }  // namespace
			
 
				 }  // namespace Carbon::Lex