2 年前 · 5041a14f59
--- a/toolchain/lex/tokenized_buffer_benchmark.cpp
+++ b/toolchain/lex/tokenized_buffer_benchmark.cpp
@@ -185,7 +185,7 @@ auto GetRandomIdentifiers() -> const std::array<std::string, NumTokens>& {
 
				 
			
 
				 // Compute a random sequence of just identifiers.
			
 
				 template <int MinLength = 1, int MaxLength = 64, bool Uniform = false>
			
 
				-auto RandomIdentifierSeq() -> std::string {
			
 
				+auto RandomIdentifierSeq(llvm::StringRef separator = " ") -> std::string {
			
 
				   // Get a static pool of identifiers with the desired distribution.
			
 
				   const std::array<std::string, NumTokens>& ids =
			
 
				       GetRandomIdentifiers<MinLength, MaxLength, Uniform>();
			
@@ -197,7 +197,7 @@ auto RandomIdentifierSeq() -> std::string {
 
				     tokens[i] = ids[i];
			
 
				   }
			
 
				   std::shuffle(tokens.begin(), tokens.end(), absl::BitGen());
			
 
				-  return llvm::join(tokens, " ");
			
 
				+  return llvm::join(tokens, separator);
			
 
				 }
			
 
				 
			
 
				 auto GetSymbolTokenTable() -> llvm::ArrayRef<TokenKind> {
			
@@ -450,6 +450,29 @@ BENCHMARK(BM_ValidIdentifiers<3, 5, /*Uniform=*/true>);
 
				 BENCHMARK(BM_ValidIdentifiers<3, 16, /*Uniform=*/true>);
			
 
				 BENCHMARK(BM_ValidIdentifiers<12, 64, /*Uniform=*/true>);
			
 
				 
			
 
				+// Benchmark to stress the lexing of horizontal whitespace. This sets up what is
			
 
				+// nearly a worst-case scenario of short-but-expensive-to-lex tokens with runs
			
 
				+// of horizontal whitespace between them.
			
 
				+void BM_HorizontalWhitespace(benchmark::State& state) {
			
 
				+  int num_spaces = state.range(0);
			
 
				+  std::string separator(num_spaces, ' ');
			
 
				+  std::string source = RandomIdentifierSeq<3, 5, /*Uniform=*/true>(separator);
			
 
				+
			
 
				+  LexerBenchHelper helper(source);
			
 
				+  for (auto _ : state) {
			
 
				+    TokenizedBuffer buffer = helper.Lex();
			
 
				+
			
 
				+    // Ensure that lexing actually occurs for benchmarking and that it doesn't
			
 
				+    // hit errors that would skew the benchmark results.
			
 
				+    CARBON_CHECK(!buffer.has_errors()) << helper.DiagnoseErrors();
			
 
				+  }
			
 
				+
			
 
				+  state.SetBytesProcessed(state.iterations() * source.size());
			
 
				+  state.counters["tokens_per_second"] = benchmark::Counter(
			
 
				+      NumTokens, benchmark::Counter::kIsIterationInvariantRate);
			
 
				+}
			
 
				+BENCHMARK(BM_HorizontalWhitespace)->RangeMultiplier(4)->Range(1, 128);
			
 
				+
			
 
				 void BM_RandomSource(benchmark::State& state) {
			
 
				   std::string source = RandomSource(DefaultSourceDist);
			
 
				 
			
@@ -475,6 +498,75 @@ void BM_RandomSource(benchmark::State& state) {
 
				 // range here.
			
 
				 BENCHMARK(BM_RandomSource);
			
 
				 
			
 
				+// Benchmark to stress the lexing of blank lines. This uses a simple, easy to
			
 
				+// lex token, but separates each one by varying numbers of blank lines.
			
 
				+void BM_BlankLines(benchmark::State& state) {
			
 
				+  int num_blank_lines = state.range(0);
			
 
				+  std::string separator(num_blank_lines, '\n');
			
 
				+  std::string source = RandomIdentifierSeq<3, 5, /*Uniform=*/true>(separator);
			
 
				+
			
 
				+  LexerBenchHelper helper(source);
			
 
				+  for (auto _ : state) {
			
 
				+    TokenizedBuffer buffer = helper.Lex();
			
 
				+
			
 
				+    // Ensure that lexing actually occurs for benchmarking and that it doesn't
			
 
				+    // hit errors that would skew the benchmark results.
			
 
				+    CARBON_CHECK(!buffer.has_errors()) << helper.DiagnoseErrors();
			
 
				+  }
			
 
				+
			
 
				+  state.SetBytesProcessed(state.iterations() * source.size());
			
 
				+  state.counters["tokens_per_second"] = benchmark::Counter(
			
 
				+      NumTokens, benchmark::Counter::kIsIterationInvariantRate);
			
 
				+  state.counters["lines_per_second"] =
			
 
				+      benchmark::Counter(llvm::StringRef(source).count('\n'),
			
 
				+                         benchmark::Counter::kIsIterationInvariantRate);
			
 
				+}
			
 
				+BENCHMARK(BM_BlankLines)->RangeMultiplier(4)->Range(1, 128);
			
 
				+
			
 
				+// Benchmark to stress the lexing of comment lines. This uses a simple, easy to
			
 
				+// lex token, but separates each one by varying numbers of comment lines, with
			
 
				+// varying comment line length and indentation.
			
 
				+void BM_CommentLines(benchmark::State& state) {
			
 
				+  int num_comment_lines = state.range(0);
			
 
				+  int comment_length = state.range(1);
			
 
				+  int comment_indent = state.range(2);
			
 
				+  std::string separator;
			
 
				+  llvm::raw_string_ostream os(separator);
			
 
				+  os << "\n";
			
 
				+  for (int i : llvm::seq(num_comment_lines)) {
			
 
				+    static_cast<void>(i);
			
 
				+    os << std::string(comment_indent, ' ') << "//"
			
 
				+       << std::string(comment_length, ' ') << "\n";
			
 
				+  }
			
 
				+  std::string source = RandomIdentifierSeq<3, 5, /*Uniform=*/true>(separator);
			
 
				+
			
 
				+  LexerBenchHelper helper(source);
			
 
				+  for (auto _ : state) {
			
 
				+    TokenizedBuffer buffer = helper.Lex();
			
 
				+
			
 
				+    // Ensure that lexing actually occurs for benchmarking and that it doesn't
			
 
				+    // hit errors that would skew the benchmark results.
			
 
				+    CARBON_CHECK(!buffer.has_errors()) << helper.DiagnoseErrors();
			
 
				+  }
			
 
				+
			
 
				+  state.SetBytesProcessed(state.iterations() * source.size());
			
 
				+  state.counters["tokens_per_second"] = benchmark::Counter(
			
 
				+      NumTokens, benchmark::Counter::kIsIterationInvariantRate);
			
 
				+  state.counters["lines_per_second"] =
			
 
				+      benchmark::Counter(llvm::StringRef(source).count('\n'),
			
 
				+                         benchmark::Counter::kIsIterationInvariantRate);
			
 
				+}
			
 
				+BENCHMARK(BM_CommentLines)
			
 
				+    ->ArgsProduct({
			
 
				+        // How many lines of comment. Focused on a couple of small and checking
			
 
				+        // how it scales up to large blocks.
			
 
				+        {1, 4, 128},
			
 
				+        // Comment lengths: the two extremes and a middling length.
			
 
				+        {0, 30, 70},
			
 
				+        // Comment indentations.
			
 
				+        {0, 2, 8},
			
 
				+    });
			
 
				+
			
 
				 // This is a speed-of-light benchmark that should reflect memory bandwidth
			
 
				 // (ideally) of simply reading all the source code. For speed-of-light we use
			
 
				 // `strcpy` -- this both examines ever byte of the input looking for a null to