ソースを参照

Add whitespace- and comment-specific benchmarking. (#3276)

These benchmarks zero-in and stress test horizontal and vertical
whitespace as well as comment lexing performance. They set up
essentially a worst-case scenario of ramping up whitespace between very
sparse tokens to show how the lexer copes with this.

The horizontal whitespace benchmark is perhaps less important as
frequent runs of 50-characters of horizontal whitespace are relatively
rare already, and likely to be exceedingly rare without trailing
comments. But its good to include for completeness and it shows
reasonably strong performance with the current table-dispatch approach.

The blank line and comment line benchmarks are much more important. Lots
of code is relatively line-sparse, especially API files that are perhaps
the most useful to parse quickly. And many of these are a mixture of
sparse with blank lines and sparse with large comment blocks. The
benchmarks show that there are some serious limits here, even falling
below 100k tokens per second throughput on some of the stress tests
here.

---------

Co-authored-by: Richard Smith <richard@metafoo.co.uk>
Co-authored-by: josh11b <josh11b@users.noreply.github.com>
Chandler Carruth 2 年 前
コミット
5041a14f59
1 ファイル変更94 行追加2 行削除
  1. 94 2
      toolchain/lex/tokenized_buffer_benchmark.cpp

+ 94 - 2
toolchain/lex/tokenized_buffer_benchmark.cpp

@@ -185,7 +185,7 @@ auto GetRandomIdentifiers() -> const std::array<std::string, NumTokens>& {
 
 // Compute a random sequence of just identifiers.
 template <int MinLength = 1, int MaxLength = 64, bool Uniform = false>
-auto RandomIdentifierSeq() -> std::string {
+auto RandomIdentifierSeq(llvm::StringRef separator = " ") -> std::string {
   // Get a static pool of identifiers with the desired distribution.
   const std::array<std::string, NumTokens>& ids =
       GetRandomIdentifiers<MinLength, MaxLength, Uniform>();
@@ -197,7 +197,7 @@ auto RandomIdentifierSeq() -> std::string {
     tokens[i] = ids[i];
   }
   std::shuffle(tokens.begin(), tokens.end(), absl::BitGen());
-  return llvm::join(tokens, " ");
+  return llvm::join(tokens, separator);
 }
 
 auto GetSymbolTokenTable() -> llvm::ArrayRef<TokenKind> {
@@ -450,6 +450,29 @@ BENCHMARK(BM_ValidIdentifiers<3, 5, /*Uniform=*/true>);
 BENCHMARK(BM_ValidIdentifiers<3, 16, /*Uniform=*/true>);
 BENCHMARK(BM_ValidIdentifiers<12, 64, /*Uniform=*/true>);
 
+// Benchmark to stress the lexing of horizontal whitespace. This sets up what is
+// nearly a worst-case scenario of short-but-expensive-to-lex tokens with runs
+// of horizontal whitespace between them.
+void BM_HorizontalWhitespace(benchmark::State& state) {
+  int num_spaces = state.range(0);
+  std::string separator(num_spaces, ' ');
+  std::string source = RandomIdentifierSeq<3, 5, /*Uniform=*/true>(separator);
+
+  LexerBenchHelper helper(source);
+  for (auto _ : state) {
+    TokenizedBuffer buffer = helper.Lex();
+
+    // Ensure that lexing actually occurs for benchmarking and that it doesn't
+    // hit errors that would skew the benchmark results.
+    CARBON_CHECK(!buffer.has_errors()) << helper.DiagnoseErrors();
+  }
+
+  state.SetBytesProcessed(state.iterations() * source.size());
+  state.counters["tokens_per_second"] = benchmark::Counter(
+      NumTokens, benchmark::Counter::kIsIterationInvariantRate);
+}
+BENCHMARK(BM_HorizontalWhitespace)->RangeMultiplier(4)->Range(1, 128);
+
 void BM_RandomSource(benchmark::State& state) {
   std::string source = RandomSource(DefaultSourceDist);
 
@@ -475,6 +498,75 @@ void BM_RandomSource(benchmark::State& state) {
 // range here.
 BENCHMARK(BM_RandomSource);
 
+// Benchmark to stress the lexing of blank lines. This uses a simple, easy to
+// lex token, but separates each one by varying numbers of blank lines.
+void BM_BlankLines(benchmark::State& state) {
+  int num_blank_lines = state.range(0);
+  std::string separator(num_blank_lines, '\n');
+  std::string source = RandomIdentifierSeq<3, 5, /*Uniform=*/true>(separator);
+
+  LexerBenchHelper helper(source);
+  for (auto _ : state) {
+    TokenizedBuffer buffer = helper.Lex();
+
+    // Ensure that lexing actually occurs for benchmarking and that it doesn't
+    // hit errors that would skew the benchmark results.
+    CARBON_CHECK(!buffer.has_errors()) << helper.DiagnoseErrors();
+  }
+
+  state.SetBytesProcessed(state.iterations() * source.size());
+  state.counters["tokens_per_second"] = benchmark::Counter(
+      NumTokens, benchmark::Counter::kIsIterationInvariantRate);
+  state.counters["lines_per_second"] =
+      benchmark::Counter(llvm::StringRef(source).count('\n'),
+                         benchmark::Counter::kIsIterationInvariantRate);
+}
+BENCHMARK(BM_BlankLines)->RangeMultiplier(4)->Range(1, 128);
+
+// Benchmark to stress the lexing of comment lines. This uses a simple, easy to
+// lex token, but separates each one by varying numbers of comment lines, with
+// varying comment line length and indentation.
+void BM_CommentLines(benchmark::State& state) {
+  int num_comment_lines = state.range(0);
+  int comment_length = state.range(1);
+  int comment_indent = state.range(2);
+  std::string separator;
+  llvm::raw_string_ostream os(separator);
+  os << "\n";
+  for (int i : llvm::seq(num_comment_lines)) {
+    static_cast<void>(i);
+    os << std::string(comment_indent, ' ') << "//"
+       << std::string(comment_length, ' ') << "\n";
+  }
+  std::string source = RandomIdentifierSeq<3, 5, /*Uniform=*/true>(separator);
+
+  LexerBenchHelper helper(source);
+  for (auto _ : state) {
+    TokenizedBuffer buffer = helper.Lex();
+
+    // Ensure that lexing actually occurs for benchmarking and that it doesn't
+    // hit errors that would skew the benchmark results.
+    CARBON_CHECK(!buffer.has_errors()) << helper.DiagnoseErrors();
+  }
+
+  state.SetBytesProcessed(state.iterations() * source.size());
+  state.counters["tokens_per_second"] = benchmark::Counter(
+      NumTokens, benchmark::Counter::kIsIterationInvariantRate);
+  state.counters["lines_per_second"] =
+      benchmark::Counter(llvm::StringRef(source).count('\n'),
+                         benchmark::Counter::kIsIterationInvariantRate);
+}
+BENCHMARK(BM_CommentLines)
+    ->ArgsProduct({
+        // How many lines of comment. Focused on a couple of small and checking
+        // how it scales up to large blocks.
+        {1, 4, 128},
+        // Comment lengths: the two extremes and a middling length.
+        {0, 30, 70},
+        // Comment indentations.
+        {0, 2, 8},
+    });
+
 // This is a speed-of-light benchmark that should reflect memory bandwidth
 // (ideally) of simply reading all the source code. For speed-of-light we use
 // `strcpy` -- this both examines ever byte of the input looking for a null to