Browse Source

Improve string lexing performance (#1001)

See benchmark notes on https://github.com/carbon-language/carbon-lang/pull/1001
Jon Meow 4 years ago
parent
commit
dfef55fc2c

+ 13 - 0
toolchain/lexer/BUILD

@@ -97,6 +97,19 @@ cc_library(
     ],
     ],
 )
 )
 
 
+cc_binary(
+    name = "string_literal_benchmark",
+    testonly = 1,
+    srcs = ["string_literal_benchmark.cpp"],
+    deps = [
+        ":string_literal",
+        "//common:ostream",
+        "//toolchain/diagnostics:diagnostic_emitter",
+        "@com_github_google_benchmark//:benchmark_main",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
 cc_test(
 cc_test(
     name = "string_literal_test",
     name = "string_literal_test",
     size = "small",
     size = "small",

+ 66 - 48
toolchain/lexer/string_literal.cpp

@@ -83,82 +83,100 @@ struct InvalidHorizontalWhitespaceInString
       "sequence in a string literal.";
       "sequence in a string literal.";
 };
 };
 
 
-// Find and return the opening characters of a multi-line string literal,
+static constexpr char MultiLineIndicator[] = R"(""")";
+
+// Return the number of opening characters of a multi-line string literal,
 // after any '#'s, including the file type indicator and following newline.
 // after any '#'s, including the file type indicator and following newline.
-static auto TakeMultiLineStringLiteralPrefix(llvm::StringRef source_text)
-    -> llvm::StringRef {
-  llvm::StringRef remaining = source_text;
-  if (!remaining.consume_front(R"(""")")) {
-    return llvm::StringRef();
+static auto GetMultiLineStringLiteralPrefixSize(llvm::StringRef source_text)
+    -> int {
+  if (!source_text.startswith(MultiLineIndicator)) {
+    return 0;
   }
   }
 
 
   // The rest of the line must be a valid file type indicator: a sequence of
   // The rest of the line must be a valid file type indicator: a sequence of
   // characters containing neither '#' nor '"' followed by a newline.
   // characters containing neither '#' nor '"' followed by a newline.
-  remaining = remaining.drop_until(
-      [](char c) { return c == '"' || c == '#' || c == '\n'; });
-  if (!remaining.consume_front("\n")) {
-    return llvm::StringRef();
+  auto prefix_end =
+      source_text.find_first_of("#\n\"", strlen(MultiLineIndicator));
+  if (prefix_end == llvm::StringRef::npos || source_text[prefix_end] != '\n') {
+    return 0;
   }
   }
 
 
-  return source_text.take_front(remaining.begin() - source_text.begin());
+  // Include the newline on return.
+  return prefix_end + 1;
 }
 }
 
 
-// If source_text begins with a string literal token, extract and return
-// information on that token.
 auto LexedStringLiteral::Lex(llvm::StringRef source_text)
 auto LexedStringLiteral::Lex(llvm::StringRef source_text)
     -> llvm::Optional<LexedStringLiteral> {
     -> llvm::Optional<LexedStringLiteral> {
-  const char* begin = source_text.begin();
+  int64_t cursor = 0;
+  const int64_t source_text_size = source_text.size();
 
 
-  int hash_level = 0;
-  while (source_text.consume_front("#")) {
-    ++hash_level;
+  // Determine the number of hashes prefixing.
+  while (cursor < source_text_size && source_text[cursor] == '#') {
+    ++cursor;
   }
   }
+  const int hash_level = cursor;
 
 
   llvm::SmallString<16> terminator("\"");
   llvm::SmallString<16> terminator("\"");
   llvm::SmallString<16> escape("\\");
   llvm::SmallString<16> escape("\\");
 
 
-  llvm::StringRef multi_line_prefix =
-      TakeMultiLineStringLiteralPrefix(source_text);
-  bool multi_line = !multi_line_prefix.empty();
+  const int multi_line_prefix_size =
+      GetMultiLineStringLiteralPrefixSize(source_text.substr(hash_level));
+  const bool multi_line = multi_line_prefix_size > 0;
   if (multi_line) {
   if (multi_line) {
-    source_text = source_text.drop_front(multi_line_prefix.size());
-    terminator = R"(""")";
-  } else if (!source_text.consume_front("\"")) {
+    cursor += multi_line_prefix_size;
+    terminator = MultiLineIndicator;
+  } else if (cursor < source_text_size && source_text[cursor] == '"') {
+    ++cursor;
+  } else {
     return llvm::None;
     return llvm::None;
   }
   }
 
 
+  const int prefix_len = cursor;
+
   // The terminator and escape sequence marker require a number of '#'s
   // The terminator and escape sequence marker require a number of '#'s
   // matching the leading sequence of '#'s.
   // matching the leading sequence of '#'s.
   terminator.resize(terminator.size() + hash_level, '#');
   terminator.resize(terminator.size() + hash_level, '#');
   escape.resize(escape.size() + hash_level, '#');
   escape.resize(escape.size() + hash_level, '#');
 
 
-  const char* content_begin = source_text.begin();
-  const char* content_end = content_begin;
-  while (!source_text.consume_front(terminator)) {
-    // Let LexError figure out how to recover from an unterminated string
-    // literal.
-    if (source_text.empty()) {
-      return llvm::None;
-    }
-
-    // Consume an escape sequence marker if present.
-    (void)source_text.consume_front(escape);
-
-    // Then consume one more character, either of the content or of an
-    // escape sequence. This can be a newline in a multi-line string literal.
-    // This relies on multi-character escape sequences not containing an
-    // embedded and unescaped terminator or newline.
-    if (!multi_line && source_text.startswith("\n")) {
-      return llvm::None;
+  for (; cursor < source_text_size; ++cursor) {
+    // This switch and loop structure relies on multi-character terminators and
+    // escape sequences starting with a predictable character and not containing
+    // embedded and unescaped terminators or newlines.
+    switch (source_text[cursor]) {
+      case '\\':
+        if (escape.size() == 1 ||
+            source_text.substr(cursor).startswith(escape)) {
+          cursor += escape.size();
+          // If there's either not a character following the escape, or it's a
+          // single-line string and the escaped character is a newline, we
+          // should stop here.
+          if (cursor >= source_text_size ||
+              (!multi_line && source_text[cursor] == '\n')) {
+            return llvm::None;
+          }
+        }
+        break;
+      case '\n':
+        if (!multi_line) {
+          return llvm::None;
+        }
+        break;
+      case '\"': {
+        if (terminator.size() == 1 ||
+            source_text.substr(cursor).startswith(terminator)) {
+          llvm::StringRef text =
+              source_text.substr(0, cursor + terminator.size());
+          llvm::StringRef content =
+              source_text.substr(prefix_len, cursor - prefix_len);
+          return LexedStringLiteral(text, content, hash_level, multi_line);
+        }
+        break;
+      }
     }
     }
-    source_text = source_text.substr(1);
-    content_end = source_text.begin();
   }
   }
-
-  return LexedStringLiteral(
-      llvm::StringRef(begin, source_text.begin() - begin),
-      llvm::StringRef(content_begin, content_end - content_begin), hash_level,
-      multi_line);
+  // Let LexError figure out how to recover from an unterminated string
+  // literal.
+  return llvm::None;
 }
 }
 
 
 // Given a string that contains at least one newline, find the indent (the
 // Given a string that contains at least one newline, find the indent (the

+ 74 - 0
toolchain/lexer/string_literal_benchmark.cpp

@@ -0,0 +1,74 @@
+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+// Exceptions. See /LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <benchmark/benchmark.h>
+
+#include "toolchain/lexer/string_literal.h"
+
+namespace Carbon::Testing {
+namespace {
+
+static void BM_ValidString(benchmark::State& state, std::string_view introducer,
+                           std::string_view terminator) {
+  std::string x(introducer);
+  x.append(100000, 'a');
+  x.append(terminator);
+  for (auto _ : state) {
+    LexedStringLiteral::Lex(x);
+  }
+}
+
+static void BM_ValidString_Simple(benchmark::State& state) {
+  BM_ValidString(state, "\"", "\"");
+}
+
+static void BM_ValidString_Multiline(benchmark::State& state) {
+  BM_ValidString(state, "\"\"\"\n", "\n\"\"\"");
+}
+
+static void BM_ValidString_Raw(benchmark::State& state) {
+  BM_ValidString(state, "#\"", "\"#");
+}
+
+BENCHMARK(BM_ValidString_Simple);
+BENCHMARK(BM_ValidString_Multiline);
+BENCHMARK(BM_ValidString_Raw);
+
+static void BM_IncompleteWithRepeatedEscapes(benchmark::State& state,
+                                             std::string_view introducer,
+                                             std::string_view escape) {
+  std::string x(introducer);
+  // Aim for about 100k to emphasize escape parsing issues.
+  while (x.size() < 100000) {
+    x.append("key: ");
+    x.append(escape);
+    x.append("\"");
+    x.append(escape);
+    x.append("\"");
+    x.append(escape);
+    x.append("n ");
+  }
+  for (auto _ : state) {
+    LexedStringLiteral::Lex(x);
+  }
+}
+
+static void BM_IncompleteWithEscapes_Simple(benchmark::State& state) {
+  BM_IncompleteWithRepeatedEscapes(state, "\"", "\\");
+}
+
+static void BM_IncompleteWithEscapes_Multiline(benchmark::State& state) {
+  BM_IncompleteWithRepeatedEscapes(state, "\"\"\"\n", "\\");
+}
+
+static void BM_IncompleteWithEscapes_Raw(benchmark::State& state) {
+  BM_IncompleteWithRepeatedEscapes(state, "#\"", "\\#");
+}
+
+BENCHMARK(BM_IncompleteWithEscapes_Simple);
+BENCHMARK(BM_IncompleteWithEscapes_Multiline);
+BENCHMARK(BM_IncompleteWithEscapes_Raw);
+
+}  // namespace
+}  // namespace Carbon::Testing