| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311 |
- // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
- // Exceptions. See /LICENSE for license information.
- // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- #include <benchmark/benchmark.h>
- #include <algorithm>
- #include <array>
- #include <cstddef>
- #include <numbers>
- #include "absl/hash/hash.h"
- #include "absl/random/random.h"
- #include "common/hashing.h"
- #include "llvm/ADT/Hashing.h"
- namespace Carbon {
- namespace {
- // We want the benchmark working set to fit in the L1 cache where possible so
- // that the benchmark focuses on the CPU-execution costs and not memory latency.
- // For most CPUs we're going to care about, 16k will fit easily, and 32k will
- // probably fit. But we also need to include sizes for string benchmarks. This
- // targets 8k of entropy with each object up to 8k of size for a total of 16k.
- constexpr int EntropySize = 8 << 10;
- constexpr int EntropyObjSize = 8 << 10;
- // An array of random entropy with `EntropySize` bytes plus 8k. The goal is that
- // clients can read `EntropySize` objects of up to 8k size out of this pool by
- // starting at different byte offsets.
- static const llvm::ArrayRef<std::byte> entropy_bytes =
- []() -> llvm::ArrayRef<std::byte> {
- static llvm::SmallVector<std::byte> bytes;
- // Pad out the entropy for up to 1kb objects.
- bytes.resize(EntropySize + EntropyObjSize);
- absl::BitGen gen;
- for (std::byte& b : bytes) {
- b = static_cast<std::byte>(absl::Uniform<uint8_t>(gen));
- }
- return bytes;
- }();
- // Based on 16k of entropy above and an L1 cache size often up to 32k, keep each
- // array of sizes small at 8k or 1k 8-byte sizes.
- constexpr int NumSizes = 1 << 10;
- // Selects an array of `NumSizes` sizes, witch each one in the range [0,
- // MaxSize). The sizes will be in a random order, but the sum of sizes will
- // always be the same.
- template <size_t MaxSize>
- static const std::array<size_t, NumSizes> rand_sizes = []() {
- std::array<size_t, NumSizes> sizes;
- // Build an array with a deterministic set of sizes in the
- // range [0, MaxSize), using the golden ratio to select well distributed
- // points in that range. See https://www.youtube.com/watch?v=lOIP_Z_-0Hs for
- // an example of why this is an effective strategy for selecting sizes in the
- // range.
- static_assert(NumSizes > 128);
- constexpr size_t Scale = std::max<size_t>(1, MaxSize / std::numbers::phi);
- for (auto [i, size] : llvm::enumerate(sizes)) {
- size = (i * Scale) % MaxSize;
- }
- // Shuffle the sizes randomly so that there isn't any pattern of sizes
- // encountered and we get relatively realistic branch prediction behavior
- // when branching on the size. We use this approach rather than random
- // sizes to ensure we always have the same total size of data processed.
- std::shuffle(sizes.begin(), sizes.end(), absl::BitGen());
- return sizes;
- }();
- // A small helper class to synthesize random values out of our entropy pool.
- // This is done in a way that depends on an arbitrary input (`x`) to allow us to
- // create a benchmark that measures a *dependent* chain of hashes of these
- // values.
- //
- // `T` needs to be default constructable and reasonable to synthesize an
- // instance by copying random bytes into its underlying storage.
- //
- // This helper class also accumulates the number of bytes of data generated in
- // order to let us compute throughput measurements as well as latency
- // measurements.
- //
- // This helper class has the same API as the `RandStrings` helpers below so that
- // they can all be used as type parameters to a common benchmark routine below.
- template <typename T>
- struct RandValues {
- size_t bytes = 0;
- // Get a random value. We don't need to iterate through sizes so `i` is
- // ignored, but we use `x` to select our entropy ensuring a dependency on `x`
- // for the benchmark.
- auto Get(ssize_t /*i*/, uint64_t x) -> T {
- static_assert(sizeof(T) <= EntropyObjSize);
- bytes += sizeof(T);
- T result;
- // Clang Tidy complains about this `memcpy` despite this being the canonical
- // formulation. Removing the type `T` would also remove warnings for getting
- // the size incorrect.
- // NOLINTNEXTLINE(bugprone-multi-level-implicit-pointer-conversion)
- memcpy(&result, &entropy_bytes[x % EntropySize], sizeof(T));
- return result;
- }
- };
- // A specialization to help with building pairs of values.
- template <typename T, typename U>
- struct RandValues<std::pair<T, U>> {
- size_t bytes = 0;
- auto Get(ssize_t /*i*/, uint64_t x) -> std::pair<T, U> {
- static_assert(sizeof(std::pair<T, U>) <= EntropyObjSize);
- bytes += sizeof(std::pair<T, U>);
- T result0;
- U result1;
- // Clang Tidy complains about this `memcpy` despite this being the canonical
- // formulation. Removing the type `T` would also remove warnings for getting
- // the size incorrect.
- // NOLINTNEXTLINE(bugprone-multi-level-implicit-pointer-conversion)
- memcpy(&result0, &entropy_bytes[x % EntropySize], sizeof(T));
- // NOLINTNEXTLINE(bugprone-multi-level-implicit-pointer-conversion)
- memcpy(&result1, &entropy_bytes[x % EntropySize] + sizeof(T), sizeof(U));
- return {result0, result1};
- }
- };
- // A helper class similar to `RandValues`, but for building strings rather than
- // values. The string content is pulled from the entropy pool. The size can be
- // random from [0, MaxSize], or it can be fixed at `MaxSize`. But the `MaxSize`
- // cannot be larger than a single byte sequence pulled from the entropy pool
- // (`EntropyObjSize`).
- template <bool RandSize, size_t MaxSize>
- struct RandStrings {
- size_t bytes = 0;
- // Get a random string. If the sizes are random, we use `i` to select each
- // size and require it to be in the range [0, NumSizes). Otherwise `i` is
- // ignored. We always use `x` to select the entropy and establish a dependency
- // on the input.
- auto Get(ssize_t i, uint64_t x) -> llvm::StringRef {
- static_assert(MaxSize <= EntropyObjSize);
- size_t s = MaxSize;
- if constexpr (RandSize) {
- // When using random sizes, we leverage `i` which is guaranteed to range
- // from [0, NumSizes).
- s = rand_sizes<MaxSize>[i];
- } else {
- // Prevent `s` from being constant folded when we directly use `MaxSize`.
- benchmark::DoNotOptimize(s);
- }
- bytes += s;
- return llvm::StringRef(
- reinterpret_cast<const char*>(&entropy_bytes[x % EntropySize]), s);
- }
- };
- struct HashBenchBase {
- uint64_t seed;
- HashBenchBase() {
- // The real-world use case we care about is in a hash table where we'll mix
- // in some seed state, likely some ASLR address. To simulate this for
- // benchmarking, compute a seed from the address of a stack local variable.
- volatile char key;
- key = 42;
- // Rinse this through a volatile variable as well so returning it isn't
- // flagged. The whole point is to escape the address of something on the
- // stack.
- volatile auto key_addr = reinterpret_cast<uint64_t>(&key);
- seed = key_addr;
- }
- };
- struct CarbonHashBench : HashBenchBase {
- template <typename T>
- auto operator()(const T& value) -> uint64_t {
- return static_cast<uint64_t>(HashValue(value, seed));
- }
- };
- struct AbseilHashBench : HashBenchBase {
- template <typename T>
- auto operator()(const T& value) -> uint64_t {
- // Manually seed this with an after-the-fact XOR as there isn't a seeded
- // version. This matches what Abseil's hash tables do as well.
- return absl::HashOf(value) ^ seed;
- }
- };
- struct LLVMHashBench : HashBenchBase {
- template <typename T>
- auto operator()(const T& value) -> uint64_t {
- // Manually seed this with an after-the-fact XOR as there isn't a seeded
- // version.
- return llvm::hash_value(value) ^ seed;
- }
- };
- template <typename Values, typename Hasher>
- auto BM_LatencyHash(benchmark::State& state) -> void {
- uint64_t x = 13;
- Values v;
- Hasher h;
- // We run the benchmark in `NumSizes` batches so that when needed we always
- // process each of the sizes and we don't randomly end up with a skewed set of
- // sizes.
- while (state.KeepRunningBatch(NumSizes)) {
- for (ssize_t i = 0; i < NumSizes; ++i) {
- benchmark::DoNotOptimize(x = h(v.Get(i, x)));
- }
- }
- state.SetBytesProcessed(v.bytes);
- }
- // Latency benchmarks are grouped by the three different hash functions to
- // facilitate comparing their performance for a given value type or string size
- // bucket.
- #define LATENCY_VALUE_BENCHMARKS(...) \
- BENCHMARK(BM_LatencyHash<RandValues<__VA_ARGS__>, CarbonHashBench>); \
- BENCHMARK(BM_LatencyHash<RandValues<__VA_ARGS__>, AbseilHashBench>); \
- BENCHMARK(BM_LatencyHash<RandValues<__VA_ARGS__>, LLVMHashBench>)
- LATENCY_VALUE_BENCHMARKS(uint8_t);
- LATENCY_VALUE_BENCHMARKS(uint16_t);
- LATENCY_VALUE_BENCHMARKS(std::pair<uint8_t, uint8_t>);
- LATENCY_VALUE_BENCHMARKS(uint32_t);
- LATENCY_VALUE_BENCHMARKS(std::pair<uint16_t, uint16_t>);
- LATENCY_VALUE_BENCHMARKS(uint64_t);
- LATENCY_VALUE_BENCHMARKS(int*);
- LATENCY_VALUE_BENCHMARKS(std::pair<uint32_t, uint32_t>);
- LATENCY_VALUE_BENCHMARKS(std::pair<uint64_t, uint32_t>);
- LATENCY_VALUE_BENCHMARKS(std::pair<uint32_t, uint64_t>);
- LATENCY_VALUE_BENCHMARKS(std::pair<int*, uint32_t>);
- LATENCY_VALUE_BENCHMARKS(std::pair<uint32_t, int*>);
- LATENCY_VALUE_BENCHMARKS(__uint128_t);
- LATENCY_VALUE_BENCHMARKS(std::pair<uint64_t, uint64_t>);
- LATENCY_VALUE_BENCHMARKS(std::pair<int*, int*>);
- LATENCY_VALUE_BENCHMARKS(std::pair<uint64_t, int*>);
- LATENCY_VALUE_BENCHMARKS(std::pair<int*, uint64_t>);
- #define LATENCY_STRING_BENCHMARKS(MaxSize) \
- BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/true, MaxSize>, \
- CarbonHashBench>); \
- BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/true, MaxSize>, \
- AbseilHashBench>); \
- BENCHMARK( \
- BM_LatencyHash<RandStrings</*RandSize=*/true, MaxSize>, LLVMHashBench>)
- LATENCY_STRING_BENCHMARKS(/*MaxSize=*/4);
- LATENCY_STRING_BENCHMARKS(/*MaxSize=*/8);
- LATENCY_STRING_BENCHMARKS(/*MaxSize=*/16);
- LATENCY_STRING_BENCHMARKS(/*MaxSize=*/32);
- LATENCY_STRING_BENCHMARKS(/*MaxSize=*/64);
- LATENCY_STRING_BENCHMARKS(/*MaxSize=*/256);
- LATENCY_STRING_BENCHMARKS(/*MaxSize=*/512);
- LATENCY_STRING_BENCHMARKS(/*MaxSize=*/1024);
- LATENCY_STRING_BENCHMARKS(/*MaxSize=*/2048);
- LATENCY_STRING_BENCHMARKS(/*MaxSize=*/4096);
- LATENCY_STRING_BENCHMARKS(/*MaxSize=*/8192);
- // We also want to check for size-specific cliffs, particularly in small sizes
- // and sizes around implementation inflection points such as powers of two and
- // half-way points between powers of two. Because these benchmarks are looking
- // for size-related cliffs, all the runs for particular hash function are kept
- // together.
- //
- // Note: because these use a fixed size, their specific timing isn't terribly
- // informative. The branch predictor behavior on a modern CPU will be
- // significantly different in this benchmarks from any other and may distort all
- // manner of the timings. The results should really only be compared between
- // sizes for cliffs, and not directly compared with other numbers.
- #define LATENCY_STRING_SIZE_BENCHMARKS(Hash) \
- BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 0>, Hash>); \
- BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 1>, Hash>); \
- BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 2>, Hash>); \
- BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 3>, Hash>); \
- BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 4>, Hash>); \
- BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 5>, Hash>); \
- BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 6>, Hash>); \
- BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 7>, Hash>); \
- BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 8>, Hash>); \
- BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 9>, Hash>); \
- BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 15>, Hash>); \
- BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 16>, Hash>); \
- BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 17>, Hash>); \
- BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 23>, Hash>); \
- BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 24>, Hash>); \
- BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 25>, Hash>); \
- BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 31>, Hash>); \
- BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 32>, Hash>); \
- BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 33>, Hash>); \
- BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 47>, Hash>); \
- BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 48>, Hash>); \
- BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 49>, Hash>); \
- BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 63>, Hash>); \
- BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 64>, Hash>); \
- BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 65>, Hash>); \
- BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 91>, Hash>); \
- BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 92>, Hash>); \
- BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 93>, Hash>); \
- BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 127>, Hash>); \
- BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 128>, Hash>); \
- BENCHMARK(BM_LatencyHash<RandStrings</*RandSize=*/false, 129>, Hash>)
- // Because these just look for size-related cliffs in performance, we only do a
- // minimal number of benchmarks. There are a lot of sizes so this avoids wasted
- // time in benchmark runs and there isn't much value from greater comparative
- // coverage here.
- LATENCY_STRING_SIZE_BENCHMARKS(CarbonHashBench);
- LATENCY_STRING_SIZE_BENCHMARKS(AbseilHashBench);
- } // namespace
- } // namespace Carbon
|