// Part of the Carbon Language project, under the Apache License v2.0 with LLVM // Exceptions. See /LICENSE for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception #include #include #include #include "absl/hash/hash.h" #include "absl/random/random.h" #include "common/hashing.h" #include "llvm/ADT/Hashing.h" namespace Carbon { namespace { // We want the benchmark working set to fit in the L1 cache where possible so // that the benchmark focuses on the CPU-execution costs and not memory latency. // For most CPUs we're going to care about, 16k will fit easily, and 32k will // probably fit. But we also need to include sizes for string benchmarks. This // targets 8k of entropy with each object up to 8k of size for a total of 16k. constexpr int EntropySize = 8 << 10; constexpr int EntropyObjSize = 8 << 10; // An array of random entropy with `EntropySize` bytes plus 8k. The goal is that // clients can read `EntropySize` objects of up to 8k size out of this pool by // starting at different byte offsets. static const llvm::ArrayRef entropy_bytes = []() -> llvm::ArrayRef { static llvm::SmallVector bytes; // Pad out the entropy for up to 1kb objects. bytes.resize(EntropySize + EntropyObjSize); absl::BitGen gen; for (std::byte& b : bytes) { b = static_cast(absl::Uniform(gen)); } return bytes; }(); // Based on 16k of entropy above and an L1 cache size often up to 32k, keep each // array of sizes small at 8k or 1k 8-byte sizes. constexpr int NumSizes = 1 << 10; // Selects an array of `NumSizes` sizes, witch each one in the range [0, // MaxSize). The sizes will be in a random order, but the sum of sizes will // always be the same. template static const std::array rand_sizes = []() { std::array sizes; // Build an array with a deterministic set of sizes in the // range [0, MaxSize), using the golden ratio to select well distributed // points in that range. See https://www.youtube.com/watch?v=lOIP_Z_-0Hs for // an example of why this is an effective strategy for selecting sizes in the // range. static_assert(NumSizes > 128); constexpr double Phi = 1.61803398875; constexpr size_t Scale = std::max(1, MaxSize / Phi); for (auto [i, size] : llvm::enumerate(sizes)) { size = (i * Scale) % MaxSize; } // Shuffle the sizes randomly so that there isn't any pattern of sizes // encountered and we get relatively realistic branch prediction behavior // when branching on the size. We use this approach rather than random // sizes to ensure we always have the same total size of data processed. std::shuffle(sizes.begin(), sizes.end(), absl::BitGen()); return sizes; }(); // A small helper class to synthesize random values out of our entropy pool. // This is done in a way that depends on an arbitrary input (`x`) to allow us to // create a benchmark that measures a *dependent* chain of hashes of these // values. // // `T` needs to be default constructable and reasonable to synthesize an // instance by copying random bytes into its underlying storage. // // This helper class also accumulates the number of bytes of data generated in // order to let us compute throughput measurements as well as latency // measurements. // // This helper class has the same API as the `RandStrings` helpers below so that // they can all be used as type parameters to a common benchmark routine below. template struct RandValues { size_t bytes = 0; // Get a random value. We don't need to iterate through sizes so `i` is // ignored, but we use `x` to select our entropy ensuring a dependency on `x` // for the benchmark. auto Get(ssize_t /*i*/, uint64_t x) -> T { static_assert(sizeof(T) <= EntropyObjSize); bytes += sizeof(T); T result; memcpy(&result, &entropy_bytes[x % EntropySize], sizeof(T)); return result; } }; // A specialization to help with building pairs of values. template struct RandValues> { size_t bytes = 0; auto Get(ssize_t /*i*/, uint64_t x) -> std::pair { static_assert(sizeof(std::pair) <= EntropyObjSize); bytes += sizeof(std::pair); T result0; U result1; memcpy(&result0, &entropy_bytes[x % EntropySize], sizeof(T)); memcpy(&result1, &entropy_bytes[x % EntropySize] + sizeof(T), sizeof(U)); return {result0, result1}; } }; // A helper class similar to `RandValues`, but for building strings rather than // values. The string content is pulled from the entropy pool. The size can be // random from [0, MaxSize], or it can be fixed at `MaxSize`. But the `MaxSize` // cannot be larger than a single byte sequence pulled from the entropy pool // (`EntropyObjSize`). template struct RandStrings { size_t bytes = 0; // Get a random string. If the sizes are random, we use `i` to select each // size and require it to be in the range [0, NumSizes). Otherwise `i` is // ignored. We always use `x` to select the entropy and establish a dependency // on the input. auto Get(ssize_t i, uint64_t x) -> llvm::StringRef { static_assert(MaxSize <= EntropyObjSize); size_t s = MaxSize; if constexpr (RandSize) { // When using random sizes, we leverage `i` which is guaranteed to range // from [0, NumSizes). s = rand_sizes[i]; } else { // Prevent `s` from being constant folded when we directly use `MaxSize`. benchmark::DoNotOptimize(s); } bytes += s; return llvm::StringRef( reinterpret_cast(&entropy_bytes[x % EntropySize]), s); } }; struct HashBenchBase { uint64_t seed; HashBenchBase() { // The real-world use case we care about is in a hash table where we'll mix // in some seed state, likely some ASLR address. To simulate this for // benchmarking, compute a seed from the address of a stack local variable. volatile char key; key = 42; // Rinse this through a volatile variable as well so returning it isn't // flagged. The whole point is to escape the address of something on the // stack. volatile auto key_addr = reinterpret_cast(&key); seed = key_addr; } }; struct CarbonHashBench : HashBenchBase { template auto operator()(const T& value) -> uint64_t { return static_cast(HashValue(value, seed)); } }; struct AbseilHashBench : HashBenchBase { template auto operator()(const T& value) -> uint64_t { // Manually seed this with an after-the-fact XOR as there isn't a seeded // version. This matches what Abseil's hash tables do as well. return absl::HashOf(value) ^ seed; } }; struct LLVMHashBench : HashBenchBase { template auto operator()(const T& value) -> uint64_t { // Manually seed this with an after-the-fact XOR as there isn't a seeded // version. return llvm::hash_value(value) ^ seed; } }; template void BM_LatencyHash(benchmark::State& state) { uint64_t x = 13; Values v; Hasher h; // We run the benchmark in `NumSizes` batches so that when needed we always // process each of the sizes and we don't randomly end up with a skewed set of // sizes. while (state.KeepRunningBatch(NumSizes)) { for (ssize_t i = 0; i < NumSizes; ++i) { benchmark::DoNotOptimize(x = h(v.Get(i, x))); } } state.SetBytesProcessed(v.bytes); } // Latency benchmarks are grouped by the three different hash functions to // facilitate comparing their performance for a given value type or string size // bucket. #define LATENCY_VALUE_BENCHMARKS(...) \ BENCHMARK(BM_LatencyHash, CarbonHashBench>); \ BENCHMARK(BM_LatencyHash, AbseilHashBench>); \ BENCHMARK(BM_LatencyHash, LLVMHashBench>) LATENCY_VALUE_BENCHMARKS(uint8_t); LATENCY_VALUE_BENCHMARKS(uint16_t); LATENCY_VALUE_BENCHMARKS(std::pair); LATENCY_VALUE_BENCHMARKS(uint32_t); LATENCY_VALUE_BENCHMARKS(std::pair); LATENCY_VALUE_BENCHMARKS(uint64_t); LATENCY_VALUE_BENCHMARKS(int*); LATENCY_VALUE_BENCHMARKS(std::pair); LATENCY_VALUE_BENCHMARKS(std::pair); LATENCY_VALUE_BENCHMARKS(std::pair); LATENCY_VALUE_BENCHMARKS(std::pair); LATENCY_VALUE_BENCHMARKS(std::pair); LATENCY_VALUE_BENCHMARKS(__uint128_t); LATENCY_VALUE_BENCHMARKS(std::pair); LATENCY_VALUE_BENCHMARKS(std::pair); LATENCY_VALUE_BENCHMARKS(std::pair); LATENCY_VALUE_BENCHMARKS(std::pair); #define LATENCY_STRING_BENCHMARKS(MaxSize) \ BENCHMARK(BM_LatencyHash, \ CarbonHashBench>); \ BENCHMARK(BM_LatencyHash, \ AbseilHashBench>); \ BENCHMARK( \ BM_LatencyHash, LLVMHashBench>) LATENCY_STRING_BENCHMARKS(/*MaxSize=*/4); LATENCY_STRING_BENCHMARKS(/*MaxSize=*/8); LATENCY_STRING_BENCHMARKS(/*MaxSize=*/16); LATENCY_STRING_BENCHMARKS(/*MaxSize=*/32); LATENCY_STRING_BENCHMARKS(/*MaxSize=*/64); LATENCY_STRING_BENCHMARKS(/*MaxSize=*/256); LATENCY_STRING_BENCHMARKS(/*MaxSize=*/512); LATENCY_STRING_BENCHMARKS(/*MaxSize=*/1024); LATENCY_STRING_BENCHMARKS(/*MaxSize=*/2048); LATENCY_STRING_BENCHMARKS(/*MaxSize=*/4096); LATENCY_STRING_BENCHMARKS(/*MaxSize=*/8192); // We also want to check for size-specific cliffs, particularly in small sizes // and sizes around implementation inflection points such as powers of two and // half-way points between powers of two. Because these benchmarks are looking // for size-related cliffs, all the runs for particular hash function are kept // together. // // Note: because these use a fixed size, their specific timing isn't terribly // informative. The branch predictor behavior on a modern CPU will be // significantly different in this benchmarks from any other and may distort all // manner of the timings. The results should really only be compared between // sizes for cliffs, and not directly compared with other numbers. #define LATENCY_STRING_SIZE_BENCHMARKS(Hash) \ BENCHMARK(BM_LatencyHash, Hash>); \ BENCHMARK(BM_LatencyHash, Hash>); \ BENCHMARK(BM_LatencyHash, Hash>); \ BENCHMARK(BM_LatencyHash, Hash>); \ BENCHMARK(BM_LatencyHash, Hash>); \ BENCHMARK(BM_LatencyHash, Hash>); \ BENCHMARK(BM_LatencyHash, Hash>); \ BENCHMARK(BM_LatencyHash, Hash>); \ BENCHMARK(BM_LatencyHash, Hash>); \ BENCHMARK(BM_LatencyHash, Hash>); \ BENCHMARK(BM_LatencyHash, Hash>); \ BENCHMARK(BM_LatencyHash, Hash>); \ BENCHMARK(BM_LatencyHash, Hash>); \ BENCHMARK(BM_LatencyHash, Hash>); \ BENCHMARK(BM_LatencyHash, Hash>); \ BENCHMARK(BM_LatencyHash, Hash>); \ BENCHMARK(BM_LatencyHash, Hash>); \ BENCHMARK(BM_LatencyHash, Hash>); \ BENCHMARK(BM_LatencyHash, Hash>); \ BENCHMARK(BM_LatencyHash, Hash>); \ BENCHMARK(BM_LatencyHash, Hash>); \ BENCHMARK(BM_LatencyHash, Hash>); \ BENCHMARK(BM_LatencyHash, Hash>); \ BENCHMARK(BM_LatencyHash, Hash>); \ BENCHMARK(BM_LatencyHash, Hash>); \ BENCHMARK(BM_LatencyHash, Hash>); \ BENCHMARK(BM_LatencyHash, Hash>); \ BENCHMARK(BM_LatencyHash, Hash>); \ BENCHMARK(BM_LatencyHash, Hash>); \ BENCHMARK(BM_LatencyHash, Hash>); \ BENCHMARK(BM_LatencyHash, Hash>) // Because these just look for size-related cliffs in performance, we only do a // minimal number of benchmarks. There are a lot of sizes so this avoids wasted // time in benchmark runs and there isn't much value from greater comparative // coverage here. LATENCY_STRING_SIZE_BENCHMARKS(CarbonHashBench); LATENCY_STRING_SIZE_BENCHMARKS(AbseilHashBench); } // namespace } // namespace Carbon