hashing_test.cpp 35 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #include "common/hashing.h"
  5. #include <gmock/gmock.h>
  6. #include <gtest/gtest.h>
  7. #include <concepts>
  8. #include <type_traits>
  9. #include "llvm/ADT/Sequence.h"
  10. #include "llvm/ADT/StringExtras.h"
  11. #include "llvm/Support/FormatVariadic.h"
  12. #include "llvm/Support/TypeName.h"
  13. namespace Carbon {
  14. namespace {
  15. using ::testing::Eq;
  16. using ::testing::Le;
  17. using ::testing::Ne;
  18. TEST(HashingTest, HashCodeAPI) {
  19. // Manually compute a few hash codes where we can exercise the underlying API.
  20. HashCode empty = HashValue("");
  21. HashCode a = HashValue("a");
  22. HashCode b = HashValue("b");
  23. ASSERT_THAT(HashValue(""), Eq(empty));
  24. ASSERT_THAT(HashValue("a"), Eq(a));
  25. ASSERT_THAT(HashValue("b"), Eq(b));
  26. ASSERT_THAT(empty, Ne(a));
  27. ASSERT_THAT(empty, Ne(b));
  28. ASSERT_THAT(a, Ne(b));
  29. // Exercise the methods in basic ways across a few sizes. This doesn't check
  30. // much beyond stability across re-computed values, crashing, or hitting UB.
  31. EXPECT_THAT(HashValue("a").ExtractIndex(), Eq(a.ExtractIndex()));
  32. EXPECT_THAT(a.ExtractIndex(), Ne(b.ExtractIndex()));
  33. EXPECT_THAT(a.ExtractIndex(), Ne(empty.ExtractIndex()));
  34. // The tag shouldn't have bits set outside the range requested.
  35. EXPECT_THAT(HashValue("a").ExtractIndexAndTag<1>().second & ~0b1, Eq(0));
  36. EXPECT_THAT(HashValue("a").ExtractIndexAndTag<2>().second & ~0b11, Eq(0));
  37. EXPECT_THAT(HashValue("a").ExtractIndexAndTag<3>().second & ~0b111, Eq(0));
  38. EXPECT_THAT(HashValue("a").ExtractIndexAndTag<4>().second & ~0b1111, Eq(0));
  39. // Note that the index produced with a tag may be different from the index
  40. // alone!
  41. EXPECT_THAT(HashValue("a").ExtractIndexAndTag<2>(),
  42. Eq(a.ExtractIndexAndTag<2>()));
  43. EXPECT_THAT(HashValue("a").ExtractIndexAndTag<16>(),
  44. Eq(a.ExtractIndexAndTag<16>()));
  45. EXPECT_THAT(HashValue("a").ExtractIndexAndTag<7>(),
  46. Eq(a.ExtractIndexAndTag<7>()));
  47. const auto [a_index, a_tag] = a.ExtractIndexAndTag<4>();
  48. const auto [b_index, b_tag] = b.ExtractIndexAndTag<4>();
  49. EXPECT_THAT(a_index, Ne(b_index));
  50. EXPECT_THAT(a_tag, Ne(b_tag));
  51. }
  52. TEST(HashingTest, Integers) {
  53. for (int64_t i : {0, 1, 2, 3, 42, -1, -2, -3, -13}) {
  54. SCOPED_TRACE(llvm::formatv("Hashing: {0}", i).str());
  55. auto test_int_hash = [](auto i) {
  56. using T = decltype(i);
  57. SCOPED_TRACE(
  58. llvm::formatv("Hashing type: {0}", llvm::getTypeName<T>()).str());
  59. HashCode hash = HashValue(i);
  60. // Hashes should be stable within the execution.
  61. EXPECT_THAT(HashValue(i), Eq(hash));
  62. // Zero should match, and other integers shouldn't collide trivially.
  63. HashCode hash_zero = HashValue(static_cast<T>(0));
  64. if (i == 0) {
  65. EXPECT_THAT(hash, Eq(hash_zero));
  66. } else {
  67. EXPECT_THAT(hash, Ne(hash_zero));
  68. }
  69. };
  70. test_int_hash(static_cast<int8_t>(i));
  71. test_int_hash(static_cast<uint8_t>(i));
  72. test_int_hash(static_cast<int16_t>(i));
  73. test_int_hash(static_cast<uint16_t>(i));
  74. test_int_hash(static_cast<int32_t>(i));
  75. test_int_hash(static_cast<uint32_t>(i));
  76. // `i` is already an int64_t variable.
  77. test_int_hash(i);
  78. test_int_hash(static_cast<uint64_t>(i));
  79. }
  80. }
  81. TEST(HashingTest, BasicSeeding) {
  82. auto unseeded_hash = HashValue(42);
  83. EXPECT_THAT(unseeded_hash, Ne(HashValue(42, 1)));
  84. EXPECT_THAT(unseeded_hash, Ne(HashValue(42, 2)));
  85. EXPECT_THAT(unseeded_hash, Ne(HashValue(42, 3)));
  86. EXPECT_THAT(unseeded_hash,
  87. Ne(HashValue(42, static_cast<uint64_t>(unseeded_hash))));
  88. }
  89. TEST(HashingTest, Pointers) {
  90. int object1 = 42;
  91. std::string object2 =
  92. "Hello World! This is a long-ish string so it ends up on the heap!";
  93. HashCode hash_null = HashValue(nullptr);
  94. // Hashes should be stable.
  95. EXPECT_THAT(HashValue(nullptr), Eq(hash_null));
  96. // Hash other kinds of pointers without trivial collisions.
  97. HashCode hash1 = HashValue(&object1);
  98. HashCode hash2 = HashValue(&object2);
  99. HashCode hash3 = HashValue(object2.data());
  100. EXPECT_THAT(hash1, Ne(hash_null));
  101. EXPECT_THAT(hash2, Ne(hash_null));
  102. EXPECT_THAT(hash3, Ne(hash_null));
  103. EXPECT_THAT(hash1, Ne(hash2));
  104. EXPECT_THAT(hash1, Ne(hash3));
  105. EXPECT_THAT(hash2, Ne(hash3));
  106. // Hash values reflect the address and not the type.
  107. EXPECT_THAT(HashValue(static_cast<void*>(nullptr)), Eq(hash_null));
  108. EXPECT_THAT(HashValue(static_cast<int*>(nullptr)), Eq(hash_null));
  109. EXPECT_THAT(HashValue(static_cast<std::string*>(nullptr)), Eq(hash_null));
  110. EXPECT_THAT(HashValue(reinterpret_cast<void*>(&object1)), Eq(hash1));
  111. EXPECT_THAT(HashValue(reinterpret_cast<int*>(&object2)), Eq(hash2));
  112. EXPECT_THAT(HashValue(reinterpret_cast<std::string*>(object2.data())),
  113. Eq(hash3));
  114. }
  115. TEST(HashingTest, PairsAndTuples) {
  116. // Note that we can't compare hash codes across arity, or in general, compare
  117. // hash codes for different types as the type isn't part of the hash. These
  118. // hashes are targeted at use in hash tables which pick a single type that's
  119. // the basis of any comparison.
  120. HashCode hash_00 = HashValue(std::pair(0, 0));
  121. HashCode hash_01 = HashValue(std::pair(0, 1));
  122. HashCode hash_10 = HashValue(std::pair(1, 0));
  123. HashCode hash_11 = HashValue(std::pair(1, 1));
  124. EXPECT_THAT(hash_00, Ne(hash_01));
  125. EXPECT_THAT(hash_00, Ne(hash_10));
  126. EXPECT_THAT(hash_00, Ne(hash_11));
  127. EXPECT_THAT(hash_01, Ne(hash_10));
  128. EXPECT_THAT(hash_01, Ne(hash_11));
  129. EXPECT_THAT(hash_10, Ne(hash_11));
  130. HashCode hash_000 = HashValue(std::tuple(0, 0, 0));
  131. HashCode hash_001 = HashValue(std::tuple(0, 0, 1));
  132. HashCode hash_010 = HashValue(std::tuple(0, 1, 0));
  133. HashCode hash_011 = HashValue(std::tuple(0, 1, 1));
  134. HashCode hash_100 = HashValue(std::tuple(1, 0, 0));
  135. HashCode hash_101 = HashValue(std::tuple(1, 0, 1));
  136. HashCode hash_110 = HashValue(std::tuple(1, 1, 0));
  137. HashCode hash_111 = HashValue(std::tuple(1, 1, 1));
  138. EXPECT_THAT(hash_000, Ne(hash_001));
  139. EXPECT_THAT(hash_000, Ne(hash_010));
  140. EXPECT_THAT(hash_000, Ne(hash_011));
  141. EXPECT_THAT(hash_000, Ne(hash_100));
  142. EXPECT_THAT(hash_000, Ne(hash_101));
  143. EXPECT_THAT(hash_000, Ne(hash_110));
  144. EXPECT_THAT(hash_000, Ne(hash_111));
  145. EXPECT_THAT(hash_001, Ne(hash_010));
  146. EXPECT_THAT(hash_001, Ne(hash_011));
  147. EXPECT_THAT(hash_001, Ne(hash_100));
  148. EXPECT_THAT(hash_001, Ne(hash_101));
  149. EXPECT_THAT(hash_001, Ne(hash_110));
  150. EXPECT_THAT(hash_001, Ne(hash_111));
  151. EXPECT_THAT(hash_010, Ne(hash_011));
  152. EXPECT_THAT(hash_010, Ne(hash_100));
  153. EXPECT_THAT(hash_010, Ne(hash_101));
  154. EXPECT_THAT(hash_010, Ne(hash_110));
  155. EXPECT_THAT(hash_010, Ne(hash_111));
  156. EXPECT_THAT(hash_011, Ne(hash_100));
  157. EXPECT_THAT(hash_011, Ne(hash_101));
  158. EXPECT_THAT(hash_011, Ne(hash_110));
  159. EXPECT_THAT(hash_011, Ne(hash_111));
  160. EXPECT_THAT(hash_100, Ne(hash_101));
  161. EXPECT_THAT(hash_100, Ne(hash_110));
  162. EXPECT_THAT(hash_100, Ne(hash_111));
  163. EXPECT_THAT(hash_101, Ne(hash_110));
  164. EXPECT_THAT(hash_101, Ne(hash_111));
  165. EXPECT_THAT(hash_110, Ne(hash_111));
  166. // Hashing a 2-tuple and a pair should produce identical results, so pairs
  167. // are compatible with code using things like variadic tuple construction.
  168. EXPECT_THAT(HashValue(std::tuple(0, 0)), Eq(hash_00));
  169. EXPECT_THAT(HashValue(std::tuple(0, 1)), Eq(hash_01));
  170. EXPECT_THAT(HashValue(std::tuple(1, 0)), Eq(hash_10));
  171. EXPECT_THAT(HashValue(std::tuple(1, 1)), Eq(hash_11));
  172. // Integers in tuples should also work.
  173. for (int i : {0, 1, 2, 3, 42, -1, -2, -3, -13}) {
  174. SCOPED_TRACE(llvm::formatv("Hashing: ({0}, {0}, {0})", i).str());
  175. auto test_int_tuple_hash = [](auto i) {
  176. using T = decltype(i);
  177. SCOPED_TRACE(
  178. llvm::formatv("Hashing integer type: {0}", llvm::getTypeName<T>())
  179. .str());
  180. std::tuple v = {i, i, i};
  181. HashCode hash = HashValue(v);
  182. // Hashes should be stable within the execution.
  183. EXPECT_THAT(HashValue(v), Eq(hash));
  184. // Zero should match, and other integers shouldn't collide trivially.
  185. T zero = 0;
  186. std::tuple zero_tuple = {zero, zero, zero};
  187. HashCode hash_zero = HashValue(zero_tuple);
  188. if (i == 0) {
  189. EXPECT_THAT(hash, Eq(hash_zero));
  190. } else {
  191. EXPECT_THAT(hash, Ne(hash_zero));
  192. }
  193. };
  194. test_int_tuple_hash(i);
  195. test_int_tuple_hash(static_cast<int8_t>(i));
  196. test_int_tuple_hash(static_cast<uint8_t>(i));
  197. test_int_tuple_hash(static_cast<int16_t>(i));
  198. test_int_tuple_hash(static_cast<uint16_t>(i));
  199. test_int_tuple_hash(static_cast<int32_t>(i));
  200. test_int_tuple_hash(static_cast<uint32_t>(i));
  201. test_int_tuple_hash(static_cast<int64_t>(i));
  202. test_int_tuple_hash(static_cast<uint64_t>(i));
  203. // Heterogeneous integer types should also work, but we only support
  204. // comparing against hashes of tuples with the exact same type.
  205. using T1 = std::tuple<int8_t, uint32_t, int16_t>;
  206. using T2 = std::tuple<uint32_t, int16_t, uint64_t>;
  207. if (i == 0) {
  208. EXPECT_THAT(HashValue(T1{i, i, i}), Eq(HashValue(T1{0, 0, 0})));
  209. EXPECT_THAT(HashValue(T2{i, i, i}), Eq(HashValue(T2{0, 0, 0})));
  210. } else {
  211. EXPECT_THAT(HashValue(T1{i, i, i}), Ne(HashValue(T1{0, 0, 0})));
  212. EXPECT_THAT(HashValue(T2{i, i, i}), Ne(HashValue(T2{0, 0, 0})));
  213. }
  214. }
  215. // Hash values of pointers in pairs and tuples reflect the address and not the
  216. // type. Pairs and 2-tuples give the same hash values.
  217. HashCode hash_2null = HashValue(std::pair(nullptr, nullptr));
  218. EXPECT_THAT(HashValue(std::tuple(static_cast<int*>(nullptr),
  219. static_cast<double*>(nullptr))),
  220. Eq(hash_2null));
  221. // Hash other kinds of pointers without trivial collisions.
  222. int object1 = 42;
  223. std::string object2 = "Hello world!";
  224. HashCode hash_3ptr =
  225. HashValue(std::tuple(&object1, &object2, object2.data()));
  226. EXPECT_THAT(hash_3ptr, Ne(HashValue(std::tuple(nullptr, nullptr, nullptr))));
  227. // Hash values reflect the address and not the type.
  228. EXPECT_THAT(
  229. HashValue(std::tuple(reinterpret_cast<void*>(&object1),
  230. reinterpret_cast<int*>(&object2),
  231. reinterpret_cast<std::string*>(object2.data()))),
  232. Eq(hash_3ptr));
  233. }
  234. TEST(HashingTest, BasicStrings) {
  235. llvm::SmallVector<std::pair<std::string, HashCode>> hashes;
  236. for (int size : {0, 1, 2, 4, 16, 64, 256, 1024}) {
  237. std::string s(size, 'a');
  238. hashes.push_back({s, HashValue(s)});
  239. }
  240. for (const auto& [s1, hash1] : hashes) {
  241. EXPECT_THAT(HashValue(s1), Eq(hash1));
  242. // Also check that we get the same hashes even when using string-wrapping
  243. // types.
  244. EXPECT_THAT(HashValue(std::string_view(s1)), Eq(hash1));
  245. EXPECT_THAT(HashValue(llvm::StringRef(s1)), Eq(hash1));
  246. // And some basic tests that simple things don't collide.
  247. for (const auto& [s2, hash2] : hashes) {
  248. if (s1 != s2) {
  249. EXPECT_THAT(hash1, Ne(hash2))
  250. << "Matching hashes for '" << s1 << "' and '" << s2 << "'";
  251. }
  252. }
  253. }
  254. }
  255. TEST(HashingTest, ArrayLike) {
  256. int c_array[] = {1, 2, 3, 4};
  257. EXPECT_THAT(HashValue(c_array), Eq(HashValue(c_array)));
  258. EXPECT_THAT(HashValue(std::array{1, 2, 3, 4}), Eq(HashValue(c_array)));
  259. EXPECT_THAT(HashValue(std::vector{1, 2, 3, 4}), Eq(HashValue(c_array)));
  260. EXPECT_THAT(HashValue(llvm::SmallVector<int>{1, 2, 3, 4}),
  261. Eq(HashValue(c_array)));
  262. }
  263. TEST(HashingTest, HashAPInt) {
  264. // The bit width should be hashed as well as the value.
  265. llvm::APInt one_64(/*numBits=*/64, /*val=*/1);
  266. llvm::APInt two_64(/*numBits=*/64, /*val=*/2);
  267. llvm::APInt one_128(/*numBits=*/128, /*val=*/1);
  268. llvm::APInt two_128(/*numBits=*/128, /*val=*/2);
  269. std::array array = {one_64, two_64, one_128, two_128};
  270. for (int i : llvm::seq<int>(array.size())) {
  271. EXPECT_THAT(HashValue(array[i]), Eq(HashValue(array[i])));
  272. for (int j : llvm::seq<int>(i + 1, array.size())) {
  273. EXPECT_THAT(HashValue(array[i]), Ne(HashValue(array[j])))
  274. << "Hashing #" << i << " and #" << j;
  275. }
  276. }
  277. }
  278. TEST(HashingTest, HashAPFloat) {
  279. // Hashtable equality for `APFloat` uses a bitwise comparison. This
  280. // differentiates between various things that would otherwise not make sense:
  281. // - Different floating point semantics
  282. // - `-0.0` and `0.0`
  283. //
  284. // It also allows NaNs to be compared meaningfully.
  285. llvm::APFloat zero_float =
  286. llvm::APFloat::getZero(llvm::APFloat::IEEEsingle());
  287. llvm::APFloat neg_zero_float =
  288. llvm::APFloat::getZero(llvm::APFloat::IEEEsingle(), /*Negative=*/true);
  289. llvm::APFloat zero_double =
  290. llvm::APFloat::getZero(llvm::APFloat::IEEEdouble());
  291. llvm::APFloat zero_bfloat = llvm::APFloat::getZero(llvm::APFloat::BFloat());
  292. llvm::APFloat one_float = llvm::APFloat::getOne(llvm::APFloat::IEEEsingle());
  293. llvm::APFloat inf_float = llvm::APFloat::getInf(llvm::APFloat::IEEEsingle());
  294. llvm::APFloat nan_0_float = llvm::APFloat::getNaN(
  295. llvm::APFloat::IEEEsingle(), /*Negative=*/false, /*payload=*/0);
  296. llvm::APFloat nan_42_float = llvm::APFloat::getNaN(
  297. llvm::APFloat::IEEEsingle(), /*Negative=*/false, /*payload=*/42);
  298. std::array array = {zero_float, neg_zero_float, zero_double, zero_bfloat,
  299. one_float, inf_float, nan_42_float};
  300. for (int i : llvm::seq<int>(array.size())) {
  301. EXPECT_THAT(HashValue(array[i]), Eq(HashValue(array[i])));
  302. for (int j : llvm::seq<int>(i + 1, array.size())) {
  303. EXPECT_THAT(HashValue(array[i]), Ne(HashValue(array[j])))
  304. << "Hashing #" << i << " and #" << j;
  305. }
  306. }
  307. // Note that currently we use LLVM's hashing of `APFloat` which does *not*
  308. // hash the payload of NaNs.
  309. EXPECT_THAT(HashValue(nan_0_float), Eq(HashValue(nan_42_float)));
  310. }
  311. // A type that has hashing customization. However, it also works to be small and
  312. // appear to have a unique object representation. This helps ensure that when a
  313. // user provides custom hashing it is reliably used.
  314. struct HashableType {
  315. int8_t x;
  316. int8_t y;
  317. int16_t ignored = 0;
  318. // Provide the hashing but try to craft a relatively low-ranking overload to
  319. // help ensure that the hashing framework doesn't accidentally override this.
  320. template <typename T>
  321. requires(std::same_as<T, HashableType>)
  322. friend auto CarbonHashValue(const T& value, uint64_t seed) -> HashCode {
  323. Hasher hasher(seed);
  324. hasher.Hash(value.x, value.y);
  325. return static_cast<HashCode>(hasher);
  326. }
  327. };
  328. static_assert(std::has_unique_object_representations_v<HashableType>);
  329. TEST(HashingTest, CustomType) {
  330. HashableType a = {.x = 1, .y = 2};
  331. HashableType b = {.x = 3, .y = 4};
  332. EXPECT_THAT(HashValue(a), Eq(HashValue(a)));
  333. EXPECT_THAT(HashValue(a), Ne(HashValue(b)));
  334. // Differences in an ignored field have no impact.
  335. HashableType c = {.x = 3, .y = 4, .ignored = 42};
  336. EXPECT_THAT(HashValue(c), Eq(HashValue(b)));
  337. }
  338. TEST(HashingTest, ArrayRecursion) {
  339. // Make sure we correctly recurse when hashing an array and don't try to use
  340. // the object representation.
  341. llvm::APInt one_64(/*numBits=*/64, /*val=*/1);
  342. llvm::APInt two_64(/*numBits=*/64, /*val=*/2);
  343. llvm::APInt one_128(/*numBits=*/128, /*val=*/1);
  344. llvm::APInt two_128(/*numBits=*/128, /*val=*/2);
  345. std::array apint_array = {one_64, two_64, one_128, two_128};
  346. EXPECT_THAT(HashValue(apint_array),
  347. Eq(HashValue(std::array{one_64, two_64, one_128, two_128})));
  348. EXPECT_THAT(HashValue(apint_array),
  349. Ne(HashValue(std::array{one_64, two_64, two_128, one_128})));
  350. EXPECT_THAT(HashValue(apint_array),
  351. Ne(HashValue(std::array{one_64, two_64, one_64, two_128})));
  352. EXPECT_THAT(HashValue(apint_array),
  353. Ne(HashValue(std::array{one_64, two_128, one_128, two_128})));
  354. EXPECT_THAT(HashValue(apint_array),
  355. Ne(HashValue(std::array{one_64, two_64, one_128})));
  356. EXPECT_THAT(
  357. HashValue(apint_array),
  358. Ne(HashValue(std::array{one_64, two_64, one_128, two_128, two_128})));
  359. // Also test for a custom type that still *looks* like plain data.
  360. HashableType a = {.x = 1, .y = 2};
  361. HashableType b = {.x = 3, .y = 4};
  362. HashableType c = {.x = 3, .y = 4, .ignored = 42};
  363. std::array custom_array = {a, b, c, a};
  364. EXPECT_THAT(HashValue(custom_array), Eq(HashValue(std::array{a, b, c, a})));
  365. EXPECT_THAT(HashValue(custom_array), Eq(HashValue(std::array{a, b, b, a})));
  366. EXPECT_THAT(HashValue(custom_array), Ne(HashValue(std::array{a, b, c, b})));
  367. EXPECT_THAT(HashValue(custom_array), Ne(HashValue(std::array{a, b, a, c})));
  368. EXPECT_THAT(HashValue(custom_array), Ne(HashValue(std::array{a, b, c})));
  369. EXPECT_THAT(HashValue(custom_array),
  370. Ne(HashValue(std::array{a, b, c, a, a})));
  371. }
  372. TEST(HashingTest, TupleRecursion) {
  373. // Make sure we can hash pairs and tuples which require us to recurse for each
  374. // element rather than treating the whole object as raw storage.
  375. // We can use APInt values to help test this.
  376. llvm::APInt one_64(/*numBits=*/64, /*val=*/1);
  377. llvm::APInt two_64(/*numBits=*/64, /*val=*/2);
  378. llvm::APInt one_128(/*numBits=*/128, /*val=*/1);
  379. llvm::APInt two_128(/*numBits=*/128, /*val=*/2);
  380. EXPECT_THAT(HashValue(std::pair{one_64, one_128}),
  381. Eq(HashValue(std::pair{one_64, one_128})));
  382. EXPECT_THAT(HashValue(std::pair{one_64, one_128}),
  383. Ne(HashValue(std::pair{one_64, two_64})));
  384. EXPECT_THAT(HashValue(std::pair{one_64, one_128}),
  385. Ne(HashValue(std::pair{one_64, one_64})));
  386. EXPECT_THAT(HashValue(std::pair{one_64, one_128}),
  387. Ne(HashValue(std::pair{one_128, one_64})));
  388. EXPECT_THAT(HashValue(std::tuple{one_64, one_128, two_64}),
  389. Eq(HashValue(std::tuple{one_64, one_128, two_64})));
  390. EXPECT_THAT(HashValue(std::tuple{one_64, one_128, two_64}),
  391. Ne(HashValue(std::tuple{one_64, two_64, two_64})));
  392. EXPECT_THAT(HashValue(std::tuple{one_64, one_128, two_64}),
  393. Ne(HashValue(std::tuple{one_64, one_64, two_64})));
  394. EXPECT_THAT(HashValue(std::tuple{one_64, one_128, two_64}),
  395. Ne(HashValue(std::tuple{one_64, two_64, one_128})));
  396. EXPECT_THAT(HashValue(std::tuple{one_64, one_128, two_64}),
  397. Ne(HashValue(std::tuple{one_64, one_128})));
  398. // Also test for a custom type that still *looks* like plain data.
  399. HashableType a = {.x = 1, .y = 2};
  400. HashableType b = {.x = 3, .y = 4};
  401. HashableType c = {.x = 3, .y = 4, .ignored = 42};
  402. EXPECT_THAT(HashValue(std::pair{a, b}), Eq(HashValue(std::pair{a, b})));
  403. EXPECT_THAT(HashValue(std::pair{a, b}), Ne(HashValue(std::pair{a, a})));
  404. EXPECT_THAT(HashValue(std::pair{a, b}), Ne(HashValue(std::pair{b, a})));
  405. EXPECT_THAT(HashValue(std::pair{a, b}), Eq(HashValue(std::pair{a, c})));
  406. EXPECT_THAT(HashValue(std::tuple{a, b, a}),
  407. Eq(HashValue(std::tuple{a, b, a})));
  408. EXPECT_THAT(HashValue(std::tuple{a, b, a}),
  409. Ne(HashValue(std::tuple{a, b, b})));
  410. EXPECT_THAT(HashValue(std::tuple{a, b, a}),
  411. Ne(HashValue(std::tuple{a, a, a})));
  412. EXPECT_THAT(HashValue(std::tuple{a, b, a}),
  413. Eq(HashValue(std::tuple{a, c, a})));
  414. }
  415. // The only significantly bad seed is zero, so pick a non-zero seed with a tiny
  416. // amount of entropy to make sure that none of the testing relies on the entropy
  417. // from this.
  418. constexpr uint64_t TestSeed = 42ULL * 1024;
  419. auto ToHexBytes(llvm::StringRef s) -> std::string {
  420. std::string rendered;
  421. llvm::raw_string_ostream os(rendered);
  422. os << "{";
  423. llvm::ListSeparator sep(", ");
  424. for (const char c : s) {
  425. os << sep << llvm::formatv("{0:x2}", static_cast<uint8_t>(c));
  426. }
  427. os << "}";
  428. return rendered;
  429. }
  430. template <typename T>
  431. struct HashedValue {
  432. HashCode hash;
  433. T v;
  434. };
  435. using HashedString = HashedValue<std::string>;
  436. template <typename T>
  437. auto PrintFullWidthHex(llvm::raw_ostream& os, T value) {
  438. static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 ||
  439. sizeof(T) == 8);
  440. // Given the nature of a format string and the good formatting, a nested
  441. // conditional seems like the most readable structure.
  442. // NOLINTBEGIN(readability-avoid-nested-conditional-operator)
  443. os << llvm::formatv(sizeof(T) == 1 ? "{0:x2}"
  444. : sizeof(T) == 2 ? "{0:x4}"
  445. : sizeof(T) == 4 ? "{0:x8}"
  446. : "{0:x16}",
  447. static_cast<uint64_t>(value));
  448. // NOLINTEND(readability-avoid-nested-conditional-operator)
  449. }
  450. template <typename T>
  451. requires std::integral<T>
  452. auto operator<<(llvm::raw_ostream& os, HashedValue<T> hv)
  453. -> llvm::raw_ostream& {
  454. os << "hash " << hv.hash << " for value ";
  455. PrintFullWidthHex(os, hv.v);
  456. return os;
  457. }
  458. template <typename T, typename U>
  459. requires std::integral<T> && std::integral<U>
  460. auto operator<<(llvm::raw_ostream& os, HashedValue<std::pair<T, U>> hv)
  461. -> llvm::raw_ostream& {
  462. os << "hash " << hv.hash << " for pair of ";
  463. PrintFullWidthHex(os, hv.v.first);
  464. os << " and ";
  465. PrintFullWidthHex(os, hv.v.second);
  466. return os;
  467. }
  468. struct Collisions {
  469. int total;
  470. int median;
  471. int max;
  472. };
  473. // Analyzes a list of hashed values to find all of the hash codes which collide
  474. // within a specific bit-range.
  475. //
  476. // With `BitBegin=0` and `BitEnd=64`, this is equivalent to finding full
  477. // collisions. But when the begin and end of the bit range are narrower than the
  478. // 64-bits of the hash code, it allows this function to analyze a specific
  479. // window of bits within the 64-bit hash code to understand how many collisions
  480. // emerge purely within that bit range.
  481. //
  482. // With narrow ranges (we often look at the first N and last N bits for small
  483. // N), collisions are common and so this function summarizes this with the total
  484. // number of collisions and the median number of collisions for an input value.
  485. template <int BitBegin, int BitEnd, typename T>
  486. auto FindBitRangeCollisions(llvm::ArrayRef<HashedValue<T>> hashes)
  487. -> Collisions {
  488. static_assert(BitBegin < BitEnd);
  489. constexpr int BitCount = BitEnd - BitBegin;
  490. static_assert(BitCount <= 32);
  491. constexpr int BitShift = BitBegin;
  492. constexpr uint64_t BitMask = ((1ULL << BitCount) - 1) << BitShift;
  493. // We collect counts of collisions in a vector. Initially, we just have a zero
  494. // and all inputs map to that collision count. As we discover collisions,
  495. // we'll create a dedicated counter for it and count how many inputs collide.
  496. llvm::SmallVector<int> collision_counts;
  497. collision_counts.push_back(0);
  498. // The "map" for collision counts. Each input hashed value has a corresponding
  499. // index stored here. That index is the index of the collision count in the
  500. // container above. We resize this to fill it with zeros to start as the zero
  501. // index above has a collision count of zero.
  502. //
  503. // The result of this is that the number of collisions for `hashes[i]` is
  504. // `collision_counts[collision_map[i]]`.
  505. llvm::SmallVector<int> collision_map;
  506. collision_map.resize(hashes.size());
  507. // First, we extract the bit subsequence we want to examine from each hash and
  508. // store it with an index back into the hashed values (or the collision map).
  509. //
  510. // The result is that, `bits_and_indices[i].bits` has the hash bits of
  511. // interest from `hashes[bits_and_indices[i].index]`.
  512. //
  513. // And because `collision_map` above uses the same indices as `hashes`,
  514. // `collision_counts[collision_map[bits_and_indices[i].index]]` is the number
  515. // of collisions for `bits_and_indices[i].bits`.
  516. struct BitSequenceAndHashIndex {
  517. // The bit subsequence of a hash input, adjusted into the low bits.
  518. uint32_t bits;
  519. // The index of the hash input corresponding to this bit sequence.
  520. int index;
  521. };
  522. llvm::SmallVector<BitSequenceAndHashIndex> bits_and_indices;
  523. bits_and_indices.reserve(hashes.size());
  524. for (const auto& [hash, v] : hashes) {
  525. CARBON_DCHECK(v == hashes[bits_and_indices.size()].v);
  526. auto hash_bits = (static_cast<uint64_t>(hash) & BitMask) >> BitShift;
  527. bits_and_indices.push_back(
  528. {.bits = static_cast<uint32_t>(hash_bits),
  529. .index = static_cast<int>(bits_and_indices.size())});
  530. }
  531. // Now we sort by the extracted bit sequence so we can efficiently scan for
  532. // colliding bit patterns.
  533. std::sort(
  534. bits_and_indices.begin(), bits_and_indices.end(),
  535. [](const auto& lhs, const auto& rhs) { return lhs.bits < rhs.bits; });
  536. // Scan the sorted bit sequences we've extracted looking for collisions. We
  537. // count the total collisions, but we also track the number of individual
  538. // inputs that collide with each specific bit pattern.
  539. uint32_t prev_hash_bits = bits_and_indices[0].bits;
  540. int prev_index = bits_and_indices[0].index;
  541. bool in_collision = false;
  542. int total = 0;
  543. for (const auto& [hash_bits, hash_index] :
  544. llvm::ArrayRef(bits_and_indices).slice(1)) {
  545. // Check if we've found a new hash (and thus a new value), reset everything.
  546. CARBON_CHECK(hashes[prev_index].v != hashes[hash_index].v);
  547. if (hash_bits != prev_hash_bits) {
  548. CARBON_CHECK(hashes[prev_index].hash != hashes[hash_index].hash);
  549. prev_hash_bits = hash_bits;
  550. prev_index = hash_index;
  551. in_collision = false;
  552. continue;
  553. }
  554. // Otherwise, we have a colliding bit sequence.
  555. ++total;
  556. // If we've already created a collision count to track this, just increment
  557. // it and map this hash to it.
  558. if (in_collision) {
  559. ++collision_counts.back();
  560. collision_map[hash_index] = collision_counts.size() - 1;
  561. continue;
  562. }
  563. // If this is a new collision, create a dedicated count to track it and
  564. // begin counting.
  565. in_collision = true;
  566. collision_map[prev_index] = collision_counts.size();
  567. collision_map[hash_index] = collision_counts.size();
  568. collision_counts.push_back(1);
  569. }
  570. // Sort by collision count for each hash.
  571. std::sort(bits_and_indices.begin(), bits_and_indices.end(),
  572. [&](const auto& lhs, const auto& rhs) {
  573. return collision_counts[collision_map[lhs.index]] <
  574. collision_counts[collision_map[rhs.index]];
  575. });
  576. // And compute the median and max.
  577. int median = collision_counts
  578. [collision_map[bits_and_indices[bits_and_indices.size() / 2].index]];
  579. int max = *std::max_element(collision_counts.begin(), collision_counts.end());
  580. CARBON_CHECK(max ==
  581. collision_counts[collision_map[bits_and_indices.back().index]]);
  582. return {.total = total, .median = median, .max = max};
  583. }
  584. auto CheckNoDuplicateValues(llvm::ArrayRef<HashedString> hashes) -> void {
  585. for (int i = 0, size = hashes.size(); i < size - 1; ++i) {
  586. const auto& [_, value] = hashes[i];
  587. CARBON_CHECK(value != hashes[i + 1].v, "Duplicate value: {0}", value);
  588. }
  589. }
  590. template <int N>
  591. auto AllByteStringsHashedAndSorted() {
  592. static_assert(N < 5, "Can only generate all 4-byte strings or shorter.");
  593. llvm::SmallVector<HashedString> hashes;
  594. int64_t count = 1LL << (N * 8);
  595. for (int64_t i : llvm::seq(count)) {
  596. uint8_t bytes[N];
  597. for (int j : llvm::seq(N)) {
  598. bytes[j] = (static_cast<uint64_t>(i) >> (8 * j)) & 0xff;
  599. }
  600. std::string s(std::begin(bytes), std::end(bytes));
  601. hashes.push_back({HashValue(s, TestSeed), s});
  602. }
  603. std::sort(hashes.begin(), hashes.end(),
  604. [](const HashedString& lhs, const HashedString& rhs) {
  605. return static_cast<uint64_t>(lhs.hash) <
  606. static_cast<uint64_t>(rhs.hash);
  607. });
  608. CheckNoDuplicateValues(hashes);
  609. return hashes;
  610. }
  611. auto ExpectNoHashCollisions(llvm::ArrayRef<HashedString> hashes) -> void {
  612. HashCode prev_hash = hashes[0].hash;
  613. llvm::StringRef prev_s = hashes[0].v;
  614. for (const auto& [hash, s] : hashes.slice(1)) {
  615. if (hash != prev_hash) {
  616. prev_hash = hash;
  617. prev_s = s;
  618. continue;
  619. }
  620. FAIL() << "Colliding hash '" << hash << "' of strings "
  621. << ToHexBytes(prev_s) << " and " << ToHexBytes(s);
  622. }
  623. }
  624. TEST(HashingTest, Collisions1ByteSized) {
  625. auto hashes_storage = AllByteStringsHashedAndSorted<1>();
  626. llvm::ArrayRef hashes = hashes_storage;
  627. ExpectNoHashCollisions(hashes);
  628. auto low_32bit_collisions = FindBitRangeCollisions<0, 32>(hashes);
  629. EXPECT_THAT(low_32bit_collisions.total, Eq(0));
  630. auto high_32bit_collisions = FindBitRangeCollisions<32, 64>(hashes);
  631. EXPECT_THAT(high_32bit_collisions.total, Eq(0));
  632. // We expect collisions when only looking at 7-bits of the hash. However,
  633. // modern hash table designs need to use either the low or high 7 bits as tags
  634. // for faster searching. So we add some direct testing that the median and max
  635. // collisions for any given key stay within bounds. We express the bounds in
  636. // terms of the minimum expected "perfect" rate of collisions if uniformly
  637. // distributed.
  638. int min_7bit_collisions = llvm::NextPowerOf2(hashes.size() - 1) / (1 << 7);
  639. auto low_7bit_collisions = FindBitRangeCollisions<0, 7>(hashes);
  640. EXPECT_THAT(low_7bit_collisions.median, Le(8 * min_7bit_collisions));
  641. EXPECT_THAT(low_7bit_collisions.max, Le(8 * min_7bit_collisions));
  642. auto high_7bit_collisions = FindBitRangeCollisions<64 - 7, 64>(hashes);
  643. EXPECT_THAT(high_7bit_collisions.median, Le(2 * min_7bit_collisions));
  644. EXPECT_THAT(high_7bit_collisions.max, Le(4 * min_7bit_collisions));
  645. }
  646. TEST(HashingTest, Collisions2ByteSized) {
  647. auto hashes_storage = AllByteStringsHashedAndSorted<2>();
  648. llvm::ArrayRef hashes = hashes_storage;
  649. ExpectNoHashCollisions(hashes);
  650. auto low_32bit_collisions = FindBitRangeCollisions<0, 32>(hashes);
  651. EXPECT_THAT(low_32bit_collisions.total, Eq(0));
  652. auto high_32bit_collisions = FindBitRangeCollisions<32, 64>(hashes);
  653. EXPECT_THAT(high_32bit_collisions.total, Eq(0));
  654. // Similar to 1-byte keys, we do expect a certain rate of collisions here but
  655. // bound the median and max.
  656. int min_7bit_collisions = llvm::NextPowerOf2(hashes.size() - 1) / (1 << 7);
  657. auto low_7bit_collisions = FindBitRangeCollisions<0, 7>(hashes);
  658. EXPECT_THAT(low_7bit_collisions.median, Le(2 * min_7bit_collisions));
  659. EXPECT_THAT(low_7bit_collisions.max, Le(2 * min_7bit_collisions));
  660. auto high_7bit_collisions = FindBitRangeCollisions<64 - 7, 64>(hashes);
  661. EXPECT_THAT(high_7bit_collisions.median, Le(2 * min_7bit_collisions));
  662. EXPECT_THAT(high_7bit_collisions.max, Le(2 * min_7bit_collisions));
  663. }
  664. // Generate and hash all strings of of [BeginByteCount, EndByteCount) bytes,
  665. // with [BeginSetBitCount, EndSetBitCount) contiguous bits at each possible bit
  666. // offset set to one and all other bits set to zero.
  667. template <int BeginByteCount, int EndByteCount, int BeginSetBitCount,
  668. int EndSetBitCount>
  669. struct SparseHashTestParamRanges {
  670. static_assert(BeginByteCount >= 0);
  671. static_assert(BeginByteCount < EndByteCount);
  672. static_assert(BeginSetBitCount >= 0);
  673. static_assert(BeginSetBitCount < EndSetBitCount);
  674. // Note that we intentionally allow the end-set-bit-count to result in more
  675. // set bits than are available -- we truncate the number of set bits to fit
  676. // within the byte string.
  677. static_assert(BeginSetBitCount <= BeginByteCount * 8);
  678. struct ByteCount {
  679. static constexpr int Begin = BeginByteCount;
  680. static constexpr int End = EndByteCount;
  681. };
  682. struct SetBitCount {
  683. static constexpr int Begin = BeginSetBitCount;
  684. static constexpr int End = EndSetBitCount;
  685. };
  686. };
  687. template <typename ParamRanges>
  688. struct SparseHashTest : ::testing::Test {
  689. using ByteCount = typename ParamRanges::ByteCount;
  690. using SetBitCount = typename ParamRanges::SetBitCount;
  691. static auto GetHashedByteStrings() {
  692. llvm::SmallVector<HashedString> hashes;
  693. for (int byte_count :
  694. llvm::seq_inclusive(ByteCount::Begin, ByteCount::End)) {
  695. int bits = byte_count * 8;
  696. for (int set_bit_count : llvm::seq_inclusive(
  697. SetBitCount::Begin, std::min(bits, SetBitCount::End))) {
  698. if (set_bit_count == 0) {
  699. std::string s(byte_count, '\0');
  700. hashes.push_back({HashValue(s, TestSeed), std::move(s)});
  701. continue;
  702. }
  703. for (int begin_set_bit : llvm::seq_inclusive(0, bits - set_bit_count)) {
  704. std::string s(byte_count, '\0');
  705. int begin_set_bit_byte_index = begin_set_bit / 8;
  706. int begin_set_bit_bit_index = begin_set_bit % 8;
  707. int end_set_bit_byte_index = (begin_set_bit + set_bit_count) / 8;
  708. int end_set_bit_bit_index = (begin_set_bit + set_bit_count) % 8;
  709. // We build a begin byte and end byte. We set the begin byte, set
  710. // subsequent bytes up to *and including* the end byte to all ones,
  711. // and then mask the end byte. For multi-byte runs, the mask just sets
  712. // the end byte and for single-byte runs the mask computes the
  713. // intersecting bits.
  714. //
  715. // Consider a 4-set-bit count, starting at bit 2. The begin bit index
  716. // is 2, and the end bit index is 6.
  717. //
  718. // Begin byte: 0b11111111 -(shl 2)-----> 0b11111100
  719. // End byte: 0b11111111 -(shr (8-6))-> 0b00111111
  720. // Masked byte: 0b00111100
  721. //
  722. // Or a 10-set-bit-count starting at bit 2. The begin bit index is 2,
  723. // the end byte index is (12 / 8) or 1, and the end bit index is (12 %
  724. // 8) or 4.
  725. //
  726. // Begin byte: 0b11111111 -(shl 2)-----> 0b11111100 -> 6 bits
  727. // End byte: 0b11111111 -(shr (8-4))-> 0b00001111 -> 4 bits
  728. // 10 total bits
  729. //
  730. uint8_t begin_set_bit_byte = 0xFFU << begin_set_bit_bit_index;
  731. uint8_t end_set_bit_byte = 0xFFU >> (8 - end_set_bit_bit_index);
  732. bool has_end_byte_bits = end_set_bit_byte != 0;
  733. s[begin_set_bit_byte_index] = begin_set_bit_byte;
  734. for (int i : llvm::seq(begin_set_bit_byte_index + 1,
  735. end_set_bit_byte_index + has_end_byte_bits)) {
  736. s[i] = '\xFF';
  737. }
  738. // If there are no bits set in the end byte, it may be past-the-end
  739. // and we can't even mask a zero byte safely.
  740. if (has_end_byte_bits) {
  741. s[end_set_bit_byte_index] &= end_set_bit_byte;
  742. }
  743. hashes.push_back({HashValue(s, TestSeed), std::move(s)});
  744. }
  745. }
  746. }
  747. std::sort(hashes.begin(), hashes.end(),
  748. [](const HashedString& lhs, const HashedString& rhs) {
  749. return static_cast<uint64_t>(lhs.hash) <
  750. static_cast<uint64_t>(rhs.hash);
  751. });
  752. CheckNoDuplicateValues(hashes);
  753. return hashes;
  754. }
  755. };
  756. using SparseHashTestParams = ::testing::Types<
  757. SparseHashTestParamRanges</*BeginByteCount=*/0, /*EndByteCount=*/256,
  758. /*BeginSetBitCount=*/0, /*EndSetBitCount=*/1>,
  759. SparseHashTestParamRanges</*BeginByteCount=*/1, /*EndByteCount=*/128,
  760. /*BeginSetBitCount=*/2, /*EndSetBitCount=*/4>,
  761. SparseHashTestParamRanges</*BeginByteCount=*/1, /*EndByteCount=*/64,
  762. /*BeginSetBitCount=*/4, /*EndSetBitCount=*/16>>;
  763. TYPED_TEST_SUITE(SparseHashTest, SparseHashTestParams);
  764. TYPED_TEST(SparseHashTest, Collisions) {
  765. auto hashes_storage = this->GetHashedByteStrings();
  766. llvm::ArrayRef hashes = hashes_storage;
  767. ExpectNoHashCollisions(hashes);
  768. int min_7bit_collisions = llvm::NextPowerOf2(hashes.size() - 1) / (1 << 7);
  769. auto low_7bit_collisions = FindBitRangeCollisions<0, 7>(hashes);
  770. EXPECT_THAT(low_7bit_collisions.median, Le(2 * min_7bit_collisions));
  771. EXPECT_THAT(low_7bit_collisions.max, Le(2 * min_7bit_collisions));
  772. auto high_7bit_collisions = FindBitRangeCollisions<64 - 7, 64>(hashes);
  773. EXPECT_THAT(high_7bit_collisions.median, Le(2 * min_7bit_collisions));
  774. EXPECT_THAT(high_7bit_collisions.max, Le(2 * min_7bit_collisions));
  775. }
  776. } // namespace
  777. } // namespace Carbon