source_gen.cpp 37 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #include "testing/base/source_gen.h"
  5. #include <numeric>
  6. #include "llvm/ADT/ArrayRef.h"
  7. #include "llvm/ADT/Sequence.h"
  8. #include "llvm/ADT/SmallVector.h"
  9. #include "llvm/ADT/StringExtras.h"
  10. #include "llvm/Support/FormatVariadic.h"
  11. #include "toolchain/lex/token_kind.h"
  12. namespace Carbon::Testing {
  13. auto SourceGen::Global() -> SourceGen& {
  14. static SourceGen global_gen;
  15. return global_gen;
  16. }
  17. SourceGen::SourceGen(Language language) : language_(language) {}
  18. // Heuristic numbers used in synthesizing various identifier sequences.
  19. constexpr static int MinClassNameLength = 5;
  20. constexpr static int MinMemberNameLength = 4;
  21. // The shuffled state used to generate some number of classes.
  22. //
  23. // This state encodes everything used to generate class definitions. The state
  24. // will be consumed until empty.
  25. //
  26. // Detailed comments for out-of-line methods are on their definitions.
  27. class SourceGen::ClassGenState {
  28. public:
  29. ClassGenState(SourceGen& gen, int num_classes,
  30. const ClassParams& class_params,
  31. const TypeUseParams& type_use_params);
  32. auto public_function_param_counts() -> llvm::SmallVectorImpl<int>& {
  33. return public_function_param_counts_;
  34. }
  35. auto public_method_param_counts() -> llvm::SmallVectorImpl<int>& {
  36. return public_method_param_counts_;
  37. }
  38. auto private_function_param_counts() -> llvm::SmallVectorImpl<int>& {
  39. return private_function_param_counts_;
  40. }
  41. auto private_method_param_counts() -> llvm::SmallVectorImpl<int>& {
  42. return private_method_param_counts_;
  43. }
  44. auto class_names() -> llvm::SmallVectorImpl<llvm::StringRef>& {
  45. return class_names_;
  46. }
  47. auto member_names() -> llvm::SmallVectorImpl<llvm::StringRef>& {
  48. return member_names_;
  49. }
  50. auto param_names() -> llvm::SmallVectorImpl<llvm::StringRef>& {
  51. return param_names_;
  52. }
  53. auto type_names() -> llvm::SmallVectorImpl<llvm::StringRef>& {
  54. return type_names_;
  55. }
  56. auto AddValidTypeName(llvm::StringRef type_name) -> void {
  57. valid_type_names_.Insert(type_name);
  58. }
  59. auto GetValidTypeName() -> llvm::StringRef;
  60. private:
  61. auto BuildClassAndTypeNames(SourceGen& gen, int num_classes, int num_types,
  62. const TypeUseParams& type_use_params) -> void;
  63. llvm::SmallVector<int> public_function_param_counts_;
  64. llvm::SmallVector<int> public_method_param_counts_;
  65. llvm::SmallVector<int> private_function_param_counts_;
  66. llvm::SmallVector<int> private_method_param_counts_;
  67. llvm::SmallVector<llvm::StringRef> class_names_;
  68. llvm::SmallVector<llvm::StringRef> member_names_;
  69. llvm::SmallVector<llvm::StringRef> param_names_;
  70. llvm::SmallVector<llvm::StringRef> type_names_;
  71. Set<llvm::StringRef> valid_type_names_;
  72. int last_type_name_index_ = 0;
  73. };
  74. // A helper to sum elements of a range.
  75. template <typename T>
  76. static auto Sum(const T& range) -> int {
  77. return std::accumulate(range.begin(), range.end(), 0);
  78. }
  79. // Given a number of class definitions and the params with which to generate
  80. // them, builds the state that will be used while generating that many classes.
  81. //
  82. // We build the state first and across all the class definitions that will be
  83. // generated so that we can distribute random components across all the
  84. // definitions.
  85. SourceGen::ClassGenState::ClassGenState(SourceGen& gen, int num_classes,
  86. const ClassParams& class_params,
  87. const TypeUseParams& type_use_params) {
  88. public_function_param_counts_ =
  89. gen.GetShuffledInts(num_classes * class_params.public_function_decls, 0,
  90. class_params.public_function_decl_params.max_params);
  91. public_method_param_counts_ =
  92. gen.GetShuffledInts(num_classes * class_params.public_method_decls, 0,
  93. class_params.public_method_decl_params.max_params);
  94. private_function_param_counts_ =
  95. gen.GetShuffledInts(num_classes * class_params.private_function_decls, 0,
  96. class_params.private_function_decl_params.max_params);
  97. private_method_param_counts_ =
  98. gen.GetShuffledInts(num_classes * class_params.private_method_decls, 0,
  99. class_params.private_method_decl_params.max_params);
  100. int num_members =
  101. num_classes *
  102. (class_params.public_function_decls + class_params.public_method_decls +
  103. class_params.private_function_decls + class_params.private_method_decls +
  104. class_params.private_field_decls);
  105. member_names_ = gen.GetShuffledIdentifiers(
  106. num_members, /*min_length=*/MinMemberNameLength);
  107. int num_params =
  108. Sum(public_function_param_counts_) + Sum(public_method_param_counts_) +
  109. Sum(private_function_param_counts_) + Sum(private_method_param_counts_);
  110. param_names_ = gen.GetShuffledIdentifiers(num_params);
  111. BuildClassAndTypeNames(gen, num_classes, num_members + num_params,
  112. type_use_params);
  113. }
  114. auto SourceGen::ClassGenState::GetValidTypeName() -> llvm::StringRef {
  115. // Check that we don't completely wrap the type names by tracking where we
  116. // started.
  117. int initial_last_type_name_index = last_type_name_index_;
  118. // Now search the type names, starting from the last used index, to find the
  119. // first valid name.
  120. for (;;) {
  121. if (last_type_name_index_ == 0) {
  122. last_type_name_index_ = type_names_.size();
  123. }
  124. --last_type_name_index_;
  125. llvm::StringRef& type_name = type_names_[last_type_name_index_];
  126. if (valid_type_names_.Contains(type_name)) {
  127. // Found a valid type name, swap it with the back and pop that off.
  128. std::swap(type_names_.back(), type_name);
  129. return type_names_.pop_back_val();
  130. }
  131. CARBON_CHECK(last_type_name_index_ != initial_last_type_name_index)
  132. << "Failed to find a valid type name with " << type_names_.size()
  133. << " candidates, an initial index of " << initial_last_type_name_index
  134. << ", and with " << class_names_.size() << " classes left to emit!";
  135. }
  136. }
  137. // Build both the class names this file will declare and a list of type
  138. // references to use throughout those classes.
  139. //
  140. // We combine a list of fixed types in the `type_use_params` with the list of
  141. // class names that will be defined to form the spelling of all the referenced
  142. // types. The `type_use_params` provides weights for each fixed type as well as
  143. // an overall weight for referencing class names that are being declared. We
  144. // build a set of type references so that its histogram will roughly match these
  145. // weights.
  146. //
  147. // For each of the fixed types, `type_use_params` provides a spelling for both
  148. // Carbon and C++.
  149. //
  150. // We distribute our references to declared class names evenly to the extent
  151. // possible.
  152. //
  153. // Before all the references are formed, the class names are kept their original
  154. // unshuffled order. This ensures that any uneven sampling of names is done
  155. // deterministically. At the end, we randomly shuffle the sequences of both the
  156. // declared class names and type references to provide an unpredictable order in
  157. // the generated output.
  158. auto SourceGen::ClassGenState::BuildClassAndTypeNames(
  159. SourceGen& gen, int num_classes, int num_types,
  160. const TypeUseParams& type_use_params) -> void {
  161. // Initially get the sequence of class names without shuffling so we can
  162. // compute our type name pool from them prior to any shuffling.
  163. class_names_ =
  164. gen.GetUniqueIdentifiers(num_classes, /*min_length=*/MinClassNameLength);
  165. type_names_.reserve(num_types);
  166. // Compute the sum of weights and pre-process the fixed types.
  167. int type_weight_sum = type_use_params.declared_types_weight;
  168. for (const auto& fixed_type_weight : type_use_params.fixed_type_weights) {
  169. type_weight_sum += fixed_type_weight.weight;
  170. // Add all the fixed type spellings as immediately valid.
  171. valid_type_names_.Insert(gen.IsCpp() ? fixed_type_weight.cpp_spelling
  172. : fixed_type_weight.carbon_spelling);
  173. }
  174. // Compute the number of declared types used. We expect to have a decent
  175. // number of repeated names, so we repeatedly append the entire sequence of
  176. // class names until there is some remainder of names needed.
  177. int num_declared_types =
  178. num_types * type_use_params.declared_types_weight / type_weight_sum;
  179. for ([[maybe_unused]] auto _ : llvm::seq(num_declared_types / num_classes)) {
  180. type_names_.append(class_names_.begin(), class_names_.end());
  181. }
  182. // Now append the remainder number of class names. This is where the class
  183. // names being un-shuffled is essential. We're going to have one extra
  184. // reference to some fraction of the class names and we want that to be a
  185. // stable subset.
  186. type_names_.append(class_names_.begin(),
  187. class_names_.begin() + (num_declared_types % num_classes));
  188. CARBON_CHECK(static_cast<int>(type_names_.size()) == num_declared_types);
  189. // Use each fixed type weight to append the expected number of copies of that
  190. // type. This isn't exact however, and is designed to stop short.
  191. for (const auto& fixed_type_weight : type_use_params.fixed_type_weights) {
  192. int num_fixed_type = num_types * fixed_type_weight.weight / type_weight_sum;
  193. type_names_.append(num_fixed_type, gen.IsCpp()
  194. ? fixed_type_weight.cpp_spelling
  195. : fixed_type_weight.carbon_spelling);
  196. }
  197. // If we need a tail of types to hit the exact number, simply round-robin
  198. // through the fixed types without any weighting. With reasonably large
  199. // numbers of types this won't distort the distribution in an interesting way
  200. // and is simpler than trying to scale the distribution down.
  201. while (static_cast<int>(type_names_.size()) < num_types) {
  202. for (const auto& fixed_type_weight :
  203. llvm::ArrayRef(type_use_params.fixed_type_weights)
  204. .take_front(num_types - type_names_.size())) {
  205. type_names_.push_back(gen.IsCpp() ? fixed_type_weight.cpp_spelling
  206. : fixed_type_weight.carbon_spelling);
  207. }
  208. }
  209. CARBON_CHECK(static_cast<int>(type_names_.size()) == num_types);
  210. last_type_name_index_ = num_types;
  211. // Now shuffle both the class names and the type names.
  212. std::shuffle(class_names_.begin(), class_names_.end(), gen.rng_);
  213. std::shuffle(type_names_.begin(), type_names_.end(), gen.rng_);
  214. }
  215. // Some heuristic numbers used when formatting generated code. These heuristics
  216. // are loosely based on what we expect to make Carbon code readable, and might
  217. // not fit as well in C++, but we use the same heuristics across languages for
  218. // simplicity and to make the output in different languages more directly
  219. // comparable.
  220. constexpr static int NumSingleLineFunctionParams = 3;
  221. constexpr static int NumSingleLineMethodParams = 2;
  222. constexpr static int MaxParamsPerLine = 4;
  223. static auto EstimateAvgFunctionDeclLines(SourceGen::FunctionDeclParams params)
  224. -> double {
  225. // Currently model a uniform distribution [0, max] parameters. Assume a line
  226. // break before the first parameter for >3 and after every 4th.
  227. int param_lines = 0;
  228. for (int num_params : llvm::seq_inclusive(0, params.max_params)) {
  229. if (num_params > NumSingleLineFunctionParams) {
  230. param_lines += (num_params + MaxParamsPerLine - 1) / MaxParamsPerLine;
  231. }
  232. }
  233. return 1.0 + static_cast<double>(param_lines) / (params.max_params + 1);
  234. }
  235. static auto EstimateAvgMethodDeclLines(SourceGen::MethodDeclParams params)
  236. -> double {
  237. // Currently model a uniform distribution [0, max] parameters. Assume a line
  238. // break before the first parameter for >2 and after every 4th.
  239. int param_lines = 0;
  240. for (int num_params : llvm::seq_inclusive(0, params.max_params)) {
  241. if (num_params > NumSingleLineMethodParams) {
  242. param_lines += (num_params + MaxParamsPerLine - 1) / MaxParamsPerLine;
  243. }
  244. }
  245. return 1.0 + static_cast<double>(param_lines) / (params.max_params + 1);
  246. }
  247. // Note that this should match the heuristics used when formatting.
  248. // TODO: See top-level TODO about line estimates and formatting.
  249. static auto EstimateAvgClassDefLines(SourceGen::ClassParams params) -> double {
  250. // Comment line, and class open line.
  251. double avg = 2.0;
  252. // One comment line and blank line per function, plus the function lines.
  253. avg +=
  254. (2.0 + EstimateAvgFunctionDeclLines(params.public_function_decl_params)) *
  255. params.public_function_decls;
  256. avg += (2.0 + EstimateAvgMethodDeclLines(params.public_method_decl_params)) *
  257. params.public_method_decls;
  258. avg += (2.0 +
  259. EstimateAvgFunctionDeclLines(params.private_function_decl_params)) *
  260. params.private_function_decls;
  261. avg += (2.0 + EstimateAvgMethodDeclLines(params.private_method_decl_params)) *
  262. params.private_method_decls;
  263. // A blank line and all the fields (if any).
  264. if (params.private_field_decls > 0) {
  265. avg += 1.0 + params.private_field_decls;
  266. }
  267. // No need to account for the class close line, we have an extra blank line
  268. // count for the last of the above.
  269. return avg;
  270. }
  271. auto SourceGen::GenAPIFileDenseDecls(int target_lines,
  272. const DenseDeclParams& params)
  273. -> std::string {
  274. std::string source;
  275. llvm::raw_string_ostream os(source);
  276. // Figure out how many classes fit in our target lines, each separated by a
  277. // blank line. We need to account the comment lines below to start the file.
  278. // Note that we want a blank line after our file comment block, so every class
  279. // needs a blank line.
  280. constexpr int NumFileCommentLines = 4;
  281. double avg_class_lines = EstimateAvgClassDefLines(params.class_params);
  282. CARBON_CHECK(target_lines > NumFileCommentLines + avg_class_lines)
  283. << "Not enough target lines to generate a single class!";
  284. int num_classes = static_cast<double>(target_lines - NumFileCommentLines) /
  285. (avg_class_lines + 1);
  286. int expected_lines =
  287. NumFileCommentLines + num_classes * (avg_class_lines + 1);
  288. os << "// Generated " << (!IsCpp() ? "Carbon" : "C++") << " source file.\n";
  289. os << llvm::formatv("// {0} target lines: {1} classes, {2} expected lines",
  290. target_lines, num_classes, expected_lines)
  291. << "\n";
  292. os << "//\n// Generating as an API file with dense declarations.\n";
  293. // Carbon uses an implicitly imported prelude to get builtin types, but C++
  294. // requires header files so include those.
  295. if (IsCpp()) {
  296. os << "\n";
  297. // Header for specific integer types like `std::int64_t`.
  298. os << "#include <cstdint>\n";
  299. // Header for `std::pair`.
  300. os << "#include <utility>\n";
  301. }
  302. auto class_gen_state = ClassGenState(*this, num_classes, params.class_params,
  303. params.type_use_params);
  304. for ([[maybe_unused]] int _ : llvm::seq(num_classes)) {
  305. os << "\n";
  306. GenerateClassDef(params.class_params, class_gen_state, os);
  307. }
  308. // Make sure we consumed all the state.
  309. CARBON_CHECK(class_gen_state.public_function_param_counts().empty());
  310. CARBON_CHECK(class_gen_state.public_method_param_counts().empty());
  311. CARBON_CHECK(class_gen_state.private_function_param_counts().empty());
  312. CARBON_CHECK(class_gen_state.private_method_param_counts().empty());
  313. CARBON_CHECK(class_gen_state.class_names().empty());
  314. CARBON_CHECK(class_gen_state.type_names().empty());
  315. return source;
  316. }
  317. auto SourceGen::GetShuffledIdentifiers(int number, int min_length,
  318. int max_length, bool uniform)
  319. -> llvm::SmallVector<llvm::StringRef> {
  320. llvm::SmallVector<llvm::StringRef> idents =
  321. GetIdentifiers(number, min_length, max_length, uniform);
  322. std::shuffle(idents.begin(), idents.end(), rng_);
  323. return idents;
  324. }
  325. auto SourceGen::GetShuffledUniqueIdentifiers(int number, int min_length,
  326. int max_length, bool uniform)
  327. -> llvm::SmallVector<llvm::StringRef> {
  328. CARBON_CHECK(min_length >= 4)
  329. << "Cannot trivially guarantee enough distinct, unique identifiers for "
  330. "lengths <= 3";
  331. llvm::SmallVector<llvm::StringRef> idents =
  332. GetUniqueIdentifiers(number, min_length, max_length, uniform);
  333. std::shuffle(idents.begin(), idents.end(), rng_);
  334. return idents;
  335. }
  336. auto SourceGen::GetIdentifiers(int number, int min_length, int max_length,
  337. bool uniform)
  338. -> llvm::SmallVector<llvm::StringRef> {
  339. llvm::SmallVector<llvm::StringRef> idents = GetIdentifiersImpl(
  340. number, min_length, max_length, uniform,
  341. [this](int length, int length_count,
  342. llvm::SmallVectorImpl<llvm::StringRef>& dest) {
  343. auto length_idents = GetSingleLengthIdentifiers(length, length_count);
  344. dest.append(length_idents.begin(), length_idents.end());
  345. });
  346. return idents;
  347. }
  348. auto SourceGen::GetUniqueIdentifiers(int number, int min_length, int max_length,
  349. bool uniform)
  350. -> llvm::SmallVector<llvm::StringRef> {
  351. CARBON_CHECK(min_length >= 4)
  352. << "Cannot trivially guarantee enough distinct, unique identifiers for "
  353. "lengths <= 3";
  354. llvm::SmallVector<llvm::StringRef> idents =
  355. GetIdentifiersImpl(number, min_length, max_length, uniform,
  356. [this](int length, int length_count,
  357. llvm::SmallVectorImpl<llvm::StringRef>& dest) {
  358. AppendUniqueIdentifiers(length, length_count, dest);
  359. });
  360. return idents;
  361. }
  362. auto SourceGen::GetSingleLengthIdentifiers(int length, int number)
  363. -> llvm::ArrayRef<llvm::StringRef> {
  364. llvm::SmallVector<llvm::StringRef>& idents =
  365. identifiers_by_length_.Insert(length, {}).value();
  366. if (static_cast<int>(idents.size()) < number) {
  367. idents.reserve(number);
  368. for ([[maybe_unused]] int _ : llvm::seq<int>(idents.size(), number)) {
  369. auto ident_storage =
  370. llvm::MutableArrayRef(reinterpret_cast<char*>(storage_.Allocate(
  371. /*Size=*/length, /*Alignment=*/1)),
  372. length);
  373. GenerateRandomIdentifier(ident_storage);
  374. llvm::StringRef new_id(ident_storage.data(), length);
  375. idents.push_back(new_id);
  376. }
  377. CARBON_CHECK(static_cast<int>(idents.size()) == number);
  378. }
  379. return llvm::ArrayRef(idents).slice(0, number);
  380. }
  381. static auto IdentifierStartChars() -> llvm::ArrayRef<char> {
  382. static llvm::SmallVector<char> chars = [] {
  383. llvm::SmallVector<char> chars;
  384. for (char c : llvm::seq_inclusive('A', 'Z')) {
  385. chars.push_back(c);
  386. }
  387. for (char c : llvm::seq_inclusive('a', 'z')) {
  388. chars.push_back(c);
  389. }
  390. return chars;
  391. }();
  392. return chars;
  393. }
  394. static auto IdentifierChars() -> llvm::ArrayRef<char> {
  395. static llvm::SmallVector<char> chars = [] {
  396. llvm::ArrayRef<char> start_chars = IdentifierStartChars();
  397. llvm::SmallVector<char> chars(start_chars.begin(), start_chars.end());
  398. chars.push_back('_');
  399. for (char c : llvm::seq_inclusive('0', '9')) {
  400. chars.push_back(c);
  401. }
  402. return chars;
  403. }();
  404. return chars;
  405. }
  406. constexpr static llvm::StringRef NonCarbonCppKeywords[] = {
  407. "asm", "do", "double", "float", "int", "long", "new", "signed",
  408. "std", "try", "unix", "unsigned", "xor", "NAN", "M_E", "M_PI",
  409. };
  410. // Returns a random identifier string of the specified length.
  411. //
  412. // Ensures this is a valid identifier, avoiding any overlapping syntaxes or
  413. // keywords both in Carbon and C++.
  414. //
  415. // This routine is somewhat expensive and so is useful to cache and reduce the
  416. // frequency of calls. However, each time it is called it computes a completely
  417. // new random identifier and so can be useful to eventually find a distinct
  418. // identifier when needed.
  419. auto SourceGen::GenerateRandomIdentifier(
  420. llvm::MutableArrayRef<char> dest_storage) -> void {
  421. llvm::ArrayRef<char> start_chars = IdentifierStartChars();
  422. llvm::ArrayRef<char> chars = IdentifierChars();
  423. llvm::StringRef ident(dest_storage.data(), dest_storage.size());
  424. do {
  425. dest_storage[0] =
  426. start_chars[absl::Uniform<int>(rng_, 0, start_chars.size())];
  427. for (int i : llvm::seq<int>(1, dest_storage.size())) {
  428. dest_storage[i] = chars[absl::Uniform<int>(rng_, 0, chars.size())];
  429. }
  430. } while (
  431. // TODO: Clean up and simplify this code. With some small refactorings and
  432. // post-processing we should be able to make this both easier to read and
  433. // less inefficient.
  434. llvm::any_of(
  435. Lex::TokenKind::KeywordTokens,
  436. [ident](auto token) { return ident == token.fixed_spelling(); }) ||
  437. llvm::is_contained(NonCarbonCppKeywords, ident) ||
  438. ident.ends_with("_t") || ident.ends_with("_MIN") ||
  439. ident.ends_with("_MAX") || ident.ends_with("_C") ||
  440. (llvm::is_contained({'i', 'u', 'f'}, ident[0]) &&
  441. llvm::all_of(ident.substr(1),
  442. [](const char c) { return llvm::isDigit(c); })));
  443. }
  444. // Appends a number of unique, random identifiers with a particular length to
  445. // the provided destination vector.
  446. //
  447. // Uses, and when necessary grows, a cached sequence of random identifiers with
  448. // the specified length. Because these are cached, this is efficient to call
  449. // repeatedly, but will not produce a different sequence of identifiers.
  450. auto SourceGen::AppendUniqueIdentifiers(
  451. int length, int number, llvm::SmallVectorImpl<llvm::StringRef>& dest)
  452. -> void {
  453. auto& [count, unique_idents] =
  454. unique_identifiers_by_length_.Insert(length, {}).value();
  455. // See if we need to grow our pool of unique identifiers with the requested
  456. // length.
  457. if (count < number) {
  458. // We'll need to insert exactly the requested new unique identifiers. All
  459. // our other inserts will find an existing entry.
  460. unique_idents.GrowForInsertCount(count - number);
  461. // Generate the needed number of identifiers.
  462. for ([[maybe_unused]] int _ : llvm::seq<int>(count, number)) {
  463. // Allocate stable storage for the identifier so we can form stable
  464. // `StringRef`s to it.
  465. auto ident_storage =
  466. llvm::MutableArrayRef(reinterpret_cast<char*>(storage_.Allocate(
  467. /*Size=*/length, /*Alignment=*/1)),
  468. length);
  469. // Repeatedly generate novel identifiers of this length until we find a
  470. // new unique one.
  471. for (;;) {
  472. GenerateRandomIdentifier(ident_storage);
  473. auto result =
  474. unique_idents.Insert(llvm::StringRef(ident_storage.data(), length));
  475. if (result.is_inserted()) {
  476. break;
  477. }
  478. }
  479. }
  480. count = number;
  481. }
  482. // Append all the identifiers directly out of the set. We make no guarantees
  483. // about the relative order so we just use the non-deterministic order of the
  484. // set and avoid additional storage.
  485. //
  486. // TODO: It's awkward the `ForEach` here can't early-exit. This just walks the
  487. // whole set which is harmless if inefficient. We should add early exiting
  488. // the loop support to `Set` and update this code.
  489. unique_idents.ForEach([&](llvm::StringRef ident) {
  490. if (number > 0) {
  491. dest.push_back(ident);
  492. --number;
  493. }
  494. });
  495. CARBON_CHECK(number == 0);
  496. }
  497. // An array of the counts that should be used for each identifier length to
  498. // produce our desired distribution.
  499. //
  500. // Note that the zero-based index corresponds to a 1-based length, so the count
  501. // for identifiers of length 1 is at index 0.
  502. static constexpr std::array<int, 64> IdentifierLengthCounts = [] {
  503. std::array<int, 64> ident_length_counts;
  504. // For non-uniform distribution, we simulate a distribution roughly based on
  505. // the observed histogram of identifier lengths, but smoothed a bit and
  506. // reduced to small counts so that we cycle through all the lengths
  507. // reasonably quickly. We want sampling of even 10% of NumTokens from this
  508. // in a round-robin form to not be skewed overly much. This still inherently
  509. // compresses the long tail as we'd rather have coverage even though it
  510. // distorts the distribution a bit.
  511. //
  512. // The distribution here comes from a script that analyzes source code run
  513. // over a few directories of LLVM. The script renders a visual ascii-art
  514. // histogram along with the data for each bucket, and that output is
  515. // included in comments above each bucket size below to help visualize the
  516. // rough shape we're aiming for.
  517. //
  518. // 1 characters [3976] ███████████████████████████████▊
  519. ident_length_counts[0] = 40;
  520. // 2 characters [3724] █████████████████████████████▊
  521. ident_length_counts[1] = 40;
  522. // 3 characters [4173] █████████████████████████████████▍
  523. ident_length_counts[2] = 40;
  524. // 4 characters [5000] ████████████████████████████████████████
  525. ident_length_counts[3] = 50;
  526. // 5 characters [1568] ████████████▌
  527. ident_length_counts[4] = 20;
  528. // 6 characters [2226] █████████████████▊
  529. ident_length_counts[5] = 20;
  530. // 7 characters [2380] ███████████████████
  531. ident_length_counts[6] = 20;
  532. // 8 characters [1786] ██████████████▎
  533. ident_length_counts[7] = 18;
  534. // 9 characters [1397] ███████████▏
  535. ident_length_counts[8] = 12;
  536. // 10 characters [ 739] █████▉
  537. ident_length_counts[9] = 12;
  538. // 11 characters [ 779] ██████▎
  539. ident_length_counts[10] = 12;
  540. // 12 characters [1344] ██████████▊
  541. ident_length_counts[11] = 12;
  542. // 13 characters [ 498] ████
  543. ident_length_counts[12] = 5;
  544. // 14 characters [ 284] ██▎
  545. ident_length_counts[13] = 3;
  546. // 15 characters [ 172] █▍
  547. // 16 characters [ 278] ██▎
  548. // 17 characters [ 191] █▌
  549. // 18 characters [ 207] █▋
  550. for (int i = 14; i < 18; ++i) {
  551. ident_length_counts[i] = 2;
  552. }
  553. // 19 - 63 characters are all <100 but non-zero, and we map them to 1 for
  554. // coverage despite slightly over weighting the tail.
  555. for (int i = 18; i < 64; ++i) {
  556. ident_length_counts[i] = 1;
  557. }
  558. return ident_length_counts;
  559. }();
  560. // A template function that implements the common logic of `GetIdentifiers` and
  561. // `GetUniqueIdentifiers`. Most parameters correspond to the parameters of those
  562. // functions. Additionally, an `AppendFunc` callable is provided to implement
  563. // the appending operation.
  564. //
  565. // The main functionality provided here is collecting the correct number of
  566. // identifiers from each of the lengths in the range [min_length, max_length]
  567. // and either in our default representative distribution or a uniform
  568. // distribution.
  569. auto SourceGen::GetIdentifiersImpl(int number, int min_length, int max_length,
  570. bool uniform,
  571. llvm::function_ref<AppendFn> append)
  572. -> llvm::SmallVector<llvm::StringRef> {
  573. CARBON_CHECK(min_length <= max_length);
  574. CARBON_CHECK(uniform || max_length <= 64)
  575. << "Cannot produce a meaningful non-uniform distribution of lengths "
  576. "longer than 64 as those are exceedingly rare in our observed data "
  577. "sets.";
  578. llvm::SmallVector<llvm::StringRef> idents;
  579. idents.reserve(number);
  580. // First, compute the total weight of the distribution so we know how many
  581. // identifiers we'll get each time we collect from it.
  582. int num_lengths = max_length - min_length + 1;
  583. auto length_counts =
  584. llvm::ArrayRef(IdentifierLengthCounts).slice(min_length - 1, num_lengths);
  585. int count_sum = uniform ? num_lengths : Sum(length_counts);
  586. CARBON_CHECK(count_sum >= 1);
  587. int number_rem = number % count_sum;
  588. // Finally, walk through each length in the distribution.
  589. for (int length : llvm::seq_inclusive(min_length, max_length)) {
  590. // Scale how many identifiers we want of this length if computing a
  591. // non-uniform distribution. For uniform, we always take one.
  592. int scale = uniform ? 1 : IdentifierLengthCounts[length - 1];
  593. // Now we can compute how many identifiers of this length to request.
  594. int length_count = (number / count_sum) * scale;
  595. if (number_rem > 0) {
  596. int rem_adjustment = std::min(scale, number_rem);
  597. length_count += rem_adjustment;
  598. number_rem -= rem_adjustment;
  599. }
  600. append(length, length_count, idents);
  601. }
  602. CARBON_CHECK(number_rem == 0)
  603. << "Unexpected number remaining: " << number_rem;
  604. CARBON_CHECK(static_cast<int>(idents.size()) == number)
  605. << "Ended up with " << idents.size()
  606. << " identifiers instead of the requested " << number;
  607. return idents;
  608. }
  609. // Returns a shuffled sequence of integers in the range [min, max].
  610. //
  611. // The order of the returned integers is random, but each integer in the range
  612. // appears the same number of times in the result, with the number of
  613. // appearances rounded up for lower numbers and rounded down for higher numbers
  614. // in order to exactly produce `number` results.
  615. auto SourceGen::GetShuffledInts(int number, int min, int max)
  616. -> llvm::SmallVector<int> {
  617. llvm::SmallVector<int> ints;
  618. ints.reserve(number);
  619. // Evenly distribute to each value between min and max.
  620. int num_values = max - min + 1;
  621. for (int i : llvm::seq_inclusive(min, max)) {
  622. int i_count = number / num_values;
  623. i_count += i < (min + (number % num_values));
  624. ints.append(i_count, i);
  625. }
  626. CARBON_CHECK(static_cast<int>(ints.size()) == number);
  627. std::shuffle(ints.begin(), ints.end(), rng_);
  628. return ints;
  629. }
  630. // A helper to pop series of unique identifiers off a sequence of random
  631. // identifiers that may have duplicates.
  632. //
  633. // This is particularly designed to work with the sequences of non-unique
  634. // identifiers produced by `GetShuffledIdentifiers` with the important property
  635. // that while popping off unique identifiers found in the shuffled list, we
  636. // don't change the distribution of identifier lengths.
  637. //
  638. // The uniqueness is only per-instance of the class, and so an instance can be
  639. // used to extract a series of names that share a scope.
  640. //
  641. // It works by scanning the sequence to extract each unique identifier found,
  642. // swapping it to the back and popping it off the list. This does shuffle the
  643. // order, but it isn't expected to do so in an interesting way.
  644. //
  645. // It also provides a fallback path in case there are no unique identifiers left
  646. // which computes fresh, random identifiers with the same length as the next one
  647. // in the sequence until a unique one is found.
  648. //
  649. // For simplicity of the fallback path, the lifetime of the identifiers produced
  650. // is bound to the lifetime of the popper instance, and not the generator as a
  651. // whole. If this is ever a problematic constraint, we can start copying
  652. // fallback identifiers into the generator's storage.
  653. class SourceGen::UniqueIdentifierPopper {
  654. public:
  655. explicit UniqueIdentifierPopper(SourceGen& gen,
  656. llvm::SmallVectorImpl<llvm::StringRef>& data)
  657. : gen_(&gen), data_(&data), it_(data_->rbegin()) {}
  658. // Pop the next unique identifier that can be found in the data, or synthesize
  659. // one with a valid length. Always consumes exactly one identifier from the
  660. // data.
  661. //
  662. // Note that the lifetime of the underlying identifier is that of the popper
  663. // and not the underlying data.
  664. auto Pop() -> llvm::StringRef {
  665. for (auto end = data_->rend(); it_ != end; ++it_) {
  666. auto insert = set_.Insert(*it_);
  667. if (!insert.is_inserted()) {
  668. continue;
  669. }
  670. if (it_ != data_->rbegin()) {
  671. std::swap(*data_->rbegin(), *it_);
  672. }
  673. CARBON_CHECK(insert.key() == data_->back());
  674. return data_->pop_back_val();
  675. }
  676. // Out of unique elements. Overwrite the back, preserving its length,
  677. // generating a new identifiers until we find a unique one and return that.
  678. // This ensures we continue to consume the structure and produce the same
  679. // size identifiers even in the fallback.
  680. int length = data_->pop_back_val().size();
  681. auto fallback_ident_storage =
  682. llvm::MutableArrayRef(reinterpret_cast<char*>(gen_->storage_.Allocate(
  683. /*Size=*/length, /*Alignment=*/1)),
  684. length);
  685. for (;;) {
  686. gen_->GenerateRandomIdentifier(fallback_ident_storage);
  687. auto fallback_id = llvm::StringRef(fallback_ident_storage.data(), length);
  688. if (set_.Insert(fallback_id).is_inserted()) {
  689. return fallback_id;
  690. }
  691. }
  692. }
  693. private:
  694. SourceGen* gen_;
  695. llvm::SmallVectorImpl<llvm::StringRef>* data_;
  696. llvm::SmallVectorImpl<llvm::StringRef>::reverse_iterator it_;
  697. Set<llvm::StringRef> set_;
  698. };
  699. // Generates a function declaration and writes it to the provided stream.
  700. //
  701. // The declaration can be configured with a function name, private modifier,
  702. // whether it is a method, the parameter count, an how indented it is.
  703. //
  704. // This is also provided a collection of identifiers to consume as parameter
  705. // names -- it will use a unique popper to extract unique parameter names from
  706. // this collection.
  707. auto SourceGen::GenerateFunctionDecl(
  708. llvm::StringRef name, bool is_private, bool is_method, int param_count,
  709. llvm::StringRef indent, llvm::SmallVectorImpl<llvm::StringRef>& param_names,
  710. llvm::function_ref<auto()->llvm::StringRef> get_type_name,
  711. llvm::raw_ostream& os) -> void {
  712. os << indent << "// TODO: make better comment text\n";
  713. if (!IsCpp()) {
  714. os << indent << (is_private ? "private " : "") << "fn " << name;
  715. if (is_method) {
  716. os << "[self: Self]";
  717. }
  718. } else {
  719. os << indent;
  720. if (!is_method) {
  721. os << "static ";
  722. }
  723. os << "auto " << name;
  724. }
  725. os << "(";
  726. if (param_count >
  727. (is_method ? NumSingleLineMethodParams : NumSingleLineFunctionParams)) {
  728. os << "\n" << indent << " ";
  729. }
  730. UniqueIdentifierPopper unique_param_names(*this, param_names);
  731. for (int i : llvm::seq(param_count)) {
  732. if (i > 0) {
  733. if ((i % MaxParamsPerLine) == 0) {
  734. os << ",\n" << indent << " ";
  735. } else {
  736. os << ", ";
  737. }
  738. }
  739. if (!IsCpp()) {
  740. os << unique_param_names.Pop() << ": " << get_type_name();
  741. } else {
  742. os << get_type_name() << " " << unique_param_names.Pop();
  743. }
  744. }
  745. os << ")";
  746. os << " -> " << get_type_name();
  747. os << ";\n";
  748. }
  749. // Generate a class definition and write it to the provided stream.
  750. //
  751. // The structure of the definition is guided by the `params` provided, and it
  752. // consumes the provided state.
  753. auto SourceGen::GenerateClassDef(const ClassParams& params,
  754. ClassGenState& state, llvm::raw_ostream& os)
  755. -> void {
  756. llvm::StringRef name = state.class_names().pop_back_val();
  757. os << "// TODO: make better comment text\n";
  758. os << "class " << name << " {\n";
  759. if (IsCpp()) {
  760. os << " public:\n";
  761. }
  762. // Field types can't be the class we're currently declaring. We enforce this
  763. // by collecting them before inserting that type into the valid set.
  764. llvm::SmallVector<llvm::StringRef> field_type_names;
  765. field_type_names.reserve(params.private_field_decls);
  766. for ([[maybe_unused]] int _ : llvm::seq(params.private_field_decls)) {
  767. field_type_names.push_back(state.GetValidTypeName());
  768. }
  769. // Mark this class as now a valid type now that field type names have been
  770. // collected. We can reference this class from functions and methods within
  771. // the definition.
  772. state.AddValidTypeName(name);
  773. UniqueIdentifierPopper unique_member_names(*this, state.member_names());
  774. llvm::ListSeparator line_sep("\n");
  775. for ([[maybe_unused]] int _ : llvm::seq(params.public_function_decls)) {
  776. os << line_sep;
  777. GenerateFunctionDecl(
  778. unique_member_names.Pop(), /*is_private=*/false,
  779. /*is_method=*/false,
  780. state.public_function_param_counts().pop_back_val(),
  781. /*indent=*/" ", state.param_names(),
  782. [&] { return state.GetValidTypeName(); }, os);
  783. }
  784. for ([[maybe_unused]] int _ : llvm::seq(params.public_method_decls)) {
  785. os << line_sep;
  786. GenerateFunctionDecl(
  787. unique_member_names.Pop(), /*is_private=*/false,
  788. /*is_method=*/true, state.public_method_param_counts().pop_back_val(),
  789. /*indent=*/" ", state.param_names(),
  790. [&] { return state.GetValidTypeName(); }, os);
  791. }
  792. if (IsCpp()) {
  793. os << "\n private:\n";
  794. // Reset the separator.
  795. line_sep = llvm::ListSeparator("\n");
  796. }
  797. for ([[maybe_unused]] int _ : llvm::seq(params.private_function_decls)) {
  798. os << line_sep;
  799. GenerateFunctionDecl(
  800. unique_member_names.Pop(), /*is_private=*/true,
  801. /*is_method=*/false,
  802. state.private_function_param_counts().pop_back_val(),
  803. /*indent=*/" ", state.param_names(),
  804. [&] { return state.GetValidTypeName(); }, os);
  805. }
  806. for ([[maybe_unused]] int _ : llvm::seq(params.private_method_decls)) {
  807. os << line_sep;
  808. GenerateFunctionDecl(
  809. unique_member_names.Pop(), /*is_private=*/true,
  810. /*is_method=*/true, state.private_method_param_counts().pop_back_val(),
  811. /*indent=*/" ", state.param_names(),
  812. [&] { return state.GetValidTypeName(); }, os);
  813. }
  814. os << line_sep;
  815. for (llvm::StringRef type_name : field_type_names) {
  816. if (!IsCpp()) {
  817. os << " private var " << unique_member_names.Pop() << ": " << type_name
  818. << ";\n";
  819. } else {
  820. os << " " << type_name << " " << unique_member_names.Pop() << ";\n";
  821. }
  822. }
  823. os << "}" << (IsCpp() ? ";" : "") << "\n";
  824. }
  825. } // namespace Carbon::Testing