source_gen.h 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #ifndef CARBON_TESTING_BASE_SOURCE_GEN_H_
  5. #define CARBON_TESTING_BASE_SOURCE_GEN_H_
  6. #include <string>
  7. #include "absl/random/random.h"
  8. #include "common/map.h"
  9. #include "common/set.h"
  10. #include "llvm/ADT/ArrayRef.h"
  11. #include "llvm/ADT/StringRef.h"
  12. #include "llvm/Support/Allocator.h"
  13. namespace Carbon::Testing {
  14. // Provides source code generation facilities.
  15. //
  16. // This class works to generate valid but random & meaningless source code in
  17. // interesting patterns for benchmarking. It is very incomplete. A high level
  18. // set of long-term goals:
  19. //
  20. // - Generate interesting patterns and structures of code that have emerged as
  21. // toolchain performance bottlenecks in practice in C++ codebases.
  22. // - Generate code that includes most Carbon language features (and whatever
  23. // reasonable C++ analogs could be used for comparative purposes):
  24. // - Functions
  25. // - Classes with class functions, methods, and fields
  26. // - Interfaces
  27. // - Checked generics and templates
  28. // - Nested and unnested impls
  29. // - Nested classes
  30. // - Inline and out-of-line function and method definitions
  31. // - Imports and exports
  32. // - API files and impl files.
  33. // - Be random but deterministic. The goal is benchmarking and so while this
  34. // code should strive for not producing trivially predictable patterns, it
  35. // should also strive to be consistent and suitable for benchmarking. Wherever
  36. // possible, it should permute the order and content without randomizing the
  37. // total count, size, or complexity.
  38. //
  39. // Note that the default and primary generation target is interesting Carbon
  40. // source code. We have a best-effort to alternatively generate comparable C++
  41. // constructs to the Carbon ones for comparative benchmarking, but there is no
  42. // goal to cover all the interesting C++ patterns we might want to benchmark,
  43. // and we don't aim for perfectly synthesizing C++ analogs. We can always drop
  44. // fidelity for the C++ code path if needed for simplicity.
  45. //
  46. // TODO: There are numerous places where we hard code a fixed quantity. Instead,
  47. // we should build a rich but general system to easily encode a discrete
  48. // distribution that is sampled. We have a specialized version of this for
  49. // identifiers that should be generalized.
  50. class SourceGen {
  51. public:
  52. enum class Language {
  53. Carbon,
  54. Cpp,
  55. };
  56. struct FunctionDeclParams {
  57. // TODD: Arbitrary default, should switch to a distribution from data.
  58. int max_params = 4;
  59. };
  60. struct MethodDeclParams {
  61. // TODD: Arbitrary default, should switch to a distribution from data.
  62. int max_params = 4;
  63. };
  64. // Parameters used to generate a class in a generated file.
  65. //
  66. // Currently, this uses a fixed number of each kind of declaration, with
  67. // arbitrary defaults chosen. The defaults currently skew towards large
  68. // classes with lots of nested declarations.
  69. // TODO: Switch these to distributions based on data.
  70. //
  71. // TODO: Add support for generating definitions and parameters to control
  72. // them.
  73. struct ClassParams {
  74. int public_function_decls = 4;
  75. FunctionDeclParams public_function_decl_params = {.max_params = 8};
  76. int public_method_decls = 10;
  77. MethodDeclParams public_method_decl_params;
  78. int private_function_decls = 2;
  79. FunctionDeclParams private_function_decl_params = {.max_params = 6};
  80. int private_method_decls = 8;
  81. MethodDeclParams private_method_decl_params = {.max_params = 6};
  82. int private_field_decls = 6;
  83. };
  84. // Parameters used to generate a file with dense declarations.
  85. struct DenseDeclParams {
  86. // TODO: Add more parameters to control generating top-level constructs
  87. // other than class definitions.
  88. // Parameters used when generating class definitions.
  89. ClassParams class_params = {};
  90. };
  91. // Access a global instance of this type to generate Carbon code for
  92. // benchmarks, tests, or other places where sharing a common instance is
  93. // useful. Note that there is nothing thread safe about this instance or type.
  94. static auto Global() -> SourceGen&;
  95. // Construct a source generator for the provided language, by default Carbon.
  96. explicit SourceGen(Language language = Language::Carbon);
  97. // Generate an API file with dense classes containing function forward
  98. // declarations.
  99. //
  100. // Accepts a number of `target_lines` for the resulting source code. This is a
  101. // rough approximation used to scale all the other constructs up and down
  102. // accordingly. For C++ source generation, we work to generate the same number
  103. // of constructs as Carbon would for the given line count over keeping the
  104. // actual line count close to the target.
  105. //
  106. // TODO: Currently, the formatting and line breaks of generating code are
  107. // extremely rough still, and those are a large factor in adherence to
  108. // `target_lines`. Long term, the goal is to get as close as we can to any
  109. // automatically formatted code while still keeping the stability of
  110. // benchmarking.
  111. auto GenAPIFileDenseDecls(int target_lines, DenseDeclParams params)
  112. -> std::string;
  113. // Get some number of randomly shuffled identifiers.
  114. //
  115. // The identifiers start with a character [A-Za-z], other characters may also
  116. // include [0-9_]. Both Carbon and C++ keywords are excluded along with any
  117. // other non-identifier syntaxes that overlap to ensure all of these can be
  118. // used as identifiers.
  119. //
  120. // The order will be different for each call to this function, but the
  121. // specific identifiers may remain the same in order to reduce the cost of
  122. // repeated calls. However, the sum of the identifier sizes returned is
  123. // guaranteed to be the same for every call with the same number of
  124. // identifiers so that benchmarking all of these identifiers has predictable
  125. // and stable cost.
  126. //
  127. // Optionally, callers can request a minimum and maximum length. By default,
  128. // the length distribution used across the identifiers will mirror the
  129. // observed distribution of identifiers in C++ source code and our expectation
  130. // of them in Carbon source code. The maximum length in this default
  131. // distribution cannot be more than 64.
  132. //
  133. // Callers can request a uniform distribution across [min_length, max_length],
  134. // and when it is requested there is no limit on `max_length`.
  135. auto GetShuffledIdentifiers(int number, int min_length = 1,
  136. int max_length = 64, bool uniform = false)
  137. -> llvm::SmallVector<llvm::StringRef>;
  138. // Same as `GetShuffledIdentifiers`, but ensures there are no collisions.
  139. auto GetShuffledUniqueIdentifiers(int number, int min_length = 4,
  140. int max_length = 64, bool uniform = false)
  141. -> llvm::SmallVector<llvm::StringRef>;
  142. // Returns a collection of un-shuffled identifiers, otherwise the same as
  143. // `GetShuffledIdentifiers`.
  144. //
  145. // Usually, benchmarks should use the shuffled version. However, this is
  146. // useful when there is already a post-processing step to shuffle things as it
  147. // is *dramatically* more efficient, especially in debug builds.
  148. auto GetIdentifiers(int number, int min_length = 1, int max_length = 64,
  149. bool uniform = false)
  150. -> llvm::SmallVector<llvm::StringRef>;
  151. // Returns a collection of un-shuffled unique identifiers, otherwise the same
  152. // as `GetShuffledUniqueIdentifiers`.
  153. //
  154. // Usually, benchmarks should use the shuffled version. However, this is
  155. // useful when there is already a post-processing step to shuffle things.
  156. auto GetUniqueIdentifiers(int number, int min_length = 1, int max_length = 64,
  157. bool uniform = false)
  158. -> llvm::SmallVector<llvm::StringRef>;
  159. // Returns a shared collection of random identifiers of a specific length.
  160. //
  161. // For a single, exact length, we have an even cheaper routine to return
  162. // access to a shared collection of identifiers. The order of these is a
  163. // single fixed random order for a given execution. The returned array
  164. // reference is only valid until the next call any method on this generator.
  165. auto GetSingleLengthIdentifiers(int length, int number)
  166. -> llvm::ArrayRef<llvm::StringRef>;
  167. private:
  168. // The shuffled state used to generate some number of classes.
  169. //
  170. // This state encodes all the shuffled entropy used for generating a number of
  171. // class definitions. While generating definitions, the state here will be
  172. // consumed until empty.
  173. struct ClassGenState {
  174. llvm::SmallVector<int> public_function_param_counts;
  175. llvm::SmallVector<int> public_method_param_counts;
  176. llvm::SmallVector<int> private_function_param_counts;
  177. llvm::SmallVector<int> private_method_param_counts;
  178. llvm::SmallVector<llvm::StringRef> class_names;
  179. llvm::SmallVector<llvm::StringRef> member_names;
  180. llvm::SmallVector<llvm::StringRef> param_names;
  181. };
  182. class UniqueIdentifierPopper;
  183. friend UniqueIdentifierPopper;
  184. using AppendFn = auto(int length, int number,
  185. llvm::SmallVectorImpl<llvm::StringRef>& dest) -> void;
  186. auto IsCpp() -> bool { return language_ == Language::Cpp; }
  187. auto GenerateRandomIdentifier(llvm::MutableArrayRef<char> dest_storage)
  188. -> void;
  189. auto AppendUniqueIdentifiers(int length, int number,
  190. llvm::SmallVectorImpl<llvm::StringRef>& dest)
  191. -> void;
  192. auto GetIdentifiersImpl(int number, int min_length, int max_length,
  193. bool uniform, llvm::function_ref<AppendFn> append)
  194. -> llvm::SmallVector<llvm::StringRef>;
  195. auto GetShuffledInts(int number, int min, int max) -> llvm::SmallVector<int>;
  196. auto GetClassGenState(int number, ClassParams params) -> ClassGenState;
  197. auto GenerateFunctionDecl(llvm::StringRef name, bool is_private,
  198. bool is_method, int param_count,
  199. llvm::StringRef indent,
  200. llvm::SmallVectorImpl<llvm::StringRef>& param_names,
  201. llvm::raw_ostream& os) -> void;
  202. auto GenerateClassDef(const ClassParams& params, ClassGenState& state,
  203. llvm::raw_ostream& os) -> void;
  204. absl::BitGen rng_;
  205. llvm::BumpPtrAllocator storage_;
  206. Map<int, llvm::SmallVector<llvm::StringRef>> identifiers_by_length_;
  207. Map<int, std::pair<int, Set<llvm::StringRef>>> unique_identifiers_by_length_;
  208. Language language_;
  209. };
  210. } // namespace Carbon::Testing
  211. #endif // CARBON_TESTING_BASE_SOURCE_GEN_H_