// Part of the Carbon Language project, under the Apache License v2.0 with LLVM // Exceptions. See /LICENSE for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception #include "testing/base/source_gen.h" #include #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/Sequence.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Support/FormatVariadic.h" #include "toolchain/lex/token_kind.h" namespace Carbon::Testing { auto SourceGen::Global() -> SourceGen& { static SourceGen global_gen; return global_gen; } SourceGen::SourceGen(Language language) : language_(language) {} // Some heuristic numbers used when formatting generated code. These heuristics // are loosely based on what we expect to make Carbon code readable, and might // not fit as well in C++, but we use the same heuristics across languages for // simplicity and to make the output in different languages more directly // comparable. constexpr static int NumSingleLineFunctionParams = 3; constexpr static int NumSingleLineMethodParams = 2; constexpr static int MaxParamsPerLine = 4; static auto EstimateAvgFunctionDeclLines(SourceGen::FunctionDeclParams params) -> double { // Currently model a uniform distribution [0, max] parameters. Assume a line // break before the first parameter for >3 and after every 4th. int param_lines = 0; for (int num_params : llvm::seq_inclusive(0, params.max_params)) { if (num_params > NumSingleLineFunctionParams) { param_lines += (num_params + MaxParamsPerLine - 1) / MaxParamsPerLine; } } return 1.0 + static_cast(param_lines) / (params.max_params + 1); } static auto EstimateAvgMethodDeclLines(SourceGen::MethodDeclParams params) -> double { // Currently model a uniform distribution [0, max] parameters. Assume a line // break before the first parameter for >2 and after every 4th. int param_lines = 0; for (int num_params : llvm::seq_inclusive(0, params.max_params)) { if (num_params > NumSingleLineMethodParams) { param_lines += (num_params + MaxParamsPerLine - 1) / MaxParamsPerLine; } } return 1.0 + static_cast(param_lines) / (params.max_params + 1); } // Note that this should match the heuristics used when formatting. // TODO: See top-level TODO about line estimates and formatting. static auto EstimateAvgClassDefLines(SourceGen::ClassParams params) -> double { // Comment line, and class open line. double avg = 2.0; // One comment line and blank line per function, plus the function lines. avg += (2.0 + EstimateAvgFunctionDeclLines(params.public_function_decl_params)) * params.public_function_decls; avg += (2.0 + EstimateAvgMethodDeclLines(params.public_method_decl_params)) * params.public_method_decls; avg += (2.0 + EstimateAvgFunctionDeclLines(params.private_function_decl_params)) * params.private_function_decls; avg += (2.0 + EstimateAvgMethodDeclLines(params.private_method_decl_params)) * params.private_method_decls; // A blank line and all the fields (if any). if (params.private_field_decls > 0) { avg += 1.0 + params.private_field_decls; } // No need to account for the class close line, we have an extra blank line // count for the last of the above. return avg; } auto SourceGen::GenAPIFileDenseDecls(int target_lines, DenseDeclParams params) -> std::string { std::string source; llvm::raw_string_ostream os(source); // Figure out how many classes fit in our target lines, each separated by a // blank line. We need to account the comment lines below to start the file. // Note that we want a blank line after our file comment block, so every class // needs a blank line. constexpr int NumFileCommentLines = 4; double avg_class_lines = EstimateAvgClassDefLines(params.class_params); CARBON_CHECK(target_lines > NumFileCommentLines + avg_class_lines) << "Not enough target lines to generate a single class!"; int num_classes = static_cast(target_lines - NumFileCommentLines) / (avg_class_lines + 1); int expected_lines = NumFileCommentLines + num_classes * (avg_class_lines + 1); os << "// Generated " << (!IsCpp() ? "Carbon" : "C++") << " source file.\n"; os << llvm::formatv("// {0} target lines: {1} classes, {2} expected lines", target_lines, num_classes, expected_lines) << "\n"; os << "//\n// Generating as an API file with dense declarations.\n"; auto class_gen_state = GetClassGenState(num_classes, params.class_params); for ([[maybe_unused]] int _ : llvm::seq(num_classes)) { os << "\n"; GenerateClassDef(params.class_params, class_gen_state, os); } // Make sure we consumed all the state. CARBON_CHECK(class_gen_state.public_function_param_counts.empty()); CARBON_CHECK(class_gen_state.public_method_param_counts.empty()); CARBON_CHECK(class_gen_state.private_function_param_counts.empty()); CARBON_CHECK(class_gen_state.private_method_param_counts.empty()); CARBON_CHECK(class_gen_state.class_names.empty()); return source; } auto SourceGen::GetShuffledIdentifiers(int number, int min_length, int max_length, bool uniform) -> llvm::SmallVector { llvm::SmallVector idents = GetIdentifiers(number, min_length, max_length, uniform); std::shuffle(idents.begin(), idents.end(), rng_); return idents; } auto SourceGen::GetShuffledUniqueIdentifiers(int number, int min_length, int max_length, bool uniform) -> llvm::SmallVector { CARBON_CHECK(min_length >= 4) << "Cannot trivially guarantee enough distinct, unique identifiers for " "lengths <= 3"; llvm::SmallVector idents = GetUniqueIdentifiers(number, min_length, max_length, uniform); std::shuffle(idents.begin(), idents.end(), rng_); return idents; } auto SourceGen::GetIdentifiers(int number, int min_length, int max_length, bool uniform) -> llvm::SmallVector { llvm::SmallVector idents = GetIdentifiersImpl( number, min_length, max_length, uniform, [this](int length, int length_count, llvm::SmallVectorImpl& dest) { auto length_idents = GetSingleLengthIdentifiers(length, length_count); dest.append(length_idents.begin(), length_idents.end()); }); return idents; } auto SourceGen::GetUniqueIdentifiers(int number, int min_length, int max_length, bool uniform) -> llvm::SmallVector { CARBON_CHECK(min_length >= 4) << "Cannot trivially guarantee enough distinct, unique identifiers for " "lengths <= 3"; llvm::SmallVector idents = GetIdentifiersImpl(number, min_length, max_length, uniform, [this](int length, int length_count, llvm::SmallVectorImpl& dest) { AppendUniqueIdentifiers(length, length_count, dest); }); return idents; } auto SourceGen::GetSingleLengthIdentifiers(int length, int number) -> llvm::ArrayRef { llvm::SmallVector& idents = identifiers_by_length_.Insert(length, {}).value(); if (static_cast(idents.size()) < number) { idents.reserve(number); for ([[maybe_unused]] int _ : llvm::seq(idents.size(), number)) { auto ident_storage = llvm::MutableArrayRef(reinterpret_cast(storage_.Allocate( /*Size=*/length, /*Alignment=*/1)), length); GenerateRandomIdentifier(ident_storage); llvm::StringRef new_id(ident_storage.data(), length); idents.push_back(new_id); } CARBON_CHECK(static_cast(idents.size()) == number); } return llvm::ArrayRef(idents).slice(0, number); } static auto IdentifierStartChars() -> llvm::ArrayRef { static llvm::SmallVector chars = [] { llvm::SmallVector chars; for (char c : llvm::seq_inclusive('A', 'Z')) { chars.push_back(c); } for (char c : llvm::seq_inclusive('a', 'z')) { chars.push_back(c); } return chars; }(); return chars; } static auto IdentifierChars() -> llvm::ArrayRef { static llvm::SmallVector chars = [] { llvm::ArrayRef start_chars = IdentifierStartChars(); llvm::SmallVector chars(start_chars.begin(), start_chars.end()); chars.push_back('_'); for (char c : llvm::seq_inclusive('0', '9')) { chars.push_back(c); } return chars; }(); return chars; } constexpr static llvm::StringRef NonCarbonCppKeywords[] = { "asm", "do", "double", "float", "int", "long", "new", "signed", "try", "unix", "unsigned", "xor", }; // Returns a random identifier string of the specified length. // // Ensures this is a valid identifier, avoiding any overlapping syntaxes or // keywords both in Carbon and C++. // // This routine is somewhat expensive and so is useful to cache and reduce the // frequency of calls. However, each time it is called it computes a completely // new random identifier and so can be useful to eventually find a distinct // identifier when needed. auto SourceGen::GenerateRandomIdentifier( llvm::MutableArrayRef dest_storage) -> void { llvm::ArrayRef start_chars = IdentifierStartChars(); llvm::ArrayRef chars = IdentifierChars(); auto ident = llvm::StringRef(dest_storage.data(), dest_storage.size()); do { dest_storage[0] = start_chars[absl::Uniform(rng_, 0, start_chars.size())]; for (int i : llvm::seq(1, dest_storage.size())) { dest_storage[i] = chars[absl::Uniform(rng_, 0, chars.size())]; } } while ( // TODO: Clean up and simplify this code. With some small refactorings and // post-processing we should be able to make this both easier to read and // less inefficient. llvm::any_of( Lex::TokenKind::KeywordTokens, [ident](auto token) { return ident == token.fixed_spelling(); }) || llvm::is_contained(NonCarbonCppKeywords, ident) || (llvm::is_contained({'i', 'u', 'f'}, ident[0]) && llvm::all_of(ident.substr(1), [](const char c) { return llvm::isDigit(c); }))); } // Appends a number of unique, random identifiers with a particular length to // the provided destination vector. // // Uses, and when necessary grows, a cached sequence of random identifiers with // the specified length. Because these are cached, this is efficient to call // repeatedly, but will not produce a different sequence of identifiers. auto SourceGen::AppendUniqueIdentifiers( int length, int number, llvm::SmallVectorImpl& dest) -> void { auto& [count, unique_idents] = unique_identifiers_by_length_.Insert(length, {}).value(); // See if we need to grow our pool of unique identifiers with the requested // length. if (count < number) { // We'll need to insert exactly the requested new unique identifiers. All // our other inserts will find an existing entry. unique_idents.GrowForInsertCount(count - number); // Generate the needed number of identifiers. for ([[maybe_unused]] int _ : llvm::seq(count, number)) { // Allocate stable storage for the identifier so we can form stable // `StringRef`s to it. auto ident_storage = llvm::MutableArrayRef(reinterpret_cast(storage_.Allocate( /*Size=*/length, /*Alignment=*/1)), length); // Repeatedly generate novel identifiers of this length until we find a // new unique one. for (;;) { GenerateRandomIdentifier(ident_storage); auto result = unique_idents.Insert(llvm::StringRef(ident_storage.data(), length)); if (result.is_inserted()) { break; } } } count = number; } // Append all the identifiers directly out of the set. We make no guarantees // about the relative order so we just use the non-deterministic order of the // set and avoid additional storage. // // TODO: It's awkward the `ForEach` here can't early-exit. This just walks the // whole set which is harmless if inefficient. We should add early exiting // the loop support to `Set` and update this code. unique_idents.ForEach([&](llvm::StringRef ident) { if (number > 0) { dest.push_back(ident); --number; } }); CARBON_CHECK(number == 0); } // An array of the counts that should be used for each identifier length to // produce our desired distribution. // // Note that the zero-based index corresponds to a 1-based length, so the count // for identifiers of length 1 is at index 0. static constexpr std::array IdentifierLengthCounts = [] { std::array ident_length_counts; // For non-uniform distribution, we simulate a distribution roughly based on // the observed histogram of identifier lengths, but smoothed a bit and // reduced to small counts so that we cycle through all the lengths // reasonably quickly. We want sampling of even 10% of NumTokens from this // in a round-robin form to not be skewed overly much. This still inherently // compresses the long tail as we'd rather have coverage even though it // distorts the distribution a bit. // // The distribution here comes from a script that analyzes source code run // over a few directories of LLVM. The script renders a visual ascii-art // histogram along with the data for each bucket, and that output is // included in comments above each bucket size below to help visualize the // rough shape we're aiming for. // // 1 characters [3976] ███████████████████████████████▊ ident_length_counts[0] = 40; // 2 characters [3724] █████████████████████████████▊ ident_length_counts[1] = 40; // 3 characters [4173] █████████████████████████████████▍ ident_length_counts[2] = 40; // 4 characters [5000] ████████████████████████████████████████ ident_length_counts[3] = 50; // 5 characters [1568] ████████████▌ ident_length_counts[4] = 20; // 6 characters [2226] █████████████████▊ ident_length_counts[5] = 20; // 7 characters [2380] ███████████████████ ident_length_counts[6] = 20; // 8 characters [1786] ██████████████▎ ident_length_counts[7] = 18; // 9 characters [1397] ███████████▏ ident_length_counts[8] = 12; // 10 characters [ 739] █████▉ ident_length_counts[9] = 12; // 11 characters [ 779] ██████▎ ident_length_counts[10] = 12; // 12 characters [1344] ██████████▊ ident_length_counts[11] = 12; // 13 characters [ 498] ████ ident_length_counts[12] = 5; // 14 characters [ 284] ██▎ ident_length_counts[13] = 3; // 15 characters [ 172] █▍ // 16 characters [ 278] ██▎ // 17 characters [ 191] █▌ // 18 characters [ 207] █▋ for (int i = 14; i < 18; ++i) { ident_length_counts[i] = 2; } // 19 - 63 characters are all <100 but non-zero, and we map them to 1 for // coverage despite slightly over weighting the tail. for (int i = 18; i < 64; ++i) { ident_length_counts[i] = 1; } return ident_length_counts; }(); // A helper to sum elements of a range. template static auto Sum(const T& range) -> int { return std::accumulate(range.begin(), range.end(), 0); } // A template function that implements the common logic of `GetIdentifiers` and // `GetUniqueIdentifiers`. Most parameters correspond to the parameters of those // functions. Additionally, an `AppendFunc` callable is provided to implement // the appending operation. // // The main functionality provided here is collecting the correct number of // identifiers from each of the lengths in the range [min_length, max_length] // and either in our default representative distribution or a uniform // distribution. auto SourceGen::GetIdentifiersImpl(int number, int min_length, int max_length, bool uniform, llvm::function_ref append) -> llvm::SmallVector { CARBON_CHECK(min_length <= max_length); CARBON_CHECK(uniform || max_length <= 64) << "Cannot produce a meaningful non-uniform distribution of lengths " "longer than 64 as those are exceedingly rare in our observed data " "sets."; llvm::SmallVector idents; idents.reserve(number); // First, compute the total weight of the distribution so we know how many // identifiers we'll get each time we collect from it. int num_lengths = max_length - min_length + 1; auto length_counts = llvm::ArrayRef(IdentifierLengthCounts).slice(min_length - 1, num_lengths); int count_sum = uniform ? num_lengths : Sum(length_counts); CARBON_CHECK(count_sum >= 1); int number_rem = number % count_sum; // Finally, walk through each length in the distribution. for (int length : llvm::seq_inclusive(min_length, max_length)) { // Scale how many identifiers we want of this length if computing a // non-uniform distribution. For uniform, we always take one. int scale = uniform ? 1 : IdentifierLengthCounts[length - 1]; // Now we can compute how many identifiers of this length to request. int length_count = (number / count_sum) * scale; if (number_rem > 0) { int rem_adjustment = std::min(scale, number_rem); length_count += rem_adjustment; number_rem -= rem_adjustment; } append(length, length_count, idents); } CARBON_CHECK(number_rem == 0) << "Unexpected number remaining: " << number_rem; CARBON_CHECK(static_cast(idents.size()) == number) << "Ended up with " << idents.size() << " identifiers instead of the requested " << number; return idents; } // Returns a shuffled sequence of integers in the range [min, max]. // // The order of the returned integers is random, but each integer in the range // appears the same number of times in the result, with the number of // appearances rounded up for lower numbers and rounded down for higher numbers // in order to exactly produce `number` results. auto SourceGen::GetShuffledInts(int number, int min, int max) -> llvm::SmallVector { llvm::SmallVector ints; ints.reserve(number); // Evenly distribute to each value between min and max. int num_values = max - min + 1; for (int i : llvm::seq_inclusive(min, max)) { int i_count = number / num_values; i_count += i < (min + (number % num_values)); ints.append(i_count, i); } CARBON_CHECK(static_cast(ints.size()) == number); std::shuffle(ints.begin(), ints.end(), rng_); return ints; } // Given a number of class definitions and the params with which to generate // them, builds the state that will be used while generating that many classes. // // We build the state first and across all the class definitions that will be // generated so that we can distribute random components across all the // definitions. auto SourceGen::GetClassGenState(int number, ClassParams params) -> ClassGenState { ClassGenState state; state.public_function_param_counts = GetShuffledInts(number * params.public_function_decls, 0, params.public_function_decl_params.max_params); state.public_method_param_counts = GetShuffledInts(number * params.public_method_decls, 0, params.public_method_decl_params.max_params); state.private_function_param_counts = GetShuffledInts(number * params.private_function_decls, 0, params.private_function_decl_params.max_params); state.private_method_param_counts = GetShuffledInts(number * params.private_method_decls, 0, params.private_method_decl_params.max_params); state.class_names = GetShuffledUniqueIdentifiers(number, /*min_length=*/5); int num_members = number * (params.public_function_decls + params.public_method_decls + params.private_function_decls + params.private_method_decls + params.private_field_decls); state.member_names = GetShuffledIdentifiers(num_members, /*min_length=*/4); int num_params = Sum(state.public_function_param_counts) + Sum(state.public_method_param_counts) + Sum(state.private_function_param_counts) + Sum(state.private_method_param_counts); state.param_names = GetShuffledIdentifiers(num_params); return state; } // A helper to pop series of unique identifiers off a sequence of random // identifiers that may have duplicates. // // This is particularly designed to work with the sequences of non-unique // identifiers produced by `GetShuffledIdentifiers` with the important property // that while popping off unique identifiers found in the shuffled list, we // don't change the distribution of identifier lengths. // // The uniqueness is only per-instance of the class, and so an instance can be // used to extract a series of names that share a scope. // // It works by scanning the sequence to extract each unique identifier found, // swapping it to the back and popping it off the list. This does shuffle the // order, but it isn't expected to do so in an interesting way. // // It also provides a fallback path in case there are no unique identifiers left // which computes fresh, random identifiers with the same length as the next one // in the sequence until a unique one is found. // // For simplicity of the fallback path, the lifetime of the identifiers produced // is bound to the lifetime of the popper instance, and not the generator as a // whole. If this is ever a problematic constraint, we can start copying // fallback identifiers into the generator's storage. class SourceGen::UniqueIdentifierPopper { public: explicit UniqueIdentifierPopper(SourceGen& gen, llvm::SmallVectorImpl& data) : gen_(&gen), data_(&data), it_(data_->rbegin()) {} // Pop the next unique identifier that can be found in the data, or synthesize // one with a valid length. Always consumes exactly one identifier from the // data. // // Note that the lifetime of the underlying identifier is that of the popper // and not the underlying data. auto Pop() -> llvm::StringRef { for (auto end = data_->rend(); it_ != end; ++it_) { auto insert = set_.Insert(*it_); if (!insert.is_inserted()) { continue; } if (it_ != data_->rbegin()) { std::swap(*data_->rbegin(), *it_); } CARBON_CHECK(insert.key() == data_->back()); return data_->pop_back_val(); } // Out of unique elements. Overwrite the back, preserving its length, // generating a new identifiers until we find a unique one and return that. // This ensures we continue to consume the structure and produce the same // size identifiers even in the fallback. int length = data_->pop_back_val().size(); auto fallback_ident_storage = llvm::MutableArrayRef(reinterpret_cast(gen_->storage_.Allocate( /*Size=*/length, /*Alignment=*/1)), length); for (;;) { gen_->GenerateRandomIdentifier(fallback_ident_storage); auto fallback_id = llvm::StringRef(fallback_ident_storage.data(), length); if (set_.Insert(fallback_id).is_inserted()) { return fallback_id; } } } private: SourceGen* gen_; llvm::SmallVectorImpl* data_; llvm::SmallVectorImpl::reverse_iterator it_; Set set_; }; // Generates a function declaration and writes it to the provided stream. // // The declaration can be configured with a function name, private modifier, // whether it is a method, the parameter count, an how indented it is. // // This is also provided a collection of identifiers to consume as parameter // names -- it will use a unique popper to extract unique parameter names from // this collection. auto SourceGen::GenerateFunctionDecl( llvm::StringRef name, bool is_private, bool is_method, int param_count, llvm::StringRef indent, llvm::SmallVectorImpl& param_names, llvm::raw_ostream& os) -> void { os << indent << "// TODO: make better comment text\n"; if (!IsCpp()) { os << indent << (is_private ? "private " : "") << "fn " << name; if (is_method) { os << "[self: Self]"; } } else { os << indent; if (!is_method) { os << "static "; } os << "auto " << name; } os << "("; if (param_count > (is_method ? NumSingleLineMethodParams : NumSingleLineFunctionParams)) { os << "\n" << indent << " "; } UniqueIdentifierPopper unique_param_names(*this, param_names); for (int i : llvm::seq(param_count)) { if (i > 0) { if ((i % MaxParamsPerLine) == 0) { os << ",\n" << indent << " "; } else { os << ", "; } } if (!IsCpp()) { os << unique_param_names.Pop() << ": i32"; } else { os << "int " << unique_param_names.Pop(); } } os << ")" << (IsCpp() ? " -> void" : "") << ";\n"; } // Generate a class definition and write it to the provided stream. // // The structure of the definition is guided by the `params` provided, and it // consumes the provided state. auto SourceGen::GenerateClassDef(const ClassParams& params, ClassGenState& state, llvm::raw_ostream& os) -> void { os << "// TODO: make better comment text\n"; os << "class " << state.class_names.pop_back_val() << " {\n"; if (IsCpp()) { os << " public:\n"; } UniqueIdentifierPopper unique_member_names(*this, state.member_names); llvm::ListSeparator line_sep("\n"); for ([[maybe_unused]] int _ : llvm::seq(params.public_function_decls)) { os << line_sep; GenerateFunctionDecl(unique_member_names.Pop(), /*is_private=*/false, /*is_method=*/false, state.public_function_param_counts.pop_back_val(), /*indent=*/" ", state.param_names, os); } for ([[maybe_unused]] int _ : llvm::seq(params.public_method_decls)) { os << line_sep; GenerateFunctionDecl(unique_member_names.Pop(), /*is_private=*/false, /*is_method=*/true, state.public_method_param_counts.pop_back_val(), /*indent=*/" ", state.param_names, os); } if (IsCpp()) { os << "\n private:\n"; // Reset the separator. line_sep = llvm::ListSeparator("\n"); } for ([[maybe_unused]] int _ : llvm::seq(params.private_function_decls)) { os << line_sep; GenerateFunctionDecl(unique_member_names.Pop(), /*is_private=*/true, /*is_method=*/false, state.private_function_param_counts.pop_back_val(), /*indent=*/" ", state.param_names, os); } for ([[maybe_unused]] int _ : llvm::seq(params.private_method_decls)) { os << line_sep; GenerateFunctionDecl(unique_member_names.Pop(), /*is_private=*/true, /*is_method=*/true, state.private_method_param_counts.pop_back_val(), /*indent=*/" ", state.param_names, os); } os << line_sep; for ([[maybe_unused]] int _ : llvm::seq(params.private_field_decls)) { if (!IsCpp()) { os << " private var " << unique_member_names.Pop() << ": i32;\n"; } else { os << " int " << unique_member_names.Pop() << ";\n"; } } os << "}" << (IsCpp() ? ";" : "") << "\n"; } } // namespace Carbon::Testing