string_helpers.cpp 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #include "common/string_helpers.h"
  5. #include <string.h>
  6. #include <sys/types.h>
  7. #include <algorithm>
  8. #include <array>
  9. #include <optional>
  10. #include <string>
  11. #include "common/check.h"
  12. #include "llvm/ADT/ArrayRef.h"
  13. #include "llvm/ADT/STLExtras.h"
  14. #include "llvm/ADT/SmallVector.h"
  15. #include "llvm/ADT/StringExtras.h"
  16. #include "llvm/ADT/StringRef.h"
  17. #include "llvm/Support/ConvertUTF.h"
  18. namespace Carbon {
  19. static constexpr llvm::StringRef TripleQuotes = "'''";
  20. static constexpr llvm::StringRef HorizontalWhitespaceChars = " \t";
  21. // Carbon only takes uppercase hex input.
  22. static auto FromHex(char c) -> std::optional<char> {
  23. if (c >= '0' && c <= '9') {
  24. return c - '0';
  25. }
  26. if (c >= 'A' && c <= 'F') {
  27. return 10 + c - 'A';
  28. }
  29. return std::nullopt;
  30. }
  31. auto UnescapeStringLiteral(llvm::StringRef source, const int hashtag_num,
  32. bool is_block_string) -> std::optional<std::string> {
  33. std::string ret;
  34. ret.reserve(source.size());
  35. std::string escape = "\\";
  36. escape.resize(hashtag_num + 1, '#');
  37. size_t i = 0;
  38. while (i < source.size()) {
  39. char c = source[i];
  40. if (i + hashtag_num < source.size() &&
  41. source.slice(i, i + hashtag_num + 1) == escape) {
  42. i += hashtag_num + 1;
  43. if (i == source.size()) {
  44. return std::nullopt;
  45. }
  46. switch (source[i]) {
  47. case 'n':
  48. ret.push_back('\n');
  49. break;
  50. case 'r':
  51. ret.push_back('\r');
  52. break;
  53. case 't':
  54. ret.push_back('\t');
  55. break;
  56. case '0':
  57. if (i + 1 < source.size() && llvm::isDigit(source[i + 1])) {
  58. // \0[0-9] is reserved.
  59. return std::nullopt;
  60. }
  61. ret.push_back('\0');
  62. break;
  63. case '"':
  64. ret.push_back('"');
  65. break;
  66. case '\'':
  67. ret.push_back('\'');
  68. break;
  69. case '\\':
  70. ret.push_back('\\');
  71. break;
  72. case 'x': {
  73. i += 2;
  74. if (i >= source.size()) {
  75. return std::nullopt;
  76. }
  77. std::optional<char> c1 = FromHex(source[i - 1]);
  78. std::optional<char> c2 = FromHex(source[i]);
  79. if (c1 == std::nullopt || c2 == std::nullopt) {
  80. return std::nullopt;
  81. }
  82. ret.push_back(16 * *c1 + *c2);
  83. break;
  84. }
  85. case 'u': {
  86. ++i;
  87. if (i >= source.size() || source[i] != '{') {
  88. return std::nullopt;
  89. }
  90. unsigned int unicode_int = 0;
  91. ++i;
  92. int original_i = i;
  93. while (i < source.size() && source[i] != '}') {
  94. std::optional<char> hex_val = FromHex(source[i]);
  95. if (hex_val == std::nullopt) {
  96. return std::nullopt;
  97. }
  98. unicode_int = unicode_int << 4;
  99. unicode_int += hex_val.value();
  100. ++i;
  101. if (i - original_i > 8) {
  102. return std::nullopt;
  103. }
  104. }
  105. if (i >= source.size()) {
  106. return std::nullopt;
  107. }
  108. if (i - original_i == 0) {
  109. return std::nullopt;
  110. }
  111. char utf8_buf[4];
  112. char* utf8_end = &utf8_buf[0];
  113. if (!llvm::ConvertCodePointToUTF8(unicode_int, utf8_end)) {
  114. return std::nullopt;
  115. }
  116. ret.append(utf8_buf, utf8_end - utf8_buf);
  117. break;
  118. }
  119. case '\n':
  120. if (!is_block_string) {
  121. return std::nullopt;
  122. }
  123. break;
  124. default:
  125. // Unsupported.
  126. return std::nullopt;
  127. }
  128. } else if (c == '\t') {
  129. // Disallow non-` ` horizontal whitespace:
  130. // https://github.com/carbon-language/carbon-lang/blob/trunk/docs/design/lexical_conventions/whitespace.md
  131. // TODO: This doesn't handle unicode whitespace.
  132. return std::nullopt;
  133. } else {
  134. ret.push_back(c);
  135. }
  136. ++i;
  137. }
  138. return ret;
  139. }
  140. auto ParseBlockStringLiteral(llvm::StringRef source, const int hashtag_num)
  141. -> ErrorOr<std::string> {
  142. llvm::SmallVector<llvm::StringRef> lines;
  143. source.split(lines, '\n', /*MaxSplit=*/-1, /*KeepEmpty=*/true);
  144. if (lines.size() < 2) {
  145. return Error("Too few lines");
  146. }
  147. llvm::StringRef first = lines[0];
  148. if (!first.consume_front(TripleQuotes)) {
  149. return Error("Should start with triple quotes: " + first);
  150. }
  151. first = first.rtrim(HorizontalWhitespaceChars);
  152. // Remaining chars, if any, are a file type indicator.
  153. if (first.find_first_of("\"#") != llvm::StringRef::npos ||
  154. first.find_first_of(HorizontalWhitespaceChars) != llvm::StringRef::npos) {
  155. return Error("Invalid characters in file type indicator: " + first);
  156. }
  157. llvm::StringRef last = lines[lines.size() - 1];
  158. const size_t last_length = last.size();
  159. last = last.ltrim(HorizontalWhitespaceChars);
  160. const size_t indent = last_length - last.size();
  161. if (last != TripleQuotes) {
  162. return Error("Should end with triple quotes: " + last);
  163. }
  164. std::string parsed;
  165. for (size_t i = 1; i < lines.size() - 1; ++i) {
  166. llvm::StringRef line = lines[i];
  167. const size_t first_non_ws =
  168. line.find_first_not_of(HorizontalWhitespaceChars);
  169. if (first_non_ws == llvm::StringRef::npos) {
  170. // Empty or whitespace-only line.
  171. line = "";
  172. } else {
  173. if (first_non_ws < indent) {
  174. return Error("Wrong indent for line: " + line + ", expected " +
  175. llvm::Twine(indent));
  176. }
  177. line = line.drop_front(indent).rtrim(HorizontalWhitespaceChars);
  178. }
  179. // Unescaping with \n appended to handle things like \\<newline>.
  180. llvm::SmallVector<char> buffer;
  181. std::optional<std::string> unescaped =
  182. UnescapeStringLiteral((line + "\n").toStringRef(buffer), hashtag_num,
  183. /*is_block_string=*/true);
  184. if (!unescaped.has_value()) {
  185. return Error("Invalid escaping in " + line);
  186. }
  187. // A \<newline> string collapses into nothing.
  188. if (!unescaped->empty()) {
  189. parsed.append(*unescaped);
  190. }
  191. }
  192. return parsed;
  193. }
  194. auto StringRefContainsPointer(llvm::StringRef ref, const char* ptr) -> bool {
  195. auto le = std::less_equal<>();
  196. return le(ref.begin(), ptr) && le(ptr, ref.end());
  197. }
  198. auto BuildCStrArgs(llvm::StringRef tool_path,
  199. llvm::ArrayRef<llvm::StringRef> args,
  200. llvm::OwningArrayRef<char>& cstr_arg_storage)
  201. -> llvm::SmallVector<const char*, 64> {
  202. return BuildCStrArgs(tool_path, /*prefix_args=*/{}, args, cstr_arg_storage);
  203. }
  204. auto BuildCStrArgs(llvm::StringRef tool_path,
  205. llvm::ArrayRef<std::string> prefix_args,
  206. llvm::ArrayRef<llvm::StringRef> args,
  207. llvm::OwningArrayRef<char>& cstr_arg_storage)
  208. -> llvm::SmallVector<const char*, 64> {
  209. // Render the arguments into null-terminated C-strings. Command lines can get
  210. // quite long in build systems so this tries to minimize the memory allocation
  211. // overhead.
  212. // Precompute the total C-string data size needed.
  213. int total_size = tool_path.size() + 1;
  214. for (llvm::StringRef arg : args) {
  215. // Accumulate both the string size and a null terminator byte.
  216. total_size += arg.size() + 1;
  217. }
  218. // Allocate one chunk of storage for the actual C-strings, and reserve a
  219. // vector of pointers into the storage.
  220. cstr_arg_storage = llvm::OwningArrayRef<char>(total_size);
  221. ssize_t i = 0;
  222. auto make_cstr = [&](llvm::StringRef arg) {
  223. char* cstr = &cstr_arg_storage[i];
  224. memcpy(cstr, arg.data(), arg.size());
  225. cstr[arg.size()] = '\0';
  226. i += arg.size() + 1;
  227. return cstr;
  228. };
  229. llvm::SmallVector<const char*, 64> cstr_args;
  230. cstr_args.reserve(1 + prefix_args.size() + args.size());
  231. cstr_args.push_back(make_cstr(tool_path));
  232. for (const std::string& prefix_arg : prefix_args) {
  233. cstr_args.push_back(prefix_arg.c_str());
  234. }
  235. for (llvm::StringRef arg : args) {
  236. cstr_args.push_back(make_cstr(arg));
  237. }
  238. return cstr_args;
  239. }
  240. } // namespace Carbon