string_helpers.cpp 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #include "common/string_helpers.h"
  5. #include <algorithm>
  6. #include <optional>
  7. #include "common/check.h"
  8. #include "llvm/ADT/StringExtras.h"
  9. #include "llvm/ADT/StringRef.h"
  10. #include "llvm/Support/ConvertUTF.h"
  11. namespace Carbon {
  12. static constexpr llvm::StringRef TripleQuotes = "'''";
  13. static constexpr llvm::StringRef HorizontalWhitespaceChars = " \t";
  14. // Carbon only takes uppercase hex input.
  15. static auto FromHex(char c) -> std::optional<char> {
  16. if (c >= '0' && c <= '9') {
  17. return c - '0';
  18. }
  19. if (c >= 'A' && c <= 'F') {
  20. return 10 + c - 'A';
  21. }
  22. return std::nullopt;
  23. }
  24. auto UnescapeStringLiteral(llvm::StringRef source, const int hashtag_num,
  25. bool is_block_string) -> std::optional<std::string> {
  26. std::string ret;
  27. ret.reserve(source.size());
  28. std::string escape = "\\";
  29. escape.resize(hashtag_num + 1, '#');
  30. size_t i = 0;
  31. while (i < source.size()) {
  32. char c = source[i];
  33. if (i + hashtag_num < source.size() &&
  34. source.slice(i, i + hashtag_num + 1).equals(escape)) {
  35. i += hashtag_num + 1;
  36. if (i == source.size()) {
  37. return std::nullopt;
  38. }
  39. switch (source[i]) {
  40. case 'n':
  41. ret.push_back('\n');
  42. break;
  43. case 'r':
  44. ret.push_back('\r');
  45. break;
  46. case 't':
  47. ret.push_back('\t');
  48. break;
  49. case '0':
  50. if (i + 1 < source.size() && llvm::isDigit(source[i + 1])) {
  51. // \0[0-9] is reserved.
  52. return std::nullopt;
  53. }
  54. ret.push_back('\0');
  55. break;
  56. case '"':
  57. ret.push_back('"');
  58. break;
  59. case '\'':
  60. ret.push_back('\'');
  61. break;
  62. case '\\':
  63. ret.push_back('\\');
  64. break;
  65. case 'x': {
  66. i += 2;
  67. if (i >= source.size()) {
  68. return std::nullopt;
  69. }
  70. std::optional<char> c1 = FromHex(source[i - 1]);
  71. std::optional<char> c2 = FromHex(source[i]);
  72. if (c1 == std::nullopt || c2 == std::nullopt) {
  73. return std::nullopt;
  74. }
  75. ret.push_back(16 * *c1 + *c2);
  76. break;
  77. }
  78. case 'u': {
  79. ++i;
  80. if (i >= source.size() || source[i] != '{') {
  81. return std::nullopt;
  82. }
  83. unsigned int unicode_int = 0;
  84. ++i;
  85. int original_i = i;
  86. while (i < source.size() && source[i] != '}') {
  87. std::optional<char> hex_val = FromHex(source[i]);
  88. if (hex_val == std::nullopt) {
  89. return std::nullopt;
  90. }
  91. unicode_int = unicode_int << 4;
  92. unicode_int += hex_val.value();
  93. ++i;
  94. if (i - original_i > 8) {
  95. return std::nullopt;
  96. }
  97. }
  98. if (i >= source.size()) {
  99. return std::nullopt;
  100. }
  101. if (i - original_i == 0) {
  102. return std::nullopt;
  103. }
  104. char utf8_buf[4];
  105. char* utf8_end = &utf8_buf[0];
  106. if (!llvm::ConvertCodePointToUTF8(unicode_int, utf8_end)) {
  107. return std::nullopt;
  108. }
  109. ret.append(utf8_buf, utf8_end - utf8_buf);
  110. break;
  111. }
  112. case '\n':
  113. if (!is_block_string) {
  114. return std::nullopt;
  115. }
  116. break;
  117. default:
  118. // Unsupported.
  119. return std::nullopt;
  120. }
  121. } else if (c == '\t') {
  122. // Disallow non-` ` horizontal whitespace:
  123. // https://github.com/carbon-language/carbon-lang/blob/trunk/docs/design/lexical_conventions/whitespace.md
  124. // TODO: This doesn't handle unicode whitespace.
  125. return std::nullopt;
  126. } else {
  127. ret.push_back(c);
  128. }
  129. ++i;
  130. }
  131. return ret;
  132. }
  133. auto ParseBlockStringLiteral(llvm::StringRef source, const int hashtag_num)
  134. -> ErrorOr<std::string> {
  135. llvm::SmallVector<llvm::StringRef> lines;
  136. source.split(lines, '\n', /*MaxSplit=*/-1, /*KeepEmpty=*/true);
  137. if (lines.size() < 2) {
  138. return Error("Too few lines");
  139. }
  140. llvm::StringRef first = lines[0];
  141. if (!first.consume_front(TripleQuotes)) {
  142. return Error("Should start with triple quotes: " + first);
  143. }
  144. first = first.rtrim(HorizontalWhitespaceChars);
  145. // Remaining chars, if any, are a file type indicator.
  146. if (first.find_first_of("\"#") != llvm::StringRef::npos ||
  147. first.find_first_of(HorizontalWhitespaceChars) != llvm::StringRef::npos) {
  148. return Error("Invalid characters in file type indicator: " + first);
  149. }
  150. llvm::StringRef last = lines[lines.size() - 1];
  151. const size_t last_length = last.size();
  152. last = last.ltrim(HorizontalWhitespaceChars);
  153. const size_t indent = last_length - last.size();
  154. if (last != TripleQuotes) {
  155. return Error("Should end with triple quotes: " + last);
  156. }
  157. std::string parsed;
  158. for (size_t i = 1; i < lines.size() - 1; ++i) {
  159. llvm::StringRef line = lines[i];
  160. const size_t first_non_ws =
  161. line.find_first_not_of(HorizontalWhitespaceChars);
  162. if (first_non_ws == llvm::StringRef::npos) {
  163. // Empty or whitespace-only line.
  164. line = "";
  165. } else {
  166. if (first_non_ws < indent) {
  167. return Error("Wrong indent for line: " + line + ", expected " +
  168. llvm::Twine(indent));
  169. }
  170. line = line.drop_front(indent).rtrim(HorizontalWhitespaceChars);
  171. }
  172. // Unescaping with \n appended to handle things like \\<newline>.
  173. llvm::SmallVector<char> buffer;
  174. std::optional<std::string> unescaped =
  175. UnescapeStringLiteral((line + "\n").toStringRef(buffer), hashtag_num,
  176. /*is_block_string=*/true);
  177. if (!unescaped.has_value()) {
  178. return Error("Invalid escaping in " + line);
  179. }
  180. // A \<newline> string collapses into nothing.
  181. if (!unescaped->empty()) {
  182. parsed.append(*unescaped);
  183. }
  184. }
  185. return parsed;
  186. }
  187. auto StringRefContainsPointer(llvm::StringRef ref, const char* ptr) -> bool {
  188. auto le = std::less_equal<>();
  189. return le(ref.begin(), ptr) && le(ptr, ref.end());
  190. }
  191. } // namespace Carbon