string_helpers.cpp 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #include "common/string_helpers.h"
  5. #include <algorithm>
  6. #include <optional>
  7. #include "common/check.h"
  8. #include "llvm/ADT/StringExtras.h"
  9. #include "llvm/ADT/StringRef.h"
  10. namespace Carbon {
  11. static constexpr llvm::StringRef TripleQuotes = R"(""")";
  12. static constexpr llvm::StringRef HorizontalWhitespaceChars = " \t";
  13. // Carbon only takes uppercase hex input.
  14. static auto FromHex(char c) -> std::optional<char> {
  15. if (c >= '0' && c <= '9') {
  16. return c - '0';
  17. }
  18. if (c >= 'A' && c <= 'F') {
  19. return 10 + c - 'A';
  20. }
  21. return std::nullopt;
  22. }
  23. auto UnescapeStringLiteral(llvm::StringRef source, const int hashtag_num,
  24. bool is_block_string) -> std::optional<std::string> {
  25. std::string ret;
  26. ret.reserve(source.size());
  27. std::string escape = "\\";
  28. escape.resize(hashtag_num + 1, '#');
  29. size_t i = 0;
  30. while (i < source.size()) {
  31. char c = source[i];
  32. if (i + hashtag_num < source.size() &&
  33. source.slice(i, i + hashtag_num + 1).equals(escape)) {
  34. i += hashtag_num + 1;
  35. if (i == source.size()) {
  36. return std::nullopt;
  37. }
  38. switch (source[i]) {
  39. case 'n':
  40. ret.push_back('\n');
  41. break;
  42. case 'r':
  43. ret.push_back('\r');
  44. break;
  45. case 't':
  46. ret.push_back('\t');
  47. break;
  48. case '0':
  49. if (i + 1 < source.size() && llvm::isDigit(source[i + 1])) {
  50. // \0[0-9] is reserved.
  51. return std::nullopt;
  52. }
  53. ret.push_back('\0');
  54. break;
  55. case '"':
  56. ret.push_back('"');
  57. break;
  58. case '\'':
  59. ret.push_back('\'');
  60. break;
  61. case '\\':
  62. ret.push_back('\\');
  63. break;
  64. case 'x': {
  65. i += 2;
  66. if (i >= source.size()) {
  67. return std::nullopt;
  68. }
  69. std::optional<char> c1 = FromHex(source[i - 1]);
  70. std::optional<char> c2 = FromHex(source[i]);
  71. if (c1 == std::nullopt || c2 == std::nullopt) {
  72. return std::nullopt;
  73. }
  74. ret.push_back(16 * *c1 + *c2);
  75. break;
  76. }
  77. case 'u':
  78. CARBON_FATAL() << "\\u is not yet supported in string literals";
  79. case '\n':
  80. if (!is_block_string) {
  81. return std::nullopt;
  82. }
  83. break;
  84. default:
  85. // Unsupported.
  86. return std::nullopt;
  87. }
  88. } else if (c == '\t') {
  89. // Disallow non-` ` horizontal whitespace:
  90. // https://github.com/carbon-language/carbon-lang/blob/trunk/docs/design/lexical_conventions/whitespace.md
  91. // TODO: This doesn't handle unicode whitespace.
  92. return std::nullopt;
  93. } else {
  94. ret.push_back(c);
  95. }
  96. ++i;
  97. }
  98. return ret;
  99. }
  100. auto ParseBlockStringLiteral(llvm::StringRef source, const int hashtag_num)
  101. -> ErrorOr<std::string> {
  102. llvm::SmallVector<llvm::StringRef> lines;
  103. source.split(lines, '\n', /*MaxSplit=*/-1, /*KeepEmpty=*/true);
  104. if (lines.size() < 2) {
  105. return Error("Too few lines");
  106. }
  107. llvm::StringRef first = lines[0];
  108. if (!first.consume_front(TripleQuotes)) {
  109. return Error("Should start with triple quotes: " + first);
  110. }
  111. first = first.rtrim(HorizontalWhitespaceChars);
  112. // Remaining chars, if any, are a file type indicator.
  113. if (first.find_first_of("\"#") != llvm::StringRef::npos ||
  114. first.find_first_of(HorizontalWhitespaceChars) != llvm::StringRef::npos) {
  115. return Error("Invalid characters in file type indicator: " + first);
  116. }
  117. llvm::StringRef last = lines[lines.size() - 1];
  118. const size_t last_length = last.size();
  119. last = last.ltrim(HorizontalWhitespaceChars);
  120. const size_t indent = last_length - last.size();
  121. if (last != TripleQuotes) {
  122. return Error("Should end with triple quotes: " + last);
  123. }
  124. std::string parsed;
  125. for (size_t i = 1; i < lines.size() - 1; ++i) {
  126. llvm::StringRef line = lines[i];
  127. const size_t first_non_ws =
  128. line.find_first_not_of(HorizontalWhitespaceChars);
  129. if (first_non_ws == llvm::StringRef::npos) {
  130. // Empty or whitespace-only line.
  131. line = "";
  132. } else {
  133. if (first_non_ws < indent) {
  134. return Error("Wrong indent for line: " + line + ", expected " +
  135. llvm::Twine(indent));
  136. }
  137. line = line.drop_front(indent).rtrim(HorizontalWhitespaceChars);
  138. }
  139. // Unescaping with \n appended to handle things like \\<newline>.
  140. llvm::SmallVector<char> buffer;
  141. std::optional<std::string> unescaped =
  142. UnescapeStringLiteral((line + "\n").toStringRef(buffer), hashtag_num,
  143. /*is_block_string=*/true);
  144. if (!unescaped.has_value()) {
  145. return Error("Invalid escaping in " + line);
  146. }
  147. // A \<newline> string collapses into nothing.
  148. if (!unescaped->empty()) {
  149. parsed.append(*unescaped);
  150. }
  151. }
  152. return parsed;
  153. }
  154. auto StringRefContainsPointer(llvm::StringRef ref, const char* ptr) -> bool {
  155. auto le = std::less_equal<const char*>();
  156. return le(ref.begin(), ptr) && le(ptr, ref.end());
  157. }
  158. } // namespace Carbon