string_helpers.cpp 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #include "common/string_helpers.h"
  5. #include <algorithm>
  6. #include <optional>
  7. #include "common/check.h"
  8. #include "llvm/ADT/StringExtras.h"
  9. #include "llvm/ADT/StringRef.h"
  10. namespace Carbon {
  11. namespace {
  12. constexpr llvm::StringRef TripleQuotes = "\"\"\"";
  13. constexpr llvm::StringRef HorizontalWhitespaceChars = " \t";
  14. // Carbon only takes uppercase hex input.
  15. auto FromHex(char c) -> std::optional<char> {
  16. if (c >= '0' && c <= '9') {
  17. return c - '0';
  18. }
  19. if (c >= 'A' && c <= 'F') {
  20. return 10 + c - 'A';
  21. }
  22. return std::nullopt;
  23. }
  24. // Creates an error instance with the specified `message`.
  25. llvm::Expected<std::string> MakeError(llvm::Twine message) {
  26. return llvm::createStringError(llvm::inconvertibleErrorCode(), message);
  27. }
  28. } // namespace
  29. auto UnescapeStringLiteral(llvm::StringRef source, bool is_block_string)
  30. -> std::optional<std::string> {
  31. std::string ret;
  32. ret.reserve(source.size());
  33. size_t i = 0;
  34. while (i < source.size()) {
  35. char c = source[i];
  36. switch (c) {
  37. case '\\':
  38. ++i;
  39. if (i == source.size()) {
  40. return std::nullopt;
  41. }
  42. switch (source[i]) {
  43. case 'n':
  44. ret.push_back('\n');
  45. break;
  46. case 'r':
  47. ret.push_back('\r');
  48. break;
  49. case 't':
  50. ret.push_back('\t');
  51. break;
  52. case '0':
  53. if (i + 1 < source.size() && llvm::isDigit(source[i + 1])) {
  54. // \0[0-9] is reserved.
  55. return std::nullopt;
  56. }
  57. ret.push_back('\0');
  58. break;
  59. case '"':
  60. ret.push_back('"');
  61. break;
  62. case '\'':
  63. ret.push_back('\'');
  64. break;
  65. case '\\':
  66. ret.push_back('\\');
  67. break;
  68. case 'x': {
  69. i += 2;
  70. if (i >= source.size()) {
  71. return std::nullopt;
  72. }
  73. std::optional<char> c1 = FromHex(source[i - 1]);
  74. std::optional<char> c2 = FromHex(source[i]);
  75. if (c1 == std::nullopt || c2 == std::nullopt) {
  76. return std::nullopt;
  77. }
  78. ret.push_back(16 * *c1 + *c2);
  79. break;
  80. }
  81. case 'u':
  82. FATAL() << "\\u is not yet supported in string literals";
  83. case '\n':
  84. if (!is_block_string) {
  85. return std::nullopt;
  86. }
  87. break;
  88. default:
  89. // Unsupported.
  90. return std::nullopt;
  91. }
  92. break;
  93. case '\t':
  94. // Disallow non-` ` horizontal whitespace:
  95. // https://github.com/carbon-language/carbon-lang/blob/trunk/docs/design/lexical_conventions/whitespace.md
  96. // TODO: This doesn't handle unicode whitespace.
  97. return std::nullopt;
  98. default:
  99. ret.push_back(c);
  100. break;
  101. }
  102. ++i;
  103. }
  104. return ret;
  105. }
  106. auto ParseBlockStringLiteral(llvm::StringRef source)
  107. -> llvm::Expected<std::string> {
  108. llvm::SmallVector<llvm::StringRef> lines;
  109. source.split(lines, '\n', /*MaxSplit=*/-1, /*KeepEmpty=*/true);
  110. if (lines.size() < 2) {
  111. return MakeError("Too few lines");
  112. }
  113. llvm::StringRef first = lines[0];
  114. if (!first.consume_front(TripleQuotes)) {
  115. return MakeError("Should start with triple quotes: " + first);
  116. }
  117. first = first.rtrim(HorizontalWhitespaceChars);
  118. // Remaining chars, if any, are a file type indicator.
  119. if (first.find_first_of("\"#") != llvm::StringRef::npos ||
  120. first.find_first_of(HorizontalWhitespaceChars) != llvm::StringRef::npos) {
  121. return MakeError("Invalid characters in file type indicator: " + first);
  122. }
  123. llvm::StringRef last = lines[lines.size() - 1];
  124. const size_t last_length = last.size();
  125. last = last.ltrim(HorizontalWhitespaceChars);
  126. const size_t indent = last_length - last.size();
  127. if (last != TripleQuotes) {
  128. return MakeError("Should end with triple quotes: " + last);
  129. }
  130. std::string parsed;
  131. for (size_t i = 1; i < lines.size() - 1; ++i) {
  132. llvm::StringRef line = lines[i];
  133. const size_t first_non_ws =
  134. line.find_first_not_of(HorizontalWhitespaceChars);
  135. if (first_non_ws == llvm::StringRef::npos) {
  136. // Empty or whitespace-only line.
  137. line = "";
  138. } else {
  139. if (first_non_ws < indent) {
  140. return MakeError("Wrong indent for line: " + line + ", expected " +
  141. llvm::Twine(indent));
  142. }
  143. line = line.drop_front(indent).rtrim(HorizontalWhitespaceChars);
  144. }
  145. // Unescaping with \n appended to handle things like \\<newline>.
  146. llvm::SmallVector<char> buffer;
  147. std::optional<std::string> unescaped = UnescapeStringLiteral(
  148. (line + "\n").toStringRef(buffer), /*is_block_string=*/true);
  149. if (!unescaped.has_value()) {
  150. return MakeError("Invalid escaping in " + line);
  151. }
  152. // A \<newline> string collapses into nothing.
  153. if (!unescaped->empty()) {
  154. parsed.append(*unescaped);
  155. }
  156. }
  157. return parsed;
  158. }
  159. } // namespace Carbon