numeric_literal.cpp 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #include "lexer/numeric_literal.h"
  5. #include <bitset>
  6. #include "llvm/ADT/StringExtras.h"
  7. #include "llvm/Support/FormatVariadic.h"
  8. namespace Carbon {
  9. namespace {
  10. struct EmptyDigitSequence : SimpleDiagnostic<EmptyDigitSequence> {
  11. static constexpr llvm::StringLiteral ShortName = "syntax-invalid-number";
  12. static constexpr llvm::StringLiteral Message =
  13. "Empty digit sequence in numeric literal.";
  14. };
  15. struct InvalidDigit {
  16. static constexpr llvm::StringLiteral ShortName = "syntax-invalid-number";
  17. struct Substitutions {
  18. char digit;
  19. int radix;
  20. };
  21. static auto Format(const Substitutions& subst) -> std::string {
  22. return llvm::formatv("Invalid digit '{0}' in {1} numeric literal.",
  23. subst.digit,
  24. (subst.radix == 2 ? "binary"
  25. : subst.radix == 16 ? "hexadecimal"
  26. : "decimal"))
  27. .str();
  28. }
  29. };
  30. struct InvalidDigitSeparator : SimpleDiagnostic<InvalidDigitSeparator> {
  31. static constexpr llvm::StringLiteral ShortName = "syntax-invalid-number";
  32. static constexpr llvm::StringLiteral Message =
  33. "Misplaced digit separator in numeric literal.";
  34. };
  35. struct IrregularDigitSeparators {
  36. static constexpr llvm::StringLiteral ShortName =
  37. "syntax-irregular-digit-separators";
  38. struct Substitutions {
  39. int radix;
  40. };
  41. static auto Format(const Substitutions& subst) -> std::string {
  42. assert((subst.radix == 10 || subst.radix == 16) && "unexpected radix");
  43. return llvm::formatv(
  44. "Digit separators in {0} number should appear every {1} "
  45. "characters from the right.",
  46. (subst.radix == 10 ? "decimal" : "hexadecimal"),
  47. (subst.radix == 10 ? "3" : "4"))
  48. .str();
  49. }
  50. };
  51. struct UnknownBaseSpecifier : SimpleDiagnostic<UnknownBaseSpecifier> {
  52. static constexpr llvm::StringLiteral ShortName = "syntax-invalid-number";
  53. static constexpr llvm::StringLiteral Message =
  54. "Unknown base specifier in numeric literal.";
  55. };
  56. struct BinaryRealLiteral : SimpleDiagnostic<BinaryRealLiteral> {
  57. static constexpr llvm::StringLiteral ShortName = "syntax-invalid-number";
  58. static constexpr llvm::StringLiteral Message =
  59. "Binary real number literals are not supported.";
  60. };
  61. struct WrongRealLiteralExponent {
  62. static constexpr llvm::StringLiteral ShortName = "syntax-invalid-number";
  63. struct Substitutions {
  64. char expected;
  65. };
  66. static auto Format(const Substitutions& subst) -> std::string {
  67. return llvm::formatv("Expected '{0}' to introduce exponent.",
  68. subst.expected)
  69. .str();
  70. }
  71. };
  72. } // namespace
  73. static bool isLower(char c) { return 'a' <= c && c <= 'z'; }
  74. auto NumericLiteralToken::Lex(llvm::StringRef source_text)
  75. -> llvm::Optional<NumericLiteralToken> {
  76. NumericLiteralToken result;
  77. if (source_text.empty() || !llvm::isDigit(source_text.front())) {
  78. return llvm::None;
  79. }
  80. bool seen_plus_minus = false;
  81. bool seen_radix_point = false;
  82. bool seen_potential_exponent = false;
  83. // Greedily consume all following characters that might be part of a numeric
  84. // literal. This allows us to produce better diagnostics on invalid literals.
  85. //
  86. // TODO(zygoloid): Update lexical rules to specify that a numeric literal
  87. // cannot be immediately followed by an alphanumeric character.
  88. int i = 1, n = source_text.size();
  89. for (; i != n; ++i) {
  90. char c = source_text[i];
  91. if (llvm::isAlnum(c) || c == '_') {
  92. if (isLower(c) && seen_radix_point && !seen_plus_minus) {
  93. result.exponent = i;
  94. seen_potential_exponent = true;
  95. }
  96. continue;
  97. }
  98. // Exactly one `.` can be part of the literal, but only if it's followed by
  99. // an alphanumeric character.
  100. if (c == '.' && i + 1 != n && llvm::isAlnum(source_text[i + 1]) &&
  101. !seen_radix_point) {
  102. result.radix_point = i;
  103. seen_radix_point = true;
  104. continue;
  105. }
  106. // A `+` or `-` continues the literal only if it's preceded by a lowercase
  107. // letter (which will be 'e' or 'p' or part of an invalid literal) and
  108. // followed by an alphanumeric character. This '+' or '-' cannot be an
  109. // operator because a literal cannot end in a lowercase letter.
  110. if ((c == '+' || c == '-') && seen_potential_exponent &&
  111. result.exponent == i - 1 && i + 1 != n &&
  112. llvm::isAlnum(source_text[i + 1])) {
  113. // This is not possible because we don't update result.exponent after we
  114. // see a '+' or '-'.
  115. assert(!seen_plus_minus && "should only consume one + or -");
  116. seen_plus_minus = true;
  117. continue;
  118. }
  119. break;
  120. }
  121. result.text = source_text.substr(0, i);
  122. if (!seen_radix_point) {
  123. result.radix_point = i;
  124. }
  125. if (!seen_potential_exponent) {
  126. result.exponent = i;
  127. }
  128. return result;
  129. }
  130. NumericLiteralToken::Parser::Parser(DiagnosticEmitter& emitter,
  131. NumericLiteralToken literal)
  132. : emitter(emitter), literal(literal) {
  133. int_part = literal.text.substr(0, literal.radix_point);
  134. if (int_part.consume_front("0x")) {
  135. radix = 16;
  136. } else if (int_part.consume_front("0b")) {
  137. radix = 2;
  138. }
  139. fract_part = literal.text.substr(literal.radix_point + 1,
  140. literal.exponent - literal.radix_point - 1);
  141. exponent_part = literal.text.substr(literal.exponent + 1);
  142. if (!exponent_part.consume_front("+")) {
  143. exponent_is_negative = exponent_part.consume_front("-");
  144. }
  145. }
  146. // Check that the numeric literal token is syntactically valid and meaningful,
  147. // and diagnose if not.
  148. auto NumericLiteralToken::Parser::Check() -> CheckResult {
  149. if (!CheckLeadingZero() || !CheckIntPart() || !CheckFractionalPart() ||
  150. !CheckExponentPart()) {
  151. return UnrecoverableError;
  152. }
  153. return recovered_from_error ? RecoverableError : Valid;
  154. }
  155. // Parse a string that is known to be a valid base-radix integer into an
  156. // APInt. If needs_cleaning is true, the string may additionally contain '_'
  157. // and '.' characters that should be ignored.
  158. //
  159. // Ignoring '.' is used when parsing a real literal. For example, when
  160. // parsing 123.456e7, we want to decompose it into an integer mantissa
  161. // (123456) and an exponent (7 - 3 = 2), and this routine is given the
  162. // "123.456" to parse as the mantissa.
  163. static auto ParseInteger(llvm::StringRef digits, int radix, bool needs_cleaning)
  164. -> llvm::APInt {
  165. llvm::SmallString<32> cleaned;
  166. if (needs_cleaning) {
  167. cleaned.reserve(digits.size());
  168. std::remove_copy_if(digits.begin(), digits.end(),
  169. std::back_inserter(cleaned),
  170. [](char c) { return c == '_' || c == '.'; });
  171. digits = cleaned;
  172. }
  173. llvm::APInt value;
  174. if (digits.getAsInteger(radix, value)) {
  175. llvm_unreachable("should never fail");
  176. }
  177. return value;
  178. }
  179. auto NumericLiteralToken::Parser::GetMantissa() -> llvm::APInt {
  180. const char* end = IsInteger() ? int_part.end() : fract_part.end();
  181. llvm::StringRef digits(int_part.begin(), end - int_part.begin());
  182. return ParseInteger(digits, radix, mantissa_needs_cleaning);
  183. }
  184. auto NumericLiteralToken::Parser::GetExponent() -> llvm::APInt {
  185. // Compute the effective exponent from the specified exponent, if any,
  186. // and the position of the radix point.
  187. llvm::APInt exponent(64, 0);
  188. if (!exponent_part.empty()) {
  189. exponent = ParseInteger(exponent_part, 10, exponent_needs_cleaning);
  190. // The exponent is a signed integer, and the number we just parsed is
  191. // non-negative, so ensure we have a wide enough representation to
  192. // include a sign bit. Also make sure the exponent isn't too narrow so
  193. // the calculation below can't lose information through overflow.
  194. if (exponent.isSignBitSet() || exponent.getBitWidth() < 64) {
  195. exponent = exponent.zext(std::max(64u, exponent.getBitWidth() + 1));
  196. }
  197. if (exponent_is_negative) {
  198. exponent.negate();
  199. }
  200. }
  201. // Each character after the decimal point reduces the effective exponent.
  202. int excess_exponent = fract_part.size();
  203. if (radix == 16) {
  204. excess_exponent *= 4;
  205. }
  206. exponent -= excess_exponent;
  207. if (exponent_is_negative && !exponent.isNegative()) {
  208. // We overflowed. Note that we can only overflow by a little, and only
  209. // from negative to positive, because exponent is at least 64 bits wide
  210. // and excess_exponent is bounded above by four times the size of the
  211. // input buffer, which we assume fits into 32 bits.
  212. exponent = exponent.zext(exponent.getBitWidth() + 1);
  213. exponent.setSignBit();
  214. }
  215. return exponent;
  216. }
  217. // Check that a digit sequence is valid: that it contains one or more digits,
  218. // contains only digits in the specified base, and that any digit separators
  219. // are present and correctly positioned.
  220. auto NumericLiteralToken::Parser::CheckDigitSequence(
  221. llvm::StringRef text, int radix, bool allow_digit_separators)
  222. -> CheckDigitSequenceResult {
  223. assert((radix == 2 || radix == 10 || radix == 16) && "unknown radix");
  224. std::bitset<256> valid_digits;
  225. if (radix == 2) {
  226. for (char c : "01") {
  227. valid_digits[static_cast<unsigned char>(c)] = true;
  228. }
  229. } else if (radix == 10) {
  230. for (char c : "0123456789") {
  231. valid_digits[static_cast<unsigned char>(c)] = true;
  232. }
  233. } else {
  234. for (char c : "0123456789ABCDEF") {
  235. valid_digits[static_cast<unsigned char>(c)] = true;
  236. }
  237. }
  238. int num_digit_separators = 0;
  239. for (int i = 0, n = text.size(); i != n; ++i) {
  240. char c = text[i];
  241. if (valid_digits[static_cast<unsigned char>(c)]) {
  242. continue;
  243. }
  244. if (c == '_') {
  245. // A digit separator cannot appear at the start of a digit sequence,
  246. // next to another digit separator, or at the end.
  247. if (!allow_digit_separators || i == 0 || text[i - 1] == '_' ||
  248. i + 1 == n) {
  249. emitter.EmitError<InvalidDigitSeparator>();
  250. recovered_from_error = true;
  251. }
  252. ++num_digit_separators;
  253. continue;
  254. }
  255. emitter.EmitError<InvalidDigit>({.digit = c, .radix = radix});
  256. return {.ok = false};
  257. }
  258. if (num_digit_separators == static_cast<int>(text.size())) {
  259. emitter.EmitError<EmptyDigitSequence>();
  260. return {.ok = false};
  261. }
  262. // Check that digit separators occur in exactly the expected positions.
  263. if (num_digit_separators) {
  264. CheckDigitSeparatorPlacement(text, radix, num_digit_separators);
  265. }
  266. return {.ok = true, .has_digit_separators = (num_digit_separators != 0)};
  267. }
  268. // Given a number with digit separators, check that the digit separators are
  269. // correctly positioned.
  270. auto NumericLiteralToken::Parser::CheckDigitSeparatorPlacement(
  271. llvm::StringRef text, int radix, int num_digit_separators) -> void {
  272. assert(std::count(text.begin(), text.end(), '_') == num_digit_separators &&
  273. "given wrong number of digit separators");
  274. if (radix == 2) {
  275. // There are no restrictions on digit separator placement for binary
  276. // literals.
  277. return;
  278. }
  279. assert((radix == 10 || radix == 16) &&
  280. "unexpected radix for digit separator checks");
  281. auto diagnose_irregular_digit_separators = [&] {
  282. emitter.EmitError<IrregularDigitSeparators>({.radix = radix});
  283. recovered_from_error = true;
  284. };
  285. // For decimal and hexadecimal digit sequences, digit separators must form
  286. // groups of 3 or 4 digits (4 or 5 characters), respectively.
  287. int stride = (radix == 10 ? 4 : 5);
  288. int remaining_digit_separators = num_digit_separators;
  289. auto pos = text.end();
  290. while (pos - text.begin() >= stride) {
  291. pos -= stride;
  292. if (*pos != '_') {
  293. diagnose_irregular_digit_separators();
  294. return;
  295. }
  296. --remaining_digit_separators;
  297. }
  298. // Check there weren't any other digit separators.
  299. if (remaining_digit_separators) {
  300. diagnose_irregular_digit_separators();
  301. }
  302. };
  303. // Check that we don't have a '0' prefix on a non-zero decimal integer.
  304. auto NumericLiteralToken::Parser::CheckLeadingZero() -> bool {
  305. if (radix == 10 && int_part.startswith("0") && int_part != "0") {
  306. emitter.EmitError<UnknownBaseSpecifier>();
  307. return false;
  308. }
  309. return true;
  310. }
  311. // Check the integer part (before the '.', if any) is valid.
  312. auto NumericLiteralToken::Parser::CheckIntPart() -> bool {
  313. auto int_result = CheckDigitSequence(int_part, radix);
  314. mantissa_needs_cleaning |= int_result.has_digit_separators;
  315. return int_result.ok;
  316. }
  317. // Check the fractional part (after the '.' and before the exponent, if any)
  318. // is valid.
  319. auto NumericLiteralToken::Parser::CheckFractionalPart() -> bool {
  320. if (IsInteger()) {
  321. return true;
  322. }
  323. if (radix == 2) {
  324. emitter.EmitError<BinaryRealLiteral>();
  325. recovered_from_error = true;
  326. // Carry on and parse the binary real literal anyway.
  327. }
  328. // We need to remove a '.' from the mantissa.
  329. mantissa_needs_cleaning = true;
  330. return CheckDigitSequence(fract_part, radix,
  331. /*allow_digit_separators=*/false)
  332. .ok;
  333. }
  334. // Check the exponent part (if any) is valid.
  335. auto NumericLiteralToken::Parser::CheckExponentPart() -> bool {
  336. if (literal.exponent == static_cast<int>(literal.text.size())) {
  337. return true;
  338. }
  339. char expected_exponent_kind = (radix == 10 ? 'e' : 'p');
  340. if (literal.text[literal.exponent] != expected_exponent_kind) {
  341. emitter.EmitError<WrongRealLiteralExponent>(
  342. {.expected = expected_exponent_kind});
  343. return false;
  344. }
  345. auto exponent_result = CheckDigitSequence(exponent_part, 10);
  346. exponent_needs_cleaning = exponent_result.has_digit_separators;
  347. return exponent_result.ok;
  348. }
  349. } // namespace Carbon