numeric_literal.cpp 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #include "toolchain/lexer/numeric_literal.h"
  5. #include <bitset>
  6. #include "common/check.h"
  7. #include "llvm/ADT/StringExtras.h"
  8. #include "llvm/Support/FormatVariadic.h"
  9. #include "toolchain/lexer/character_set.h"
  10. namespace Carbon {
  11. namespace {
  12. struct EmptyDigitSequence : DiagnosticBase<EmptyDigitSequence> {
  13. static constexpr llvm::StringLiteral ShortName = "syntax-invalid-number";
  14. static constexpr llvm::StringLiteral Message =
  15. "Empty digit sequence in numeric literal.";
  16. };
  17. struct InvalidDigit : DiagnosticBase<InvalidDigit> {
  18. static constexpr llvm::StringLiteral ShortName = "syntax-invalid-number";
  19. auto Format() -> std::string {
  20. return llvm::formatv(
  21. "Invalid digit '{0}' in {1} numeric literal.", digit,
  22. (radix == 2 ? "binary"
  23. : (radix == 16 ? "hexadecimal" : "decimal")))
  24. .str();
  25. }
  26. char digit;
  27. int radix;
  28. };
  29. struct InvalidDigitSeparator : DiagnosticBase<InvalidDigitSeparator> {
  30. static constexpr llvm::StringLiteral ShortName = "syntax-invalid-number";
  31. static constexpr llvm::StringLiteral Message =
  32. "Misplaced digit separator in numeric literal.";
  33. };
  34. struct IrregularDigitSeparators : DiagnosticBase<IrregularDigitSeparators> {
  35. static constexpr llvm::StringLiteral ShortName =
  36. "syntax-irregular-digit-separators";
  37. auto Format() -> std::string {
  38. CHECK((radix == 10 || radix == 16)) << "unexpected radix: " << radix;
  39. return llvm::formatv(
  40. "Digit separators in {0} number should appear every {1} "
  41. "characters from the right.",
  42. (radix == 10 ? "decimal" : "hexadecimal"),
  43. (radix == 10 ? "3" : "4"))
  44. .str();
  45. }
  46. int radix;
  47. };
  48. struct UnknownBaseSpecifier : DiagnosticBase<UnknownBaseSpecifier> {
  49. static constexpr llvm::StringLiteral ShortName = "syntax-invalid-number";
  50. static constexpr llvm::StringLiteral Message =
  51. "Unknown base specifier in numeric literal.";
  52. };
  53. struct BinaryRealLiteral : DiagnosticBase<BinaryRealLiteral> {
  54. static constexpr llvm::StringLiteral ShortName = "syntax-invalid-number";
  55. static constexpr llvm::StringLiteral Message =
  56. "Binary real number literals are not supported.";
  57. };
  58. struct WrongRealLiteralExponent : DiagnosticBase<WrongRealLiteralExponent> {
  59. static constexpr llvm::StringLiteral ShortName = "syntax-invalid-number";
  60. auto Format() -> std::string {
  61. return llvm::formatv("Expected '{0}' to introduce exponent.", expected)
  62. .str();
  63. }
  64. char expected;
  65. };
  66. struct TooManyDigits : DiagnosticBase<TooManyDigits> {
  67. static constexpr llvm::StringLiteral ShortName = "syntax-invalid-number";
  68. auto Format() -> std::string {
  69. return llvm::formatv(
  70. "Found a sequence of {0} digits, which is greater than the "
  71. "limit of {1}.",
  72. count, limit)
  73. .str();
  74. }
  75. size_t count;
  76. size_t limit;
  77. };
  78. } // namespace
  79. auto LexedNumericLiteral::Lex(llvm::StringRef source_text)
  80. -> llvm::Optional<LexedNumericLiteral> {
  81. LexedNumericLiteral result;
  82. if (source_text.empty() || !IsDecimalDigit(source_text.front())) {
  83. return llvm::None;
  84. }
  85. bool seen_plus_minus = false;
  86. bool seen_radix_point = false;
  87. bool seen_potential_exponent = false;
  88. // Greedily consume all following characters that might be part of a numeric
  89. // literal. This allows us to produce better diagnostics on invalid literals.
  90. //
  91. // TODO(zygoloid): Update lexical rules to specify that a numeric literal
  92. // cannot be immediately followed by an alphanumeric character.
  93. int i = 1, n = source_text.size();
  94. for (; i != n; ++i) {
  95. char c = source_text[i];
  96. if (IsAlnum(c) || c == '_') {
  97. if (IsLower(c) && seen_radix_point && !seen_plus_minus) {
  98. result.exponent_ = i;
  99. seen_potential_exponent = true;
  100. }
  101. continue;
  102. }
  103. // Exactly one `.` can be part of the literal, but only if it's followed by
  104. // an alphanumeric character.
  105. if (c == '.' && i + 1 != n && IsAlnum(source_text[i + 1]) &&
  106. !seen_radix_point) {
  107. result.radix_point_ = i;
  108. seen_radix_point = true;
  109. continue;
  110. }
  111. // A `+` or `-` continues the literal only if it's preceded by a lowercase
  112. // letter (which will be 'e' or 'p' or part of an invalid literal) and
  113. // followed by an alphanumeric character. This '+' or '-' cannot be an
  114. // operator because a literal cannot end in a lowercase letter.
  115. if ((c == '+' || c == '-') && seen_potential_exponent &&
  116. result.exponent_ == i - 1 && i + 1 != n &&
  117. IsAlnum(source_text[i + 1])) {
  118. // This is not possible because we don't update result.exponent after we
  119. // see a '+' or '-'.
  120. CHECK(!seen_plus_minus) << "should only consume one + or -";
  121. seen_plus_minus = true;
  122. continue;
  123. }
  124. break;
  125. }
  126. result.text_ = source_text.substr(0, i);
  127. if (!seen_radix_point) {
  128. result.radix_point_ = i;
  129. }
  130. if (!seen_potential_exponent) {
  131. result.exponent_ = i;
  132. }
  133. return result;
  134. }
  135. // Parser for numeric literal tokens.
  136. //
  137. // Responsible for checking that a numeric literal is valid and meaningful and
  138. // either diagnosing or extracting its meaning.
  139. class LexedNumericLiteral::Parser {
  140. public:
  141. Parser(DiagnosticEmitter<const char*>& emitter, LexedNumericLiteral literal);
  142. auto IsInteger() -> bool {
  143. return literal_.radix_point_ == static_cast<int>(literal_.text_.size());
  144. }
  145. // Check that the numeric literal token is syntactically valid and
  146. // meaningful, and diagnose if not. Returns `true` if the token was
  147. // sufficiently valid that we could determine its meaning. If `false` is
  148. // returned, a diagnostic has already been issued.
  149. auto Check() -> bool;
  150. // Get the radix of this token. One of 2, 10, or 16.
  151. auto GetRadix() -> int { return radix_; }
  152. // Get the mantissa of this token's value.
  153. auto GetMantissa() -> llvm::APInt;
  154. // Get the exponent of this token's value. This is always zero for an integer
  155. // literal.
  156. auto GetExponent() -> llvm::APInt;
  157. private:
  158. struct CheckDigitSequenceResult {
  159. bool ok;
  160. bool has_digit_separators = false;
  161. };
  162. auto CheckDigitSequence(llvm::StringRef text, int radix,
  163. bool allow_digit_separators = true)
  164. -> CheckDigitSequenceResult;
  165. auto CheckDigitSeparatorPlacement(llvm::StringRef text, int radix,
  166. int num_digit_separators) -> void;
  167. auto CheckLeadingZero() -> bool;
  168. auto CheckIntPart() -> bool;
  169. auto CheckFractionalPart() -> bool;
  170. auto CheckExponentPart() -> bool;
  171. DiagnosticEmitter<const char*>& emitter_;
  172. LexedNumericLiteral literal_;
  173. // The radix of the literal: 2, 10, or 16, for a prefix of '0b', no prefix,
  174. // or '0x', respectively.
  175. int radix_ = 10;
  176. // The various components of a numeric literal:
  177. //
  178. // [radix] int_part [. fract_part [[ep] [+-] exponent_part]]
  179. llvm::StringRef int_part_;
  180. llvm::StringRef fract_part_;
  181. llvm::StringRef exponent_part_;
  182. // Do we need to remove any special characters (digit separator or radix
  183. // point) before interpreting the mantissa or exponent as an integer?
  184. bool mantissa_needs_cleaning_ = false;
  185. bool exponent_needs_cleaning_ = false;
  186. // True if we found a `-` before `exponent_part`.
  187. bool exponent_is_negative_ = false;
  188. };
  189. LexedNumericLiteral::Parser::Parser(DiagnosticEmitter<const char*>& emitter,
  190. LexedNumericLiteral literal)
  191. : emitter_(emitter), literal_(literal) {
  192. int_part_ = literal.text_.substr(0, literal.radix_point_);
  193. if (int_part_.consume_front("0x")) {
  194. radix_ = 16;
  195. } else if (int_part_.consume_front("0b")) {
  196. radix_ = 2;
  197. }
  198. fract_part_ = literal.text_.substr(
  199. literal.radix_point_ + 1, literal.exponent_ - literal.radix_point_ - 1);
  200. exponent_part_ = literal.text_.substr(literal.exponent_ + 1);
  201. if (!exponent_part_.consume_front("+")) {
  202. exponent_is_negative_ = exponent_part_.consume_front("-");
  203. }
  204. }
  205. // Check that the numeric literal token is syntactically valid and meaningful,
  206. // and diagnose if not.
  207. auto LexedNumericLiteral::Parser::Check() -> bool {
  208. return CheckLeadingZero() && CheckIntPart() && CheckFractionalPart() &&
  209. CheckExponentPart();
  210. }
  211. // Parse a string that is known to be a valid base-radix integer into an
  212. // APInt. If needs_cleaning is true, the string may additionally contain '_'
  213. // and '.' characters that should be ignored.
  214. //
  215. // Ignoring '.' is used when parsing a real literal. For example, when
  216. // parsing 123.456e7, we want to decompose it into an integer mantissa
  217. // (123456) and an exponent (7 - 3 = 2), and this routine is given the
  218. // "123.456" to parse as the mantissa.
  219. static auto ParseInteger(llvm::StringRef digits, int radix, bool needs_cleaning)
  220. -> llvm::APInt {
  221. llvm::SmallString<32> cleaned;
  222. if (needs_cleaning) {
  223. cleaned.reserve(digits.size());
  224. std::remove_copy_if(digits.begin(), digits.end(),
  225. std::back_inserter(cleaned),
  226. [](char c) { return c == '_' || c == '.'; });
  227. digits = cleaned;
  228. }
  229. llvm::APInt value;
  230. if (digits.getAsInteger(radix, value)) {
  231. llvm_unreachable("should never fail");
  232. }
  233. return value;
  234. }
  235. auto LexedNumericLiteral::Parser::GetMantissa() -> llvm::APInt {
  236. const char* end = IsInteger() ? int_part_.end() : fract_part_.end();
  237. llvm::StringRef digits(int_part_.begin(), end - int_part_.begin());
  238. return ParseInteger(digits, radix_, mantissa_needs_cleaning_);
  239. }
  240. auto LexedNumericLiteral::Parser::GetExponent() -> llvm::APInt {
  241. // Compute the effective exponent from the specified exponent, if any,
  242. // and the position of the radix point.
  243. llvm::APInt exponent(64, 0);
  244. if (!exponent_part_.empty()) {
  245. exponent = ParseInteger(exponent_part_, 10, exponent_needs_cleaning_);
  246. // The exponent is a signed integer, and the number we just parsed is
  247. // non-negative, so ensure we have a wide enough representation to
  248. // include a sign bit. Also make sure the exponent isn't too narrow so
  249. // the calculation below can't lose information through overflow.
  250. if (exponent.isSignBitSet() || exponent.getBitWidth() < 64) {
  251. exponent = exponent.zext(std::max(64u, exponent.getBitWidth() + 1));
  252. }
  253. if (exponent_is_negative_) {
  254. exponent.negate();
  255. }
  256. }
  257. // Each character after the decimal point reduces the effective exponent.
  258. int excess_exponent = fract_part_.size();
  259. if (radix_ == 16) {
  260. excess_exponent *= 4;
  261. }
  262. exponent -= excess_exponent;
  263. if (exponent_is_negative_ && !exponent.isNegative()) {
  264. // We overflowed. Note that we can only overflow by a little, and only
  265. // from negative to positive, because exponent is at least 64 bits wide
  266. // and excess_exponent is bounded above by four times the size of the
  267. // input buffer, which we assume fits into 32 bits.
  268. exponent = exponent.zext(exponent.getBitWidth() + 1);
  269. exponent.setSignBit();
  270. }
  271. return exponent;
  272. }
  273. // Check that a digit sequence is valid: that it contains one or more digits,
  274. // contains only digits in the specified base, and that any digit separators
  275. // are present and correctly positioned.
  276. auto LexedNumericLiteral::Parser::CheckDigitSequence(
  277. llvm::StringRef text, int radix, bool allow_digit_separators)
  278. -> CheckDigitSequenceResult {
  279. CHECK((radix == 2 || radix == 10 || radix == 16))
  280. << "unknown radix: " << radix;
  281. std::bitset<256> valid_digits;
  282. if (radix == 2) {
  283. for (char c : "01") {
  284. valid_digits[static_cast<unsigned char>(c)] = true;
  285. }
  286. } else if (radix == 10) {
  287. for (char c : "0123456789") {
  288. valid_digits[static_cast<unsigned char>(c)] = true;
  289. }
  290. } else {
  291. for (char c : "0123456789ABCDEF") {
  292. valid_digits[static_cast<unsigned char>(c)] = true;
  293. }
  294. }
  295. int num_digit_separators = 0;
  296. for (int i = 0, n = text.size(); i != n; ++i) {
  297. char c = text[i];
  298. if (valid_digits[static_cast<unsigned char>(c)]) {
  299. continue;
  300. }
  301. if (c == '_') {
  302. // A digit separator cannot appear at the start of a digit sequence,
  303. // next to another digit separator, or at the end.
  304. if (!allow_digit_separators || i == 0 || text[i - 1] == '_' ||
  305. i + 1 == n) {
  306. emitter_.EmitError<InvalidDigitSeparator>(text.begin() + i);
  307. }
  308. ++num_digit_separators;
  309. continue;
  310. }
  311. emitter_.EmitError<InvalidDigit>(text.begin() + i,
  312. {.digit = c, .radix = radix});
  313. return {.ok = false};
  314. }
  315. if (num_digit_separators == static_cast<int>(text.size())) {
  316. emitter_.EmitError<EmptyDigitSequence>(text.begin());
  317. return {.ok = false};
  318. }
  319. // Check that digit separators occur in exactly the expected positions.
  320. if (num_digit_separators) {
  321. CheckDigitSeparatorPlacement(text, radix, num_digit_separators);
  322. }
  323. // llvm::getAsInteger is used for parsing, but it's quadratic and visibly slow
  324. // on large integer values. This limit exists to avoid hitting those limits.
  325. // Per https://github.com/carbon-language/carbon-lang/issues/980, it may be
  326. // feasible to optimize integer parsing in order to address performance if
  327. // this limit becomes an issue.
  328. //
  329. // 2^128 would be 39 decimal digits or 128 binary. In either case, this limit
  330. // is far above the threshold for normal integers.
  331. constexpr size_t DigitLimit = 1000;
  332. if (text.size() > DigitLimit) {
  333. emitter_.EmitError<TooManyDigits>(
  334. text.begin(), {.count = text.size(), .limit = DigitLimit});
  335. return {.ok = false};
  336. }
  337. return {.ok = true, .has_digit_separators = (num_digit_separators != 0)};
  338. }
  339. // Given a number with digit separators, check that the digit separators are
  340. // correctly positioned.
  341. auto LexedNumericLiteral::Parser::CheckDigitSeparatorPlacement(
  342. llvm::StringRef text, int radix, int num_digit_separators) -> void {
  343. CHECK(std::count(text.begin(), text.end(), '_') == num_digit_separators)
  344. << "given wrong number of digit separators: " << num_digit_separators;
  345. if (radix == 2) {
  346. // There are no restrictions on digit separator placement for binary
  347. // literals.
  348. return;
  349. }
  350. CHECK((radix == 10 || radix == 16))
  351. << "unexpected radix " << radix << " for digit separator checks";
  352. auto diagnose_irregular_digit_separators = [&]() {
  353. emitter_.EmitError<IrregularDigitSeparators>(text.begin(),
  354. {.radix = radix});
  355. };
  356. // For decimal and hexadecimal digit sequences, digit separators must form
  357. // groups of 3 or 4 digits (4 or 5 characters), respectively.
  358. int stride = (radix == 10 ? 4 : 5);
  359. int remaining_digit_separators = num_digit_separators;
  360. auto pos = text.end();
  361. while (pos - text.begin() >= stride) {
  362. pos -= stride;
  363. if (*pos != '_') {
  364. diagnose_irregular_digit_separators();
  365. return;
  366. }
  367. --remaining_digit_separators;
  368. }
  369. // Check there weren't any other digit separators.
  370. if (remaining_digit_separators) {
  371. diagnose_irregular_digit_separators();
  372. }
  373. };
  374. // Check that we don't have a '0' prefix on a non-zero decimal integer.
  375. auto LexedNumericLiteral::Parser::CheckLeadingZero() -> bool {
  376. if (radix_ == 10 && int_part_.startswith("0") && int_part_ != "0") {
  377. emitter_.EmitError<UnknownBaseSpecifier>(int_part_.begin());
  378. return false;
  379. }
  380. return true;
  381. }
  382. // Check the integer part (before the '.', if any) is valid.
  383. auto LexedNumericLiteral::Parser::CheckIntPart() -> bool {
  384. auto int_result = CheckDigitSequence(int_part_, radix_);
  385. mantissa_needs_cleaning_ |= int_result.has_digit_separators;
  386. return int_result.ok;
  387. }
  388. // Check the fractional part (after the '.' and before the exponent, if any)
  389. // is valid.
  390. auto LexedNumericLiteral::Parser::CheckFractionalPart() -> bool {
  391. if (IsInteger()) {
  392. return true;
  393. }
  394. if (radix_ == 2) {
  395. emitter_.EmitError<BinaryRealLiteral>(literal_.text_.begin() +
  396. literal_.radix_point_);
  397. // Carry on and parse the binary real literal anyway.
  398. }
  399. // We need to remove a '.' from the mantissa.
  400. mantissa_needs_cleaning_ = true;
  401. return CheckDigitSequence(fract_part_, radix_,
  402. /*allow_digit_separators=*/false)
  403. .ok;
  404. }
  405. // Check the exponent part (if any) is valid.
  406. auto LexedNumericLiteral::Parser::CheckExponentPart() -> bool {
  407. if (literal_.exponent_ == static_cast<int>(literal_.text_.size())) {
  408. return true;
  409. }
  410. char expected_exponent_kind = (radix_ == 10 ? 'e' : 'p');
  411. if (literal_.text_[literal_.exponent_] != expected_exponent_kind) {
  412. emitter_.EmitError<WrongRealLiteralExponent>(
  413. literal_.text_.begin() + literal_.exponent_,
  414. {.expected = expected_exponent_kind});
  415. return false;
  416. }
  417. auto exponent_result = CheckDigitSequence(exponent_part_, 10);
  418. exponent_needs_cleaning_ = exponent_result.has_digit_separators;
  419. return exponent_result.ok;
  420. }
  421. // Parse the token and compute its value.
  422. auto LexedNumericLiteral::ComputeValue(
  423. DiagnosticEmitter<const char*>& emitter) const -> Value {
  424. Parser parser(emitter, *this);
  425. if (!parser.Check()) {
  426. return UnrecoverableError();
  427. }
  428. if (parser.IsInteger()) {
  429. return IntegerValue{.value = parser.GetMantissa()};
  430. }
  431. return RealValue{.radix = (parser.GetRadix() == 10 ? 10 : 2),
  432. .mantissa = parser.GetMantissa(),
  433. .exponent = parser.GetExponent()};
  434. }
  435. } // namespace Carbon