numeric_literal.cpp 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #include "toolchain/lex/numeric_literal.h"
  5. #include <algorithm>
  6. #include <bitset>
  7. #include <iterator>
  8. #include <optional>
  9. #include "common/check.h"
  10. #include "llvm/ADT/StringExtras.h"
  11. #include "llvm/Support/FormatVariadicDetails.h"
  12. #include "toolchain/base/int.h"
  13. #include "toolchain/diagnostics/format_providers.h"
  14. #include "toolchain/lex/character_set.h"
  15. #include "toolchain/lex/helpers.h"
  16. namespace Carbon::Lex {
  17. auto NumericLiteral::Lex(llvm::StringRef source_text,
  18. bool can_form_real_literal)
  19. -> std::optional<NumericLiteral> {
  20. NumericLiteral result;
  21. if (source_text.empty() || !IsDecimalDigit(source_text.front())) {
  22. return std::nullopt;
  23. }
  24. bool seen_plus_minus = false;
  25. bool seen_radix_point = false;
  26. bool seen_potential_exponent = false;
  27. // Greedily consume all following characters that might be part of a numeric
  28. // literal. This allows us to produce better diagnostics on invalid literals.
  29. //
  30. // TODO(zygoloid): Update lexical rules to specify that a numeric literal
  31. // cannot be immediately followed by an alphanumeric character.
  32. int i = 1;
  33. int n = source_text.size();
  34. for (; i != n; ++i) {
  35. char c = source_text[i];
  36. if (IsAlnum(c) || c == '_') {
  37. if (IsLower(c) && seen_radix_point && !seen_plus_minus) {
  38. result.exponent_ = i;
  39. seen_potential_exponent = true;
  40. }
  41. continue;
  42. }
  43. // Exactly one `.` can be part of the literal, but only if it's followed by
  44. // an alphanumeric character.
  45. if (c == '.' && can_form_real_literal && i + 1 != n &&
  46. IsAlnum(source_text[i + 1]) && !seen_radix_point) {
  47. result.radix_point_ = i;
  48. seen_radix_point = true;
  49. continue;
  50. }
  51. // A `+` or `-` continues the literal only if it's preceded by a lowercase
  52. // letter (which will be 'e' or 'p' or part of an invalid literal) and
  53. // followed by an alphanumeric character. This '+' or '-' cannot be an
  54. // operator because a literal cannot end in a lowercase letter.
  55. if ((c == '+' || c == '-') && seen_potential_exponent &&
  56. result.exponent_ == i - 1 && i + 1 != n &&
  57. IsAlnum(source_text[i + 1])) {
  58. // This is not possible because we don't update result.exponent after we
  59. // see a '+' or '-'.
  60. CARBON_CHECK(!seen_plus_minus, "should only consume one + or -");
  61. seen_plus_minus = true;
  62. continue;
  63. }
  64. break;
  65. }
  66. result.text_ = source_text.substr(0, i);
  67. if (!seen_radix_point) {
  68. result.radix_point_ = i;
  69. }
  70. if (!seen_potential_exponent) {
  71. result.exponent_ = i;
  72. }
  73. return result;
  74. }
  75. // Parser for numeric literal tokens.
  76. //
  77. // Responsible for checking that a numeric literal is valid and meaningful and
  78. // either diagnosing or extracting its meaning.
  79. class NumericLiteral::Parser {
  80. public:
  81. Parser(Diagnostics::Emitter<const char*>& emitter, NumericLiteral literal);
  82. auto IsInt() -> bool {
  83. return literal_.radix_point_ == static_cast<int>(literal_.text_.size());
  84. }
  85. // Check that the numeric literal token is syntactically valid and
  86. // meaningful, and diagnose if not. Returns `true` if the token was
  87. // sufficiently valid that we could determine its meaning. If `false` is
  88. // returned, a diagnostic has already been issued.
  89. auto Check() -> bool;
  90. // Get the radix of this token. One of 2, 10, or 16.
  91. auto GetRadix() -> Radix { return radix_; }
  92. // Get the mantissa of this token's value.
  93. auto GetMantissa() -> llvm::APInt;
  94. // Get the exponent of this token's value. This is always zero for an integer
  95. // literal.
  96. auto GetExponent() -> llvm::APInt;
  97. private:
  98. struct CheckDigitSequenceResult {
  99. bool ok;
  100. bool has_digit_separators = false;
  101. };
  102. auto CheckDigitSequence(llvm::StringRef text, Radix radix,
  103. bool allow_digit_separators = true)
  104. -> CheckDigitSequenceResult;
  105. auto CheckLeadingZero() -> bool;
  106. auto CheckIntPart() -> bool;
  107. auto CheckFractionalPart() -> bool;
  108. auto CheckExponentPart() -> bool;
  109. Diagnostics::Emitter<const char*>& emitter_;
  110. NumericLiteral literal_;
  111. // The radix of the literal: 2, 10, or 16, for a prefix of '0b', no prefix,
  112. // or '0x', respectively.
  113. Radix radix_ = Radix::Decimal;
  114. // The various components of a numeric literal:
  115. //
  116. // [radix] int_part [. fract_part [[ep] [+-] exponent_part]]
  117. llvm::StringRef int_part_;
  118. llvm::StringRef fract_part_;
  119. llvm::StringRef exponent_part_;
  120. // Do we need to remove any special characters (digit separator or radix
  121. // point) before interpreting the mantissa or exponent as an integer?
  122. bool mantissa_needs_cleaning_ = false;
  123. bool exponent_needs_cleaning_ = false;
  124. // True if we found a `-` before `exponent_part`.
  125. bool exponent_is_negative_ = false;
  126. };
  127. NumericLiteral::Parser::Parser(Diagnostics::Emitter<const char*>& emitter,
  128. NumericLiteral literal)
  129. : emitter_(emitter), literal_(literal) {
  130. int_part_ = literal.text_.substr(0, literal.radix_point_);
  131. if (int_part_.consume_front("0x")) {
  132. radix_ = Radix::Hexadecimal;
  133. } else if (int_part_.consume_front("0b")) {
  134. radix_ = Radix::Binary;
  135. } else if (int_part_.consume_front("0o")) {
  136. radix_ = Radix::Octal;
  137. }
  138. fract_part_ = literal.text_.substr(
  139. literal.radix_point_ + 1, literal.exponent_ - literal.radix_point_ - 1);
  140. exponent_part_ = literal.text_.substr(literal.exponent_ + 1);
  141. if (!exponent_part_.consume_front("+")) {
  142. exponent_is_negative_ = exponent_part_.consume_front("-");
  143. }
  144. }
  145. // Check that the numeric literal token is syntactically valid and meaningful,
  146. // and diagnose if not.
  147. auto NumericLiteral::Parser::Check() -> bool {
  148. return CheckLeadingZero() && CheckIntPart() && CheckFractionalPart() &&
  149. CheckExponentPart();
  150. }
  151. // Parses a binary integer literal.
  152. static auto ParseBinary(llvm::StringRef digits, bool is_signed) -> llvm::APInt {
  153. llvm::APInt value(
  154. std::max<int>(IntStore::MinAPWidth, digits.size() + is_signed), 0);
  155. int cursor = digits.size() - 1;
  156. for (char c : digits) {
  157. if (c == '1') {
  158. value.setBit(cursor);
  159. }
  160. --cursor;
  161. }
  162. return value;
  163. }
  164. // Parses an octal or hexadecimal integer literal.
  165. template <NumericLiteral::Radix Radix>
  166. requires(Radix == NumericLiteral::Radix::Hexadecimal ||
  167. Radix == NumericLiteral::Radix::Octal)
  168. static auto ParseOctalOrHexadecimal(llvm::StringRef digits, bool is_signed)
  169. -> llvm::APInt {
  170. constexpr int BitsPerDigit =
  171. Radix == NumericLiteral::Radix::Hexadecimal ? 4 : 3;
  172. llvm::APInt value(std::max<int>(IntStore::MinAPWidth,
  173. digits.size() * BitsPerDigit + is_signed),
  174. 0);
  175. int cursor = digits.size() * BitsPerDigit - 1;
  176. for (char c : digits) {
  177. uint8_t digit;
  178. if constexpr (Radix == NumericLiteral::Radix::Octal) {
  179. digit = c - '0';
  180. } else {
  181. digit = c <= '9' ? (c - '0') : (c - 'A' + 10);
  182. if (digit & 0x8) {
  183. value.setBit(cursor);
  184. }
  185. --cursor;
  186. }
  187. if (digit & 0x4) {
  188. value.setBit(cursor);
  189. }
  190. --cursor;
  191. if (digit & 0x2) {
  192. value.setBit(cursor);
  193. }
  194. --cursor;
  195. if (digit & 0x1) {
  196. value.setBit(cursor);
  197. }
  198. --cursor;
  199. }
  200. return value;
  201. }
  202. // Parses a single chunk of up to 19 decimal digits.
  203. static auto ParseDecimalChunk(llvm::StringRef digits) -> uint64_t {
  204. uint64_t chunk_val = 0;
  205. for (char c : digits) {
  206. chunk_val = chunk_val * 10 + (c - '0');
  207. }
  208. return chunk_val;
  209. }
  210. // Parsing decimals is complex because they're not a power of 2. We process it
  211. // 19 digits at a time because that's the most that fit into uint64_t, which
  212. // is APInt's internal unit for storage; chunking this way minimizes
  213. // cross-unit arithmetic.
  214. static auto ParseDecimal(llvm::StringRef digits, bool is_signed)
  215. -> llvm::APInt {
  216. // APInt performance scales based on the number of bits, so be precise.
  217. // TODO: Check if this can be `constexpr` when C++26 is in use.
  218. static const double bits_per_digit = std::log2(10);
  219. llvm::APInt value(
  220. std::max<int>(IntStore::MinAPWidth,
  221. std::ceil(digits.size() * bits_per_digit) + is_signed),
  222. 0);
  223. static constexpr int DigitsPerChunk = 19;
  224. // If there's only a few digits, we don't need the multiplication logic.
  225. if (digits.size() <= DigitsPerChunk) {
  226. value = ParseDecimalChunk(digits);
  227. return value;
  228. }
  229. // For the first chunk, we set it up so that all remaining chunks will
  230. // cause equivalent multiplications when adding in. This lets us only
  231. // compute the multiplier once.
  232. int first_chunk_size = digits.size() % DigitsPerChunk;
  233. if (first_chunk_size == 0) {
  234. first_chunk_size = DigitsPerChunk;
  235. }
  236. value = ParseDecimalChunk(digits.take_front(first_chunk_size));
  237. digits = digits.drop_front(first_chunk_size);
  238. // For each remaining chunk, multiply the value by 10^19 and add the
  239. // chunk value.
  240. static constexpr uint64_t Mult = 10'000'000'000'000'000'000ULL;
  241. for (; !digits.empty(); digits = digits.drop_front(DigitsPerChunk)) {
  242. value *= Mult;
  243. value += ParseDecimalChunk(digits.take_front(DigitsPerChunk));
  244. }
  245. return value;
  246. }
  247. // Parse a string that is known to be a valid base-radix integer into an
  248. // APInt. If `needs_cleaning` is true, the string may additionally contain '_'
  249. // and '.' characters that should be ignored. If `is_signed` is true, a bit is
  250. // kept unused for the sign.
  251. //
  252. // Ignoring '.' is used when parsing a real literal. For example, when
  253. // parsing 123.456e7, we want to decompose it into an integer mantissa
  254. // (123456) and an exponent (7 - 3 = 4), and this routine is given the
  255. // "123.456" to parse as the mantissa.
  256. static auto ParseInt(llvm::StringRef digits, NumericLiteral::Radix radix,
  257. bool needs_cleaning, bool is_signed) -> llvm::APInt {
  258. llvm::SmallString<32> cleaned;
  259. if (needs_cleaning) {
  260. cleaned.reserve(digits.size());
  261. llvm::copy_if(digits, std::back_inserter(cleaned),
  262. [](char c) { return c != '_' && c != '.'; });
  263. digits = cleaned;
  264. }
  265. digits = digits.ltrim('0');
  266. // We don't use LLVM's `getAsInteger` because it has poor performance.
  267. // Instead, we implement our own.
  268. switch (radix) {
  269. case NumericLiteral::Radix::Binary:
  270. return ParseBinary(digits, is_signed);
  271. case NumericLiteral::Radix::Octal:
  272. return ParseOctalOrHexadecimal<NumericLiteral::Radix::Octal>(digits,
  273. is_signed);
  274. case NumericLiteral::Radix::Decimal:
  275. return ParseDecimal(digits, is_signed);
  276. case NumericLiteral::Radix::Hexadecimal:
  277. return ParseOctalOrHexadecimal<NumericLiteral::Radix::Hexadecimal>(
  278. digits, is_signed);
  279. }
  280. }
  281. auto NumericLiteral::Parser::GetMantissa() -> llvm::APInt {
  282. const char* end = IsInt() ? int_part_.end() : fract_part_.end();
  283. llvm::StringRef digits(int_part_.begin(), end - int_part_.begin());
  284. return ParseInt(digits, radix_, mantissa_needs_cleaning_,
  285. /*is_signed=*/false);
  286. }
  287. auto NumericLiteral::Parser::GetExponent() -> llvm::APInt {
  288. // Compute the effective exponent from the specified exponent, if any,
  289. // and the position of the radix point.
  290. llvm::APInt exponent(IntStore::MinAPWidth, 0);
  291. if (!exponent_part_.empty()) {
  292. exponent = ParseInt(exponent_part_, Radix::Decimal,
  293. exponent_needs_cleaning_, /*is_signed=*/true);
  294. if (exponent_is_negative_) {
  295. exponent.negate();
  296. }
  297. }
  298. // Each character after the decimal point reduces the effective exponent.
  299. int excess_exponent = fract_part_.size();
  300. if (radix_ == Radix::Hexadecimal) {
  301. excess_exponent *= 4;
  302. } else if (radix_ == Radix::Octal) {
  303. excess_exponent *= 3;
  304. }
  305. exponent -= excess_exponent;
  306. CARBON_CHECK(exponent.getBitWidth() >= 64, "overflow requires high width");
  307. if (exponent_is_negative_ && !exponent.isNegative()) {
  308. // We overflowed. Note that we can only overflow by a little, and only
  309. // from negative to positive, because exponent is at least 64 bits wide
  310. // and excess_exponent is bounded above by four times the size of the
  311. // input buffer, which we assume fits into 32 bits.
  312. exponent = exponent.zext(exponent.getBitWidth() + 1);
  313. exponent.setSignBit();
  314. }
  315. return exponent;
  316. }
  317. // Check that a digit sequence is valid: that it contains one or more digits,
  318. // contains only digits in the specified base, and that any digit separators
  319. // are present and correctly positioned.
  320. auto NumericLiteral::Parser::CheckDigitSequence(llvm::StringRef text,
  321. Radix radix,
  322. bool allow_digit_separators)
  323. -> CheckDigitSequenceResult {
  324. std::bitset<256> valid_digits;
  325. static constexpr llvm::StringLiteral Digits = "0123456789ABCDEF";
  326. for (char c : Digits.take_front(static_cast<int>(radix))) {
  327. valid_digits[static_cast<unsigned char>(c)] = true;
  328. }
  329. int num_digit_separators = 0;
  330. for (int i = 0, n = text.size(); i != n; ++i) {
  331. char c = text[i];
  332. if (valid_digits[static_cast<unsigned char>(c)]) {
  333. continue;
  334. }
  335. if (c == '_') {
  336. // A digit separator cannot appear at the start of a digit sequence,
  337. // next to another digit separator, or at the end.
  338. if (!allow_digit_separators || i == 0 || text[i - 1] == '_' ||
  339. i + 1 == n) {
  340. CARBON_DIAGNOSTIC(InvalidDigitSeparator, Error,
  341. "misplaced digit separator in numeric literal");
  342. emitter_.Emit(text.begin() + i, InvalidDigitSeparator);
  343. }
  344. ++num_digit_separators;
  345. continue;
  346. }
  347. CARBON_DIAGNOSTIC(InvalidDigit, Error,
  348. "invalid digit '{0}' in "
  349. "{1:=2:binary|=8:octal|=10:decimal|=16:hexadecimal} "
  350. "numeric literal",
  351. char, Diagnostics::IntAsSelect);
  352. emitter_.Emit(text.begin() + i, InvalidDigit, c, static_cast<int>(radix));
  353. return {.ok = false};
  354. }
  355. if (num_digit_separators == static_cast<int>(text.size())) {
  356. CARBON_DIAGNOSTIC(EmptyDigitSequence, Error,
  357. "empty digit sequence in numeric literal");
  358. emitter_.Emit(text.begin(), EmptyDigitSequence);
  359. return {.ok = false};
  360. }
  361. if (!CanLexInt(emitter_, text)) {
  362. return {.ok = false};
  363. }
  364. return {.ok = true, .has_digit_separators = (num_digit_separators != 0)};
  365. }
  366. // Check that we don't have a '0' prefix on a non-zero decimal integer.
  367. auto NumericLiteral::Parser::CheckLeadingZero() -> bool {
  368. if (radix_ == Radix::Decimal && int_part_.starts_with("0") &&
  369. int_part_ != "0") {
  370. CARBON_DIAGNOSTIC(UnknownBaseSpecifier, Error,
  371. "unknown base specifier in numeric literal");
  372. emitter_.Emit(int_part_.begin(), UnknownBaseSpecifier);
  373. return false;
  374. }
  375. return true;
  376. }
  377. // Check the integer part (before the '.', if any) is valid.
  378. auto NumericLiteral::Parser::CheckIntPart() -> bool {
  379. auto int_result = CheckDigitSequence(int_part_, radix_);
  380. mantissa_needs_cleaning_ |= int_result.has_digit_separators;
  381. return int_result.ok;
  382. }
  383. // Check the fractional part (after the '.' and before the exponent, if any)
  384. // is valid.
  385. auto NumericLiteral::Parser::CheckFractionalPart() -> bool {
  386. if (IsInt()) {
  387. return true;
  388. }
  389. if (radix_ == Radix::Binary || radix_ == Radix::Octal) {
  390. CARBON_DIAGNOSTIC(
  391. InvalidRealLiteralRadix, Error,
  392. "{0:=2:binary|=8:octal} real number literals are not supported",
  393. Diagnostics::IntAsSelect);
  394. emitter_.Emit(literal_.text_.begin() + literal_.radix_point_,
  395. InvalidRealLiteralRadix, static_cast<int>(radix_));
  396. // Carry on and parse the real literal anyway.
  397. }
  398. // We need to remove a '.' from the mantissa.
  399. mantissa_needs_cleaning_ = true;
  400. return CheckDigitSequence(fract_part_, radix_,
  401. /*allow_digit_separators=*/false)
  402. .ok;
  403. }
  404. // Check the exponent part (if any) is valid.
  405. auto NumericLiteral::Parser::CheckExponentPart() -> bool {
  406. if (literal_.exponent_ == static_cast<int>(literal_.text_.size())) {
  407. return true;
  408. }
  409. char expected_exponent_kind = (radix_ == Radix::Decimal ? 'e' : 'p');
  410. if (literal_.text_[literal_.exponent_] != expected_exponent_kind) {
  411. CARBON_DIAGNOSTIC(WrongRealLiteralExponent, Error,
  412. "expected '{0}' to introduce exponent", char);
  413. emitter_.Emit(literal_.text_.begin() + literal_.exponent_,
  414. WrongRealLiteralExponent, expected_exponent_kind);
  415. return false;
  416. }
  417. auto exponent_result = CheckDigitSequence(exponent_part_, Radix::Decimal);
  418. exponent_needs_cleaning_ = exponent_result.has_digit_separators;
  419. return exponent_result.ok;
  420. }
  421. // Parse the token and compute its value.
  422. auto NumericLiteral::ComputeValue(
  423. Diagnostics::Emitter<const char*>& emitter) const -> Value {
  424. Parser parser(emitter, *this);
  425. if (!parser.Check()) {
  426. return UnrecoverableError();
  427. }
  428. if (parser.IsInt()) {
  429. return IntValue{.value = parser.GetMantissa()};
  430. }
  431. return RealValue{
  432. .radix = (parser.GetRadix() == Radix::Decimal ? Radix::Decimal
  433. : Radix::Binary),
  434. .mantissa = parser.GetMantissa(),
  435. .exponent = parser.GetExponent()};
  436. }
  437. } // namespace Carbon::Lex