numeric_literal.cpp 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #include "toolchain/lex/numeric_literal.h"
  5. #include <algorithm>
  6. #include <bitset>
  7. #include <cmath>
  8. #include <iterator>
  9. #include <optional>
  10. #include "common/check.h"
  11. #include "llvm/ADT/StringExtras.h"
  12. #include "llvm/Support/FormatVariadicDetails.h"
  13. #include "toolchain/base/int.h"
  14. #include "toolchain/diagnostics/format_providers.h"
  15. #include "toolchain/lex/character_set.h"
  16. #include "toolchain/lex/helpers.h"
  17. namespace Carbon::Lex {
  18. auto NumericLiteral::Lex(llvm::StringRef source_text,
  19. bool can_form_real_literal)
  20. -> std::optional<NumericLiteral> {
  21. NumericLiteral result;
  22. if (source_text.empty() || !IsDecimalDigit(source_text.front())) {
  23. return std::nullopt;
  24. }
  25. bool seen_plus_minus = false;
  26. bool seen_radix_point = false;
  27. bool seen_potential_exponent = false;
  28. // Greedily consume all following characters that might be part of a numeric
  29. // literal. This allows us to produce better diagnostics on invalid literals.
  30. //
  31. // TODO(zygoloid): Update lexical rules to specify that a numeric literal
  32. // cannot be immediately followed by an alphanumeric character.
  33. int i = 1;
  34. int n = source_text.size();
  35. for (; i != n; ++i) {
  36. char c = source_text[i];
  37. if (IsAlnum(c) || c == '_') {
  38. if (IsLower(c) && seen_radix_point && !seen_plus_minus) {
  39. result.exponent_ = i;
  40. seen_potential_exponent = true;
  41. }
  42. continue;
  43. }
  44. // Exactly one `.` can be part of the literal, but only if it's followed by
  45. // an alphanumeric character.
  46. if (c == '.' && can_form_real_literal && i + 1 != n &&
  47. IsAlnum(source_text[i + 1]) && !seen_radix_point) {
  48. result.radix_point_ = i;
  49. seen_radix_point = true;
  50. continue;
  51. }
  52. // A `+` or `-` continues the literal only if it's preceded by a lowercase
  53. // letter (which will be 'e' or 'p' or part of an invalid literal) and
  54. // followed by an alphanumeric character. This '+' or '-' cannot be an
  55. // operator because a literal cannot end in a lowercase letter.
  56. if ((c == '+' || c == '-') && seen_potential_exponent &&
  57. result.exponent_ == i - 1 && i + 1 != n &&
  58. IsAlnum(source_text[i + 1])) {
  59. // This is not possible because we don't update result.exponent after we
  60. // see a '+' or '-'.
  61. CARBON_CHECK(!seen_plus_minus, "should only consume one + or -");
  62. seen_plus_minus = true;
  63. continue;
  64. }
  65. break;
  66. }
  67. result.text_ = source_text.substr(0, i);
  68. if (!seen_radix_point) {
  69. result.radix_point_ = i;
  70. }
  71. if (!seen_potential_exponent) {
  72. result.exponent_ = i;
  73. }
  74. return result;
  75. }
  76. // Parser for numeric literal tokens.
  77. //
  78. // Responsible for checking that a numeric literal is valid and meaningful and
  79. // either diagnosing or extracting its meaning.
  80. class NumericLiteral::Parser {
  81. public:
  82. Parser(Diagnostics::Emitter<const char*>& emitter, NumericLiteral literal);
  83. auto IsInt() -> bool {
  84. return literal_.radix_point_ == static_cast<int>(literal_.text_.size());
  85. }
  86. // Check that the numeric literal token is syntactically valid and
  87. // meaningful, and diagnose if not. Returns `true` if the token was
  88. // sufficiently valid that we could determine its meaning. If `false` is
  89. // returned, a diagnostic has already been issued.
  90. auto Check() -> bool;
  91. // Get the radix of this token. One of 2, 10, or 16.
  92. auto GetRadix() -> Radix { return radix_; }
  93. // Get the mantissa of this token's value.
  94. auto GetMantissa() -> llvm::APInt;
  95. // Get the exponent of this token's value. This is always zero for an integer
  96. // literal.
  97. auto GetExponent() -> llvm::APInt;
  98. private:
  99. struct CheckDigitSequenceResult {
  100. bool ok;
  101. bool has_digit_separators = false;
  102. };
  103. auto CheckDigitSequence(llvm::StringRef text, Radix radix,
  104. bool allow_digit_separators = true)
  105. -> CheckDigitSequenceResult;
  106. auto CheckLeadingZero() -> bool;
  107. auto CheckIntPart() -> bool;
  108. auto CheckFractionalPart() -> bool;
  109. auto CheckExponentPart() -> bool;
  110. Diagnostics::Emitter<const char*>& emitter_;
  111. NumericLiteral literal_;
  112. // The radix of the literal: 2, 10, or 16, for a prefix of '0b', no prefix,
  113. // or '0x', respectively.
  114. Radix radix_ = Radix::Decimal;
  115. // The various components of a numeric literal:
  116. //
  117. // [radix] int_part [. fract_part [[ep] [+-] exponent_part]]
  118. llvm::StringRef int_part_;
  119. llvm::StringRef fract_part_;
  120. llvm::StringRef exponent_part_;
  121. // Do we need to remove any special characters (digit separator or radix
  122. // point) before interpreting the mantissa or exponent as an integer?
  123. bool mantissa_needs_cleaning_ = false;
  124. bool exponent_needs_cleaning_ = false;
  125. // True if we found a `-` before `exponent_part`.
  126. bool exponent_is_negative_ = false;
  127. };
  128. NumericLiteral::Parser::Parser(Diagnostics::Emitter<const char*>& emitter,
  129. NumericLiteral literal)
  130. : emitter_(emitter), literal_(literal) {
  131. int_part_ = literal.text_.substr(0, literal.radix_point_);
  132. if (int_part_.consume_front("0x")) {
  133. radix_ = Radix::Hexadecimal;
  134. } else if (int_part_.consume_front("0b")) {
  135. radix_ = Radix::Binary;
  136. } else if (int_part_.consume_front("0o")) {
  137. radix_ = Radix::Octal;
  138. }
  139. fract_part_ = literal.text_.substr(
  140. literal.radix_point_ + 1, literal.exponent_ - literal.radix_point_ - 1);
  141. exponent_part_ = literal.text_.substr(literal.exponent_ + 1);
  142. if (!exponent_part_.consume_front("+")) {
  143. exponent_is_negative_ = exponent_part_.consume_front("-");
  144. }
  145. }
  146. // Check that the numeric literal token is syntactically valid and meaningful,
  147. // and diagnose if not.
  148. auto NumericLiteral::Parser::Check() -> bool {
  149. return CheckLeadingZero() && CheckIntPart() && CheckFractionalPart() &&
  150. CheckExponentPart();
  151. }
  152. // Parses a binary integer literal.
  153. static auto ParseBinary(llvm::StringRef digits, bool is_signed) -> llvm::APInt {
  154. llvm::APInt value(
  155. std::max<int>(IntStore::MinAPWidth, digits.size() + is_signed), 0);
  156. int cursor = digits.size() - 1;
  157. for (char c : digits) {
  158. if (c == '1') {
  159. value.setBit(cursor);
  160. }
  161. --cursor;
  162. }
  163. return value;
  164. }
  165. // Parses an octal or hexadecimal integer literal.
  166. template <NumericLiteral::Radix Radix>
  167. requires(Radix == NumericLiteral::Radix::Hexadecimal ||
  168. Radix == NumericLiteral::Radix::Octal)
  169. static auto ParseOctalOrHexadecimal(llvm::StringRef digits, bool is_signed)
  170. -> llvm::APInt {
  171. constexpr int BitsPerDigit =
  172. Radix == NumericLiteral::Radix::Hexadecimal ? 4 : 3;
  173. llvm::APInt value(std::max<int>(IntStore::MinAPWidth,
  174. digits.size() * BitsPerDigit + is_signed),
  175. 0);
  176. int cursor = digits.size() * BitsPerDigit - 1;
  177. for (char c : digits) {
  178. uint8_t digit;
  179. if constexpr (Radix == NumericLiteral::Radix::Octal) {
  180. digit = c - '0';
  181. } else {
  182. digit = c <= '9' ? (c - '0') : (c - 'A' + 10);
  183. if (digit & 0x8) {
  184. value.setBit(cursor);
  185. }
  186. --cursor;
  187. }
  188. if (digit & 0x4) {
  189. value.setBit(cursor);
  190. }
  191. --cursor;
  192. if (digit & 0x2) {
  193. value.setBit(cursor);
  194. }
  195. --cursor;
  196. if (digit & 0x1) {
  197. value.setBit(cursor);
  198. }
  199. --cursor;
  200. }
  201. return value;
  202. }
  203. // Parses a single chunk of up to 19 decimal digits.
  204. static auto ParseDecimalChunk(llvm::StringRef digits) -> uint64_t {
  205. uint64_t chunk_val = 0;
  206. for (char c : digits) {
  207. chunk_val = chunk_val * 10 + (c - '0');
  208. }
  209. return chunk_val;
  210. }
  211. // Parsing decimals is complex because they're not a power of 2. We process it
  212. // 19 digits at a time because that's the most that fit into uint64_t, which
  213. // is APInt's internal unit for storage; chunking this way minimizes
  214. // cross-unit arithmetic.
  215. static auto ParseDecimal(llvm::StringRef digits, bool is_signed)
  216. -> llvm::APInt {
  217. // APInt performance scales based on the number of bits, so be precise.
  218. // TODO: Check if this can be `constexpr` when C++26 is in use.
  219. static const double bits_per_digit = std::log2(10);
  220. llvm::APInt value(
  221. std::max<int>(IntStore::MinAPWidth,
  222. std::ceil(digits.size() * bits_per_digit) + is_signed),
  223. 0);
  224. static constexpr int DigitsPerChunk = 19;
  225. // If there's only a few digits, we don't need the multiplication logic.
  226. if (digits.size() <= DigitsPerChunk) {
  227. value = ParseDecimalChunk(digits);
  228. return value;
  229. }
  230. // For the first chunk, we set it up so that all remaining chunks will
  231. // cause equivalent multiplications when adding in. This lets us only
  232. // compute the multiplier once.
  233. int first_chunk_size = digits.size() % DigitsPerChunk;
  234. if (first_chunk_size == 0) {
  235. first_chunk_size = DigitsPerChunk;
  236. }
  237. value = ParseDecimalChunk(digits.take_front(first_chunk_size));
  238. digits = digits.drop_front(first_chunk_size);
  239. // For each remaining chunk, multiply the value by 10^19 and add the
  240. // chunk value.
  241. static constexpr uint64_t Mult = 10'000'000'000'000'000'000ULL;
  242. for (; !digits.empty(); digits = digits.drop_front(DigitsPerChunk)) {
  243. value *= Mult;
  244. value += ParseDecimalChunk(digits.take_front(DigitsPerChunk));
  245. }
  246. return value;
  247. }
  248. // Parse a string that is known to be a valid base-radix integer into an
  249. // APInt. If `needs_cleaning` is true, the string may additionally contain '_'
  250. // and '.' characters that should be ignored. If `is_signed` is true, a bit is
  251. // kept unused for the sign.
  252. //
  253. // Ignoring '.' is used when parsing a real literal. For example, when
  254. // parsing 123.456e7, we want to decompose it into an integer mantissa
  255. // (123456) and an exponent (7 - 3 = 4), and this routine is given the
  256. // "123.456" to parse as the mantissa.
  257. static auto ParseInt(llvm::StringRef digits, NumericLiteral::Radix radix,
  258. bool needs_cleaning, bool is_signed) -> llvm::APInt {
  259. llvm::SmallString<32> cleaned;
  260. if (needs_cleaning) {
  261. cleaned.reserve(digits.size());
  262. llvm::copy_if(digits, std::back_inserter(cleaned),
  263. [](char c) { return c != '_' && c != '.'; });
  264. digits = cleaned;
  265. }
  266. digits = digits.ltrim('0');
  267. // We don't use LLVM's `getAsInteger` because it has poor performance.
  268. // Instead, we implement our own.
  269. switch (radix) {
  270. case NumericLiteral::Radix::Binary:
  271. return ParseBinary(digits, is_signed);
  272. case NumericLiteral::Radix::Octal:
  273. return ParseOctalOrHexadecimal<NumericLiteral::Radix::Octal>(digits,
  274. is_signed);
  275. case NumericLiteral::Radix::Decimal:
  276. return ParseDecimal(digits, is_signed);
  277. case NumericLiteral::Radix::Hexadecimal:
  278. return ParseOctalOrHexadecimal<NumericLiteral::Radix::Hexadecimal>(
  279. digits, is_signed);
  280. }
  281. }
  282. auto NumericLiteral::Parser::GetMantissa() -> llvm::APInt {
  283. const char* end = IsInt() ? int_part_.end() : fract_part_.end();
  284. llvm::StringRef digits(int_part_.begin(), end - int_part_.begin());
  285. return ParseInt(digits, radix_, mantissa_needs_cleaning_,
  286. /*is_signed=*/false);
  287. }
  288. auto NumericLiteral::Parser::GetExponent() -> llvm::APInt {
  289. // Compute the effective exponent from the specified exponent, if any,
  290. // and the position of the radix point.
  291. llvm::APInt exponent(IntStore::MinAPWidth, 0);
  292. if (!exponent_part_.empty()) {
  293. exponent = ParseInt(exponent_part_, Radix::Decimal,
  294. exponent_needs_cleaning_, /*is_signed=*/true);
  295. if (exponent_is_negative_) {
  296. exponent.negate();
  297. }
  298. }
  299. // Each character after the decimal point reduces the effective exponent.
  300. int excess_exponent = fract_part_.size();
  301. if (radix_ == Radix::Hexadecimal) {
  302. excess_exponent *= 4;
  303. } else if (radix_ == Radix::Octal) {
  304. excess_exponent *= 3;
  305. }
  306. exponent -= excess_exponent;
  307. CARBON_CHECK(exponent.getBitWidth() >= 64, "overflow requires high width");
  308. if (exponent_is_negative_ && !exponent.isNegative()) {
  309. // We overflowed. Note that we can only overflow by a little, and only
  310. // from negative to positive, because exponent is at least 64 bits wide
  311. // and excess_exponent is bounded above by four times the size of the
  312. // input buffer, which we assume fits into 32 bits.
  313. exponent = exponent.zext(exponent.getBitWidth() + 1);
  314. exponent.setSignBit();
  315. }
  316. return exponent;
  317. }
  318. // Check that a digit sequence is valid: that it contains one or more digits,
  319. // contains only digits in the specified base, and that any digit separators
  320. // are present and correctly positioned.
  321. auto NumericLiteral::Parser::CheckDigitSequence(llvm::StringRef text,
  322. Radix radix,
  323. bool allow_digit_separators)
  324. -> CheckDigitSequenceResult {
  325. std::bitset<256> valid_digits;
  326. static constexpr llvm::StringLiteral Digits = "0123456789ABCDEF";
  327. for (char c : Digits.take_front(static_cast<int>(radix))) {
  328. valid_digits[static_cast<unsigned char>(c)] = true;
  329. }
  330. int num_digit_separators = 0;
  331. for (int i = 0, n = text.size(); i != n; ++i) {
  332. char c = text[i];
  333. if (valid_digits[static_cast<unsigned char>(c)]) {
  334. continue;
  335. }
  336. if (c == '_') {
  337. // A digit separator cannot appear at the start of a digit sequence,
  338. // next to another digit separator, or at the end.
  339. if (!allow_digit_separators || i == 0 || text[i - 1] == '_' ||
  340. i + 1 == n) {
  341. CARBON_DIAGNOSTIC(InvalidDigitSeparator, Error,
  342. "misplaced digit separator in numeric literal");
  343. emitter_.Emit(text.begin() + i, InvalidDigitSeparator);
  344. }
  345. ++num_digit_separators;
  346. continue;
  347. }
  348. CARBON_DIAGNOSTIC(InvalidDigit, Error,
  349. "invalid digit '{0}' in "
  350. "{1:=2:binary|=8:octal|=10:decimal|=16:hexadecimal} "
  351. "numeric literal",
  352. char, Diagnostics::IntAsSelect);
  353. emitter_.Emit(text.begin() + i, InvalidDigit, c, static_cast<int>(radix));
  354. return {.ok = false};
  355. }
  356. if (num_digit_separators == static_cast<int>(text.size())) {
  357. CARBON_DIAGNOSTIC(EmptyDigitSequence, Error,
  358. "empty digit sequence in numeric literal");
  359. emitter_.Emit(text.begin(), EmptyDigitSequence);
  360. return {.ok = false};
  361. }
  362. if (!CanLexInt(emitter_, text)) {
  363. return {.ok = false};
  364. }
  365. return {.ok = true, .has_digit_separators = (num_digit_separators != 0)};
  366. }
  367. // Check that we don't have a '0' prefix on a non-zero decimal integer.
  368. auto NumericLiteral::Parser::CheckLeadingZero() -> bool {
  369. if (radix_ == Radix::Decimal && int_part_.starts_with("0") &&
  370. int_part_ != "0") {
  371. CARBON_DIAGNOSTIC(UnknownBaseSpecifier, Error,
  372. "unknown base specifier in numeric literal");
  373. emitter_.Emit(int_part_.begin(), UnknownBaseSpecifier);
  374. return false;
  375. }
  376. return true;
  377. }
  378. // Check the integer part (before the '.', if any) is valid.
  379. auto NumericLiteral::Parser::CheckIntPart() -> bool {
  380. auto int_result = CheckDigitSequence(int_part_, radix_);
  381. mantissa_needs_cleaning_ |= int_result.has_digit_separators;
  382. return int_result.ok;
  383. }
  384. // Check the fractional part (after the '.' and before the exponent, if any)
  385. // is valid.
  386. auto NumericLiteral::Parser::CheckFractionalPart() -> bool {
  387. if (IsInt()) {
  388. return true;
  389. }
  390. if (radix_ == Radix::Binary || radix_ == Radix::Octal) {
  391. CARBON_DIAGNOSTIC(
  392. InvalidRealLiteralRadix, Error,
  393. "{0:=2:binary|=8:octal} real number literals are not supported",
  394. Diagnostics::IntAsSelect);
  395. emitter_.Emit(literal_.text_.begin() + literal_.radix_point_,
  396. InvalidRealLiteralRadix, static_cast<int>(radix_));
  397. // Carry on and parse the real literal anyway.
  398. }
  399. // We need to remove a '.' from the mantissa.
  400. mantissa_needs_cleaning_ = true;
  401. return CheckDigitSequence(fract_part_, radix_,
  402. /*allow_digit_separators=*/false)
  403. .ok;
  404. }
  405. // Check the exponent part (if any) is valid.
  406. auto NumericLiteral::Parser::CheckExponentPart() -> bool {
  407. if (literal_.exponent_ == static_cast<int>(literal_.text_.size())) {
  408. return true;
  409. }
  410. char expected_exponent_kind = (radix_ == Radix::Decimal ? 'e' : 'p');
  411. if (literal_.text_[literal_.exponent_] != expected_exponent_kind) {
  412. CARBON_DIAGNOSTIC(WrongRealLiteralExponent, Error,
  413. "expected '{0}' to introduce exponent", char);
  414. emitter_.Emit(literal_.text_.begin() + literal_.exponent_,
  415. WrongRealLiteralExponent, expected_exponent_kind);
  416. return false;
  417. }
  418. auto exponent_result = CheckDigitSequence(exponent_part_, Radix::Decimal);
  419. exponent_needs_cleaning_ = exponent_result.has_digit_separators;
  420. return exponent_result.ok;
  421. }
  422. // Parse the token and compute its value.
  423. auto NumericLiteral::ComputeValue(
  424. Diagnostics::Emitter<const char*>& emitter) const -> Value {
  425. Parser parser(emitter, *this);
  426. if (!parser.Check()) {
  427. return UnrecoverableError();
  428. }
  429. if (parser.IsInt()) {
  430. return IntValue{.value = parser.GetMantissa()};
  431. }
  432. return RealValue{
  433. .radix = (parser.GetRadix() == Radix::Decimal ? Radix::Decimal
  434. : Radix::Binary),
  435. .mantissa = parser.GetMantissa(),
  436. .exponent = parser.GetExponent()};
  437. }
  438. } // namespace Carbon::Lex