numeric_literal.cpp 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #include "toolchain/lex/numeric_literal.h"
  5. #include <bitset>
  6. #include "common/check.h"
  7. #include "llvm/ADT/StringExtras.h"
  8. #include "toolchain/lex/character_set.h"
  9. #include "toolchain/lex/helpers.h"
  10. namespace Carbon::Lex {
  11. // Adapts Radix for use with formatv.
  12. // NOTE: clangd may see this as unused, but it will be invoked by diagnostics.
  13. // We don't do anything to disable the warning because clang compile invocations
  14. // should warn if it's actually unused.
  15. static auto operator<<(llvm::raw_ostream& out, NumericLiteral::Radix radix)
  16. -> llvm::raw_ostream& {
  17. switch (radix) {
  18. case NumericLiteral::Radix::Binary:
  19. out << "binary";
  20. break;
  21. case NumericLiteral::Radix::Decimal:
  22. out << "decimal";
  23. break;
  24. case NumericLiteral::Radix::Hexadecimal:
  25. out << "hexadecimal";
  26. break;
  27. }
  28. return out;
  29. }
  30. auto NumericLiteral::Lex(llvm::StringRef source_text)
  31. -> std::optional<NumericLiteral> {
  32. NumericLiteral result;
  33. if (source_text.empty() || !IsDecimalDigit(source_text.front())) {
  34. return std::nullopt;
  35. }
  36. bool seen_plus_minus = false;
  37. bool seen_radix_point = false;
  38. bool seen_potential_exponent = false;
  39. // Greedily consume all following characters that might be part of a numeric
  40. // literal. This allows us to produce better diagnostics on invalid literals.
  41. //
  42. // TODO(zygoloid): Update lexical rules to specify that a numeric literal
  43. // cannot be immediately followed by an alphanumeric character.
  44. int i = 1;
  45. int n = source_text.size();
  46. for (; i != n; ++i) {
  47. char c = source_text[i];
  48. if (IsAlnum(c) || c == '_') {
  49. if (IsLower(c) && seen_radix_point && !seen_plus_minus) {
  50. result.exponent_ = i;
  51. seen_potential_exponent = true;
  52. }
  53. continue;
  54. }
  55. // Exactly one `.` can be part of the literal, but only if it's followed by
  56. // an alphanumeric character.
  57. if (c == '.' && i + 1 != n && IsAlnum(source_text[i + 1]) &&
  58. !seen_radix_point) {
  59. result.radix_point_ = i;
  60. seen_radix_point = true;
  61. continue;
  62. }
  63. // A `+` or `-` continues the literal only if it's preceded by a lowercase
  64. // letter (which will be 'e' or 'p' or part of an invalid literal) and
  65. // followed by an alphanumeric character. This '+' or '-' cannot be an
  66. // operator because a literal cannot end in a lowercase letter.
  67. if ((c == '+' || c == '-') && seen_potential_exponent &&
  68. result.exponent_ == i - 1 && i + 1 != n &&
  69. IsAlnum(source_text[i + 1])) {
  70. // This is not possible because we don't update result.exponent after we
  71. // see a '+' or '-'.
  72. CARBON_CHECK(!seen_plus_minus) << "should only consume one + or -";
  73. seen_plus_minus = true;
  74. continue;
  75. }
  76. break;
  77. }
  78. result.text_ = source_text.substr(0, i);
  79. if (!seen_radix_point) {
  80. result.radix_point_ = i;
  81. }
  82. if (!seen_potential_exponent) {
  83. result.exponent_ = i;
  84. }
  85. return result;
  86. }
  87. // Parser for numeric literal tokens.
  88. //
  89. // Responsible for checking that a numeric literal is valid and meaningful and
  90. // either diagnosing or extracting its meaning.
  91. class NumericLiteral::Parser {
  92. public:
  93. Parser(DiagnosticEmitter<const char*>& emitter, NumericLiteral literal);
  94. auto IsInteger() -> bool {
  95. return literal_.radix_point_ == static_cast<int>(literal_.text_.size());
  96. }
  97. // Check that the numeric literal token is syntactically valid and
  98. // meaningful, and diagnose if not. Returns `true` if the token was
  99. // sufficiently valid that we could determine its meaning. If `false` is
  100. // returned, a diagnostic has already been issued.
  101. auto Check() -> bool;
  102. // Get the radix of this token. One of 2, 10, or 16.
  103. auto GetRadix() -> Radix { return radix_; }
  104. // Get the mantissa of this token's value.
  105. auto GetMantissa() -> llvm::APInt;
  106. // Get the exponent of this token's value. This is always zero for an integer
  107. // literal.
  108. auto GetExponent() -> llvm::APInt;
  109. private:
  110. struct CheckDigitSequenceResult {
  111. bool ok;
  112. bool has_digit_separators = false;
  113. };
  114. auto CheckDigitSequence(llvm::StringRef text, Radix radix,
  115. bool allow_digit_separators = true)
  116. -> CheckDigitSequenceResult;
  117. auto CheckDigitSeparatorPlacement(llvm::StringRef text, Radix radix,
  118. int num_digit_separators) -> void;
  119. auto CheckLeadingZero() -> bool;
  120. auto CheckIntPart() -> bool;
  121. auto CheckFractionalPart() -> bool;
  122. auto CheckExponentPart() -> bool;
  123. DiagnosticEmitter<const char*>& emitter_;
  124. NumericLiteral literal_;
  125. // The radix of the literal: 2, 10, or 16, for a prefix of '0b', no prefix,
  126. // or '0x', respectively.
  127. Radix radix_ = Radix::Decimal;
  128. // The various components of a numeric literal:
  129. //
  130. // [radix] int_part [. fract_part [[ep] [+-] exponent_part]]
  131. llvm::StringRef int_part_;
  132. llvm::StringRef fract_part_;
  133. llvm::StringRef exponent_part_;
  134. // Do we need to remove any special characters (digit separator or radix
  135. // point) before interpreting the mantissa or exponent as an integer?
  136. bool mantissa_needs_cleaning_ = false;
  137. bool exponent_needs_cleaning_ = false;
  138. // True if we found a `-` before `exponent_part`.
  139. bool exponent_is_negative_ = false;
  140. };
  141. NumericLiteral::Parser::Parser(DiagnosticEmitter<const char*>& emitter,
  142. NumericLiteral literal)
  143. : emitter_(emitter), literal_(literal) {
  144. int_part_ = literal.text_.substr(0, literal.radix_point_);
  145. if (int_part_.consume_front("0x")) {
  146. radix_ = Radix::Hexadecimal;
  147. } else if (int_part_.consume_front("0b")) {
  148. radix_ = Radix::Binary;
  149. }
  150. fract_part_ = literal.text_.substr(
  151. literal.radix_point_ + 1, literal.exponent_ - literal.radix_point_ - 1);
  152. exponent_part_ = literal.text_.substr(literal.exponent_ + 1);
  153. if (!exponent_part_.consume_front("+")) {
  154. exponent_is_negative_ = exponent_part_.consume_front("-");
  155. }
  156. }
  157. // Check that the numeric literal token is syntactically valid and meaningful,
  158. // and diagnose if not.
  159. auto NumericLiteral::Parser::Check() -> bool {
  160. return CheckLeadingZero() && CheckIntPart() && CheckFractionalPart() &&
  161. CheckExponentPart();
  162. }
  163. // Parse a string that is known to be a valid base-radix integer into an
  164. // APInt. If needs_cleaning is true, the string may additionally contain '_'
  165. // and '.' characters that should be ignored.
  166. //
  167. // Ignoring '.' is used when parsing a real literal. For example, when
  168. // parsing 123.456e7, we want to decompose it into an integer mantissa
  169. // (123456) and an exponent (7 - 3 = 4), and this routine is given the
  170. // "123.456" to parse as the mantissa.
  171. static auto ParseInteger(llvm::StringRef digits, NumericLiteral::Radix radix,
  172. bool needs_cleaning) -> llvm::APInt {
  173. llvm::SmallString<32> cleaned;
  174. if (needs_cleaning) {
  175. cleaned.reserve(digits.size());
  176. std::remove_copy_if(digits.begin(), digits.end(),
  177. std::back_inserter(cleaned),
  178. [](char c) { return c == '_' || c == '.'; });
  179. digits = cleaned;
  180. }
  181. llvm::APInt value;
  182. if (digits.getAsInteger(static_cast<int>(radix), value)) {
  183. llvm_unreachable("should never fail");
  184. }
  185. return value;
  186. }
  187. auto NumericLiteral::Parser::GetMantissa() -> llvm::APInt {
  188. const char* end = IsInteger() ? int_part_.end() : fract_part_.end();
  189. llvm::StringRef digits(int_part_.begin(), end - int_part_.begin());
  190. return ParseInteger(digits, radix_, mantissa_needs_cleaning_);
  191. }
  192. auto NumericLiteral::Parser::GetExponent() -> llvm::APInt {
  193. // Compute the effective exponent from the specified exponent, if any,
  194. // and the position of the radix point.
  195. llvm::APInt exponent(64, 0);
  196. if (!exponent_part_.empty()) {
  197. exponent =
  198. ParseInteger(exponent_part_, Radix::Decimal, exponent_needs_cleaning_);
  199. // The exponent is a signed integer, and the number we just parsed is
  200. // non-negative, so ensure we have a wide enough representation to
  201. // include a sign bit. Also make sure the exponent isn't too narrow so
  202. // the calculation below can't lose information through overflow.
  203. if (exponent.isSignBitSet() || exponent.getBitWidth() < 64) {
  204. exponent = exponent.zext(std::max(64U, exponent.getBitWidth() + 1));
  205. }
  206. if (exponent_is_negative_) {
  207. exponent.negate();
  208. }
  209. }
  210. // Each character after the decimal point reduces the effective exponent.
  211. int excess_exponent = fract_part_.size();
  212. if (radix_ == Radix::Hexadecimal) {
  213. excess_exponent *= 4;
  214. }
  215. exponent -= excess_exponent;
  216. if (exponent_is_negative_ && !exponent.isNegative()) {
  217. // We overflowed. Note that we can only overflow by a little, and only
  218. // from negative to positive, because exponent is at least 64 bits wide
  219. // and excess_exponent is bounded above by four times the size of the
  220. // input buffer, which we assume fits into 32 bits.
  221. exponent = exponent.zext(exponent.getBitWidth() + 1);
  222. exponent.setSignBit();
  223. }
  224. return exponent;
  225. }
  226. // Check that a digit sequence is valid: that it contains one or more digits,
  227. // contains only digits in the specified base, and that any digit separators
  228. // are present and correctly positioned.
  229. auto NumericLiteral::Parser::CheckDigitSequence(llvm::StringRef text,
  230. Radix radix,
  231. bool allow_digit_separators)
  232. -> CheckDigitSequenceResult {
  233. std::bitset<256> valid_digits;
  234. switch (radix) {
  235. case Radix::Binary:
  236. for (char c : "01") {
  237. valid_digits[static_cast<unsigned char>(c)] = true;
  238. }
  239. break;
  240. case Radix::Decimal:
  241. for (char c : "0123456789") {
  242. valid_digits[static_cast<unsigned char>(c)] = true;
  243. }
  244. break;
  245. case Radix::Hexadecimal:
  246. for (char c : "0123456789ABCDEF") {
  247. valid_digits[static_cast<unsigned char>(c)] = true;
  248. }
  249. break;
  250. }
  251. int num_digit_separators = 0;
  252. for (int i = 0, n = text.size(); i != n; ++i) {
  253. char c = text[i];
  254. if (valid_digits[static_cast<unsigned char>(c)]) {
  255. continue;
  256. }
  257. if (c == '_') {
  258. // A digit separator cannot appear at the start of a digit sequence,
  259. // next to another digit separator, or at the end.
  260. if (!allow_digit_separators || i == 0 || text[i - 1] == '_' ||
  261. i + 1 == n) {
  262. CARBON_DIAGNOSTIC(InvalidDigitSeparator, Error,
  263. "Misplaced digit separator in numeric literal.");
  264. emitter_.Emit(text.begin() + 1, InvalidDigitSeparator);
  265. }
  266. ++num_digit_separators;
  267. continue;
  268. }
  269. CARBON_DIAGNOSTIC(InvalidDigit, Error,
  270. "Invalid digit '{0}' in {1} numeric literal.", char,
  271. NumericLiteral::Radix);
  272. emitter_.Emit(text.begin() + i, InvalidDigit, c, radix);
  273. return {.ok = false};
  274. }
  275. if (num_digit_separators == static_cast<int>(text.size())) {
  276. CARBON_DIAGNOSTIC(EmptyDigitSequence, Error,
  277. "Empty digit sequence in numeric literal.");
  278. emitter_.Emit(text.begin(), EmptyDigitSequence);
  279. return {.ok = false};
  280. }
  281. // Check that digit separators occur in exactly the expected positions.
  282. if (num_digit_separators) {
  283. CheckDigitSeparatorPlacement(text, radix, num_digit_separators);
  284. }
  285. if (!CanLexInteger(emitter_, text)) {
  286. return {.ok = false};
  287. }
  288. return {.ok = true, .has_digit_separators = (num_digit_separators != 0)};
  289. }
  290. // Given a number with digit separators, check that the digit separators are
  291. // correctly positioned.
  292. auto NumericLiteral::Parser::CheckDigitSeparatorPlacement(
  293. llvm::StringRef text, Radix radix, int num_digit_separators) -> void {
  294. CARBON_DCHECK(std::count(text.begin(), text.end(), '_') ==
  295. num_digit_separators)
  296. << "given wrong number of digit separators: " << num_digit_separators;
  297. if (radix == Radix::Binary) {
  298. // There are no restrictions on digit separator placement for binary
  299. // literals.
  300. return;
  301. }
  302. auto diagnose_irregular_digit_separators = [&]() {
  303. CARBON_DIAGNOSTIC(
  304. IrregularDigitSeparators, Error,
  305. "Digit separators in {0} number should appear every {1} characters "
  306. "from the right.",
  307. NumericLiteral::Radix, int);
  308. emitter_.Emit(text.begin(), IrregularDigitSeparators, radix,
  309. radix == Radix::Decimal ? 3 : 4);
  310. };
  311. // For decimal and hexadecimal digit sequences, digit separators must form
  312. // groups of 3 or 4 digits (4 or 5 characters), respectively.
  313. int stride = (radix == Radix::Decimal ? 4 : 5);
  314. int remaining_digit_separators = num_digit_separators;
  315. const auto* pos = text.end();
  316. while (pos - text.begin() >= stride) {
  317. pos -= stride;
  318. if (*pos != '_') {
  319. diagnose_irregular_digit_separators();
  320. return;
  321. }
  322. --remaining_digit_separators;
  323. }
  324. // Check there weren't any other digit separators.
  325. if (remaining_digit_separators) {
  326. diagnose_irregular_digit_separators();
  327. }
  328. };
  329. // Check that we don't have a '0' prefix on a non-zero decimal integer.
  330. auto NumericLiteral::Parser::CheckLeadingZero() -> bool {
  331. if (radix_ == Radix::Decimal && int_part_.startswith("0") &&
  332. int_part_ != "0") {
  333. CARBON_DIAGNOSTIC(UnknownBaseSpecifier, Error,
  334. "Unknown base specifier in numeric literal.");
  335. emitter_.Emit(int_part_.begin(), UnknownBaseSpecifier);
  336. return false;
  337. }
  338. return true;
  339. }
  340. // Check the integer part (before the '.', if any) is valid.
  341. auto NumericLiteral::Parser::CheckIntPart() -> bool {
  342. auto int_result = CheckDigitSequence(int_part_, radix_);
  343. mantissa_needs_cleaning_ |= int_result.has_digit_separators;
  344. return int_result.ok;
  345. }
  346. // Check the fractional part (after the '.' and before the exponent, if any)
  347. // is valid.
  348. auto NumericLiteral::Parser::CheckFractionalPart() -> bool {
  349. if (IsInteger()) {
  350. return true;
  351. }
  352. if (radix_ == Radix::Binary) {
  353. CARBON_DIAGNOSTIC(BinaryRealLiteral, Error,
  354. "Binary real number literals are not supported.");
  355. emitter_.Emit(literal_.text_.begin() + literal_.radix_point_,
  356. BinaryRealLiteral);
  357. // Carry on and parse the binary real literal anyway.
  358. }
  359. // We need to remove a '.' from the mantissa.
  360. mantissa_needs_cleaning_ = true;
  361. return CheckDigitSequence(fract_part_, radix_,
  362. /*allow_digit_separators=*/false)
  363. .ok;
  364. }
  365. // Check the exponent part (if any) is valid.
  366. auto NumericLiteral::Parser::CheckExponentPart() -> bool {
  367. if (literal_.exponent_ == static_cast<int>(literal_.text_.size())) {
  368. return true;
  369. }
  370. char expected_exponent_kind = (radix_ == Radix::Decimal ? 'e' : 'p');
  371. if (literal_.text_[literal_.exponent_] != expected_exponent_kind) {
  372. CARBON_DIAGNOSTIC(WrongRealLiteralExponent, Error,
  373. "Expected '{0}' to introduce exponent.", char);
  374. emitter_.Emit(literal_.text_.begin() + literal_.exponent_,
  375. WrongRealLiteralExponent, expected_exponent_kind);
  376. return false;
  377. }
  378. auto exponent_result = CheckDigitSequence(exponent_part_, Radix::Decimal);
  379. exponent_needs_cleaning_ = exponent_result.has_digit_separators;
  380. return exponent_result.ok;
  381. }
  382. // Parse the token and compute its value.
  383. auto NumericLiteral::ComputeValue(DiagnosticEmitter<const char*>& emitter) const
  384. -> Value {
  385. Parser parser(emitter, *this);
  386. if (!parser.Check()) {
  387. return UnrecoverableError();
  388. }
  389. if (parser.IsInteger()) {
  390. return IntegerValue{.value = parser.GetMantissa()};
  391. }
  392. return RealValue{
  393. .radix = (parser.GetRadix() == Radix::Decimal ? Radix::Decimal
  394. : Radix::Binary),
  395. .mantissa = parser.GetMantissa(),
  396. .exponent = parser.GetExponent()};
  397. }
  398. } // namespace Carbon::Lex