numeric_literal.cpp 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399
  1. // Part of the Carbon Language project, under the Apache License v2.0 with LLVM
  2. // Exceptions. See /LICENSE for license information.
  3. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  4. #include "toolchain/lex/numeric_literal.h"
  5. #include <algorithm>
  6. #include <bitset>
  7. #include <iterator>
  8. #include <optional>
  9. #include "common/check.h"
  10. #include "llvm/ADT/StringExtras.h"
  11. #include "llvm/Support/FormatVariadicDetails.h"
  12. #include "toolchain/diagnostics/format_providers.h"
  13. #include "toolchain/lex/character_set.h"
  14. #include "toolchain/lex/helpers.h"
  15. namespace Carbon::Lex {
  16. auto NumericLiteral::Lex(llvm::StringRef source_text,
  17. bool can_form_real_literal)
  18. -> std::optional<NumericLiteral> {
  19. NumericLiteral result;
  20. if (source_text.empty() || !IsDecimalDigit(source_text.front())) {
  21. return std::nullopt;
  22. }
  23. bool seen_plus_minus = false;
  24. bool seen_radix_point = false;
  25. bool seen_potential_exponent = false;
  26. // Greedily consume all following characters that might be part of a numeric
  27. // literal. This allows us to produce better diagnostics on invalid literals.
  28. //
  29. // TODO(zygoloid): Update lexical rules to specify that a numeric literal
  30. // cannot be immediately followed by an alphanumeric character.
  31. int i = 1;
  32. int n = source_text.size();
  33. for (; i != n; ++i) {
  34. char c = source_text[i];
  35. if (IsAlnum(c) || c == '_') {
  36. if (IsLower(c) && seen_radix_point && !seen_plus_minus) {
  37. result.exponent_ = i;
  38. seen_potential_exponent = true;
  39. }
  40. continue;
  41. }
  42. // Exactly one `.` can be part of the literal, but only if it's followed by
  43. // an alphanumeric character.
  44. if (c == '.' && can_form_real_literal && i + 1 != n &&
  45. IsAlnum(source_text[i + 1]) && !seen_radix_point) {
  46. result.radix_point_ = i;
  47. seen_radix_point = true;
  48. continue;
  49. }
  50. // A `+` or `-` continues the literal only if it's preceded by a lowercase
  51. // letter (which will be 'e' or 'p' or part of an invalid literal) and
  52. // followed by an alphanumeric character. This '+' or '-' cannot be an
  53. // operator because a literal cannot end in a lowercase letter.
  54. if ((c == '+' || c == '-') && seen_potential_exponent &&
  55. result.exponent_ == i - 1 && i + 1 != n &&
  56. IsAlnum(source_text[i + 1])) {
  57. // This is not possible because we don't update result.exponent after we
  58. // see a '+' or '-'.
  59. CARBON_CHECK(!seen_plus_minus, "should only consume one + or -");
  60. seen_plus_minus = true;
  61. continue;
  62. }
  63. break;
  64. }
  65. result.text_ = source_text.substr(0, i);
  66. if (!seen_radix_point) {
  67. result.radix_point_ = i;
  68. }
  69. if (!seen_potential_exponent) {
  70. result.exponent_ = i;
  71. }
  72. return result;
  73. }
  74. // Parser for numeric literal tokens.
  75. //
  76. // Responsible for checking that a numeric literal is valid and meaningful and
  77. // either diagnosing or extracting its meaning.
  78. class NumericLiteral::Parser {
  79. public:
  80. Parser(Diagnostics::Emitter<const char*>& emitter, NumericLiteral literal);
  81. auto IsInt() -> bool {
  82. return literal_.radix_point_ == static_cast<int>(literal_.text_.size());
  83. }
  84. // Check that the numeric literal token is syntactically valid and
  85. // meaningful, and diagnose if not. Returns `true` if the token was
  86. // sufficiently valid that we could determine its meaning. If `false` is
  87. // returned, a diagnostic has already been issued.
  88. auto Check() -> bool;
  89. // Get the radix of this token. One of 2, 10, or 16.
  90. auto GetRadix() -> Radix { return radix_; }
  91. // Get the mantissa of this token's value.
  92. auto GetMantissa() -> llvm::APInt;
  93. // Get the exponent of this token's value. This is always zero for an integer
  94. // literal.
  95. auto GetExponent() -> llvm::APInt;
  96. private:
  97. struct CheckDigitSequenceResult {
  98. bool ok;
  99. bool has_digit_separators = false;
  100. };
  101. auto CheckDigitSequence(llvm::StringRef text, Radix radix,
  102. bool allow_digit_separators = true)
  103. -> CheckDigitSequenceResult;
  104. auto CheckLeadingZero() -> bool;
  105. auto CheckIntPart() -> bool;
  106. auto CheckFractionalPart() -> bool;
  107. auto CheckExponentPart() -> bool;
  108. Diagnostics::Emitter<const char*>& emitter_;
  109. NumericLiteral literal_;
  110. // The radix of the literal: 2, 10, or 16, for a prefix of '0b', no prefix,
  111. // or '0x', respectively.
  112. Radix radix_ = Radix::Decimal;
  113. // The various components of a numeric literal:
  114. //
  115. // [radix] int_part [. fract_part [[ep] [+-] exponent_part]]
  116. llvm::StringRef int_part_;
  117. llvm::StringRef fract_part_;
  118. llvm::StringRef exponent_part_;
  119. // Do we need to remove any special characters (digit separator or radix
  120. // point) before interpreting the mantissa or exponent as an integer?
  121. bool mantissa_needs_cleaning_ = false;
  122. bool exponent_needs_cleaning_ = false;
  123. // True if we found a `-` before `exponent_part`.
  124. bool exponent_is_negative_ = false;
  125. };
  126. NumericLiteral::Parser::Parser(Diagnostics::Emitter<const char*>& emitter,
  127. NumericLiteral literal)
  128. : emitter_(emitter), literal_(literal) {
  129. int_part_ = literal.text_.substr(0, literal.radix_point_);
  130. if (int_part_.consume_front("0x")) {
  131. radix_ = Radix::Hexadecimal;
  132. } else if (int_part_.consume_front("0b")) {
  133. radix_ = Radix::Binary;
  134. }
  135. fract_part_ = literal.text_.substr(
  136. literal.radix_point_ + 1, literal.exponent_ - literal.radix_point_ - 1);
  137. exponent_part_ = literal.text_.substr(literal.exponent_ + 1);
  138. if (!exponent_part_.consume_front("+")) {
  139. exponent_is_negative_ = exponent_part_.consume_front("-");
  140. }
  141. }
  142. // Check that the numeric literal token is syntactically valid and meaningful,
  143. // and diagnose if not.
  144. auto NumericLiteral::Parser::Check() -> bool {
  145. return CheckLeadingZero() && CheckIntPart() && CheckFractionalPart() &&
  146. CheckExponentPart();
  147. }
  148. // Parse a string that is known to be a valid base-radix integer into an
  149. // APInt. If needs_cleaning is true, the string may additionally contain '_'
  150. // and '.' characters that should be ignored.
  151. //
  152. // Ignoring '.' is used when parsing a real literal. For example, when
  153. // parsing 123.456e7, we want to decompose it into an integer mantissa
  154. // (123456) and an exponent (7 - 3 = 4), and this routine is given the
  155. // "123.456" to parse as the mantissa.
  156. static auto ParseInt(llvm::StringRef digits, NumericLiteral::Radix radix,
  157. bool needs_cleaning) -> llvm::APInt {
  158. llvm::SmallString<32> cleaned;
  159. if (needs_cleaning) {
  160. cleaned.reserve(digits.size());
  161. llvm::copy_if(digits, std::back_inserter(cleaned),
  162. [](char c) { return c != '_' && c != '.'; });
  163. digits = cleaned;
  164. }
  165. llvm::APInt value;
  166. if (digits.getAsInteger(static_cast<int>(radix), value)) {
  167. llvm_unreachable("should never fail");
  168. }
  169. return value;
  170. }
  171. auto NumericLiteral::Parser::GetMantissa() -> llvm::APInt {
  172. const char* end = IsInt() ? int_part_.end() : fract_part_.end();
  173. llvm::StringRef digits(int_part_.begin(), end - int_part_.begin());
  174. return ParseInt(digits, radix_, mantissa_needs_cleaning_);
  175. }
  176. auto NumericLiteral::Parser::GetExponent() -> llvm::APInt {
  177. // Compute the effective exponent from the specified exponent, if any,
  178. // and the position of the radix point.
  179. llvm::APInt exponent(64, 0);
  180. if (!exponent_part_.empty()) {
  181. exponent =
  182. ParseInt(exponent_part_, Radix::Decimal, exponent_needs_cleaning_);
  183. // The exponent is a signed integer, and the number we just parsed is
  184. // non-negative, so ensure we have a wide enough representation to
  185. // include a sign bit. Also make sure the exponent isn't too narrow so
  186. // the calculation below can't lose information through overflow.
  187. if (exponent.isSignBitSet() || exponent.getBitWidth() < 64) {
  188. exponent = exponent.zext(std::max(64U, exponent.getBitWidth() + 1));
  189. }
  190. if (exponent_is_negative_) {
  191. exponent.negate();
  192. }
  193. }
  194. // Each character after the decimal point reduces the effective exponent.
  195. int excess_exponent = fract_part_.size();
  196. if (radix_ == Radix::Hexadecimal) {
  197. excess_exponent *= 4;
  198. }
  199. exponent -= excess_exponent;
  200. if (exponent_is_negative_ && !exponent.isNegative()) {
  201. // We overflowed. Note that we can only overflow by a little, and only
  202. // from negative to positive, because exponent is at least 64 bits wide
  203. // and excess_exponent is bounded above by four times the size of the
  204. // input buffer, which we assume fits into 32 bits.
  205. exponent = exponent.zext(exponent.getBitWidth() + 1);
  206. exponent.setSignBit();
  207. }
  208. return exponent;
  209. }
  210. // Check that a digit sequence is valid: that it contains one or more digits,
  211. // contains only digits in the specified base, and that any digit separators
  212. // are present and correctly positioned.
  213. auto NumericLiteral::Parser::CheckDigitSequence(llvm::StringRef text,
  214. Radix radix,
  215. bool allow_digit_separators)
  216. -> CheckDigitSequenceResult {
  217. std::bitset<256> valid_digits;
  218. switch (radix) {
  219. case Radix::Binary:
  220. for (char c : "01") {
  221. valid_digits[static_cast<unsigned char>(c)] = true;
  222. }
  223. break;
  224. case Radix::Decimal:
  225. for (char c : "0123456789") {
  226. valid_digits[static_cast<unsigned char>(c)] = true;
  227. }
  228. break;
  229. case Radix::Hexadecimal:
  230. for (char c : "0123456789ABCDEF") {
  231. valid_digits[static_cast<unsigned char>(c)] = true;
  232. }
  233. break;
  234. }
  235. int num_digit_separators = 0;
  236. for (int i = 0, n = text.size(); i != n; ++i) {
  237. char c = text[i];
  238. if (valid_digits[static_cast<unsigned char>(c)]) {
  239. continue;
  240. }
  241. if (c == '_') {
  242. // A digit separator cannot appear at the start of a digit sequence,
  243. // next to another digit separator, or at the end.
  244. if (!allow_digit_separators || i == 0 || text[i - 1] == '_' ||
  245. i + 1 == n) {
  246. CARBON_DIAGNOSTIC(InvalidDigitSeparator, Error,
  247. "misplaced digit separator in numeric literal");
  248. emitter_.Emit(text.begin() + 1, InvalidDigitSeparator);
  249. }
  250. ++num_digit_separators;
  251. continue;
  252. }
  253. CARBON_DIAGNOSTIC(
  254. InvalidDigit, Error,
  255. "invalid digit '{0}' in {1:=2:binary|=10:decimal|=16:hexadecimal} "
  256. "numeric literal",
  257. char, Diagnostics::IntAsSelect);
  258. emitter_.Emit(text.begin() + i, InvalidDigit, c, static_cast<int>(radix));
  259. return {.ok = false};
  260. }
  261. if (num_digit_separators == static_cast<int>(text.size())) {
  262. CARBON_DIAGNOSTIC(EmptyDigitSequence, Error,
  263. "empty digit sequence in numeric literal");
  264. emitter_.Emit(text.begin(), EmptyDigitSequence);
  265. return {.ok = false};
  266. }
  267. if (!CanLexInt(emitter_, text)) {
  268. return {.ok = false};
  269. }
  270. return {.ok = true, .has_digit_separators = (num_digit_separators != 0)};
  271. }
  272. // Check that we don't have a '0' prefix on a non-zero decimal integer.
  273. auto NumericLiteral::Parser::CheckLeadingZero() -> bool {
  274. if (radix_ == Radix::Decimal && int_part_.starts_with("0") &&
  275. int_part_ != "0") {
  276. CARBON_DIAGNOSTIC(UnknownBaseSpecifier, Error,
  277. "unknown base specifier in numeric literal");
  278. emitter_.Emit(int_part_.begin(), UnknownBaseSpecifier);
  279. return false;
  280. }
  281. return true;
  282. }
  283. // Check the integer part (before the '.', if any) is valid.
  284. auto NumericLiteral::Parser::CheckIntPart() -> bool {
  285. auto int_result = CheckDigitSequence(int_part_, radix_);
  286. mantissa_needs_cleaning_ |= int_result.has_digit_separators;
  287. return int_result.ok;
  288. }
  289. // Check the fractional part (after the '.' and before the exponent, if any)
  290. // is valid.
  291. auto NumericLiteral::Parser::CheckFractionalPart() -> bool {
  292. if (IsInt()) {
  293. return true;
  294. }
  295. if (radix_ == Radix::Binary) {
  296. CARBON_DIAGNOSTIC(BinaryRealLiteral, Error,
  297. "binary real number literals are not supported");
  298. emitter_.Emit(literal_.text_.begin() + literal_.radix_point_,
  299. BinaryRealLiteral);
  300. // Carry on and parse the binary real literal anyway.
  301. }
  302. // We need to remove a '.' from the mantissa.
  303. mantissa_needs_cleaning_ = true;
  304. return CheckDigitSequence(fract_part_, radix_,
  305. /*allow_digit_separators=*/false)
  306. .ok;
  307. }
  308. // Check the exponent part (if any) is valid.
  309. auto NumericLiteral::Parser::CheckExponentPart() -> bool {
  310. if (literal_.exponent_ == static_cast<int>(literal_.text_.size())) {
  311. return true;
  312. }
  313. char expected_exponent_kind = (radix_ == Radix::Decimal ? 'e' : 'p');
  314. if (literal_.text_[literal_.exponent_] != expected_exponent_kind) {
  315. CARBON_DIAGNOSTIC(WrongRealLiteralExponent, Error,
  316. "expected '{0}' to introduce exponent", char);
  317. emitter_.Emit(literal_.text_.begin() + literal_.exponent_,
  318. WrongRealLiteralExponent, expected_exponent_kind);
  319. return false;
  320. }
  321. auto exponent_result = CheckDigitSequence(exponent_part_, Radix::Decimal);
  322. exponent_needs_cleaning_ = exponent_result.has_digit_separators;
  323. return exponent_result.ok;
  324. }
  325. // Parse the token and compute its value.
  326. auto NumericLiteral::ComputeValue(
  327. Diagnostics::Emitter<const char*>& emitter) const -> Value {
  328. Parser parser(emitter, *this);
  329. if (!parser.Check()) {
  330. return UnrecoverableError();
  331. }
  332. if (parser.IsInt()) {
  333. return IntValue{.value = parser.GetMantissa()};
  334. }
  335. return RealValue{
  336. .radix = (parser.GetRadix() == Radix::Decimal ? Radix::Decimal
  337. : Radix::Binary),
  338. .mantissa = parser.GetMantissa(),
  339. .exponent = parser.GetExponent()};
  340. }
  341. } // namespace Carbon::Lex