před 5 roky · 9a873b46de
--- a/lexer/BUILD
+++ b/lexer/BUILD
@@ -26,12 +26,36 @@ cc_test(
 
				     ],
			
 
				 )
			
 
				 
			
 
				+cc_library(
			
 
				+    name = "numeric_literal",
			
 
				+    srcs = ["numeric_literal.cpp"],
			
 
				+    hdrs = ["numeric_literal.h"],
			
 
				+    deps = [
			
 
				+        "//diagnostics:diagnostic_emitter",
			
 
				+        "@llvm-project//llvm:Support",
			
 
				+    ],
			
 
				+)
			
 
				+
			
 
				+cc_test(
			
 
				+    name = "numeric_literal_test",
			
 
				+    srcs = ["numeric_literal_test.cpp"],
			
 
				+    deps = [
			
 
				+        ":numeric_literal",
			
 
				+        "//diagnostics:diagnostic_emitter",
			
 
				+        "@llvm-project//llvm:Support",
			
 
				+        "@llvm-project//llvm:gmock",
			
 
				+        "@llvm-project//llvm:gtest",
			
 
				+        "@llvm-project//llvm:gtest_main",
			
 
				+    ],
			
 
				+)
			
 
				+
			
 
				 cc_library(
			
 
				     name = "tokenized_buffer",
			
 
				     srcs = ["tokenized_buffer.cpp"],
			
 
				     hdrs = ["tokenized_buffer.h"],
			
 
				     deps = [
			
 
				         ":token_kind",
			
 
				+        ":numeric_literal",
			
 
				         "//diagnostics:diagnostic_emitter",
			
 
				         "//source:source_buffer",
			
 
				         "@llvm-project//llvm:Support",
			
--- a/lexer/numeric_literal.cpp
+++ b/lexer/numeric_literal.cpp
@@ -0,0 +1,390 @@
 
				+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
			
 
				+// Exceptions. See /LICENSE for license information.
			
 
				+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
			
 
				+
			
 
				+#include "lexer/numeric_literal.h"
			
 
				+
			
 
				+#include <bitset>
			
 
				+
			
 
				+#include "llvm/ADT/StringExtras.h"
			
 
				+#include "llvm/Support/FormatVariadic.h"
			
 
				+
			
 
				+namespace Carbon {
			
 
				+
			
 
				+namespace {
			
 
				+struct EmptyDigitSequence : SimpleDiagnostic<EmptyDigitSequence> {
			
 
				+  static constexpr llvm::StringLiteral ShortName = "syntax-invalid-number";
			
 
				+  static constexpr llvm::StringLiteral Message =
			
 
				+      "Empty digit sequence in numeric literal.";
			
 
				+};
			
 
				+
			
 
				+struct InvalidDigit {
			
 
				+  static constexpr llvm::StringLiteral ShortName = "syntax-invalid-number";
			
 
				+
			
 
				+  struct Substitutions {
			
 
				+    char digit;
			
 
				+    int radix;
			
 
				+  };
			
 
				+  static auto Format(const Substitutions& subst) -> std::string {
			
 
				+    return llvm::formatv("Invalid digit '{0}' in {1} numeric literal.",
			
 
				+                         subst.digit,
			
 
				+                         (subst.radix == 2    ? "binary"
			
 
				+                          : subst.radix == 16 ? "hexadecimal"
			
 
				+                                              : "decimal"))
			
 
				+        .str();
			
 
				+  }
			
 
				+};
			
 
				+
			
 
				+struct InvalidDigitSeparator : SimpleDiagnostic<InvalidDigitSeparator> {
			
 
				+  static constexpr llvm::StringLiteral ShortName = "syntax-invalid-number";
			
 
				+  static constexpr llvm::StringLiteral Message =
			
 
				+      "Misplaced digit separator in numeric literal.";
			
 
				+};
			
 
				+
			
 
				+struct IrregularDigitSeparators {
			
 
				+  static constexpr llvm::StringLiteral ShortName =
			
 
				+      "syntax-irregular-digit-separators";
			
 
				+
			
 
				+  struct Substitutions {
			
 
				+    int radix;
			
 
				+  };
			
 
				+  static auto Format(const Substitutions& subst) -> std::string {
			
 
				+    assert((subst.radix == 10 || subst.radix == 16) && "unexpected radix");
			
 
				+    return llvm::formatv(
			
 
				+               "Digit separators in {0} number should appear every {1} "
			
 
				+               "characters from the right.",
			
 
				+               (subst.radix == 10 ? "decimal" : "hexadecimal"),
			
 
				+               (subst.radix == 10 ? "3" : "4"))
			
 
				+        .str();
			
 
				+  }
			
 
				+};
			
 
				+
			
 
				+struct UnknownBaseSpecifier : SimpleDiagnostic<UnknownBaseSpecifier> {
			
 
				+  static constexpr llvm::StringLiteral ShortName = "syntax-invalid-number";
			
 
				+  static constexpr llvm::StringLiteral Message =
			
 
				+      "Unknown base specifier in numeric literal.";
			
 
				+};
			
 
				+
			
 
				+struct BinaryRealLiteral : SimpleDiagnostic<BinaryRealLiteral> {
			
 
				+  static constexpr llvm::StringLiteral ShortName = "syntax-invalid-number";
			
 
				+  static constexpr llvm::StringLiteral Message =
			
 
				+      "Binary real number literals are not supported.";
			
 
				+};
			
 
				+
			
 
				+struct WrongRealLiteralExponent {
			
 
				+  static constexpr llvm::StringLiteral ShortName = "syntax-invalid-number";
			
 
				+
			
 
				+  struct Substitutions {
			
 
				+    char expected;
			
 
				+  };
			
 
				+  static auto Format(const Substitutions& subst) -> std::string {
			
 
				+    return llvm::formatv("Expected '{0}' to introduce exponent.",
			
 
				+                         subst.expected)
			
 
				+        .str();
			
 
				+  }
			
 
				+};
			
 
				+}  // namespace
			
 
				+
			
 
				+static bool isLower(char c) { return 'a' <= c && c <= 'z'; }
			
 
				+
			
 
				+auto NumericLiteralToken::Lex(llvm::StringRef source_text)
			
 
				+    -> llvm::Optional<NumericLiteralToken> {
			
 
				+  NumericLiteralToken result;
			
 
				+
			
 
				+  if (source_text.empty() || !llvm::isDigit(source_text.front()))
			
 
				+    return llvm::None;
			
 
				+
			
 
				+  bool seen_plus_minus = false;
			
 
				+  bool seen_radix_point = false;
			
 
				+  bool seen_potential_exponent = false;
			
 
				+
			
 
				+  // Greedily consume all following characters that might be part of a numeric
			
 
				+  // literal. This allows us to produce better diagnostics on invalid literals.
			
 
				+  //
			
 
				+  // TODO(zygoloid): Update lexical rules to specify that a numeric literal
			
 
				+  // cannot be immediately followed by an alphanumeric character.
			
 
				+  int i = 1, n = source_text.size();
			
 
				+  for (; i != n; ++i) {
			
 
				+    char c = source_text[i];
			
 
				+    if (llvm::isAlnum(c) || c == '_') {
			
 
				+      if (isLower(c) && seen_radix_point && !seen_plus_minus) {
			
 
				+        result.exponent = i;
			
 
				+        seen_potential_exponent = true;
			
 
				+      }
			
 
				+      continue;
			
 
				+    }
			
 
				+
			
 
				+    // Exactly one `.` can be part of the literal, but only if it's followed by
			
 
				+    // an alphanumeric character.
			
 
				+    if (c == '.' && i + 1 != n && llvm::isAlnum(source_text[i + 1]) &&
			
 
				+        !seen_radix_point) {
			
 
				+      result.radix_point = i;
			
 
				+      seen_radix_point = true;
			
 
				+      continue;
			
 
				+    }
			
 
				+
			
 
				+    // A `+` or `-` continues the literal only if it's preceded by a lowercase
			
 
				+    // letter (which will be 'e' or 'p' or part of an invalid literal) and
			
 
				+    // followed by an alphanumeric character. This '+' or '-' cannot be an
			
 
				+    // operator because a literal cannot end in a lowercase letter.
			
 
				+    if ((c == '+' || c == '-') && seen_potential_exponent &&
			
 
				+        result.exponent == i - 1 && i + 1 != n &&
			
 
				+        llvm::isAlnum(source_text[i + 1])) {
			
 
				+      // This is not possible because we don't update result.exponent after we
			
 
				+      // see a '+' or '-'.
			
 
				+      assert(!seen_plus_minus && "should only consume one + or -");
			
 
				+      seen_plus_minus = true;
			
 
				+      continue;
			
 
				+    }
			
 
				+
			
 
				+    break;
			
 
				+  }
			
 
				+
			
 
				+  result.text = source_text.substr(0, i);
			
 
				+  if (!seen_radix_point)
			
 
				+    result.radix_point = i;
			
 
				+  if (!seen_potential_exponent)
			
 
				+    result.exponent = i;
			
 
				+
			
 
				+  return result;
			
 
				+}
			
 
				+
			
 
				+NumericLiteralToken::Parser::Parser(DiagnosticEmitter& emitter,
			
 
				+                                    NumericLiteralToken literal)
			
 
				+    : emitter(emitter), literal(literal) {
			
 
				+  int_part = literal.text.substr(0, literal.radix_point);
			
 
				+  if (int_part.consume_front("0x")) {
			
 
				+    radix = 16;
			
 
				+  } else if (int_part.consume_front("0b")) {
			
 
				+    radix = 2;
			
 
				+  }
			
 
				+
			
 
				+  fract_part = literal.text.substr(literal.radix_point + 1,
			
 
				+                                   literal.exponent - literal.radix_point - 1);
			
 
				+
			
 
				+  exponent_part = literal.text.substr(literal.exponent + 1);
			
 
				+  if (!exponent_part.consume_front("+")) {
			
 
				+    exponent_is_negative = exponent_part.consume_front("-");
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+// Check that the numeric literal token is syntactically valid and meaningful,
			
 
				+// and diagnose if not.
			
 
				+auto NumericLiteralToken::Parser::Check() -> CheckResult {
			
 
				+  if (!CheckLeadingZero() || !CheckIntPart() || !CheckFractionalPart() ||
			
 
				+      !CheckExponentPart())
			
 
				+    return UnrecoverableError;
			
 
				+  return recovered_from_error ? RecoverableError : Valid;
			
 
				+}
			
 
				+
			
 
				+// Parse a string that is known to be a valid base-radix integer into an
			
 
				+// APInt.  If needs_cleaning is true, the string may additionally contain '_'
			
 
				+// and '.' characters that should be ignored.
			
 
				+//
			
 
				+// Ignoring '.' is used when parsing a real literal. For example, when
			
 
				+// parsing 123.456e7, we want to decompose it into an integer mantissa
			
 
				+// (123456) and an exponent (7 - 3 = 2), and this routine is given the
			
 
				+// "123.456" to parse as the mantissa.
			
 
				+static auto ParseInteger(llvm::StringRef digits, int radix, bool needs_cleaning)
			
 
				+    -> llvm::APInt {
			
 
				+  llvm::SmallString<32> cleaned;
			
 
				+  if (needs_cleaning) {
			
 
				+    cleaned.reserve(digits.size());
			
 
				+    std::remove_copy_if(digits.begin(), digits.end(),
			
 
				+                        std::back_inserter(cleaned),
			
 
				+                        [](char c) { return c == '_' || c == '.'; });
			
 
				+    digits = cleaned;
			
 
				+  }
			
 
				+
			
 
				+  llvm::APInt value;
			
 
				+  if (digits.getAsInteger(radix, value)) {
			
 
				+    llvm_unreachable("should never fail");
			
 
				+  }
			
 
				+  return value;
			
 
				+}
			
 
				+
			
 
				+auto NumericLiteralToken::Parser::GetMantissa() -> llvm::APInt {
			
 
				+  const char* end = IsInteger() ? int_part.end() : fract_part.end();
			
 
				+  llvm::StringRef digits(int_part.begin(), end - int_part.begin());
			
 
				+  return ParseInteger(digits, radix, mantissa_needs_cleaning);
			
 
				+}
			
 
				+
			
 
				+auto NumericLiteralToken::Parser::GetExponent() -> llvm::APInt {
			
 
				+  // Compute the effective exponent from the specified exponent, if any,
			
 
				+  // and the position of the radix point.
			
 
				+  llvm::APInt exponent(64, 0);
			
 
				+  if (!exponent_part.empty()) {
			
 
				+    exponent = ParseInteger(exponent_part, 10, exponent_needs_cleaning);
			
 
				+
			
 
				+    // The exponent is a signed integer, and the number we just parsed is
			
 
				+    // non-negative, so ensure we have a wide enough representation to
			
 
				+    // include a sign bit. Also make sure the exponent isn't too narrow so
			
 
				+    // the calculation below can't lose information through overflow.
			
 
				+    if (exponent.isSignBitSet() || exponent.getBitWidth() < 64) {
			
 
				+      exponent = exponent.zext(std::max(64u, exponent.getBitWidth() + 1));
			
 
				+    }
			
 
				+    if (exponent_is_negative) {
			
 
				+      exponent.negate();
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  // Each character after the decimal point reduces the effective exponent.
			
 
				+  int excess_exponent = fract_part.size();
			
 
				+  if (radix == 16) {
			
 
				+    excess_exponent *= 4;
			
 
				+  }
			
 
				+  exponent -= excess_exponent;
			
 
				+  if (exponent_is_negative && !exponent.isNegative()) {
			
 
				+    // We overflowed. Note that we can only overflow by a little, and only
			
 
				+    // from negative to positive, because exponent is at least 64 bits wide
			
 
				+    // and excess_exponent is bounded above by four times the size of the
			
 
				+    // input buffer, which we assume fits into 32 bits.
			
 
				+    exponent = exponent.zext(exponent.getBitWidth() + 1);
			
 
				+    exponent.setSignBit();
			
 
				+  }
			
 
				+  return exponent;
			
 
				+}
			
 
				+
			
 
				+// Check that a digit sequence is valid: that it contains one or more digits,
			
 
				+// contains only digits in the specified base, and that any digit separators
			
 
				+// are present and correctly positioned.
			
 
				+auto NumericLiteralToken::Parser::CheckDigitSequence(
			
 
				+    llvm::StringRef text, int radix, bool allow_digit_separators)
			
 
				+    -> CheckDigitSequenceResult {
			
 
				+  assert((radix == 2 || radix == 10 || radix == 16) && "unknown radix");
			
 
				+
			
 
				+  std::bitset<256> valid_digits;
			
 
				+  if (radix == 2) {
			
 
				+    for (char c : "01")
			
 
				+      valid_digits[static_cast<unsigned char>(c)] = true;
			
 
				+  } else if (radix == 10) {
			
 
				+    for (char c : "0123456789")
			
 
				+      valid_digits[static_cast<unsigned char>(c)] = true;
			
 
				+  } else {
			
 
				+    for (char c : "0123456789ABCDEF")
			
 
				+      valid_digits[static_cast<unsigned char>(c)] = true;
			
 
				+  }
			
 
				+
			
 
				+  int num_digit_separators = 0;
			
 
				+
			
 
				+  for (int i = 0, n = text.size(); i != n; ++i) {
			
 
				+    char c = text[i];
			
 
				+    if (valid_digits[static_cast<unsigned char>(c)]) {
			
 
				+      continue;
			
 
				+    }
			
 
				+
			
 
				+    if (c == '_') {
			
 
				+      // A digit separator cannot appear at the start of a digit sequence,
			
 
				+      // next to another digit separator, or at the end.
			
 
				+      if (!allow_digit_separators || i == 0 || text[i - 1] == '_' ||
			
 
				+          i + 1 == n) {
			
 
				+        emitter.EmitError<InvalidDigitSeparator>();
			
 
				+        recovered_from_error = true;
			
 
				+      }
			
 
				+      ++num_digit_separators;
			
 
				+      continue;
			
 
				+    }
			
 
				+
			
 
				+    emitter.EmitError<InvalidDigit>({.digit = c, .radix = radix});
			
 
				+    return {.ok = false};
			
 
				+  }
			
 
				+
			
 
				+  if (num_digit_separators == static_cast<int>(text.size())) {
			
 
				+    emitter.EmitError<EmptyDigitSequence>();
			
 
				+    return {.ok = false};
			
 
				+  }
			
 
				+
			
 
				+  // Check that digit separators occur in exactly the expected positions.
			
 
				+  if (num_digit_separators && radix != 2)
			
 
				+    CheckDigitSeparatorPlacement(text, radix, num_digit_separators);
			
 
				+
			
 
				+  return {.ok = true, .has_digit_separators = (num_digit_separators != 0)};
			
 
				+}
			
 
				+
			
 
				+// Given a number with digit separators, check that the digit separators are
			
 
				+// correctly positioned.
			
 
				+auto NumericLiteralToken::Parser::CheckDigitSeparatorPlacement(
			
 
				+    llvm::StringRef text, int radix, int num_digit_separators) -> void {
			
 
				+  assert((radix == 10 || radix == 16) &&
			
 
				+         "unexpected radix for digit separator checks");
			
 
				+  assert(std::count(text.begin(), text.end(), '_') == num_digit_separators &&
			
 
				+         "given wrong number of digit separators");
			
 
				+
			
 
				+  auto diagnose_irregular_digit_separators = [&] {
			
 
				+    emitter.EmitError<IrregularDigitSeparators>({.radix = radix});
			
 
				+    recovered_from_error = true;
			
 
				+  };
			
 
				+
			
 
				+  // For decimal and hexadecimal digit sequences, digit separators must form
			
 
				+  // groups of 3 or 4 digits (4 or 5 characters), respectively.
			
 
				+  int stride = (radix == 10 ? 4 : 5);
			
 
				+  int remaining_digit_separators = num_digit_separators;
			
 
				+  for (auto pos = text.end(); pos - text.begin() >= stride; /*in loop*/) {
			
 
				+    pos -= stride;
			
 
				+    if (*pos != '_')
			
 
				+      return diagnose_irregular_digit_separators();
			
 
				+
			
 
				+    --remaining_digit_separators;
			
 
				+  }
			
 
				+
			
 
				+  // Check there weren't any other digit separators.
			
 
				+  if (remaining_digit_separators)
			
 
				+    diagnose_irregular_digit_separators();
			
 
				+};
			
 
				+
			
 
				+// Check that we don't have a '0' prefix on a non-zero decimal integer.
			
 
				+auto NumericLiteralToken::Parser::CheckLeadingZero() -> bool {
			
 
				+  if (radix == 10 && int_part.startswith("0") && int_part != "0") {
			
 
				+    emitter.EmitError<UnknownBaseSpecifier>();
			
 
				+    return false;
			
 
				+  }
			
 
				+  return true;
			
 
				+}
			
 
				+
			
 
				+// Check the integer part (before the '.', if any) is valid.
			
 
				+auto NumericLiteralToken::Parser::CheckIntPart() -> bool {
			
 
				+  auto int_result = CheckDigitSequence(int_part, radix);
			
 
				+  mantissa_needs_cleaning |= int_result.has_digit_separators;
			
 
				+  return int_result.ok;
			
 
				+}
			
 
				+
			
 
				+// Check the fractional part (after the '.' and before the exponent, if any)
			
 
				+// is valid.
			
 
				+auto NumericLiteralToken::Parser::CheckFractionalPart() -> bool {
			
 
				+  if (IsInteger()) {
			
 
				+    return true;
			
 
				+  }
			
 
				+
			
 
				+  if (radix == 2) {
			
 
				+    emitter.EmitError<BinaryRealLiteral>();
			
 
				+    recovered_from_error = true;
			
 
				+    // Carry on and parse the binary real literal anyway.
			
 
				+  }
			
 
				+
			
 
				+  // We need to remove a '.' from the mantissa.
			
 
				+  mantissa_needs_cleaning = true;
			
 
				+
			
 
				+  return CheckDigitSequence(fract_part, radix,
			
 
				+                            /*allow_digit_separators=*/false)
			
 
				+      .ok;
			
 
				+}
			
 
				+
			
 
				+// Check the exponent part (if any) is valid.
			
 
				+auto NumericLiteralToken::Parser::CheckExponentPart() -> bool {
			
 
				+  if (literal.exponent == static_cast<int>(literal.text.size())) {
			
 
				+    return true;
			
 
				+  }
			
 
				+
			
 
				+  char expected_exponent_kind = (radix == 10 ? 'e' : 'p');
			
 
				+  if (literal.text[literal.exponent] != expected_exponent_kind) {
			
 
				+    emitter.EmitError<WrongRealLiteralExponent>(
			
 
				+        {.expected = expected_exponent_kind});
			
 
				+    return false;
			
 
				+  }
			
 
				+
			
 
				+  auto exponent_result = CheckDigitSequence(exponent_part, 10);
			
 
				+  exponent_needs_cleaning = exponent_result.has_digit_separators;
			
 
				+  return exponent_result.ok;
			
 
				+}
			
 
				+
			
 
				+}  // namespace Carbon
			
--- a/lexer/numeric_literal.h
+++ b/lexer/numeric_literal.h
@@ -0,0 +1,127 @@
 
				+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
			
 
				+// Exceptions. See /LICENSE for license information.
			
 
				+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
			
 
				+
			
 
				+#ifndef LEXER_NUMERIC_LITERAL_H_
			
 
				+#define LEXER_NUMERIC_LITERAL_H_
			
 
				+
			
 
				+#include <utility>
			
 
				+
			
 
				+#include "diagnostics/diagnostic_emitter.h"
			
 
				+#include "llvm/ADT/APInt.h"
			
 
				+#include "llvm/ADT/Optional.h"
			
 
				+#include "llvm/ADT/StringRef.h"
			
 
				+
			
 
				+namespace Carbon {
			
 
				+
			
 
				+// A numeric literal token that has been extracted from a source buffer.
			
 
				+class NumericLiteralToken {
			
 
				+ public:
			
 
				+  // Get the text corresponding to this literal.
			
 
				+  llvm::StringRef Text() const { return text; }
			
 
				+
			
 
				+  // Extract a numeric literal from the given text, if it has a suitable form.
			
 
				+  static auto Lex(llvm::StringRef source_text)
			
 
				+      -> llvm::Optional<NumericLiteralToken>;
			
 
				+
			
 
				+  class Parser;
			
 
				+
			
 
				+ private:
			
 
				+  NumericLiteralToken() {}
			
 
				+
			
 
				+  // The text of the token.
			
 
				+  llvm::StringRef text;
			
 
				+
			
 
				+  // The offset of the '.'. Set to text.size() if none is present.
			
 
				+  int radix_point;
			
 
				+
			
 
				+  // The offset of the alphabetical character introducing the exponent. In a
			
 
				+  // valid literal, this will be an 'e' or a 'p', and may be followed by a '+'
			
 
				+  // or a '-', but for error recovery, this may simply be the last lowercase
			
 
				+  // letter in the invalid token. Always greater than or equal to radix_point.
			
 
				+  // Set to text.size() if none is present.
			
 
				+  int exponent;
			
 
				+};
			
 
				+
			
 
				+// Parser for numeric literal tokens.
			
 
				+//
			
 
				+// Responsible for checking that a numeric literal is valid and meaningful and
			
 
				+// either diagnosing or extracting its meaning.
			
 
				+class NumericLiteralToken::Parser {
			
 
				+ public:
			
 
				+  Parser(DiagnosticEmitter& emitter, NumericLiteralToken literal);
			
 
				+
			
 
				+  auto IsInteger() -> bool {
			
 
				+    return literal.radix_point == static_cast<int>(literal.text.size());
			
 
				+  }
			
 
				+
			
 
				+  enum CheckResult {
			
 
				+    // The token is valid.
			
 
				+    Valid,
			
 
				+    // The token is invalid, but we've diagnosed and recovered from the error.
			
 
				+    RecoverableError,
			
 
				+    // The token is invalid, and we've diagnosed, but we can't assign meaning
			
 
				+    // to it.
			
 
				+    UnrecoverableError,
			
 
				+  };
			
 
				+
			
 
				+  // Check that the numeric literal token is syntactically valid and
			
 
				+  // meaningful, and diagnose if not.
			
 
				+  auto Check() -> CheckResult;
			
 
				+
			
 
				+  // Get the radix of this token. One of 2, 10, or 16.
			
 
				+  auto GetRadix() -> int { return radix; }
			
 
				+
			
 
				+  // Get the mantissa of this token's value.
			
 
				+  auto GetMantissa() -> llvm::APInt;
			
 
				+
			
 
				+  // Get the exponent of this token's value. This is always zero for an integer
			
 
				+  // literal.
			
 
				+  auto GetExponent() -> llvm::APInt;
			
 
				+
			
 
				+ private:
			
 
				+  struct CheckDigitSequenceResult {
			
 
				+    bool ok;
			
 
				+    bool has_digit_separators = false;
			
 
				+  };
			
 
				+
			
 
				+  auto CheckDigitSequence(llvm::StringRef text, int radix,
			
 
				+                          bool allow_digit_separators = true)
			
 
				+      -> CheckDigitSequenceResult;
			
 
				+  auto CheckDigitSeparatorPlacement(llvm::StringRef text, int radix,
			
 
				+                                    int num_digit_separators) -> void;
			
 
				+  auto CheckLeadingZero() -> bool;
			
 
				+  auto CheckIntPart() -> bool;
			
 
				+  auto CheckFractionalPart() -> bool;
			
 
				+  auto CheckExponentPart() -> bool;
			
 
				+
			
 
				+ private:
			
 
				+  DiagnosticEmitter& emitter;
			
 
				+  NumericLiteralToken literal;
			
 
				+
			
 
				+  // The radix of the literal: 2, 10, or 16, for a prefix of '0b', no prefix,
			
 
				+  // or '0x', respectively.
			
 
				+  int radix = 10;
			
 
				+
			
 
				+  // The various components of a numeric literal:
			
 
				+  //
			
 
				+  //     [radix] int_part [. fract_part [[ep] [+-] exponent_part]]
			
 
				+  llvm::StringRef int_part;
			
 
				+  llvm::StringRef fract_part;
			
 
				+  llvm::StringRef exponent_part;
			
 
				+
			
 
				+  // Do we need to remove any special characters (digit separator or radix
			
 
				+  // point) before interpreting the mantissa or exponent as an integer?
			
 
				+  bool mantissa_needs_cleaning = false;
			
 
				+  bool exponent_needs_cleaning = false;
			
 
				+
			
 
				+  // True if we found a `-` before `exponent_part`.
			
 
				+  bool exponent_is_negative = false;
			
 
				+
			
 
				+  // True if we produced an error but recovered.
			
 
				+  bool recovered_from_error = false;
			
 
				+};
			
 
				+
			
 
				+}  // namespace Carbon
			
 
				+
			
 
				+#endif  // LEXER_NUMERIC_LITERAL_H_
			
--- a/lexer/numeric_literal_test.cpp
+++ b/lexer/numeric_literal_test.cpp
@@ -0,0 +1,261 @@
 
				+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
			
 
				+// Exceptions. See /LICENSE for license information.
			
 
				+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
			
 
				+
			
 
				+#include "lexer/numeric_literal.h"
			
 
				+
			
 
				+#include <iterator>
			
 
				+
			
 
				+#include "diagnostics/diagnostic_emitter.h"
			
 
				+#include "gmock/gmock.h"
			
 
				+#include "gtest/gtest.h"
			
 
				+
			
 
				+namespace Carbon {
			
 
				+namespace {
			
 
				+
			
 
				+struct NumericLiteralTest : ::testing::Test {
			
 
				+  auto Lex(llvm::StringRef text) -> NumericLiteralToken {
			
 
				+    llvm::Optional<NumericLiteralToken> result = NumericLiteralToken::Lex(text);
			
 
				+    assert(result);
			
 
				+    EXPECT_EQ(result->Text(), text);
			
 
				+    return *result;
			
 
				+  }
			
 
				+
			
 
				+  auto Parse(llvm::StringRef text) -> NumericLiteralToken::Parser {
			
 
				+    return NumericLiteralToken::Parser(ConsoleDiagnosticEmitter(), Lex(text));
			
 
				+  }
			
 
				+};
			
 
				+
			
 
				+TEST_F(NumericLiteralTest, HandlesIntegerLiteral) {
			
 
				+  struct Testcase {
			
 
				+    llvm::StringLiteral token;
			
 
				+    uint64_t value;
			
 
				+    int radix;
			
 
				+  };
			
 
				+  Testcase testcases[] = {
			
 
				+      {.token = "12", .value = 12, .radix = 10},
			
 
				+      {.token = "0x12_3ABC", .value = 0x12'3ABC, .radix = 16},
			
 
				+      {.token = "0b10_10_11", .value = 0b10'10'11, .radix = 2},
			
 
				+      {.token = "1_234_567", .value = 1'234'567, .radix = 10},
			
 
				+  };
			
 
				+  for (Testcase testcase : testcases) {
			
 
				+    auto parser = Parse(testcase.token);
			
 
				+    EXPECT_EQ(parser.Check(), parser.Valid) << testcase.token;
			
 
				+    EXPECT_EQ(parser.IsInteger(), true);
			
 
				+    EXPECT_EQ(parser.GetMantissa().getZExtValue(), testcase.value);
			
 
				+    EXPECT_EQ(parser.GetExponent().getSExtValue(), 0);
			
 
				+    EXPECT_EQ(parser.GetRadix(), testcase.radix);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+TEST_F(NumericLiteralTest, ValidatesBaseSpecifier) {
			
 
				+  llvm::StringLiteral valid[] = {
			
 
				+      // Decimal integer literals.
			
 
				+      "0",
			
 
				+      "1",
			
 
				+      "123456789000000000000000000000000000000000000",
			
 
				+
			
 
				+      // Hexadecimal integer literals.
			
 
				+      "0x0123456789ABCDEF",
			
 
				+      "0x0000000000000000000000000000000",
			
 
				+
			
 
				+      // Binary integer literals.
			
 
				+      "0b10110100101001010",
			
 
				+      "0b0000000",
			
 
				+  };
			
 
				+  for (llvm::StringLiteral literal : valid) {
			
 
				+    auto parser = Parse(literal);
			
 
				+    EXPECT_EQ(parser.Check(), parser.Valid) << literal;
			
 
				+  }
			
 
				+
			
 
				+  llvm::StringLiteral invalid[] = {
			
 
				+      "00",  "0X123",    "0o123",          "0B1",
			
 
				+      "007", "123L",     "123456789A",     "0x",
			
 
				+      "0b",  "0x123abc", "0b011101201001", "0b10A",
			
 
				+      "0x_", "0b_",
			
 
				+  };
			
 
				+  for (llvm::StringLiteral literal : invalid) {
			
 
				+    auto parser = Parse(literal);
			
 
				+    EXPECT_EQ(parser.Check(), parser.UnrecoverableError) << literal;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+TEST_F(NumericLiteralTest, ValidatesIntegerDigitSeparators) {
			
 
				+  llvm::StringLiteral valid[] = {
			
 
				+      // Decimal literals optionally have digit separators every 3 places.
			
 
				+      "1_234",
			
 
				+      "123_456",
			
 
				+      "1_234_567",
			
 
				+
			
 
				+      // Hexadecimal literals optionally have digit separators every 4 places.
			
 
				+      "0x1_0000",
			
 
				+      "0x1000_0000",
			
 
				+      "0x1_0000_0000",
			
 
				+
			
 
				+      // Binary integer literals can have digit separators anywhere..
			
 
				+      "0b1_0_1_0_1_0",
			
 
				+      "0b111_0000",
			
 
				+  };
			
 
				+  for (llvm::StringLiteral literal : valid) {
			
 
				+    auto parser = Parse(literal);
			
 
				+    EXPECT_EQ(parser.Check(), parser.Valid) << literal;
			
 
				+  }
			
 
				+
			
 
				+  llvm::StringLiteral invalid[] = {
			
 
				+      // Decimal literals.
			
 
				+      "12_34",
			
 
				+      "123_4_6_789",
			
 
				+      "12_3456_789",
			
 
				+      "12__345",
			
 
				+      "1_",
			
 
				+
			
 
				+      // Hexadecimal literals.
			
 
				+      "0x_1234",
			
 
				+      "0x123_",
			
 
				+      "0x12_3",
			
 
				+      "0x_234_5678",
			
 
				+      "0x1234_567",
			
 
				+
			
 
				+      // Binary literals.
			
 
				+      "0b_10101",
			
 
				+      "0b1__01",
			
 
				+      "0b1011_",
			
 
				+      "0b1_01_01_",
			
 
				+  };
			
 
				+  for (llvm::StringLiteral literal : invalid) {
			
 
				+    auto parser = Parse(literal);
			
 
				+    EXPECT_EQ(parser.Check(), parser.RecoverableError) << literal;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+TEST_F(NumericLiteralTest, HandlesRealLiteral) {
			
 
				+  struct Testcase {
			
 
				+    llvm::StringLiteral token;
			
 
				+    uint64_t mantissa;
			
 
				+    int64_t exponent;
			
 
				+    unsigned radix;
			
 
				+  };
			
 
				+  Testcase testcases[] = {
			
 
				+      // Decimal real literals.
			
 
				+      {.token = "0.0", .mantissa = 0, .exponent = -1, .radix = 10},
			
 
				+      {.token = "12.345", .mantissa = 12345, .exponent = -3, .radix = 10},
			
 
				+      {.token = "12.345e6", .mantissa = 12345, .exponent = 3, .radix = 10},
			
 
				+      {.token = "12.345e+6", .mantissa = 12345, .exponent = 3, .radix = 10},
			
 
				+      {.token = "1_234.5e-2", .mantissa = 12345, .exponent = -3, .radix = 10},
			
 
				+      {.token = "1.0e-2_000_000",
			
 
				+       .mantissa = 10,
			
 
				+       .exponent = -2'000'001,
			
 
				+       .radix = 10},
			
 
				+
			
 
				+      // Hexadecimal real literals.
			
 
				+      {.token = "0x1_2345_6789.CDEF",
			
 
				+       .mantissa = 0x1'2345'6789'CDEF,
			
 
				+       .exponent = -16,
			
 
				+       .radix = 16},
			
 
				+      {.token = "0x0.0001p4", .mantissa = 1, .exponent = -12, .radix = 16},
			
 
				+      {.token = "0x0.0001p+4", .mantissa = 1, .exponent = -12, .radix = 16},
			
 
				+      {.token = "0x0.0001p-4", .mantissa = 1, .exponent = -20, .radix = 16},
			
 
				+      // The exponent here works out as exactly INT64_MIN.
			
 
				+      {.token = "0x1.01p-9223372036854775800",
			
 
				+       .mantissa = 0x101,
			
 
				+       .exponent = -9223372036854775807L - 1L,
			
 
				+       .radix = 16},
			
 
				+      // The exponent here doesn't fit in a signed 64-bit integer until we
			
 
				+      // adjust for the radix point.
			
 
				+      {.token = "0x1.01p9223372036854775809",
			
 
				+       .mantissa = 0x101,
			
 
				+       .exponent = 9223372036854775801L,
			
 
				+       .radix = 16},
			
 
				+
			
 
				+      // Binary real literals. These are invalid, but we accept them for error
			
 
				+      // recovery.
			
 
				+      {.token = "0b10_11_01.01",
			
 
				+       .mantissa = 0b10110101,
			
 
				+       .exponent = -2,
			
 
				+       .radix = 2},
			
 
				+  };
			
 
				+  for (Testcase testcase : testcases) {
			
 
				+    auto parser = Parse(testcase.token);
			
 
				+    EXPECT_EQ(parser.Check(),
			
 
				+              testcase.radix == 2 ? parser.RecoverableError : parser.Valid)
			
 
				+        << testcase.token;
			
 
				+    EXPECT_EQ(parser.IsInteger(), false);
			
 
				+    EXPECT_EQ(parser.GetMantissa().getZExtValue(), testcase.mantissa);
			
 
				+    EXPECT_EQ(parser.GetExponent().getSExtValue(), testcase.exponent);
			
 
				+    EXPECT_EQ(parser.GetRadix(), testcase.radix);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+TEST_F(NumericLiteralTest, HandlesRealLiteralOverflow) {
			
 
				+  llvm::StringLiteral input = "0x1.000001p-9223372036854775800";
			
 
				+  auto parser = Parse(input);
			
 
				+  EXPECT_EQ(parser.Check(), parser.Valid);
			
 
				+  EXPECT_EQ(parser.GetMantissa(), 0x1000001);
			
 
				+  EXPECT_EQ((parser.GetExponent() + 9223372036854775800).getSExtValue(), -24);
			
 
				+  EXPECT_EQ(parser.GetRadix(), 16);
			
 
				+}
			
 
				+
			
 
				+TEST_F(NumericLiteralTest, ValidatesRealLiterals) {
			
 
				+  llvm::StringLiteral invalid_digit_separators[] = {
			
 
				+      // Invalid digit separators.
			
 
				+      "12_34.5",     "123.4_567", "123.456_7", "1_2_3.4",
			
 
				+      "123.4e56_78", "0x12_34.5", "0x12.3_4",  "0x12.34p5_6",
			
 
				+  };
			
 
				+  for (llvm::StringLiteral literal : invalid_digit_separators) {
			
 
				+    auto parser = Parse(literal);
			
 
				+    EXPECT_EQ(parser.Check(), parser.RecoverableError) << literal;
			
 
				+  }
			
 
				+
			
 
				+  llvm::StringLiteral invalid[] = {
			
 
				+      // No digits in integer part.
			
 
				+      "0x.0",
			
 
				+      "0b.0",
			
 
				+      "0x_.0",
			
 
				+      "0b_.0",
			
 
				+
			
 
				+      // No digits in fractional part.
			
 
				+      "0.e",
			
 
				+      "0.e0",
			
 
				+      "0.e+0",
			
 
				+      "0x0.p",
			
 
				+      "0x0.p-0",
			
 
				+
			
 
				+      // Invalid digits in mantissa.
			
 
				+      "123A.4",
			
 
				+      "123.4A",
			
 
				+      "123A.4e0",
			
 
				+      "123.4Ae0",
			
 
				+      "0x123ABCDEFG.0",
			
 
				+      "0x123.ABCDEFG",
			
 
				+      "0x123ABCDEFG.0p0",
			
 
				+      "0x123.ABCDEFGp0",
			
 
				+
			
 
				+      // Invalid exponent letter.
			
 
				+      "0.0f0",
			
 
				+      "0.0p0",
			
 
				+      "0.0z+0",
			
 
				+      "0x0.0e0",
			
 
				+      "0x0.0f0",
			
 
				+      "0x0.0z-0",
			
 
				+
			
 
				+      // No digits in exponent part.
			
 
				+      "0.0e",
			
 
				+      "0x0.0p",
			
 
				+      "0.0e_",
			
 
				+      "0x0.0p_",
			
 
				+
			
 
				+      // Invalid digits in exponent part.
			
 
				+      "0.0eHELLO",
			
 
				+      "0.0eA",
			
 
				+      "0.0e+A",
			
 
				+      "0x0.0pA",
			
 
				+      "0x0.0p-A",
			
 
				+  };
			
 
				+  for (llvm::StringLiteral literal : invalid) {
			
 
				+    auto parser = Parse(literal);
			
 
				+    EXPECT_EQ(parser.Check(), parser.UnrecoverableError) << literal;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+}  // namespace
			
 
				+}  // namespace Carbon
			
--- a/lexer/tokenized_buffer.cpp
+++ b/lexer/tokenized_buffer.cpp
@@ -5,12 +5,11 @@
 
				 #include "lexer/tokenized_buffer.h"
			
 
				 
			
 
				 #include <algorithm>
			
 
				-#include <bitset>
			
 
				 #include <cmath>
			
 
				 #include <iterator>
			
 
				 #include <string>
			
 
				 
			
 
				-#include "llvm/ADT/StringExtras.h"
			
 
				+#include "lexer/numeric_literal.h"
			
 
				 #include "llvm/ADT/StringRef.h"
			
 
				 #include "llvm/ADT/StringSwitch.h"
			
 
				 #include "llvm/ADT/Twine.h"
			
@@ -46,78 +45,6 @@ struct MismatchedClosing : SimpleDiagnostic<MismatchedClosing> {
 
				       "Closing symbol does not match most recent opening symbol.";
			
 
				 };
			
 
				 
			
 
				-struct EmptyDigitSequence : SimpleDiagnostic<EmptyDigitSequence> {
			
 
				-  static constexpr llvm::StringLiteral ShortName = "syntax-invalid-number";
			
 
				-  static constexpr llvm::StringLiteral Message =
			
 
				-      "Empty digit sequence in numeric literal.";
			
 
				-};
			
 
				-
			
 
				-struct InvalidDigit {
			
 
				-  static constexpr llvm::StringLiteral ShortName = "syntax-invalid-number";
			
 
				-
			
 
				-  struct Substitutions {
			
 
				-    char digit;
			
 
				-    int radix;
			
 
				-  };
			
 
				-  static auto Format(const Substitutions& subst) -> std::string {
			
 
				-    return llvm::formatv("Invalid digit '{0}' in {1} numeric literal.",
			
 
				-                         subst.digit,
			
 
				-                         (subst.radix == 2
			
 
				-                              ? "binary"
			
 
				-                              : subst.radix == 16 ? "hexadecimal" : "decimal"))
			
 
				-        .str();
			
 
				-  }
			
 
				-};
			
 
				-
			
 
				-struct InvalidDigitSeparator : SimpleDiagnostic<InvalidDigitSeparator> {
			
 
				-  static constexpr llvm::StringLiteral ShortName = "syntax-invalid-number";
			
 
				-  static constexpr llvm::StringLiteral Message =
			
 
				-      "Misplaced digit separator in numeric literal.";
			
 
				-};
			
 
				-
			
 
				-struct IrregularDigitSeparators {
			
 
				-  static constexpr llvm::StringLiteral ShortName =
			
 
				-      "syntax-irregular-digit-separators";
			
 
				-
			
 
				-  struct Substitutions {
			
 
				-    int radix;
			
 
				-  };
			
 
				-  static auto Format(const Substitutions& subst) -> std::string {
			
 
				-    assert((subst.radix == 10 || subst.radix == 16) && "unexpected radix");
			
 
				-    return llvm::formatv(
			
 
				-               "Digit separators in {0} number should appear every {1} "
			
 
				-               "characters from the right.",
			
 
				-               (subst.radix == 10 ? "decimal" : "hexadecimal"),
			
 
				-               (subst.radix == 10 ? "3" : "4"))
			
 
				-        .str();
			
 
				-  }
			
 
				-};
			
 
				-
			
 
				-struct UnknownBaseSpecifier : SimpleDiagnostic<UnknownBaseSpecifier> {
			
 
				-  static constexpr llvm::StringLiteral ShortName = "syntax-invalid-number";
			
 
				-  static constexpr llvm::StringLiteral Message =
			
 
				-      "Unknown base specifier in numeric literal.";
			
 
				-};
			
 
				-
			
 
				-struct BinaryRealLiteral : SimpleDiagnostic<BinaryRealLiteral> {
			
 
				-  static constexpr llvm::StringLiteral ShortName = "syntax-invalid-number";
			
 
				-  static constexpr llvm::StringLiteral Message =
			
 
				-      "Binary real number literals are not supported.";
			
 
				-};
			
 
				-
			
 
				-struct WrongRealLiteralExponent {
			
 
				-  static constexpr llvm::StringLiteral ShortName = "syntax-invalid-number";
			
 
				-
			
 
				-  struct Substitutions {
			
 
				-    char expected;
			
 
				-  };
			
 
				-  static auto Format(const Substitutions& subst) -> std::string {
			
 
				-    return llvm::formatv("Expected '{0}' to introduce exponent.",
			
 
				-                         subst.expected)
			
 
				-        .str();
			
 
				-  }
			
 
				-};
			
 
				-
			
 
				 struct UnrecognizedCharacters : SimpleDiagnostic<UnrecognizedCharacters> {
			
 
				   static constexpr llvm::StringLiteral ShortName =
			
 
				       "syntax-unrecognized-characters";
			
@@ -129,378 +56,6 @@ struct UnrecognizedCharacters : SimpleDiagnostic<UnrecognizedCharacters> {
 
				 // acceptable whitespace.
			
 
				 static bool isSpace(char c) { return c == ' ' || c == '\n' || c == '\t'; }
			
 
				 
			
 
				-static bool isLower(char c) { return 'a' <= c && c <= 'z'; }
			
 
				-
			
 
				-namespace {
			
 
				-struct NumericLiteral {
			
 
				-  llvm::StringRef text;
			
 
				-
			
 
				-  // The offset of the '.'. Set to text.size() if none is present.
			
 
				-  int radix_point;
			
 
				-
			
 
				-  // The offset of the alphabetical character introducing the exponent. In a
			
 
				-  // valid literal, this will be an 'e' or a 'p', and may be followed by a '+'
			
 
				-  // or a '-', but for error recovery, this may simply be the last lowercase
			
 
				-  // letter in the invalid token. Always greater than or equal to radix_point.
			
 
				-  // Set to text.size() if none is present.
			
 
				-  int exponent;
			
 
				-};
			
 
				-}  // namespace
			
 
				-
			
 
				-static auto TakeLeadingNumericLiteral(llvm::StringRef source_text)
			
 
				-    -> NumericLiteral {
			
 
				-  NumericLiteral result;
			
 
				-
			
 
				-  if (source_text.empty() || !llvm::isDigit(source_text.front()))
			
 
				-    return result;
			
 
				-
			
 
				-  bool seen_plus_minus = false;
			
 
				-  bool seen_radix_point = false;
			
 
				-  bool seen_potential_exponent = false;
			
 
				-
			
 
				-  // Greedily consume all following characters that might be part of a numeric
			
 
				-  // literal. This allows us to produce better diagnostics on invalid literals.
			
 
				-  //
			
 
				-  // TODO(zygoloid): Update lexical rules to specify that a numeric literal
			
 
				-  // cannot be immediately followed by an alphanumeric character.
			
 
				-  int i = 1, n = source_text.size();
			
 
				-  for (; i != n; ++i) {
			
 
				-    char c = source_text[i];
			
 
				-    if (llvm::isAlnum(c) || c == '_') {
			
 
				-      if (isLower(c) && seen_radix_point && !seen_plus_minus) {
			
 
				-        result.exponent = i;
			
 
				-        seen_potential_exponent = true;
			
 
				-      }
			
 
				-      continue;
			
 
				-    }
			
 
				-
			
 
				-    // Exactly one `.` can be part of the literal, but only if it's followed by
			
 
				-    // an alphanumeric character.
			
 
				-    if (c == '.' && i + 1 != n && llvm::isAlnum(source_text[i + 1]) &&
			
 
				-        !seen_radix_point) {
			
 
				-      result.radix_point = i;
			
 
				-      seen_radix_point = true;
			
 
				-      continue;
			
 
				-    }
			
 
				-
			
 
				-    // A `+` or `-` continues the literal only if it's preceded by a lowercase
			
 
				-    // letter (which will be 'e' or 'p' or part of an invalid literal) and
			
 
				-    // followed by an alphanumeric character. This '+' or '-' cannot be an
			
 
				-    // operator because a literal cannot end in a lowercase letter.
			
 
				-    if ((c == '+' || c == '-') && seen_potential_exponent &&
			
 
				-        result.exponent == i - 1 && i + 1 != n &&
			
 
				-        llvm::isAlnum(source_text[i + 1])) {
			
 
				-      // This is not possible because we don't update result.exponent after we
			
 
				-      // see a '+' or '-'.
			
 
				-      assert(!seen_plus_minus && "should only consume one + or -");
			
 
				-      seen_plus_minus = true;
			
 
				-      continue;
			
 
				-    }
			
 
				-
			
 
				-    break;
			
 
				-  }
			
 
				-
			
 
				-  result.text = source_text.substr(0, i);
			
 
				-  if (!seen_radix_point)
			
 
				-    result.radix_point = i;
			
 
				-  if (!seen_potential_exponent)
			
 
				-    result.exponent = i;
			
 
				-
			
 
				-  return result;
			
 
				-}
			
 
				-
			
 
				-namespace {
			
 
				-// Parser for numeric literal tokens.
			
 
				-//
			
 
				-// Responsible for checking that a numeric literal is valid and meaningful and
			
 
				-// either diagnosing or extracting its meaning.
			
 
				-class NumericLiteralParser {
			
 
				- public:
			
 
				-  NumericLiteralParser(DiagnosticEmitter& emitter, NumericLiteral literal)
			
 
				-      : emitter(emitter), literal(literal) {
			
 
				-    int_part = literal.text.substr(0, literal.radix_point);
			
 
				-    if (int_part.consume_front("0x")) {
			
 
				-      radix = 16;
			
 
				-    } else if (int_part.consume_front("0b")) {
			
 
				-      radix = 2;
			
 
				-    }
			
 
				-
			
 
				-    fract_part = literal.text.substr(
			
 
				-        literal.radix_point + 1, literal.exponent - literal.radix_point - 1);
			
 
				-
			
 
				-    exponent_part = literal.text.substr(literal.exponent + 1);
			
 
				-    if (!exponent_part.consume_front("+")) {
			
 
				-      exponent_is_negative = exponent_part.consume_front("-");
			
 
				-    }
			
 
				-  }
			
 
				-
			
 
				-  auto IsInteger() -> bool {
			
 
				-    return literal.radix_point == static_cast<int>(literal.text.size());
			
 
				-  }
			
 
				-
			
 
				-  enum CheckResult {
			
 
				-    // The token is valid.
			
 
				-    Valid,
			
 
				-    // The token is invalid, but we've diagnosed and recovered from the error.
			
 
				-    RecoverableError,
			
 
				-    // The token is invalid, and we've diagnosed, but we can't assign meaning
			
 
				-    // to it.
			
 
				-    UnrecoverableError,
			
 
				-  };
			
 
				-
			
 
				-  // Check that the numeric literal token is syntactically valid and
			
 
				-  // meaningful, and diagnose if not.
			
 
				-  auto Check() -> CheckResult {
			
 
				-    if (!CheckLeadingZero() || !CheckIntPart() || !CheckFractionalPart() ||
			
 
				-        !CheckExponentPart())
			
 
				-      return UnrecoverableError;
			
 
				-    return recovered_from_error ? RecoverableError : Valid;
			
 
				-  }
			
 
				-
			
 
				-  auto GetMantissa() -> llvm::APInt {
			
 
				-    const char* end = IsInteger() ? int_part.end() : fract_part.end();
			
 
				-    llvm::StringRef digits(int_part.begin(), end - int_part.begin());
			
 
				-    return ParseInteger(digits, radix, mantissa_needs_cleaning);
			
 
				-  }
			
 
				-
			
 
				-  auto GetExponent() -> llvm::APInt {
			
 
				-    // Compute the effective exponent from the specified exponent, if any,
			
 
				-    // and the position of the radix point.
			
 
				-    llvm::APInt exponent(64, 0);
			
 
				-    if (!exponent_part.empty()) {
			
 
				-      exponent = ParseInteger(exponent_part, 10, exponent_needs_cleaning);
			
 
				-
			
 
				-      // The exponent is a signed integer, and the number we just parsed is
			
 
				-      // non-negative, so ensure we have a wide enough representation to
			
 
				-      // include a sign bit. Also make sure the exponent isn't too narrow so
			
 
				-      // the calculation below can't lose information through overflow.
			
 
				-      if (exponent.isSignBitSet() || exponent.getBitWidth() < 64) {
			
 
				-        exponent = exponent.zext(std::max(64u, exponent.getBitWidth() + 1));
			
 
				-      }
			
 
				-      if (exponent_is_negative) {
			
 
				-        exponent.negate();
			
 
				-      }
			
 
				-    }
			
 
				-
			
 
				-    // Each character after the decimal point reduces the effective exponent.
			
 
				-    int excess_exponent = fract_part.size();
			
 
				-    if (radix == 16) {
			
 
				-      excess_exponent *= 4;
			
 
				-    }
			
 
				-    exponent -= excess_exponent;
			
 
				-    if (exponent_is_negative && !exponent.isNegative()) {
			
 
				-      // We overflowed. Note that we can only overflow by a little, and only
			
 
				-      // from negative to positive, because exponent is at least 64 bits wide
			
 
				-      // and excess_exponent is bounded above by four times the size of the
			
 
				-      // input buffer, which we assume fits into 32 bits.
			
 
				-      exponent = exponent.zext(exponent.getBitWidth() + 1);
			
 
				-      exponent.setSignBit();
			
 
				-    }
			
 
				-    return exponent;
			
 
				-  }
			
 
				-
			
 
				- private:
			
 
				-  struct CheckDigitSequenceResult {
			
 
				-    bool ok;
			
 
				-    bool has_digit_separators = false;
			
 
				-  };
			
 
				-
			
 
				-  // Check that a digit sequence is valid: that it contains one or more digits,
			
 
				-  // contains only digits in the specified base, and that any digit separators
			
 
				-  // are present and correctly positioned.
			
 
				-  auto CheckDigitSequence(llvm::StringRef text, int radix,
			
 
				-                          bool allow_digit_separators = true)
			
 
				-      -> CheckDigitSequenceResult {
			
 
				-    assert((radix == 2 || radix == 10 || radix == 16) && "unknown radix");
			
 
				-
			
 
				-    std::bitset<256> valid_digits;
			
 
				-    if (radix == 2) {
			
 
				-      for (char c : "01")
			
 
				-        valid_digits[static_cast<unsigned char>(c)] = true;
			
 
				-    } else if (radix == 10) {
			
 
				-      for (char c : "0123456789")
			
 
				-        valid_digits[static_cast<unsigned char>(c)] = true;
			
 
				-    } else {
			
 
				-      for (char c : "0123456789ABCDEF")
			
 
				-        valid_digits[static_cast<unsigned char>(c)] = true;
			
 
				-    }
			
 
				-
			
 
				-    int num_digit_separators = 0;
			
 
				-
			
 
				-    for (int i = 0, n = text.size(); i != n; ++i) {
			
 
				-      char c = text[i];
			
 
				-      if (valid_digits[static_cast<unsigned char>(c)]) {
			
 
				-        continue;
			
 
				-      }
			
 
				-
			
 
				-      if (c == '_') {
			
 
				-        // A digit separator cannot appear at the start of a digit sequence,
			
 
				-        // next to another digit separator, or at the end.
			
 
				-        if (!allow_digit_separators || i == 0 || text[i - 1] == '_' ||
			
 
				-            i + 1 == n) {
			
 
				-          emitter.EmitError<InvalidDigitSeparator>();
			
 
				-          recovered_from_error = true;
			
 
				-        }
			
 
				-        ++num_digit_separators;
			
 
				-        continue;
			
 
				-      }
			
 
				-
			
 
				-      emitter.EmitError<InvalidDigit>({.digit = c, .radix = radix});
			
 
				-      return {.ok = false};
			
 
				-    }
			
 
				-
			
 
				-    if (num_digit_separators == static_cast<int>(text.size())) {
			
 
				-      emitter.EmitError<EmptyDigitSequence>();
			
 
				-      return {.ok = false};
			
 
				-    }
			
 
				-
			
 
				-    // Check that digit separators occur in exactly the expected positions.
			
 
				-    if (num_digit_separators && radix != 2)
			
 
				-      CheckDigitSeparatorPlacement(text, radix, num_digit_separators);
			
 
				-
			
 
				-    return {.ok = true, .has_digit_separators = (num_digit_separators != 0)};
			
 
				-  }
			
 
				-
			
 
				-  // Given a number with digit separators, check that the digit separators are
			
 
				-  // correctly positioned.
			
 
				-  auto CheckDigitSeparatorPlacement(llvm::StringRef text, int radix,
			
 
				-                                    int num_digit_separators) -> void {
			
 
				-    assert((radix == 10 || radix == 16) &&
			
 
				-           "unexpected radix for digit separator checks");
			
 
				-    assert(std::count(text.begin(), text.end(), '_') == num_digit_separators &&
			
 
				-           "given wrong number of digit separators");
			
 
				-
			
 
				-    auto diagnose_irregular_digit_separators = [&] {
			
 
				-      emitter.EmitError<IrregularDigitSeparators>({.radix = radix});
			
 
				-      recovered_from_error = true;
			
 
				-    };
			
 
				-
			
 
				-    // For decimal and hexadecimal digit sequences, digit separators must form
			
 
				-    // groups of 3 or 4 digits (4 or 5 characters), respectively.
			
 
				-    int stride = (radix == 10 ? 4 : 5);
			
 
				-    int remaining_digit_separators = num_digit_separators;
			
 
				-    for (auto pos = text.end(); pos - text.begin() >= stride; /*in loop*/) {
			
 
				-      pos -= stride;
			
 
				-      if (*pos != '_')
			
 
				-        return diagnose_irregular_digit_separators();
			
 
				-
			
 
				-      --remaining_digit_separators;
			
 
				-    }
			
 
				-
			
 
				-    // Check there weren't any other digit separators.
			
 
				-    if (remaining_digit_separators)
			
 
				-      diagnose_irregular_digit_separators();
			
 
				-  };
			
 
				-
			
 
				-  // Check that we don't have a '0' prefix on a non-zero decimal integer.
			
 
				-  auto CheckLeadingZero() -> bool {
			
 
				-    if (radix == 10 && int_part.startswith("0") && int_part != "0") {
			
 
				-      emitter.EmitError<UnknownBaseSpecifier>();
			
 
				-      return false;
			
 
				-    }
			
 
				-    return true;
			
 
				-  }
			
 
				-
			
 
				-  // Check the integer part (before the '.', if any) is valid.
			
 
				-  auto CheckIntPart() -> bool {
			
 
				-    auto int_result = CheckDigitSequence(int_part, radix);
			
 
				-    mantissa_needs_cleaning |= int_result.has_digit_separators;
			
 
				-    return int_result.ok;
			
 
				-  }
			
 
				-
			
 
				-  // Check the fractional part (after the '.' and before the exponent, if any)
			
 
				-  // is valid.
			
 
				-  auto CheckFractionalPart() -> bool {
			
 
				-    if (IsInteger()) {
			
 
				-      return true;
			
 
				-    }
			
 
				-
			
 
				-    if (radix == 2) {
			
 
				-      emitter.EmitError<BinaryRealLiteral>();
			
 
				-      recovered_from_error = true;
			
 
				-      // Carry on and parse the binary real literal anyway.
			
 
				-    }
			
 
				-
			
 
				-    // We need to remove a '.' from the mantissa.
			
 
				-    mantissa_needs_cleaning = true;
			
 
				-
			
 
				-    return CheckDigitSequence(fract_part, radix,
			
 
				-                              /*allow_digit_separators=*/false)
			
 
				-        .ok;
			
 
				-  }
			
 
				-
			
 
				-  // Check the exponent part (if any) is valid.
			
 
				-  auto CheckExponentPart() -> bool {
			
 
				-    if (literal.exponent == static_cast<int>(literal.text.size())) {
			
 
				-      return true;
			
 
				-    }
			
 
				-
			
 
				-    char expected_exponent_kind = (radix == 10 ? 'e' : 'p');
			
 
				-    if (literal.text[literal.exponent] != expected_exponent_kind) {
			
 
				-      emitter.EmitError<WrongRealLiteralExponent>(
			
 
				-          {.expected = expected_exponent_kind});
			
 
				-      return false;
			
 
				-    }
			
 
				-
			
 
				-    auto exponent_result = CheckDigitSequence(exponent_part, 10);
			
 
				-    exponent_needs_cleaning = exponent_result.has_digit_separators;
			
 
				-    return exponent_result.ok;
			
 
				-  }
			
 
				-
			
 
				-  // Parse a string that is known to be a valid base-radix integer into an
			
 
				-  // APInt.  If needs_cleaning is true, the string may additionally contain '_'
			
 
				-  // and '.' characters that should be ignored.
			
 
				-  //
			
 
				-  // Ignoring '.' is used when parsing a real literal. For example, when
			
 
				-  // parsing 123.456e7, we want to decompose it into an integer mantissa
			
 
				-  // (123456) and an exponent (7 - 3 = 2), and this routine is given the
			
 
				-  // "123.456" to parse as the mantissa.
			
 
				-  static auto ParseInteger(llvm::StringRef digits, int radix,
			
 
				-                           bool needs_cleaning) -> llvm::APInt {
			
 
				-    llvm::SmallString<32> cleaned;
			
 
				-    if (needs_cleaning) {
			
 
				-      cleaned.reserve(digits.size());
			
 
				-      std::remove_copy_if(digits.begin(), digits.end(),
			
 
				-                          std::back_inserter(cleaned),
			
 
				-                          [](char c) { return c == '_' || c == '.'; });
			
 
				-      digits = cleaned;
			
 
				-    }
			
 
				-
			
 
				-    llvm::APInt value;
			
 
				-    if (digits.getAsInteger(radix, value)) {
			
 
				-      llvm_unreachable("should never fail");
			
 
				-    }
			
 
				-    return value;
			
 
				-  }
			
 
				-
			
 
				- private:
			
 
				-  DiagnosticEmitter& emitter;
			
 
				-  NumericLiteral literal;
			
 
				-
			
 
				-  // The radix of the literal: 2, 10, or 16, for a prefix of '0b', no prefix,
			
 
				-  // or '0x', respectively.
			
 
				-  int radix = 10;
			
 
				-
			
 
				-  // The various components of a numeric literal:
			
 
				-  //
			
 
				-  //     [radix] int_part [. fract_part [[ep] [+-] exponent_part]]
			
 
				-  llvm::StringRef int_part;
			
 
				-  llvm::StringRef fract_part;
			
 
				-  llvm::StringRef exponent_part;
			
 
				-
			
 
				-  // Do we need to remove any special characters (digit separator or radix
			
 
				-  // point) before interpreting the mantissa or exponent as an integer?
			
 
				-  bool mantissa_needs_cleaning = false;
			
 
				-  bool exponent_needs_cleaning = false;
			
 
				-
			
 
				-  // True if we found a `-` before `exponent_part`.
			
 
				-  bool exponent_is_negative = false;
			
 
				-
			
 
				-  // True if we produced an error but recovered.
			
 
				-  bool recovered_from_error = false;
			
 
				-};
			
 
				-}  // namespace
			
 
				-
			
 
				 // Implementation of the lexer logic itself.
			
 
				 //
			
 
				 // The design is that lexing can loop over the source buffer, consuming it into
			
@@ -618,39 +173,41 @@ class TokenizedBuffer::Lexer {
 
				   }
			
 
				 
			
 
				   auto LexNumericLiteral(llvm::StringRef& source_text) -> LexResult {
			
 
				-    NumericLiteral literal = TakeLeadingNumericLiteral(source_text);
			
 
				-    if (literal.text.empty()) {
			
 
				+    llvm::Optional<NumericLiteralToken> literal =
			
 
				+        NumericLiteralToken::Lex(source_text);
			
 
				+    if (!literal) {
			
 
				       return LexResult::NoMatch();
			
 
				     }
			
 
				 
			
 
				     int int_column = current_column;
			
 
				-    current_column += literal.text.size();
			
 
				-    source_text = source_text.drop_front(literal.text.size());
			
 
				+    int token_size = literal->Text().size();
			
 
				+    current_column += token_size;
			
 
				+    source_text = source_text.drop_front(token_size);
			
 
				 
			
 
				     if (!set_indent) {
			
 
				       current_line_info->indent = int_column;
			
 
				       set_indent = true;
			
 
				     }
			
 
				 
			
 
				-    NumericLiteralParser literal_parser(emitter, literal);
			
 
				+    NumericLiteralToken::Parser literal_parser(emitter, *literal);
			
 
				 
			
 
				     switch (literal_parser.Check()) {
			
 
				-      case NumericLiteralParser::UnrecoverableError: {
			
 
				+      case NumericLiteralToken::Parser::UnrecoverableError: {
			
 
				         auto token = buffer.AddToken({
			
 
				             .kind = TokenKind::Error(),
			
 
				             .token_line = current_line,
			
 
				             .column = int_column,
			
 
				-            .error_length = static_cast<int32_t>(literal.text.size()),
			
 
				+            .error_length = token_size,
			
 
				         });
			
 
				         buffer.has_errors = true;
			
 
				         return token;
			
 
				       }
			
 
				 
			
 
				-      case NumericLiteralParser::RecoverableError:
			
 
				+      case NumericLiteralToken::Parser::RecoverableError:
			
 
				         buffer.has_errors = true;
			
 
				         break;
			
 
				 
			
 
				-      case NumericLiteralParser::Valid:
			
 
				+      case NumericLiteralToken::Parser::Valid:
			
 
				         break;
			
 
				     }
			
 
				 
			
@@ -908,7 +465,10 @@ auto TokenizedBuffer::GetTokenText(Token token) const -> llvm::StringRef {
 
				       token_info.kind == TokenKind::RealLiteral()) {
			
 
				     auto& line_info = GetLineInfo(token_info.token_line);
			
 
				     int64_t token_start = line_info.start + token_info.column;
			
 
				-    return TakeLeadingNumericLiteral(source->Text().substr(token_start)).text;
			
 
				+    llvm::Optional<NumericLiteralToken> relexed_token =
			
 
				+        NumericLiteralToken::Lex(source->Text().substr(token_start));
			
 
				+    assert(relexed_token && "Could not reform numeric literal token.");
			
 
				+    return relexed_token->Text();
			
 
				   }
			
 
				 
			
 
				   assert(token_info.kind == TokenKind::Identifier() &&
			
--- a/lexer/tokenized_buffer_test.cpp
+++ b/lexer/tokenized_buffer_test.cpp
@@ -77,8 +77,8 @@ TEST_F(LexerTest, TracksLinesAndColumns) {
 
				                       }));
			
 
				 }
			
 
				 
			
 
				-TEST_F(LexerTest, HandlesIntegerLiteral) {
			
 
				-  auto buffer = Lex("12-578\n  1  2\n0x12_3ABC\n0b10_10_11\n1_234_567");
			
 
				+TEST_F(LexerTest, HandlesNumericLiteral) {
			
 
				+  auto buffer = Lex("12-578\n  1  2\n0x12_3ABC\n0b10_10_11\n1_234_567\n1.5e9");
			
 
				   EXPECT_FALSE(buffer.HasErrors());
			
 
				   ASSERT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
			
 
				                           {.kind = TokenKind::IntegerLiteral(),
			
@@ -120,6 +120,11 @@ TEST_F(LexerTest, HandlesIntegerLiteral) {
 
				                            .column = 1,
			
 
				                            .indent_column = 1,
			
 
				                            .text = "1_234_567"},
			
 
				+                          {.kind = TokenKind::RealLiteral(),
			
 
				+                           .line = 6,
			
 
				+                           .column = 1,
			
 
				+                           .indent_column = 1,
			
 
				+                           .text = "1.5e9"},
			
 
				                       }));
			
 
				   auto token_12 = buffer.Tokens().begin();
			
 
				   EXPECT_EQ(buffer.GetIntegerLiteral(*token_12), 12);
			
@@ -135,272 +140,43 @@ TEST_F(LexerTest, HandlesIntegerLiteral) {
 
				   EXPECT_EQ(buffer.GetIntegerLiteral(*token_0b10_10_11), 0b10'10'11);
			
 
				   auto token_1_234_567 = buffer.Tokens().begin() + 7;
			
 
				   EXPECT_EQ(buffer.GetIntegerLiteral(*token_1_234_567), 1'234'567);
			
 
				+  auto token_1_5e9 = buffer.Tokens().begin() + 8;
			
 
				+  auto value_1_5e9 = buffer.GetRealLiteral(*token_1_5e9);
			
 
				+  EXPECT_EQ(value_1_5e9.Mantissa().getZExtValue(), 15);
			
 
				+  EXPECT_EQ(value_1_5e9.Exponent().getSExtValue(), 8);
			
 
				+  EXPECT_EQ(value_1_5e9.IsDecimal(), true);
			
 
				 }
			
 
				 
			
 
				-TEST_F(LexerTest, ValidatesBaseSpecifier) {
			
 
				-  llvm::StringLiteral valid[] = {
			
 
				-      // Decimal integer literals.
			
 
				-      "0",
			
 
				-      "1",
			
 
				-      "123456789000000000000000000000000000000000000",
			
 
				-
			
 
				-      // Hexadecimal integer literals.
			
 
				-      "0x0123456789ABCDEF",
			
 
				-      "0x0000000000000000000000000000000",
			
 
				-
			
 
				-      // Binary integer literals.
			
 
				-      "0b10110100101001010",
			
 
				-      "0b0000000",
			
 
				-  };
			
 
				-  for (llvm::StringLiteral literal : valid) {
			
 
				-    auto buffer = Lex(literal);
			
 
				-    EXPECT_FALSE(buffer.HasErrors()) << literal;
			
 
				-    ASSERT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
			
 
				-                            {.kind = TokenKind::IntegerLiteral(),
			
 
				-                             .line = 1,
			
 
				-                             .column = 1,
			
 
				-                             .indent_column = 1,
			
 
				-                             .text = literal}}));
			
 
				-  }
			
 
				-
			
 
				-  llvm::StringLiteral invalid[] = {
			
 
				-      "00",  "0X123",    "0o123",          "0B1",
			
 
				-      "007", "123L",     "123456789A",     "0x",
			
 
				-      "0b",  "0x123abc", "0b011101201001", "0b10A",
			
 
				-      "0x_", "0b_",
			
 
				-  };
			
 
				-  for (llvm::StringLiteral literal : invalid) {
			
 
				-    auto buffer = Lex(literal);
			
 
				-    EXPECT_TRUE(buffer.HasErrors()) << literal;
			
 
				-    ASSERT_THAT(
			
 
				-        buffer,
			
 
				-        HasTokens(llvm::ArrayRef<ExpectedToken>{{.kind = TokenKind::Error(),
			
 
				-                                                 .line = 1,
			
 
				-                                                 .column = 1,
			
 
				-                                                 .indent_column = 1,
			
 
				-                                                 .text = literal}}));
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-TEST_F(LexerTest, ValidatesIntegerDigitSeparators) {
			
 
				-  llvm::StringLiteral valid[] = {
			
 
				-      // Decimal literals optionally have digit separators every 3 places.
			
 
				-      "1_234",
			
 
				-      "123_456",
			
 
				-      "1_234_567",
			
 
				-
			
 
				-      // Hexadecimal literals optionally have digit separators every 4 places.
			
 
				-      "0x1_0000",
			
 
				-      "0x1000_0000",
			
 
				-      "0x1_0000_0000",
			
 
				-
			
 
				-      // Binary integer literals can have digit separators anywhere..
			
 
				-      "0b1_0_1_0_1_0",
			
 
				-      "0b111_0000",
			
 
				-  };
			
 
				-  for (llvm::StringLiteral literal : valid) {
			
 
				-    auto buffer = Lex(literal);
			
 
				-    EXPECT_FALSE(buffer.HasErrors()) << literal;
			
 
				-    ASSERT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
			
 
				-                            {.kind = TokenKind::IntegerLiteral(),
			
 
				-                             .line = 1,
			
 
				-                             .column = 1,
			
 
				-                             .indent_column = 1,
			
 
				-                             .text = literal}}));
			
 
				-  }
			
 
				-
			
 
				-  llvm::StringLiteral invalid[] = {
			
 
				-      // Decimal literals.
			
 
				-      "12_34",
			
 
				-      "123_4_6_789",
			
 
				-      "12_3456_789",
			
 
				-      "12__345",
			
 
				-      "1_",
			
 
				-
			
 
				-      // Hexadecimal literals.
			
 
				-      "0x_1234",
			
 
				-      "0x123_",
			
 
				-      "0x12_3",
			
 
				-      "0x_234_5678",
			
 
				-      "0x1234_567",
			
 
				-
			
 
				-      // Binary literals.
			
 
				-      "0b_10101",
			
 
				-      "0b1__01",
			
 
				-      "0b1011_",
			
 
				-      "0b1_01_01_",
			
 
				-  };
			
 
				-  for (llvm::StringLiteral literal : invalid) {
			
 
				-    auto buffer = Lex(literal);
			
 
				-    EXPECT_TRUE(buffer.HasErrors()) << literal;
			
 
				-    // We expect to produce a token even for a literal containing invalid digit
			
 
				-    // separators, for better error recovery.
			
 
				-    ASSERT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
			
 
				-                            {.kind = TokenKind::IntegerLiteral(),
			
 
				-                             .line = 1,
			
 
				-                             .column = 1,
			
 
				-                             .indent_column = 1,
			
 
				-                             .text = literal}}));
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-TEST_F(LexerTest, HandlesRealLiteral) {
			
 
				-  struct Testcase {
			
 
				-    llvm::StringLiteral token;
			
 
				-    uint64_t mantissa;
			
 
				-    int64_t exponent;
			
 
				-    unsigned radix;
			
 
				-  };
			
 
				-  Testcase testcases[] = {
			
 
				-      // Decimal real literals.
			
 
				-      {.token = "0.0", .mantissa = 0, .exponent = -1, .radix = 10},
			
 
				-      {.token = "12.345", .mantissa = 12345, .exponent = -3, .radix = 10},
			
 
				-      {.token = "12.345e6", .mantissa = 12345, .exponent = 3, .radix = 10},
			
 
				-      {.token = "12.345e+6", .mantissa = 12345, .exponent = 3, .radix = 10},
			
 
				-      {.token = "1_234.5e-2", .mantissa = 12345, .exponent = -3, .radix = 10},
			
 
				-      {.token = "1.0e-2_000_000",
			
 
				-       .mantissa = 10,
			
 
				-       .exponent = -2'000'001,
			
 
				-       .radix = 10},
			
 
				-
			
 
				-      // Hexadecimal real literals.
			
 
				-      {.token = "0x1_2345_6789.CDEF",
			
 
				-       .mantissa = 0x1'2345'6789'CDEF,
			
 
				-       .exponent = -16,
			
 
				-       .radix = 16},
			
 
				-      {.token = "0x0.0001p4", .mantissa = 1, .exponent = -12, .radix = 16},
			
 
				-      {.token = "0x0.0001p+4", .mantissa = 1, .exponent = -12, .radix = 16},
			
 
				-      {.token = "0x0.0001p-4", .mantissa = 1, .exponent = -20, .radix = 16},
			
 
				-      // The exponent here works out as exactly INT64_MIN.
			
 
				-      {.token = "0x1.01p-9223372036854775800",
			
 
				-       .mantissa = 0x101,
			
 
				-       .exponent = -9223372036854775807L - 1L,
			
 
				-       .radix = 16},
			
 
				-      // The exponent here doesn't fit in a signed 64-bit integer until we
			
 
				-      // adjust for the radix point.
			
 
				-      {.token = "0x1.01p9223372036854775809",
			
 
				-       .mantissa = 0x101,
			
 
				-       .exponent = 9223372036854775801L,
			
 
				-       .radix = 16},
			
 
				-
			
 
				-      // Binary real literals. These are invalid, but we accept them for error
			
 
				-      // recovery.
			
 
				-      {.token = "0b10_11_01.01",
			
 
				-       .mantissa = 0b10110101,
			
 
				-       .exponent = -2,
			
 
				-       .radix = 2},
			
 
				-  };
			
 
				-  for (Testcase testcase : testcases) {
			
 
				-    auto buffer = Lex(testcase.token);
			
 
				-    EXPECT_EQ(buffer.HasErrors(), testcase.radix == 2);
			
 
				-    ASSERT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
			
 
				-                            {.kind = TokenKind::RealLiteral(),
			
 
				-                             .line = 1,
			
 
				-                             .column = 1,
			
 
				-                             .indent_column = 1,
			
 
				-                             .text = testcase.token},
			
 
				-                        }));
			
 
				-    auto token = buffer.Tokens().begin();
			
 
				-    TokenizedBuffer::RealLiteralValue value = buffer.GetRealLiteral(*token);
			
 
				-    EXPECT_EQ(value.Mantissa().getZExtValue(), testcase.mantissa);
			
 
				-    EXPECT_EQ(value.Exponent().getSExtValue(), testcase.exponent);
			
 
				-    EXPECT_EQ(value.IsDecimal(), testcase.radix == 10);
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-TEST_F(LexerTest, HandlesRealLiteralOverflow) {
			
 
				-  llvm::StringLiteral input = "0x1.000001p-9223372036854775800";
			
 
				-  auto buffer = Lex(input);
			
 
				-  EXPECT_FALSE(buffer.HasErrors());
			
 
				+TEST_F(LexerTest, HandlesInvalidNumericLiterals) {
			
 
				+  auto buffer = Lex("14x 15_49 0x3.5q 0x3_4.5_6 0ops");
			
 
				+  EXPECT_TRUE(buffer.HasErrors());
			
 
				   ASSERT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
			
 
				-                          {.kind = TokenKind::RealLiteral(),
			
 
				+                          {.kind = TokenKind::Error(),
			
 
				                            .line = 1,
			
 
				                            .column = 1,
			
 
				                            .indent_column = 1,
			
 
				-                           .text = input},
			
 
				+                           .text = "14x"},
			
 
				+                          {.kind = TokenKind::IntegerLiteral(),
			
 
				+                           .line = 1,
			
 
				+                           .column = 5,
			
 
				+                           .indent_column = 1,
			
 
				+                           .text = "15_49"},
			
 
				+                          {.kind = TokenKind::Error(),
			
 
				+                           .line = 1,
			
 
				+                           .column = 11,
			
 
				+                           .indent_column = 1,
			
 
				+                           .text = "0x3.5q"},
			
 
				+                          {.kind = TokenKind::RealLiteral(),
			
 
				+                           .line = 1,
			
 
				+                           .column = 18,
			
 
				+                           .indent_column = 1,
			
 
				+                           .text = "0x3_4.5_6"},
			
 
				+                          {.kind = TokenKind::Error(),
			
 
				+                           .line = 1,
			
 
				+                           .column = 28,
			
 
				+                           .indent_column = 1,
			
 
				+                           .text = "0ops"},
			
 
				                       }));
			
 
				-  auto token = buffer.Tokens().begin();
			
 
				-  TokenizedBuffer::RealLiteralValue value = buffer.GetRealLiteral(*token);
			
 
				-  EXPECT_EQ(value.Mantissa(), 0x1000001);
			
 
				-  EXPECT_EQ((value.Exponent() + 9223372036854775800).getSExtValue(), -24);
			
 
				-  EXPECT_EQ(value.IsDecimal(), false);
			
 
				-}
			
 
				-
			
 
				-TEST_F(LexerTest, ValidatesRealLiterals) {
			
 
				-  llvm::StringLiteral invalid_digit_separators[] = {
			
 
				-      // Invalid digit separators.
			
 
				-      "12_34.5",     "123.4_567", "123.456_7", "1_2_3.4",
			
 
				-      "123.4e56_78", "0x12_34.5", "0x12.3_4",  "0x12.34p5_6",
			
 
				-  };
			
 
				-  for (llvm::StringLiteral literal : invalid_digit_separators) {
			
 
				-    auto buffer = Lex(literal);
			
 
				-    EXPECT_TRUE(buffer.HasErrors()) << literal;
			
 
				-    // We expect to produce a token even for a literal containing invalid digit
			
 
				-    // separators, for better error recovery.
			
 
				-    ASSERT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
			
 
				-                            {.kind = TokenKind::RealLiteral(),
			
 
				-                             .line = 1,
			
 
				-                             .column = 1,
			
 
				-                             .indent_column = 1,
			
 
				-                             .text = literal}}));
			
 
				-  }
			
 
				-
			
 
				-  llvm::StringLiteral invalid[] = {
			
 
				-      // No digits in integer part.
			
 
				-      "0x.0",
			
 
				-      "0b.0",
			
 
				-      "0x_.0",
			
 
				-      "0b_.0",
			
 
				-
			
 
				-      // No digits in fractional part.
			
 
				-      "0.e",
			
 
				-      "0.e0",
			
 
				-      "0.e+0",
			
 
				-      "0x0.p",
			
 
				-      "0x0.p-0",
			
 
				-
			
 
				-      // Invalid digits in mantissa.
			
 
				-      "123A.4",
			
 
				-      "123.4A",
			
 
				-      "123A.4e0",
			
 
				-      "123.4Ae0",
			
 
				-      "0x123ABCDEFG.0",
			
 
				-      "0x123.ABCDEFG",
			
 
				-      "0x123ABCDEFG.0p0",
			
 
				-      "0x123.ABCDEFGp0",
			
 
				-
			
 
				-      // Invalid exponent letter.
			
 
				-      "0.0f0",
			
 
				-      "0.0p0",
			
 
				-      "0.0z+0",
			
 
				-      "0x0.0e0",
			
 
				-      "0x0.0f0",
			
 
				-      "0x0.0z-0",
			
 
				-
			
 
				-      // No digits in exponent part.
			
 
				-      "0.0e",
			
 
				-      "0x0.0p",
			
 
				-      "0.0e_",
			
 
				-      "0x0.0p_",
			
 
				-
			
 
				-      // Invalid digits in exponent part.
			
 
				-      "0.0eHELLO",
			
 
				-      "0.0eA",
			
 
				-      "0.0e+A",
			
 
				-      "0x0.0pA",
			
 
				-      "0x0.0p-A",
			
 
				-  };
			
 
				-  for (llvm::StringLiteral literal : invalid) {
			
 
				-    auto buffer = Lex(literal);
			
 
				-    EXPECT_TRUE(buffer.HasErrors()) << literal;
			
 
				-    ASSERT_THAT(
			
 
				-        buffer,
			
 
				-        HasTokens(llvm::ArrayRef<ExpectedToken>{{.kind = TokenKind::Error(),
			
 
				-                                                 .line = 1,
			
 
				-                                                 .column = 1,
			
 
				-                                                 .indent_column = 1,
			
 
				-                                                 .text = literal}}));
			
 
				-  }
			
 
				 }
			
 
				 
			
 
				 TEST_F(LexerTest, SplitsNumericLiteralsProperly) {