Procházet zdrojové kódy

Move numeric literal parsing out into its own files. (#326)

Richard Smith před 5 roky
rodič
revize
9a873b46de

+ 24 - 0
lexer/BUILD

@@ -26,12 +26,36 @@ cc_test(
     ],
 )
 
+cc_library(
+    name = "numeric_literal",
+    srcs = ["numeric_literal.cpp"],
+    hdrs = ["numeric_literal.h"],
+    deps = [
+        "//diagnostics:diagnostic_emitter",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
+cc_test(
+    name = "numeric_literal_test",
+    srcs = ["numeric_literal_test.cpp"],
+    deps = [
+        ":numeric_literal",
+        "//diagnostics:diagnostic_emitter",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//llvm:gmock",
+        "@llvm-project//llvm:gtest",
+        "@llvm-project//llvm:gtest_main",
+    ],
+)
+
 cc_library(
     name = "tokenized_buffer",
     srcs = ["tokenized_buffer.cpp"],
     hdrs = ["tokenized_buffer.h"],
     deps = [
         ":token_kind",
+        ":numeric_literal",
         "//diagnostics:diagnostic_emitter",
         "//source:source_buffer",
         "@llvm-project//llvm:Support",

+ 390 - 0
lexer/numeric_literal.cpp

@@ -0,0 +1,390 @@
+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+// Exceptions. See /LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "lexer/numeric_literal.h"
+
+#include <bitset>
+
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/FormatVariadic.h"
+
+namespace Carbon {
+
+namespace {
+struct EmptyDigitSequence : SimpleDiagnostic<EmptyDigitSequence> {
+  static constexpr llvm::StringLiteral ShortName = "syntax-invalid-number";
+  static constexpr llvm::StringLiteral Message =
+      "Empty digit sequence in numeric literal.";
+};
+
+struct InvalidDigit {
+  static constexpr llvm::StringLiteral ShortName = "syntax-invalid-number";
+
+  struct Substitutions {
+    char digit;
+    int radix;
+  };
+  static auto Format(const Substitutions& subst) -> std::string {
+    return llvm::formatv("Invalid digit '{0}' in {1} numeric literal.",
+                         subst.digit,
+                         (subst.radix == 2    ? "binary"
+                          : subst.radix == 16 ? "hexadecimal"
+                                              : "decimal"))
+        .str();
+  }
+};
+
+struct InvalidDigitSeparator : SimpleDiagnostic<InvalidDigitSeparator> {
+  static constexpr llvm::StringLiteral ShortName = "syntax-invalid-number";
+  static constexpr llvm::StringLiteral Message =
+      "Misplaced digit separator in numeric literal.";
+};
+
+struct IrregularDigitSeparators {
+  static constexpr llvm::StringLiteral ShortName =
+      "syntax-irregular-digit-separators";
+
+  struct Substitutions {
+    int radix;
+  };
+  static auto Format(const Substitutions& subst) -> std::string {
+    assert((subst.radix == 10 || subst.radix == 16) && "unexpected radix");
+    return llvm::formatv(
+               "Digit separators in {0} number should appear every {1} "
+               "characters from the right.",
+               (subst.radix == 10 ? "decimal" : "hexadecimal"),
+               (subst.radix == 10 ? "3" : "4"))
+        .str();
+  }
+};
+
+struct UnknownBaseSpecifier : SimpleDiagnostic<UnknownBaseSpecifier> {
+  static constexpr llvm::StringLiteral ShortName = "syntax-invalid-number";
+  static constexpr llvm::StringLiteral Message =
+      "Unknown base specifier in numeric literal.";
+};
+
+struct BinaryRealLiteral : SimpleDiagnostic<BinaryRealLiteral> {
+  static constexpr llvm::StringLiteral ShortName = "syntax-invalid-number";
+  static constexpr llvm::StringLiteral Message =
+      "Binary real number literals are not supported.";
+};
+
+struct WrongRealLiteralExponent {
+  static constexpr llvm::StringLiteral ShortName = "syntax-invalid-number";
+
+  struct Substitutions {
+    char expected;
+  };
+  static auto Format(const Substitutions& subst) -> std::string {
+    return llvm::formatv("Expected '{0}' to introduce exponent.",
+                         subst.expected)
+        .str();
+  }
+};
+}  // namespace
+
+static bool isLower(char c) { return 'a' <= c && c <= 'z'; }
+
+auto NumericLiteralToken::Lex(llvm::StringRef source_text)
+    -> llvm::Optional<NumericLiteralToken> {
+  NumericLiteralToken result;
+
+  if (source_text.empty() || !llvm::isDigit(source_text.front()))
+    return llvm::None;
+
+  bool seen_plus_minus = false;
+  bool seen_radix_point = false;
+  bool seen_potential_exponent = false;
+
+  // Greedily consume all following characters that might be part of a numeric
+  // literal. This allows us to produce better diagnostics on invalid literals.
+  //
+  // TODO(zygoloid): Update lexical rules to specify that a numeric literal
+  // cannot be immediately followed by an alphanumeric character.
+  int i = 1, n = source_text.size();
+  for (; i != n; ++i) {
+    char c = source_text[i];
+    if (llvm::isAlnum(c) || c == '_') {
+      if (isLower(c) && seen_radix_point && !seen_plus_minus) {
+        result.exponent = i;
+        seen_potential_exponent = true;
+      }
+      continue;
+    }
+
+    // Exactly one `.` can be part of the literal, but only if it's followed by
+    // an alphanumeric character.
+    if (c == '.' && i + 1 != n && llvm::isAlnum(source_text[i + 1]) &&
+        !seen_radix_point) {
+      result.radix_point = i;
+      seen_radix_point = true;
+      continue;
+    }
+
+    // A `+` or `-` continues the literal only if it's preceded by a lowercase
+    // letter (which will be 'e' or 'p' or part of an invalid literal) and
+    // followed by an alphanumeric character. This '+' or '-' cannot be an
+    // operator because a literal cannot end in a lowercase letter.
+    if ((c == '+' || c == '-') && seen_potential_exponent &&
+        result.exponent == i - 1 && i + 1 != n &&
+        llvm::isAlnum(source_text[i + 1])) {
+      // This is not possible because we don't update result.exponent after we
+      // see a '+' or '-'.
+      assert(!seen_plus_minus && "should only consume one + or -");
+      seen_plus_minus = true;
+      continue;
+    }
+
+    break;
+  }
+
+  result.text = source_text.substr(0, i);
+  if (!seen_radix_point)
+    result.radix_point = i;
+  if (!seen_potential_exponent)
+    result.exponent = i;
+
+  return result;
+}
+
+NumericLiteralToken::Parser::Parser(DiagnosticEmitter& emitter,
+                                    NumericLiteralToken literal)
+    : emitter(emitter), literal(literal) {
+  int_part = literal.text.substr(0, literal.radix_point);
+  if (int_part.consume_front("0x")) {
+    radix = 16;
+  } else if (int_part.consume_front("0b")) {
+    radix = 2;
+  }
+
+  fract_part = literal.text.substr(literal.radix_point + 1,
+                                   literal.exponent - literal.radix_point - 1);
+
+  exponent_part = literal.text.substr(literal.exponent + 1);
+  if (!exponent_part.consume_front("+")) {
+    exponent_is_negative = exponent_part.consume_front("-");
+  }
+}
+
+// Check that the numeric literal token is syntactically valid and meaningful,
+// and diagnose if not.
+auto NumericLiteralToken::Parser::Check() -> CheckResult {
+  if (!CheckLeadingZero() || !CheckIntPart() || !CheckFractionalPart() ||
+      !CheckExponentPart())
+    return UnrecoverableError;
+  return recovered_from_error ? RecoverableError : Valid;
+}
+
+// Parse a string that is known to be a valid base-radix integer into an
+// APInt.  If needs_cleaning is true, the string may additionally contain '_'
+// and '.' characters that should be ignored.
+//
+// Ignoring '.' is used when parsing a real literal. For example, when
+// parsing 123.456e7, we want to decompose it into an integer mantissa
+// (123456) and an exponent (7 - 3 = 2), and this routine is given the
+// "123.456" to parse as the mantissa.
+static auto ParseInteger(llvm::StringRef digits, int radix, bool needs_cleaning)
+    -> llvm::APInt {
+  llvm::SmallString<32> cleaned;
+  if (needs_cleaning) {
+    cleaned.reserve(digits.size());
+    std::remove_copy_if(digits.begin(), digits.end(),
+                        std::back_inserter(cleaned),
+                        [](char c) { return c == '_' || c == '.'; });
+    digits = cleaned;
+  }
+
+  llvm::APInt value;
+  if (digits.getAsInteger(radix, value)) {
+    llvm_unreachable("should never fail");
+  }
+  return value;
+}
+
+auto NumericLiteralToken::Parser::GetMantissa() -> llvm::APInt {
+  const char* end = IsInteger() ? int_part.end() : fract_part.end();
+  llvm::StringRef digits(int_part.begin(), end - int_part.begin());
+  return ParseInteger(digits, radix, mantissa_needs_cleaning);
+}
+
+auto NumericLiteralToken::Parser::GetExponent() -> llvm::APInt {
+  // Compute the effective exponent from the specified exponent, if any,
+  // and the position of the radix point.
+  llvm::APInt exponent(64, 0);
+  if (!exponent_part.empty()) {
+    exponent = ParseInteger(exponent_part, 10, exponent_needs_cleaning);
+
+    // The exponent is a signed integer, and the number we just parsed is
+    // non-negative, so ensure we have a wide enough representation to
+    // include a sign bit. Also make sure the exponent isn't too narrow so
+    // the calculation below can't lose information through overflow.
+    if (exponent.isSignBitSet() || exponent.getBitWidth() < 64) {
+      exponent = exponent.zext(std::max(64u, exponent.getBitWidth() + 1));
+    }
+    if (exponent_is_negative) {
+      exponent.negate();
+    }
+  }
+
+  // Each character after the decimal point reduces the effective exponent.
+  int excess_exponent = fract_part.size();
+  if (radix == 16) {
+    excess_exponent *= 4;
+  }
+  exponent -= excess_exponent;
+  if (exponent_is_negative && !exponent.isNegative()) {
+    // We overflowed. Note that we can only overflow by a little, and only
+    // from negative to positive, because exponent is at least 64 bits wide
+    // and excess_exponent is bounded above by four times the size of the
+    // input buffer, which we assume fits into 32 bits.
+    exponent = exponent.zext(exponent.getBitWidth() + 1);
+    exponent.setSignBit();
+  }
+  return exponent;
+}
+
+// Check that a digit sequence is valid: that it contains one or more digits,
+// contains only digits in the specified base, and that any digit separators
+// are present and correctly positioned.
+auto NumericLiteralToken::Parser::CheckDigitSequence(
+    llvm::StringRef text, int radix, bool allow_digit_separators)
+    -> CheckDigitSequenceResult {
+  assert((radix == 2 || radix == 10 || radix == 16) && "unknown radix");
+
+  std::bitset<256> valid_digits;
+  if (radix == 2) {
+    for (char c : "01")
+      valid_digits[static_cast<unsigned char>(c)] = true;
+  } else if (radix == 10) {
+    for (char c : "0123456789")
+      valid_digits[static_cast<unsigned char>(c)] = true;
+  } else {
+    for (char c : "0123456789ABCDEF")
+      valid_digits[static_cast<unsigned char>(c)] = true;
+  }
+
+  int num_digit_separators = 0;
+
+  for (int i = 0, n = text.size(); i != n; ++i) {
+    char c = text[i];
+    if (valid_digits[static_cast<unsigned char>(c)]) {
+      continue;
+    }
+
+    if (c == '_') {
+      // A digit separator cannot appear at the start of a digit sequence,
+      // next to another digit separator, or at the end.
+      if (!allow_digit_separators || i == 0 || text[i - 1] == '_' ||
+          i + 1 == n) {
+        emitter.EmitError<InvalidDigitSeparator>();
+        recovered_from_error = true;
+      }
+      ++num_digit_separators;
+      continue;
+    }
+
+    emitter.EmitError<InvalidDigit>({.digit = c, .radix = radix});
+    return {.ok = false};
+  }
+
+  if (num_digit_separators == static_cast<int>(text.size())) {
+    emitter.EmitError<EmptyDigitSequence>();
+    return {.ok = false};
+  }
+
+  // Check that digit separators occur in exactly the expected positions.
+  if (num_digit_separators && radix != 2)
+    CheckDigitSeparatorPlacement(text, radix, num_digit_separators);
+
+  return {.ok = true, .has_digit_separators = (num_digit_separators != 0)};
+}
+
+// Given a number with digit separators, check that the digit separators are
+// correctly positioned.
+auto NumericLiteralToken::Parser::CheckDigitSeparatorPlacement(
+    llvm::StringRef text, int radix, int num_digit_separators) -> void {
+  assert((radix == 10 || radix == 16) &&
+         "unexpected radix for digit separator checks");
+  assert(std::count(text.begin(), text.end(), '_') == num_digit_separators &&
+         "given wrong number of digit separators");
+
+  auto diagnose_irregular_digit_separators = [&] {
+    emitter.EmitError<IrregularDigitSeparators>({.radix = radix});
+    recovered_from_error = true;
+  };
+
+  // For decimal and hexadecimal digit sequences, digit separators must form
+  // groups of 3 or 4 digits (4 or 5 characters), respectively.
+  int stride = (radix == 10 ? 4 : 5);
+  int remaining_digit_separators = num_digit_separators;
+  for (auto pos = text.end(); pos - text.begin() >= stride; /*in loop*/) {
+    pos -= stride;
+    if (*pos != '_')
+      return diagnose_irregular_digit_separators();
+
+    --remaining_digit_separators;
+  }
+
+  // Check there weren't any other digit separators.
+  if (remaining_digit_separators)
+    diagnose_irregular_digit_separators();
+};
+
+// Check that we don't have a '0' prefix on a non-zero decimal integer.
+auto NumericLiteralToken::Parser::CheckLeadingZero() -> bool {
+  if (radix == 10 && int_part.startswith("0") && int_part != "0") {
+    emitter.EmitError<UnknownBaseSpecifier>();
+    return false;
+  }
+  return true;
+}
+
+// Check the integer part (before the '.', if any) is valid.
+auto NumericLiteralToken::Parser::CheckIntPart() -> bool {
+  auto int_result = CheckDigitSequence(int_part, radix);
+  mantissa_needs_cleaning |= int_result.has_digit_separators;
+  return int_result.ok;
+}
+
+// Check the fractional part (after the '.' and before the exponent, if any)
+// is valid.
+auto NumericLiteralToken::Parser::CheckFractionalPart() -> bool {
+  if (IsInteger()) {
+    return true;
+  }
+
+  if (radix == 2) {
+    emitter.EmitError<BinaryRealLiteral>();
+    recovered_from_error = true;
+    // Carry on and parse the binary real literal anyway.
+  }
+
+  // We need to remove a '.' from the mantissa.
+  mantissa_needs_cleaning = true;
+
+  return CheckDigitSequence(fract_part, radix,
+                            /*allow_digit_separators=*/false)
+      .ok;
+}
+
+// Check the exponent part (if any) is valid.
+auto NumericLiteralToken::Parser::CheckExponentPart() -> bool {
+  if (literal.exponent == static_cast<int>(literal.text.size())) {
+    return true;
+  }
+
+  char expected_exponent_kind = (radix == 10 ? 'e' : 'p');
+  if (literal.text[literal.exponent] != expected_exponent_kind) {
+    emitter.EmitError<WrongRealLiteralExponent>(
+        {.expected = expected_exponent_kind});
+    return false;
+  }
+
+  auto exponent_result = CheckDigitSequence(exponent_part, 10);
+  exponent_needs_cleaning = exponent_result.has_digit_separators;
+  return exponent_result.ok;
+}
+
+}  // namespace Carbon

+ 127 - 0
lexer/numeric_literal.h

@@ -0,0 +1,127 @@
+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+// Exceptions. See /LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef LEXER_NUMERIC_LITERAL_H_
+#define LEXER_NUMERIC_LITERAL_H_
+
+#include <utility>
+
+#include "diagnostics/diagnostic_emitter.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/StringRef.h"
+
+namespace Carbon {
+
+// A numeric literal token that has been extracted from a source buffer.
+class NumericLiteralToken {
+ public:
+  // Get the text corresponding to this literal.
+  llvm::StringRef Text() const { return text; }
+
+  // Extract a numeric literal from the given text, if it has a suitable form.
+  static auto Lex(llvm::StringRef source_text)
+      -> llvm::Optional<NumericLiteralToken>;
+
+  class Parser;
+
+ private:
+  NumericLiteralToken() {}
+
+  // The text of the token.
+  llvm::StringRef text;
+
+  // The offset of the '.'. Set to text.size() if none is present.
+  int radix_point;
+
+  // The offset of the alphabetical character introducing the exponent. In a
+  // valid literal, this will be an 'e' or a 'p', and may be followed by a '+'
+  // or a '-', but for error recovery, this may simply be the last lowercase
+  // letter in the invalid token. Always greater than or equal to radix_point.
+  // Set to text.size() if none is present.
+  int exponent;
+};
+
+// Parser for numeric literal tokens.
+//
+// Responsible for checking that a numeric literal is valid and meaningful and
+// either diagnosing or extracting its meaning.
+class NumericLiteralToken::Parser {
+ public:
+  Parser(DiagnosticEmitter& emitter, NumericLiteralToken literal);
+
+  auto IsInteger() -> bool {
+    return literal.radix_point == static_cast<int>(literal.text.size());
+  }
+
+  enum CheckResult {
+    // The token is valid.
+    Valid,
+    // The token is invalid, but we've diagnosed and recovered from the error.
+    RecoverableError,
+    // The token is invalid, and we've diagnosed, but we can't assign meaning
+    // to it.
+    UnrecoverableError,
+  };
+
+  // Check that the numeric literal token is syntactically valid and
+  // meaningful, and diagnose if not.
+  auto Check() -> CheckResult;
+
+  // Get the radix of this token. One of 2, 10, or 16.
+  auto GetRadix() -> int { return radix; }
+
+  // Get the mantissa of this token's value.
+  auto GetMantissa() -> llvm::APInt;
+
+  // Get the exponent of this token's value. This is always zero for an integer
+  // literal.
+  auto GetExponent() -> llvm::APInt;
+
+ private:
+  struct CheckDigitSequenceResult {
+    bool ok;
+    bool has_digit_separators = false;
+  };
+
+  auto CheckDigitSequence(llvm::StringRef text, int radix,
+                          bool allow_digit_separators = true)
+      -> CheckDigitSequenceResult;
+  auto CheckDigitSeparatorPlacement(llvm::StringRef text, int radix,
+                                    int num_digit_separators) -> void;
+  auto CheckLeadingZero() -> bool;
+  auto CheckIntPart() -> bool;
+  auto CheckFractionalPart() -> bool;
+  auto CheckExponentPart() -> bool;
+
+ private:
+  DiagnosticEmitter& emitter;
+  NumericLiteralToken literal;
+
+  // The radix of the literal: 2, 10, or 16, for a prefix of '0b', no prefix,
+  // or '0x', respectively.
+  int radix = 10;
+
+  // The various components of a numeric literal:
+  //
+  //     [radix] int_part [. fract_part [[ep] [+-] exponent_part]]
+  llvm::StringRef int_part;
+  llvm::StringRef fract_part;
+  llvm::StringRef exponent_part;
+
+  // Do we need to remove any special characters (digit separator or radix
+  // point) before interpreting the mantissa or exponent as an integer?
+  bool mantissa_needs_cleaning = false;
+  bool exponent_needs_cleaning = false;
+
+  // True if we found a `-` before `exponent_part`.
+  bool exponent_is_negative = false;
+
+  // True if we produced an error but recovered.
+  bool recovered_from_error = false;
+};
+
+}  // namespace Carbon
+
+#endif  // LEXER_NUMERIC_LITERAL_H_

+ 261 - 0
lexer/numeric_literal_test.cpp

@@ -0,0 +1,261 @@
+// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
+// Exceptions. See /LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "lexer/numeric_literal.h"
+
+#include <iterator>
+
+#include "diagnostics/diagnostic_emitter.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace Carbon {
+namespace {
+
+struct NumericLiteralTest : ::testing::Test {
+  auto Lex(llvm::StringRef text) -> NumericLiteralToken {
+    llvm::Optional<NumericLiteralToken> result = NumericLiteralToken::Lex(text);
+    assert(result);
+    EXPECT_EQ(result->Text(), text);
+    return *result;
+  }
+
+  auto Parse(llvm::StringRef text) -> NumericLiteralToken::Parser {
+    return NumericLiteralToken::Parser(ConsoleDiagnosticEmitter(), Lex(text));
+  }
+};
+
+TEST_F(NumericLiteralTest, HandlesIntegerLiteral) {
+  struct Testcase {
+    llvm::StringLiteral token;
+    uint64_t value;
+    int radix;
+  };
+  Testcase testcases[] = {
+      {.token = "12", .value = 12, .radix = 10},
+      {.token = "0x12_3ABC", .value = 0x12'3ABC, .radix = 16},
+      {.token = "0b10_10_11", .value = 0b10'10'11, .radix = 2},
+      {.token = "1_234_567", .value = 1'234'567, .radix = 10},
+  };
+  for (Testcase testcase : testcases) {
+    auto parser = Parse(testcase.token);
+    EXPECT_EQ(parser.Check(), parser.Valid) << testcase.token;
+    EXPECT_EQ(parser.IsInteger(), true);
+    EXPECT_EQ(parser.GetMantissa().getZExtValue(), testcase.value);
+    EXPECT_EQ(parser.GetExponent().getSExtValue(), 0);
+    EXPECT_EQ(parser.GetRadix(), testcase.radix);
+  }
+}
+
+TEST_F(NumericLiteralTest, ValidatesBaseSpecifier) {
+  llvm::StringLiteral valid[] = {
+      // Decimal integer literals.
+      "0",
+      "1",
+      "123456789000000000000000000000000000000000000",
+
+      // Hexadecimal integer literals.
+      "0x0123456789ABCDEF",
+      "0x0000000000000000000000000000000",
+
+      // Binary integer literals.
+      "0b10110100101001010",
+      "0b0000000",
+  };
+  for (llvm::StringLiteral literal : valid) {
+    auto parser = Parse(literal);
+    EXPECT_EQ(parser.Check(), parser.Valid) << literal;
+  }
+
+  llvm::StringLiteral invalid[] = {
+      "00",  "0X123",    "0o123",          "0B1",
+      "007", "123L",     "123456789A",     "0x",
+      "0b",  "0x123abc", "0b011101201001", "0b10A",
+      "0x_", "0b_",
+  };
+  for (llvm::StringLiteral literal : invalid) {
+    auto parser = Parse(literal);
+    EXPECT_EQ(parser.Check(), parser.UnrecoverableError) << literal;
+  }
+}
+
+TEST_F(NumericLiteralTest, ValidatesIntegerDigitSeparators) {
+  llvm::StringLiteral valid[] = {
+      // Decimal literals optionally have digit separators every 3 places.
+      "1_234",
+      "123_456",
+      "1_234_567",
+
+      // Hexadecimal literals optionally have digit separators every 4 places.
+      "0x1_0000",
+      "0x1000_0000",
+      "0x1_0000_0000",
+
+      // Binary integer literals can have digit separators anywhere..
+      "0b1_0_1_0_1_0",
+      "0b111_0000",
+  };
+  for (llvm::StringLiteral literal : valid) {
+    auto parser = Parse(literal);
+    EXPECT_EQ(parser.Check(), parser.Valid) << literal;
+  }
+
+  llvm::StringLiteral invalid[] = {
+      // Decimal literals.
+      "12_34",
+      "123_4_6_789",
+      "12_3456_789",
+      "12__345",
+      "1_",
+
+      // Hexadecimal literals.
+      "0x_1234",
+      "0x123_",
+      "0x12_3",
+      "0x_234_5678",
+      "0x1234_567",
+
+      // Binary literals.
+      "0b_10101",
+      "0b1__01",
+      "0b1011_",
+      "0b1_01_01_",
+  };
+  for (llvm::StringLiteral literal : invalid) {
+    auto parser = Parse(literal);
+    EXPECT_EQ(parser.Check(), parser.RecoverableError) << literal;
+  }
+}
+
+TEST_F(NumericLiteralTest, HandlesRealLiteral) {
+  struct Testcase {
+    llvm::StringLiteral token;
+    uint64_t mantissa;
+    int64_t exponent;
+    unsigned radix;
+  };
+  Testcase testcases[] = {
+      // Decimal real literals.
+      {.token = "0.0", .mantissa = 0, .exponent = -1, .radix = 10},
+      {.token = "12.345", .mantissa = 12345, .exponent = -3, .radix = 10},
+      {.token = "12.345e6", .mantissa = 12345, .exponent = 3, .radix = 10},
+      {.token = "12.345e+6", .mantissa = 12345, .exponent = 3, .radix = 10},
+      {.token = "1_234.5e-2", .mantissa = 12345, .exponent = -3, .radix = 10},
+      {.token = "1.0e-2_000_000",
+       .mantissa = 10,
+       .exponent = -2'000'001,
+       .radix = 10},
+
+      // Hexadecimal real literals.
+      {.token = "0x1_2345_6789.CDEF",
+       .mantissa = 0x1'2345'6789'CDEF,
+       .exponent = -16,
+       .radix = 16},
+      {.token = "0x0.0001p4", .mantissa = 1, .exponent = -12, .radix = 16},
+      {.token = "0x0.0001p+4", .mantissa = 1, .exponent = -12, .radix = 16},
+      {.token = "0x0.0001p-4", .mantissa = 1, .exponent = -20, .radix = 16},
+      // The exponent here works out as exactly INT64_MIN.
+      {.token = "0x1.01p-9223372036854775800",
+       .mantissa = 0x101,
+       .exponent = -9223372036854775807L - 1L,
+       .radix = 16},
+      // The exponent here doesn't fit in a signed 64-bit integer until we
+      // adjust for the radix point.
+      {.token = "0x1.01p9223372036854775809",
+       .mantissa = 0x101,
+       .exponent = 9223372036854775801L,
+       .radix = 16},
+
+      // Binary real literals. These are invalid, but we accept them for error
+      // recovery.
+      {.token = "0b10_11_01.01",
+       .mantissa = 0b10110101,
+       .exponent = -2,
+       .radix = 2},
+  };
+  for (Testcase testcase : testcases) {
+    auto parser = Parse(testcase.token);
+    EXPECT_EQ(parser.Check(),
+              testcase.radix == 2 ? parser.RecoverableError : parser.Valid)
+        << testcase.token;
+    EXPECT_EQ(parser.IsInteger(), false);
+    EXPECT_EQ(parser.GetMantissa().getZExtValue(), testcase.mantissa);
+    EXPECT_EQ(parser.GetExponent().getSExtValue(), testcase.exponent);
+    EXPECT_EQ(parser.GetRadix(), testcase.radix);
+  }
+}
+
+TEST_F(NumericLiteralTest, HandlesRealLiteralOverflow) {
+  llvm::StringLiteral input = "0x1.000001p-9223372036854775800";
+  auto parser = Parse(input);
+  EXPECT_EQ(parser.Check(), parser.Valid);
+  EXPECT_EQ(parser.GetMantissa(), 0x1000001);
+  EXPECT_EQ((parser.GetExponent() + 9223372036854775800).getSExtValue(), -24);
+  EXPECT_EQ(parser.GetRadix(), 16);
+}
+
+TEST_F(NumericLiteralTest, ValidatesRealLiterals) {
+  llvm::StringLiteral invalid_digit_separators[] = {
+      // Invalid digit separators.
+      "12_34.5",     "123.4_567", "123.456_7", "1_2_3.4",
+      "123.4e56_78", "0x12_34.5", "0x12.3_4",  "0x12.34p5_6",
+  };
+  for (llvm::StringLiteral literal : invalid_digit_separators) {
+    auto parser = Parse(literal);
+    EXPECT_EQ(parser.Check(), parser.RecoverableError) << literal;
+  }
+
+  llvm::StringLiteral invalid[] = {
+      // No digits in integer part.
+      "0x.0",
+      "0b.0",
+      "0x_.0",
+      "0b_.0",
+
+      // No digits in fractional part.
+      "0.e",
+      "0.e0",
+      "0.e+0",
+      "0x0.p",
+      "0x0.p-0",
+
+      // Invalid digits in mantissa.
+      "123A.4",
+      "123.4A",
+      "123A.4e0",
+      "123.4Ae0",
+      "0x123ABCDEFG.0",
+      "0x123.ABCDEFG",
+      "0x123ABCDEFG.0p0",
+      "0x123.ABCDEFGp0",
+
+      // Invalid exponent letter.
+      "0.0f0",
+      "0.0p0",
+      "0.0z+0",
+      "0x0.0e0",
+      "0x0.0f0",
+      "0x0.0z-0",
+
+      // No digits in exponent part.
+      "0.0e",
+      "0x0.0p",
+      "0.0e_",
+      "0x0.0p_",
+
+      // Invalid digits in exponent part.
+      "0.0eHELLO",
+      "0.0eA",
+      "0.0e+A",
+      "0x0.0pA",
+      "0x0.0p-A",
+  };
+  for (llvm::StringLiteral literal : invalid) {
+    auto parser = Parse(literal);
+    EXPECT_EQ(parser.Check(), parser.UnrecoverableError) << literal;
+  }
+}
+
+}  // namespace
+}  // namespace Carbon

+ 16 - 456
lexer/tokenized_buffer.cpp

@@ -5,12 +5,11 @@
 #include "lexer/tokenized_buffer.h"
 
 #include <algorithm>
-#include <bitset>
 #include <cmath>
 #include <iterator>
 #include <string>
 
-#include "llvm/ADT/StringExtras.h"
+#include "lexer/numeric_literal.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Twine.h"
@@ -46,78 +45,6 @@ struct MismatchedClosing : SimpleDiagnostic<MismatchedClosing> {
       "Closing symbol does not match most recent opening symbol.";
 };
 
-struct EmptyDigitSequence : SimpleDiagnostic<EmptyDigitSequence> {
-  static constexpr llvm::StringLiteral ShortName = "syntax-invalid-number";
-  static constexpr llvm::StringLiteral Message =
-      "Empty digit sequence in numeric literal.";
-};
-
-struct InvalidDigit {
-  static constexpr llvm::StringLiteral ShortName = "syntax-invalid-number";
-
-  struct Substitutions {
-    char digit;
-    int radix;
-  };
-  static auto Format(const Substitutions& subst) -> std::string {
-    return llvm::formatv("Invalid digit '{0}' in {1} numeric literal.",
-                         subst.digit,
-                         (subst.radix == 2
-                              ? "binary"
-                              : subst.radix == 16 ? "hexadecimal" : "decimal"))
-        .str();
-  }
-};
-
-struct InvalidDigitSeparator : SimpleDiagnostic<InvalidDigitSeparator> {
-  static constexpr llvm::StringLiteral ShortName = "syntax-invalid-number";
-  static constexpr llvm::StringLiteral Message =
-      "Misplaced digit separator in numeric literal.";
-};
-
-struct IrregularDigitSeparators {
-  static constexpr llvm::StringLiteral ShortName =
-      "syntax-irregular-digit-separators";
-
-  struct Substitutions {
-    int radix;
-  };
-  static auto Format(const Substitutions& subst) -> std::string {
-    assert((subst.radix == 10 || subst.radix == 16) && "unexpected radix");
-    return llvm::formatv(
-               "Digit separators in {0} number should appear every {1} "
-               "characters from the right.",
-               (subst.radix == 10 ? "decimal" : "hexadecimal"),
-               (subst.radix == 10 ? "3" : "4"))
-        .str();
-  }
-};
-
-struct UnknownBaseSpecifier : SimpleDiagnostic<UnknownBaseSpecifier> {
-  static constexpr llvm::StringLiteral ShortName = "syntax-invalid-number";
-  static constexpr llvm::StringLiteral Message =
-      "Unknown base specifier in numeric literal.";
-};
-
-struct BinaryRealLiteral : SimpleDiagnostic<BinaryRealLiteral> {
-  static constexpr llvm::StringLiteral ShortName = "syntax-invalid-number";
-  static constexpr llvm::StringLiteral Message =
-      "Binary real number literals are not supported.";
-};
-
-struct WrongRealLiteralExponent {
-  static constexpr llvm::StringLiteral ShortName = "syntax-invalid-number";
-
-  struct Substitutions {
-    char expected;
-  };
-  static auto Format(const Substitutions& subst) -> std::string {
-    return llvm::formatv("Expected '{0}' to introduce exponent.",
-                         subst.expected)
-        .str();
-  }
-};
-
 struct UnrecognizedCharacters : SimpleDiagnostic<UnrecognizedCharacters> {
   static constexpr llvm::StringLiteral ShortName =
       "syntax-unrecognized-characters";
@@ -129,378 +56,6 @@ struct UnrecognizedCharacters : SimpleDiagnostic<UnrecognizedCharacters> {
 // acceptable whitespace.
 static bool isSpace(char c) { return c == ' ' || c == '\n' || c == '\t'; }
 
-static bool isLower(char c) { return 'a' <= c && c <= 'z'; }
-
-namespace {
-struct NumericLiteral {
-  llvm::StringRef text;
-
-  // The offset of the '.'. Set to text.size() if none is present.
-  int radix_point;
-
-  // The offset of the alphabetical character introducing the exponent. In a
-  // valid literal, this will be an 'e' or a 'p', and may be followed by a '+'
-  // or a '-', but for error recovery, this may simply be the last lowercase
-  // letter in the invalid token. Always greater than or equal to radix_point.
-  // Set to text.size() if none is present.
-  int exponent;
-};
-}  // namespace
-
-static auto TakeLeadingNumericLiteral(llvm::StringRef source_text)
-    -> NumericLiteral {
-  NumericLiteral result;
-
-  if (source_text.empty() || !llvm::isDigit(source_text.front()))
-    return result;
-
-  bool seen_plus_minus = false;
-  bool seen_radix_point = false;
-  bool seen_potential_exponent = false;
-
-  // Greedily consume all following characters that might be part of a numeric
-  // literal. This allows us to produce better diagnostics on invalid literals.
-  //
-  // TODO(zygoloid): Update lexical rules to specify that a numeric literal
-  // cannot be immediately followed by an alphanumeric character.
-  int i = 1, n = source_text.size();
-  for (; i != n; ++i) {
-    char c = source_text[i];
-    if (llvm::isAlnum(c) || c == '_') {
-      if (isLower(c) && seen_radix_point && !seen_plus_minus) {
-        result.exponent = i;
-        seen_potential_exponent = true;
-      }
-      continue;
-    }
-
-    // Exactly one `.` can be part of the literal, but only if it's followed by
-    // an alphanumeric character.
-    if (c == '.' && i + 1 != n && llvm::isAlnum(source_text[i + 1]) &&
-        !seen_radix_point) {
-      result.radix_point = i;
-      seen_radix_point = true;
-      continue;
-    }
-
-    // A `+` or `-` continues the literal only if it's preceded by a lowercase
-    // letter (which will be 'e' or 'p' or part of an invalid literal) and
-    // followed by an alphanumeric character. This '+' or '-' cannot be an
-    // operator because a literal cannot end in a lowercase letter.
-    if ((c == '+' || c == '-') && seen_potential_exponent &&
-        result.exponent == i - 1 && i + 1 != n &&
-        llvm::isAlnum(source_text[i + 1])) {
-      // This is not possible because we don't update result.exponent after we
-      // see a '+' or '-'.
-      assert(!seen_plus_minus && "should only consume one + or -");
-      seen_plus_minus = true;
-      continue;
-    }
-
-    break;
-  }
-
-  result.text = source_text.substr(0, i);
-  if (!seen_radix_point)
-    result.radix_point = i;
-  if (!seen_potential_exponent)
-    result.exponent = i;
-
-  return result;
-}
-
-namespace {
-// Parser for numeric literal tokens.
-//
-// Responsible for checking that a numeric literal is valid and meaningful and
-// either diagnosing or extracting its meaning.
-class NumericLiteralParser {
- public:
-  NumericLiteralParser(DiagnosticEmitter& emitter, NumericLiteral literal)
-      : emitter(emitter), literal(literal) {
-    int_part = literal.text.substr(0, literal.radix_point);
-    if (int_part.consume_front("0x")) {
-      radix = 16;
-    } else if (int_part.consume_front("0b")) {
-      radix = 2;
-    }
-
-    fract_part = literal.text.substr(
-        literal.radix_point + 1, literal.exponent - literal.radix_point - 1);
-
-    exponent_part = literal.text.substr(literal.exponent + 1);
-    if (!exponent_part.consume_front("+")) {
-      exponent_is_negative = exponent_part.consume_front("-");
-    }
-  }
-
-  auto IsInteger() -> bool {
-    return literal.radix_point == static_cast<int>(literal.text.size());
-  }
-
-  enum CheckResult {
-    // The token is valid.
-    Valid,
-    // The token is invalid, but we've diagnosed and recovered from the error.
-    RecoverableError,
-    // The token is invalid, and we've diagnosed, but we can't assign meaning
-    // to it.
-    UnrecoverableError,
-  };
-
-  // Check that the numeric literal token is syntactically valid and
-  // meaningful, and diagnose if not.
-  auto Check() -> CheckResult {
-    if (!CheckLeadingZero() || !CheckIntPart() || !CheckFractionalPart() ||
-        !CheckExponentPart())
-      return UnrecoverableError;
-    return recovered_from_error ? RecoverableError : Valid;
-  }
-
-  auto GetMantissa() -> llvm::APInt {
-    const char* end = IsInteger() ? int_part.end() : fract_part.end();
-    llvm::StringRef digits(int_part.begin(), end - int_part.begin());
-    return ParseInteger(digits, radix, mantissa_needs_cleaning);
-  }
-
-  auto GetExponent() -> llvm::APInt {
-    // Compute the effective exponent from the specified exponent, if any,
-    // and the position of the radix point.
-    llvm::APInt exponent(64, 0);
-    if (!exponent_part.empty()) {
-      exponent = ParseInteger(exponent_part, 10, exponent_needs_cleaning);
-
-      // The exponent is a signed integer, and the number we just parsed is
-      // non-negative, so ensure we have a wide enough representation to
-      // include a sign bit. Also make sure the exponent isn't too narrow so
-      // the calculation below can't lose information through overflow.
-      if (exponent.isSignBitSet() || exponent.getBitWidth() < 64) {
-        exponent = exponent.zext(std::max(64u, exponent.getBitWidth() + 1));
-      }
-      if (exponent_is_negative) {
-        exponent.negate();
-      }
-    }
-
-    // Each character after the decimal point reduces the effective exponent.
-    int excess_exponent = fract_part.size();
-    if (radix == 16) {
-      excess_exponent *= 4;
-    }
-    exponent -= excess_exponent;
-    if (exponent_is_negative && !exponent.isNegative()) {
-      // We overflowed. Note that we can only overflow by a little, and only
-      // from negative to positive, because exponent is at least 64 bits wide
-      // and excess_exponent is bounded above by four times the size of the
-      // input buffer, which we assume fits into 32 bits.
-      exponent = exponent.zext(exponent.getBitWidth() + 1);
-      exponent.setSignBit();
-    }
-    return exponent;
-  }
-
- private:
-  struct CheckDigitSequenceResult {
-    bool ok;
-    bool has_digit_separators = false;
-  };
-
-  // Check that a digit sequence is valid: that it contains one or more digits,
-  // contains only digits in the specified base, and that any digit separators
-  // are present and correctly positioned.
-  auto CheckDigitSequence(llvm::StringRef text, int radix,
-                          bool allow_digit_separators = true)
-      -> CheckDigitSequenceResult {
-    assert((radix == 2 || radix == 10 || radix == 16) && "unknown radix");
-
-    std::bitset<256> valid_digits;
-    if (radix == 2) {
-      for (char c : "01")
-        valid_digits[static_cast<unsigned char>(c)] = true;
-    } else if (radix == 10) {
-      for (char c : "0123456789")
-        valid_digits[static_cast<unsigned char>(c)] = true;
-    } else {
-      for (char c : "0123456789ABCDEF")
-        valid_digits[static_cast<unsigned char>(c)] = true;
-    }
-
-    int num_digit_separators = 0;
-
-    for (int i = 0, n = text.size(); i != n; ++i) {
-      char c = text[i];
-      if (valid_digits[static_cast<unsigned char>(c)]) {
-        continue;
-      }
-
-      if (c == '_') {
-        // A digit separator cannot appear at the start of a digit sequence,
-        // next to another digit separator, or at the end.
-        if (!allow_digit_separators || i == 0 || text[i - 1] == '_' ||
-            i + 1 == n) {
-          emitter.EmitError<InvalidDigitSeparator>();
-          recovered_from_error = true;
-        }
-        ++num_digit_separators;
-        continue;
-      }
-
-      emitter.EmitError<InvalidDigit>({.digit = c, .radix = radix});
-      return {.ok = false};
-    }
-
-    if (num_digit_separators == static_cast<int>(text.size())) {
-      emitter.EmitError<EmptyDigitSequence>();
-      return {.ok = false};
-    }
-
-    // Check that digit separators occur in exactly the expected positions.
-    if (num_digit_separators && radix != 2)
-      CheckDigitSeparatorPlacement(text, radix, num_digit_separators);
-
-    return {.ok = true, .has_digit_separators = (num_digit_separators != 0)};
-  }
-
-  // Given a number with digit separators, check that the digit separators are
-  // correctly positioned.
-  auto CheckDigitSeparatorPlacement(llvm::StringRef text, int radix,
-                                    int num_digit_separators) -> void {
-    assert((radix == 10 || radix == 16) &&
-           "unexpected radix for digit separator checks");
-    assert(std::count(text.begin(), text.end(), '_') == num_digit_separators &&
-           "given wrong number of digit separators");
-
-    auto diagnose_irregular_digit_separators = [&] {
-      emitter.EmitError<IrregularDigitSeparators>({.radix = radix});
-      recovered_from_error = true;
-    };
-
-    // For decimal and hexadecimal digit sequences, digit separators must form
-    // groups of 3 or 4 digits (4 or 5 characters), respectively.
-    int stride = (radix == 10 ? 4 : 5);
-    int remaining_digit_separators = num_digit_separators;
-    for (auto pos = text.end(); pos - text.begin() >= stride; /*in loop*/) {
-      pos -= stride;
-      if (*pos != '_')
-        return diagnose_irregular_digit_separators();
-
-      --remaining_digit_separators;
-    }
-
-    // Check there weren't any other digit separators.
-    if (remaining_digit_separators)
-      diagnose_irregular_digit_separators();
-  };
-
-  // Check that we don't have a '0' prefix on a non-zero decimal integer.
-  auto CheckLeadingZero() -> bool {
-    if (radix == 10 && int_part.startswith("0") && int_part != "0") {
-      emitter.EmitError<UnknownBaseSpecifier>();
-      return false;
-    }
-    return true;
-  }
-
-  // Check the integer part (before the '.', if any) is valid.
-  auto CheckIntPart() -> bool {
-    auto int_result = CheckDigitSequence(int_part, radix);
-    mantissa_needs_cleaning |= int_result.has_digit_separators;
-    return int_result.ok;
-  }
-
-  // Check the fractional part (after the '.' and before the exponent, if any)
-  // is valid.
-  auto CheckFractionalPart() -> bool {
-    if (IsInteger()) {
-      return true;
-    }
-
-    if (radix == 2) {
-      emitter.EmitError<BinaryRealLiteral>();
-      recovered_from_error = true;
-      // Carry on and parse the binary real literal anyway.
-    }
-
-    // We need to remove a '.' from the mantissa.
-    mantissa_needs_cleaning = true;
-
-    return CheckDigitSequence(fract_part, radix,
-                              /*allow_digit_separators=*/false)
-        .ok;
-  }
-
-  // Check the exponent part (if any) is valid.
-  auto CheckExponentPart() -> bool {
-    if (literal.exponent == static_cast<int>(literal.text.size())) {
-      return true;
-    }
-
-    char expected_exponent_kind = (radix == 10 ? 'e' : 'p');
-    if (literal.text[literal.exponent] != expected_exponent_kind) {
-      emitter.EmitError<WrongRealLiteralExponent>(
-          {.expected = expected_exponent_kind});
-      return false;
-    }
-
-    auto exponent_result = CheckDigitSequence(exponent_part, 10);
-    exponent_needs_cleaning = exponent_result.has_digit_separators;
-    return exponent_result.ok;
-  }
-
-  // Parse a string that is known to be a valid base-radix integer into an
-  // APInt.  If needs_cleaning is true, the string may additionally contain '_'
-  // and '.' characters that should be ignored.
-  //
-  // Ignoring '.' is used when parsing a real literal. For example, when
-  // parsing 123.456e7, we want to decompose it into an integer mantissa
-  // (123456) and an exponent (7 - 3 = 2), and this routine is given the
-  // "123.456" to parse as the mantissa.
-  static auto ParseInteger(llvm::StringRef digits, int radix,
-                           bool needs_cleaning) -> llvm::APInt {
-    llvm::SmallString<32> cleaned;
-    if (needs_cleaning) {
-      cleaned.reserve(digits.size());
-      std::remove_copy_if(digits.begin(), digits.end(),
-                          std::back_inserter(cleaned),
-                          [](char c) { return c == '_' || c == '.'; });
-      digits = cleaned;
-    }
-
-    llvm::APInt value;
-    if (digits.getAsInteger(radix, value)) {
-      llvm_unreachable("should never fail");
-    }
-    return value;
-  }
-
- private:
-  DiagnosticEmitter& emitter;
-  NumericLiteral literal;
-
-  // The radix of the literal: 2, 10, or 16, for a prefix of '0b', no prefix,
-  // or '0x', respectively.
-  int radix = 10;
-
-  // The various components of a numeric literal:
-  //
-  //     [radix] int_part [. fract_part [[ep] [+-] exponent_part]]
-  llvm::StringRef int_part;
-  llvm::StringRef fract_part;
-  llvm::StringRef exponent_part;
-
-  // Do we need to remove any special characters (digit separator or radix
-  // point) before interpreting the mantissa or exponent as an integer?
-  bool mantissa_needs_cleaning = false;
-  bool exponent_needs_cleaning = false;
-
-  // True if we found a `-` before `exponent_part`.
-  bool exponent_is_negative = false;
-
-  // True if we produced an error but recovered.
-  bool recovered_from_error = false;
-};
-}  // namespace
-
 // Implementation of the lexer logic itself.
 //
 // The design is that lexing can loop over the source buffer, consuming it into
@@ -618,39 +173,41 @@ class TokenizedBuffer::Lexer {
   }
 
   auto LexNumericLiteral(llvm::StringRef& source_text) -> LexResult {
-    NumericLiteral literal = TakeLeadingNumericLiteral(source_text);
-    if (literal.text.empty()) {
+    llvm::Optional<NumericLiteralToken> literal =
+        NumericLiteralToken::Lex(source_text);
+    if (!literal) {
       return LexResult::NoMatch();
     }
 
     int int_column = current_column;
-    current_column += literal.text.size();
-    source_text = source_text.drop_front(literal.text.size());
+    int token_size = literal->Text().size();
+    current_column += token_size;
+    source_text = source_text.drop_front(token_size);
 
     if (!set_indent) {
       current_line_info->indent = int_column;
       set_indent = true;
     }
 
-    NumericLiteralParser literal_parser(emitter, literal);
+    NumericLiteralToken::Parser literal_parser(emitter, *literal);
 
     switch (literal_parser.Check()) {
-      case NumericLiteralParser::UnrecoverableError: {
+      case NumericLiteralToken::Parser::UnrecoverableError: {
         auto token = buffer.AddToken({
             .kind = TokenKind::Error(),
             .token_line = current_line,
             .column = int_column,
-            .error_length = static_cast<int32_t>(literal.text.size()),
+            .error_length = token_size,
         });
         buffer.has_errors = true;
         return token;
       }
 
-      case NumericLiteralParser::RecoverableError:
+      case NumericLiteralToken::Parser::RecoverableError:
         buffer.has_errors = true;
         break;
 
-      case NumericLiteralParser::Valid:
+      case NumericLiteralToken::Parser::Valid:
         break;
     }
 
@@ -908,7 +465,10 @@ auto TokenizedBuffer::GetTokenText(Token token) const -> llvm::StringRef {
       token_info.kind == TokenKind::RealLiteral()) {
     auto& line_info = GetLineInfo(token_info.token_line);
     int64_t token_start = line_info.start + token_info.column;
-    return TakeLeadingNumericLiteral(source->Text().substr(token_start)).text;
+    llvm::Optional<NumericLiteralToken> relexed_token =
+        NumericLiteralToken::Lex(source->Text().substr(token_start));
+    assert(relexed_token && "Could not reform numeric literal token.");
+    return relexed_token->Text();
   }
 
   assert(token_info.kind == TokenKind::Identifier() &&

+ 37 - 261
lexer/tokenized_buffer_test.cpp

@@ -77,8 +77,8 @@ TEST_F(LexerTest, TracksLinesAndColumns) {
                       }));
 }
 
-TEST_F(LexerTest, HandlesIntegerLiteral) {
-  auto buffer = Lex("12-578\n  1  2\n0x12_3ABC\n0b10_10_11\n1_234_567");
+TEST_F(LexerTest, HandlesNumericLiteral) {
+  auto buffer = Lex("12-578\n  1  2\n0x12_3ABC\n0b10_10_11\n1_234_567\n1.5e9");
   EXPECT_FALSE(buffer.HasErrors());
   ASSERT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
                           {.kind = TokenKind::IntegerLiteral(),
@@ -120,6 +120,11 @@ TEST_F(LexerTest, HandlesIntegerLiteral) {
                            .column = 1,
                            .indent_column = 1,
                            .text = "1_234_567"},
+                          {.kind = TokenKind::RealLiteral(),
+                           .line = 6,
+                           .column = 1,
+                           .indent_column = 1,
+                           .text = "1.5e9"},
                       }));
   auto token_12 = buffer.Tokens().begin();
   EXPECT_EQ(buffer.GetIntegerLiteral(*token_12), 12);
@@ -135,272 +140,43 @@ TEST_F(LexerTest, HandlesIntegerLiteral) {
   EXPECT_EQ(buffer.GetIntegerLiteral(*token_0b10_10_11), 0b10'10'11);
   auto token_1_234_567 = buffer.Tokens().begin() + 7;
   EXPECT_EQ(buffer.GetIntegerLiteral(*token_1_234_567), 1'234'567);
+  auto token_1_5e9 = buffer.Tokens().begin() + 8;
+  auto value_1_5e9 = buffer.GetRealLiteral(*token_1_5e9);
+  EXPECT_EQ(value_1_5e9.Mantissa().getZExtValue(), 15);
+  EXPECT_EQ(value_1_5e9.Exponent().getSExtValue(), 8);
+  EXPECT_EQ(value_1_5e9.IsDecimal(), true);
 }
 
-TEST_F(LexerTest, ValidatesBaseSpecifier) {
-  llvm::StringLiteral valid[] = {
-      // Decimal integer literals.
-      "0",
-      "1",
-      "123456789000000000000000000000000000000000000",
-
-      // Hexadecimal integer literals.
-      "0x0123456789ABCDEF",
-      "0x0000000000000000000000000000000",
-
-      // Binary integer literals.
-      "0b10110100101001010",
-      "0b0000000",
-  };
-  for (llvm::StringLiteral literal : valid) {
-    auto buffer = Lex(literal);
-    EXPECT_FALSE(buffer.HasErrors()) << literal;
-    ASSERT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
-                            {.kind = TokenKind::IntegerLiteral(),
-                             .line = 1,
-                             .column = 1,
-                             .indent_column = 1,
-                             .text = literal}}));
-  }
-
-  llvm::StringLiteral invalid[] = {
-      "00",  "0X123",    "0o123",          "0B1",
-      "007", "123L",     "123456789A",     "0x",
-      "0b",  "0x123abc", "0b011101201001", "0b10A",
-      "0x_", "0b_",
-  };
-  for (llvm::StringLiteral literal : invalid) {
-    auto buffer = Lex(literal);
-    EXPECT_TRUE(buffer.HasErrors()) << literal;
-    ASSERT_THAT(
-        buffer,
-        HasTokens(llvm::ArrayRef<ExpectedToken>{{.kind = TokenKind::Error(),
-                                                 .line = 1,
-                                                 .column = 1,
-                                                 .indent_column = 1,
-                                                 .text = literal}}));
-  }
-}
-
-TEST_F(LexerTest, ValidatesIntegerDigitSeparators) {
-  llvm::StringLiteral valid[] = {
-      // Decimal literals optionally have digit separators every 3 places.
-      "1_234",
-      "123_456",
-      "1_234_567",
-
-      // Hexadecimal literals optionally have digit separators every 4 places.
-      "0x1_0000",
-      "0x1000_0000",
-      "0x1_0000_0000",
-
-      // Binary integer literals can have digit separators anywhere..
-      "0b1_0_1_0_1_0",
-      "0b111_0000",
-  };
-  for (llvm::StringLiteral literal : valid) {
-    auto buffer = Lex(literal);
-    EXPECT_FALSE(buffer.HasErrors()) << literal;
-    ASSERT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
-                            {.kind = TokenKind::IntegerLiteral(),
-                             .line = 1,
-                             .column = 1,
-                             .indent_column = 1,
-                             .text = literal}}));
-  }
-
-  llvm::StringLiteral invalid[] = {
-      // Decimal literals.
-      "12_34",
-      "123_4_6_789",
-      "12_3456_789",
-      "12__345",
-      "1_",
-
-      // Hexadecimal literals.
-      "0x_1234",
-      "0x123_",
-      "0x12_3",
-      "0x_234_5678",
-      "0x1234_567",
-
-      // Binary literals.
-      "0b_10101",
-      "0b1__01",
-      "0b1011_",
-      "0b1_01_01_",
-  };
-  for (llvm::StringLiteral literal : invalid) {
-    auto buffer = Lex(literal);
-    EXPECT_TRUE(buffer.HasErrors()) << literal;
-    // We expect to produce a token even for a literal containing invalid digit
-    // separators, for better error recovery.
-    ASSERT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
-                            {.kind = TokenKind::IntegerLiteral(),
-                             .line = 1,
-                             .column = 1,
-                             .indent_column = 1,
-                             .text = literal}}));
-  }
-}
-
-TEST_F(LexerTest, HandlesRealLiteral) {
-  struct Testcase {
-    llvm::StringLiteral token;
-    uint64_t mantissa;
-    int64_t exponent;
-    unsigned radix;
-  };
-  Testcase testcases[] = {
-      // Decimal real literals.
-      {.token = "0.0", .mantissa = 0, .exponent = -1, .radix = 10},
-      {.token = "12.345", .mantissa = 12345, .exponent = -3, .radix = 10},
-      {.token = "12.345e6", .mantissa = 12345, .exponent = 3, .radix = 10},
-      {.token = "12.345e+6", .mantissa = 12345, .exponent = 3, .radix = 10},
-      {.token = "1_234.5e-2", .mantissa = 12345, .exponent = -3, .radix = 10},
-      {.token = "1.0e-2_000_000",
-       .mantissa = 10,
-       .exponent = -2'000'001,
-       .radix = 10},
-
-      // Hexadecimal real literals.
-      {.token = "0x1_2345_6789.CDEF",
-       .mantissa = 0x1'2345'6789'CDEF,
-       .exponent = -16,
-       .radix = 16},
-      {.token = "0x0.0001p4", .mantissa = 1, .exponent = -12, .radix = 16},
-      {.token = "0x0.0001p+4", .mantissa = 1, .exponent = -12, .radix = 16},
-      {.token = "0x0.0001p-4", .mantissa = 1, .exponent = -20, .radix = 16},
-      // The exponent here works out as exactly INT64_MIN.
-      {.token = "0x1.01p-9223372036854775800",
-       .mantissa = 0x101,
-       .exponent = -9223372036854775807L - 1L,
-       .radix = 16},
-      // The exponent here doesn't fit in a signed 64-bit integer until we
-      // adjust for the radix point.
-      {.token = "0x1.01p9223372036854775809",
-       .mantissa = 0x101,
-       .exponent = 9223372036854775801L,
-       .radix = 16},
-
-      // Binary real literals. These are invalid, but we accept them for error
-      // recovery.
-      {.token = "0b10_11_01.01",
-       .mantissa = 0b10110101,
-       .exponent = -2,
-       .radix = 2},
-  };
-  for (Testcase testcase : testcases) {
-    auto buffer = Lex(testcase.token);
-    EXPECT_EQ(buffer.HasErrors(), testcase.radix == 2);
-    ASSERT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
-                            {.kind = TokenKind::RealLiteral(),
-                             .line = 1,
-                             .column = 1,
-                             .indent_column = 1,
-                             .text = testcase.token},
-                        }));
-    auto token = buffer.Tokens().begin();
-    TokenizedBuffer::RealLiteralValue value = buffer.GetRealLiteral(*token);
-    EXPECT_EQ(value.Mantissa().getZExtValue(), testcase.mantissa);
-    EXPECT_EQ(value.Exponent().getSExtValue(), testcase.exponent);
-    EXPECT_EQ(value.IsDecimal(), testcase.radix == 10);
-  }
-}
-
-TEST_F(LexerTest, HandlesRealLiteralOverflow) {
-  llvm::StringLiteral input = "0x1.000001p-9223372036854775800";
-  auto buffer = Lex(input);
-  EXPECT_FALSE(buffer.HasErrors());
+TEST_F(LexerTest, HandlesInvalidNumericLiterals) {
+  auto buffer = Lex("14x 15_49 0x3.5q 0x3_4.5_6 0ops");
+  EXPECT_TRUE(buffer.HasErrors());
   ASSERT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
-                          {.kind = TokenKind::RealLiteral(),
+                          {.kind = TokenKind::Error(),
                            .line = 1,
                            .column = 1,
                            .indent_column = 1,
-                           .text = input},
+                           .text = "14x"},
+                          {.kind = TokenKind::IntegerLiteral(),
+                           .line = 1,
+                           .column = 5,
+                           .indent_column = 1,
+                           .text = "15_49"},
+                          {.kind = TokenKind::Error(),
+                           .line = 1,
+                           .column = 11,
+                           .indent_column = 1,
+                           .text = "0x3.5q"},
+                          {.kind = TokenKind::RealLiteral(),
+                           .line = 1,
+                           .column = 18,
+                           .indent_column = 1,
+                           .text = "0x3_4.5_6"},
+                          {.kind = TokenKind::Error(),
+                           .line = 1,
+                           .column = 28,
+                           .indent_column = 1,
+                           .text = "0ops"},
                       }));
-  auto token = buffer.Tokens().begin();
-  TokenizedBuffer::RealLiteralValue value = buffer.GetRealLiteral(*token);
-  EXPECT_EQ(value.Mantissa(), 0x1000001);
-  EXPECT_EQ((value.Exponent() + 9223372036854775800).getSExtValue(), -24);
-  EXPECT_EQ(value.IsDecimal(), false);
-}
-
-TEST_F(LexerTest, ValidatesRealLiterals) {
-  llvm::StringLiteral invalid_digit_separators[] = {
-      // Invalid digit separators.
-      "12_34.5",     "123.4_567", "123.456_7", "1_2_3.4",
-      "123.4e56_78", "0x12_34.5", "0x12.3_4",  "0x12.34p5_6",
-  };
-  for (llvm::StringLiteral literal : invalid_digit_separators) {
-    auto buffer = Lex(literal);
-    EXPECT_TRUE(buffer.HasErrors()) << literal;
-    // We expect to produce a token even for a literal containing invalid digit
-    // separators, for better error recovery.
-    ASSERT_THAT(buffer, HasTokens(llvm::ArrayRef<ExpectedToken>{
-                            {.kind = TokenKind::RealLiteral(),
-                             .line = 1,
-                             .column = 1,
-                             .indent_column = 1,
-                             .text = literal}}));
-  }
-
-  llvm::StringLiteral invalid[] = {
-      // No digits in integer part.
-      "0x.0",
-      "0b.0",
-      "0x_.0",
-      "0b_.0",
-
-      // No digits in fractional part.
-      "0.e",
-      "0.e0",
-      "0.e+0",
-      "0x0.p",
-      "0x0.p-0",
-
-      // Invalid digits in mantissa.
-      "123A.4",
-      "123.4A",
-      "123A.4e0",
-      "123.4Ae0",
-      "0x123ABCDEFG.0",
-      "0x123.ABCDEFG",
-      "0x123ABCDEFG.0p0",
-      "0x123.ABCDEFGp0",
-
-      // Invalid exponent letter.
-      "0.0f0",
-      "0.0p0",
-      "0.0z+0",
-      "0x0.0e0",
-      "0x0.0f0",
-      "0x0.0z-0",
-
-      // No digits in exponent part.
-      "0.0e",
-      "0x0.0p",
-      "0.0e_",
-      "0x0.0p_",
-
-      // Invalid digits in exponent part.
-      "0.0eHELLO",
-      "0.0eA",
-      "0.0e+A",
-      "0x0.0pA",
-      "0x0.0p-A",
-  };
-  for (llvm::StringLiteral literal : invalid) {
-    auto buffer = Lex(literal);
-    EXPECT_TRUE(buffer.HasErrors()) << literal;
-    ASSERT_THAT(
-        buffer,
-        HasTokens(llvm::ArrayRef<ExpectedToken>{{.kind = TokenKind::Error(),
-                                                 .line = 1,
-                                                 .column = 1,
-                                                 .indent_column = 1,
-                                                 .text = literal}}));
-  }
 }
 
 TEST_F(LexerTest, SplitsNumericLiteralsProperly) {