1 год назад · e68e54dae4
--- a/toolchain/diagnostics/diagnostic_kind.def
+++ b/toolchain/diagnostics/diagnostic_kind.def
@@ -41,6 +41,7 @@ CARBON_DIAGNOSTIC_KIND(MismatchedIndentInString)
 
				 CARBON_DIAGNOSTIC_KIND(MultiLineStringWithDoubleQuotes)
			
 
				 CARBON_DIAGNOSTIC_KIND(NoWhitespaceAfterCommentIntroducer)
			
 
				 CARBON_DIAGNOSTIC_KIND(TooManyDigits)
			
 
				+CARBON_DIAGNOSTIC_KIND(TooManyTokens)
			
 
				 CARBON_DIAGNOSTIC_KIND(TrailingComment)
			
 
				 CARBON_DIAGNOSTIC_KIND(UnicodeEscapeMissingBracedDigits)
			
 
				 CARBON_DIAGNOSTIC_KIND(UnicodeEscapeSurrogate)
			
--- a/toolchain/diagnostics/emitted_diagnostics_test.cpp
+++ b/toolchain/diagnostics/emitted_diagnostics_test.cpp
@@ -60,6 +60,10 @@ static auto IsUntestedDiagnostic(DiagnosticKind diagnostic_kind) -> bool {
 
				       // loss in merge conflicts due to the amount of tests being changed right
			
 
				       // now.
			
 
				       return true;
			
 
				+    case DiagnosticKind::TooManyTokens:
			
 
				+      // This isn't feasible to test with a normal testcase, but is tested in
			
 
				+      // lex/tokenized_buffer_test.cpp.
			
 
				+      return true;
			
 
				     default:
			
 
				       return false;
			
 
				   }
			
--- a/toolchain/lex/lex.cpp
+++ b/toolchain/lex/lex.cpp
@@ -191,6 +191,11 @@ class [[clang::internal_linkage]] Lexer {
 
				 
			
 
				   auto LexFileEnd(llvm::StringRef source_text, ssize_t position) -> void;
			
 
				 
			
 
				+  // Perform final checking and cleanup that should be done once we have
			
 
				+  // finished lexing the whole file, and before we consider the tokenized buffer
			
 
				+  // to be complete.
			
 
				+  auto Finalize() -> void;
			
 
				+
			
 
				   auto DiagnoseAndFixMismatchedBrackets() -> void;
			
 
				 
			
 
				   // The main entry point for dispatching through the lexer's table. This method
			
@@ -729,6 +734,8 @@ auto Lexer::Lex() && -> TokenizedBuffer {
 
				   // dispatch table until everything from source_text is consumed.
			
 
				   DispatchNext(*this, source_text, position);
			
 
				 
			
 
				+  Finalize();
			
 
				+
			
 
				   if (consumer_.seen_error()) {
			
 
				     buffer_.has_errors_ = true;
			
 
				   }
			
@@ -1342,11 +1349,31 @@ auto Lexer::LexFileEnd(llvm::StringRef source_text, ssize_t position) -> void {
 
				   NoteWhitespace();
			
 
				 
			
 
				   LexToken(TokenKind::FileEnd, position);
			
 
				+}
			
 
				 
			
 
				+auto Lexer::Finalize() -> void {
			
 
				   // If we had any mismatched brackets, issue diagnostics and fix them.
			
 
				   if (has_mismatched_brackets_ || !open_groups_.empty()) {
			
 
				     DiagnoseAndFixMismatchedBrackets();
			
 
				   }
			
 
				+
			
 
				+  // Reject source files with so many tokens that we may have exceeded the
			
 
				+  // number of bits in `token_payload_`.
			
 
				+  //
			
 
				+  // Note that we rely on this check also catching the case where there are too
			
 
				+  // many identifiers to fit an `IdentifierId` into a `token_payload_`, and
			
 
				+  // likewise for `IntId` and so on. If we start adding any of those IDs prior
			
 
				+  // to lexing, we may need to also limit the number of those IDs here.
			
 
				+  if (buffer_.token_infos_.size() > TokenizedBuffer::MaxTokens) {
			
 
				+    CARBON_DIAGNOSTIC(TooManyTokens, Error,
			
 
				+                      "too many tokens in source file; try splitting into "
			
 
				+                      "multiple source files");
			
 
				+    // Subtract one to leave room for the `FileEnd` token.
			
 
				+    token_emitter_.Emit(TokenIndex(TokenizedBuffer::MaxTokens - 1),
			
 
				+                        TooManyTokens);
			
 
				+    // TODO: Convert tokens after the token limit to error tokens to avoid
			
 
				+    // misinterpretation by consumers of the tokenized buffer.
			
 
				+  }
			
 
				 }
			
 
				 
			
 
				 // A list of pending insertions to make into a tokenized buffer for error
			
--- a/toolchain/lex/tokenized_buffer.h
+++ b/toolchain/lex/tokenized_buffer.h
@@ -83,6 +83,10 @@ class TokenDiagnosticConverter : public DiagnosticConverter<TokenIndex> {
 
				 // `HasError` returning true.
			
 
				 class TokenizedBuffer : public Printable<TokenizedBuffer> {
			
 
				  public:
			
 
				+  // The maximum number of tokens that can be stored in the buffer, including
			
 
				+  // the FileStart and FileEnd tokens.
			
 
				+  static constexpr int MaxTokens = 1 << 23;
			
 
				+
			
 
				   // A comment, which can be a block of lines.
			
 
				   //
			
 
				   // This is the API version of `CommentData`.
			
@@ -306,7 +310,6 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
 
				     }
			
 
				     auto set_ident_id(IdentifierId ident_id) -> void {
			
 
				       CARBON_DCHECK(kind() == TokenKind::Identifier);
			
 
				-      CARBON_DCHECK(ident_id.index < (2 << PayloadBits));
			
 
				       token_payload_ = ident_id.index;
			
 
				     }
			
 
				 
			
@@ -334,7 +337,6 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
 
				     }
			
 
				     auto set_closing_token_index(TokenIndex closing_index) -> void {
			
 
				       CARBON_DCHECK(kind().is_opening_symbol());
			
 
				-      CARBON_DCHECK(closing_index.index < (2 << PayloadBits));
			
 
				       token_payload_ = closing_index.index;
			
 
				     }
			
 
				 
			
@@ -344,7 +346,6 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
 
				     }
			
 
				     auto set_opening_token_index(TokenIndex opening_index) -> void {
			
 
				       CARBON_DCHECK(kind().is_closing_symbol());
			
 
				-      CARBON_DCHECK(opening_index.index < (2 << PayloadBits));
			
 
				       token_payload_ = opening_index.index;
			
 
				     }
			
 
				 
			
@@ -395,18 +396,23 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
 
				         : kind_(kind),
			
 
				           has_leading_space_(has_leading_space),
			
 
				           token_payload_(payload),
			
 
				-          byte_offset_(byte_offset) {
			
 
				-      CARBON_DCHECK(payload >= 0 && payload < (2 << PayloadBits),
			
 
				-                    "Payload won't fit into unsigned bit pack: {0}", payload);
			
 
				-    }
			
 
				+          byte_offset_(byte_offset) {}
			
 
				 
			
 
				     // A bitfield that encodes the token's kind, the leading space flag, and the
			
 
				     // remaining bits in a payload. These are encoded together as a bitfield for
			
 
				     // density and because these are the hottest fields of tokens for consumers
			
 
				     // after lexing.
			
 
				+    //
			
 
				+    // Payload values are typically ID types for which we create at most one per
			
 
				+    // token, so we ensure that `token_payload_` is large enough to fit any
			
 
				+    // token index. Stores to this field may overflow, but we produce an error
			
 
				+    // in `Lexer::Finalize` if the file has more than `MaxTokens` tokens, so
			
 
				+    // this value never overflows if lexing succeeds.
			
 
				     TokenKind::RawEnumType kind_ : sizeof(TokenKind) * 8;
			
 
				     bool has_leading_space_ : 1;
			
 
				     unsigned token_payload_ : PayloadBits;
			
 
				+    static_assert(MaxTokens <= 1 << PayloadBits,
			
 
				+                  "Not enough payload bits to store a token index");
			
 
				 
			
 
				     // Separate storage for the byte offset, this is hot while lexing but then
			
 
				     // generally cold.
			
--- a/toolchain/lex/tokenized_buffer_test.cpp
+++ b/toolchain/lex/tokenized_buffer_test.cpp
@@ -1107,6 +1107,21 @@ TEST_F(LexerTest, DiagnosticUnrecognizedChar) {
 
				   compile_helper_.GetTokenizedBuffer("\b", &consumer);
			
 
				 }
			
 
				 
			
 
				+TEST_F(LexerTest, DiagnosticFileTooLarge) {
			
 
				+  Testing::MockDiagnosticConsumer consumer;
			
 
				+  static constexpr size_t NumLines = 10'000'000;
			
 
				+  std::string input;
			
 
				+  input.reserve(NumLines * 3);
			
 
				+  for ([[maybe_unused]] int _ : llvm::seq(NumLines)) {
			
 
				+    input += "{}\n";
			
 
				+  }
			
 
				+  EXPECT_CALL(consumer,
			
 
				+              HandleDiagnostic(IsSingleDiagnostic(
			
 
				+                  DiagnosticKind::TooManyTokens, DiagnosticLevel::Error,
			
 
				+                  TokenizedBuffer::MaxTokens / 2, 1, _)));
			
 
				+  compile_helper_.GetTokenizedBuffer(input, &consumer);
			
 
				+}
			
 
				+
			
 
				 // Appends comment lines to the string, to create a comment block.
			
 
				 static auto AppendCommentLines(std::string& str, int count, llvm::StringRef tag)
			
 
				     -> void {