1 år sedan · 97e98bcc5a
--- a/toolchain/lex/lex.cpp
+++ b/toolchain/lex/lex.cpp
@@ -5,6 +5,7 @@
 
				 #include "toolchain/lex/lex.h"
			
 
				 
			
 
				 #include <array>
			
 
				+#include <limits>
			
 
				 
			
 
				 #include "common/check.h"
			
 
				 #include "common/variant_helpers.h"
			
@@ -97,9 +98,12 @@ class [[clang::internal_linkage]] Lexer {
 
				     return &buffer_.line_infos_[line_index_];
			
 
				   }
			
 
				 
			
 
				-  auto ComputeColumn(ssize_t position) -> int {
			
 
				-    CARBON_DCHECK(position >= current_line_info()->start);
			
 
				-    return position - current_line_info()->start;
			
 
				+  auto next_line() -> LineIndex { return LineIndex(line_index_ + 1); }
			
 
				+
			
 
				+  auto next_line_info() -> TokenizedBuffer::LineInfo* {
			
 
				+    CARBON_CHECK(line_index_ + 1 <
			
 
				+                 static_cast<ssize_t>(buffer_.line_infos_.size()));
			
 
				+    return &buffer_.line_infos_[line_index_ + 1];
			
 
				   }
			
 
				 
			
 
				   auto NoteWhitespace() -> void {
			
@@ -142,7 +146,8 @@ class [[clang::internal_linkage]] Lexer {
 
				 
			
 
				   // Given a word that has already been lexed, determine whether it is a type
			
 
				   // literal and if so form the corresponding token.
			
 
				-  auto LexWordAsTypeLiteralToken(llvm::StringRef word, int column) -> LexResult;
			
 
				+  auto LexWordAsTypeLiteralToken(llvm::StringRef word, int32_t byte_offset)
			
 
				+      -> LexResult;
			
 
				 
			
 
				   auto LexKeywordOrIdentifier(llvm::StringRef source_text, ssize_t& position)
			
 
				       -> LexResult;
			
@@ -661,6 +666,10 @@ static auto EstimateUpperBoundOnNumIdentifiers(int line_count) -> int {
 
				 auto Lexer::Lex() && -> TokenizedBuffer {
			
 
				   llvm::StringRef source_text = buffer_.source_->text();
			
 
				 
			
 
				+  // Enforced by the source buffer, but something we heavily rely on throughout
			
 
				+  // the lexer.
			
 
				+  CARBON_CHECK(source_text.size() < std::numeric_limits<int32_t>::max());
			
 
				+
			
 
				   // First build up our line data structures.
			
 
				   MakeLines(source_text);
			
 
				 
			
@@ -717,18 +726,18 @@ auto Lexer::MakeLines(llvm::StringRef source_text) -> void {
 
				   while (const char* nl = reinterpret_cast<const char*>(
			
 
				              memchr(&text[start], '\n', size - start))) {
			
 
				     ssize_t nl_index = nl - text;
			
 
				-    buffer_.AddLine(TokenizedBuffer::LineInfo(start, nl_index - start));
			
 
				+    buffer_.AddLine(TokenizedBuffer::LineInfo(start));
			
 
				     start = nl_index + 1;
			
 
				   }
			
 
				   // The last line ends at the end of the file.
			
 
				-  buffer_.AddLine(TokenizedBuffer::LineInfo(start, size - start));
			
 
				+  buffer_.AddLine(TokenizedBuffer::LineInfo(start));
			
 
				 
			
 
				   // If the last line wasn't empty, the file ends with an unterminated line.
			
 
				   // Add an extra blank line so that we never need to handle the special case
			
 
				   // of being on the last line inside the lexer and needing to not increment
			
 
				   // to the next line.
			
 
				   if (start != size) {
			
 
				-    buffer_.AddLine(TokenizedBuffer::LineInfo(size, 0));
			
 
				+    buffer_.AddLine(TokenizedBuffer::LineInfo(size));
			
 
				   }
			
 
				 
			
 
				   // Now that all the infos are allocated, get a fresh pointer to the first
			
@@ -826,10 +835,10 @@ auto Lexer::LexComment(llvm::StringRef source_text, ssize_t& position) -> void {
 
				     emitter_.Emit(source_text.begin() + position, TrailingComment);
			
 
				 
			
 
				     // Note that we cannot fall-through here as the logic below doesn't handle
			
 
				-    // trailing comments. For simplicity, we just consume the trailing comment
			
 
				-    // itself and let the normal lexer handle the newline as if there weren't
			
 
				-    // a comment at all.
			
 
				-    position = line_info->start + line_info->length;
			
 
				+    // trailing comments. Instead, we treat trailing comments as vertical
			
 
				+    // whitespace, which already is designed to skip over any erroneous text at
			
 
				+    // the end of the line.
			
 
				+    LexVerticalWhitespace(source_text, position);
			
 
				     return;
			
 
				   }
			
 
				 
			
@@ -950,24 +959,23 @@ auto Lexer::LexNumericLiteral(llvm::StringRef source_text, ssize_t& position)
 
				     return LexError(source_text, position);
			
 
				   }
			
 
				 
			
 
				-  int int_column = ComputeColumn(position);
			
 
				+  // Capture the position before we step past the token.
			
 
				+  int32_t byte_offset = position;
			
 
				   int token_size = literal->text().size();
			
 
				   position += token_size;
			
 
				 
			
 
				   return VariantMatch(
			
 
				       literal->ComputeValue(emitter_),
			
 
				       [&](NumericLiteral::IntValue&& value) {
			
 
				-        auto token = buffer_.AddToken({.kind = TokenKind::IntLiteral,
			
 
				-                                       .token_line = current_line(),
			
 
				-                                       .column = int_column});
			
 
				+        auto token = buffer_.AddToken(
			
 
				+            {.kind = TokenKind::IntLiteral, .byte_offset = byte_offset});
			
 
				         buffer_.GetTokenInfo(token).int_id =
			
 
				             buffer_.value_stores_->ints().Add(std::move(value.value));
			
 
				         return token;
			
 
				       },
			
 
				       [&](NumericLiteral::RealValue&& value) {
			
 
				-        auto token = buffer_.AddToken({.kind = TokenKind::RealLiteral,
			
 
				-                                       .token_line = current_line(),
			
 
				-                                       .column = int_column});
			
 
				+        auto token = buffer_.AddToken(
			
 
				+            {.kind = TokenKind::RealLiteral, .byte_offset = byte_offset});
			
 
				         buffer_.GetTokenInfo(token).real_id =
			
 
				             buffer_.value_stores_->reals().Add(Real{
			
 
				                 .mantissa = value.mantissa,
			
@@ -978,8 +986,7 @@ auto Lexer::LexNumericLiteral(llvm::StringRef source_text, ssize_t& position)
 
				       [&](NumericLiteral::UnrecoverableError) {
			
 
				         auto token = buffer_.AddToken({
			
 
				             .kind = TokenKind::Error,
			
 
				-            .token_line = current_line(),
			
 
				-            .column = int_column,
			
 
				+            .byte_offset = byte_offset,
			
 
				             .error_length = token_size,
			
 
				         });
			
 
				         return token;
			
@@ -994,15 +1001,15 @@ auto Lexer::LexStringLiteral(llvm::StringRef source_text, ssize_t& position)
 
				     return LexError(source_text, position);
			
 
				   }
			
 
				 
			
 
				-  LineIndex string_line = current_line();
			
 
				-  int string_column = ComputeColumn(position);
			
 
				+  // Capture the position before we step past the token.
			
 
				+  int32_t byte_offset = position;
			
 
				+  int string_column = byte_offset - current_line_info()->start;
			
 
				   ssize_t literal_size = literal->text().size();
			
 
				   position += literal_size;
			
 
				 
			
 
				   // Update line and column information.
			
 
				   if (literal->is_multi_line()) {
			
 
				-    while (current_line_info()->start + current_line_info()->length <
			
 
				-           position) {
			
 
				+    while (next_line_info()->start < position) {
			
 
				       ++line_index_;
			
 
				       current_line_info()->indent = string_column;
			
 
				     }
			
@@ -1015,8 +1022,7 @@ auto Lexer::LexStringLiteral(llvm::StringRef source_text, ssize_t& position)
 
				     auto string_id = buffer_.value_stores_->string_literal_values().Add(
			
 
				         literal->ComputeValue(buffer_.allocator_, emitter_));
			
 
				     auto token = buffer_.AddToken({.kind = TokenKind::StringLiteral,
			
 
				-                                   .token_line = string_line,
			
 
				-                                   .column = string_column,
			
 
				+                                   .byte_offset = byte_offset,
			
 
				                                    .string_literal_id = string_id});
			
 
				     return token;
			
 
				   } else {
			
@@ -1025,8 +1031,7 @@ auto Lexer::LexStringLiteral(llvm::StringRef source_text, ssize_t& position)
 
				     emitter_.Emit(literal->text().begin(), UnterminatedString);
			
 
				     return buffer_.AddToken(
			
 
				         {.kind = TokenKind::Error,
			
 
				-         .token_line = string_line,
			
 
				-         .column = string_column,
			
 
				+         .byte_offset = byte_offset,
			
 
				          .error_length = static_cast<int32_t>(literal_size)});
			
 
				   }
			
 
				 }
			
@@ -1041,9 +1046,8 @@ auto Lexer::LexOneCharSymbolToken(llvm::StringRef source_text, TokenKind kind,
 
				       << "' instead of the spelling '" << kind.fixed_spelling()
			
 
				       << "' of the incoming token kind '" << kind << "'";
			
 
				 
			
 
				-  TokenIndex token = buffer_.AddToken({.kind = kind,
			
 
				-                                       .token_line = current_line(),
			
 
				-                                       .column = ComputeColumn(position)});
			
 
				+  TokenIndex token = buffer_.AddToken(
			
 
				+      {.kind = kind, .byte_offset = static_cast<int32_t>(position)});
			
 
				   ++position;
			
 
				   return token;
			
 
				 }
			
@@ -1096,14 +1100,13 @@ auto Lexer::LexSymbolToken(llvm::StringRef source_text, ssize_t& position)
 
				     return LexError(source_text, position);
			
 
				   }
			
 
				 
			
 
				-  TokenIndex token = buffer_.AddToken({.kind = kind,
			
 
				-                                       .token_line = current_line(),
			
 
				-                                       .column = ComputeColumn(position)});
			
 
				+  TokenIndex token = buffer_.AddToken(
			
 
				+      {.kind = kind, .byte_offset = static_cast<int32_t>(position)});
			
 
				   position += kind.fixed_spelling().size();
			
 
				   return token;
			
 
				 }
			
 
				 
			
 
				-auto Lexer::LexWordAsTypeLiteralToken(llvm::StringRef word, int column)
			
 
				+auto Lexer::LexWordAsTypeLiteralToken(llvm::StringRef word, int32_t byte_offset)
			
 
				     -> LexResult {
			
 
				   if (word.size() < 2) {
			
 
				     // Too short to form one of these tokens.
			
@@ -1133,8 +1136,7 @@ auto Lexer::LexWordAsTypeLiteralToken(llvm::StringRef word, int column)
 
				   if (!CanLexInt(emitter_, suffix)) {
			
 
				     return buffer_.AddToken(
			
 
				         {.kind = TokenKind::Error,
			
 
				-         .token_line = current_line(),
			
 
				-         .column = column,
			
 
				+         .byte_offset = byte_offset,
			
 
				          .error_length = static_cast<int32_t>(word.size())});
			
 
				   }
			
 
				   llvm::APInt suffix_value;
			
@@ -1142,8 +1144,7 @@ auto Lexer::LexWordAsTypeLiteralToken(llvm::StringRef word, int column)
 
				     return LexResult::NoMatch();
			
 
				   }
			
 
				 
			
 
				-  auto token = buffer_.AddToken(
			
 
				-      {.kind = *kind, .token_line = current_line(), .column = column});
			
 
				+  auto token = buffer_.AddToken({.kind = *kind, .byte_offset = byte_offset});
			
 
				   buffer_.GetTokenInfo(token).int_id =
			
 
				       buffer_.value_stores_->ints().Add(std::move(suffix_value));
			
 
				   return token;
			
@@ -1158,7 +1159,8 @@ auto Lexer::LexKeywordOrIdentifier(llvm::StringRef source_text,
 
				   CARBON_CHECK(
			
 
				       IsIdStartByteTable[static_cast<unsigned char>(source_text[position])]);
			
 
				 
			
 
				-  int column = ComputeColumn(position);
			
 
				+  // Capture the position before we step past the token.
			
 
				+  int32_t byte_offset = position;
			
 
				 
			
 
				   // Take the valid characters off the front of the source buffer.
			
 
				   llvm::StringRef identifier_text =
			
@@ -1167,7 +1169,8 @@ auto Lexer::LexKeywordOrIdentifier(llvm::StringRef source_text,
 
				   position += identifier_text.size();
			
 
				 
			
 
				   // Check if the text is a type literal, and if so form such a literal.
			
 
				-  if (LexResult result = LexWordAsTypeLiteralToken(identifier_text, column)) {
			
 
				+  if (LexResult result =
			
 
				+          LexWordAsTypeLiteralToken(identifier_text, byte_offset)) {
			
 
				     return result;
			
 
				   }
			
 
				 
			
@@ -1177,15 +1180,13 @@ auto Lexer::LexKeywordOrIdentifier(llvm::StringRef source_text,
 
				 #include "toolchain/lex/token_kind.def"
			
 
				                        .Default(TokenKind::Error);
			
 
				   if (kind != TokenKind::Error) {
			
 
				-    return buffer_.AddToken(
			
 
				-        {.kind = kind, .token_line = current_line(), .column = column});
			
 
				+    return buffer_.AddToken({.kind = kind, .byte_offset = byte_offset});
			
 
				   }
			
 
				 
			
 
				   // Otherwise we have a generic identifier.
			
 
				   return buffer_.AddToken(
			
 
				       {.kind = TokenKind::Identifier,
			
 
				-       .token_line = current_line(),
			
 
				-       .column = column,
			
 
				+       .byte_offset = byte_offset,
			
 
				        .ident_id = buffer_.value_stores_->identifiers().Add(identifier_text)});
			
 
				 }
			
 
				 
			
@@ -1206,8 +1207,7 @@ auto Lexer::LexHash(llvm::StringRef source_text, ssize_t& position)
 
				       position + 1 == static_cast<ssize_t>(source_text.size()) ||
			
 
				       !IsIdStartByteTable[static_cast<unsigned char>(
			
 
				           source_text[position + 1])] ||
			
 
				-      prev_token_info.token_line != current_line() ||
			
 
				-      prev_token_info.column != ComputeColumn(position) - 1) {
			
 
				+      prev_token_info.byte_offset != static_cast<int32_t>(position) - 1) {
			
 
				     [[clang::musttail]] return LexStringLiteral(source_text, position);
			
 
				   }
			
 
				   CARBON_DCHECK(buffer_.value_stores_->identifiers().Get(
			
@@ -1255,8 +1255,7 @@ auto Lexer::LexError(llvm::StringRef source_text, ssize_t& position)
 
				 
			
 
				   auto token = buffer_.AddToken(
			
 
				       {.kind = TokenKind::Error,
			
 
				-       .token_line = current_line(),
			
 
				-       .column = ComputeColumn(position),
			
 
				+       .byte_offset = static_cast<int32_t>(position),
			
 
				        .error_length = static_cast<int32_t>(error_text.size())});
			
 
				   CARBON_DIAGNOSTIC(UnrecognizedCharacters, Error,
			
 
				                     "Encountered unrecognized characters while parsing.");
			
@@ -1268,13 +1267,14 @@ auto Lexer::LexError(llvm::StringRef source_text, ssize_t& position)
 
				 
			
 
				 auto Lexer::LexFileStart(llvm::StringRef source_text, ssize_t& position)
			
 
				     -> void {
			
 
				+  CARBON_CHECK(position == 0);
			
 
				+
			
 
				   // Before lexing any source text, add the start-of-file token so that code
			
 
				   // can assume a non-empty token buffer for the rest of lexing. Note that the
			
 
				   // start-of-file always has trailing space because it *is* whitespace.
			
 
				   buffer_.AddToken({.kind = TokenKind::FileStart,
			
 
				                     .has_trailing_space = true,
			
 
				-                    .token_line = current_line(),
			
 
				-                    .column = 0});
			
 
				+                    .byte_offset = 0});
			
 
				 
			
 
				   // Also skip any horizontal whitespace and record the indentation of the
			
 
				   // first line.
			
@@ -1295,17 +1295,13 @@ auto Lexer::LexFileEnd(llvm::StringRef source_text, ssize_t position) -> void {
 
				   if (position == current_line_info()->start && line_index_ != 0) {
			
 
				     --line_index_;
			
 
				     --position;
			
 
				-  } else {
			
 
				-    // Update the line length as this is also the end of a line.
			
 
				-    current_line_info()->length = ComputeColumn(position);
			
 
				   }
			
 
				 
			
 
				   // The end-of-file token is always considered to be whitespace.
			
 
				   NoteWhitespace();
			
 
				 
			
 
				   buffer_.AddToken({.kind = TokenKind::FileEnd,
			
 
				-                    .token_line = current_line(),
			
 
				-                    .column = ComputeColumn(position)});
			
 
				+                    .byte_offset = static_cast<int32_t>(position)});
			
 
				 
			
 
				   // If we had any mismatched brackets, issue diagnostics and fix them.
			
 
				   if (has_mismatched_brackets_ || !open_groups_.empty()) {
			
@@ -1334,16 +1330,17 @@ class Lexer::ErrorRecoveryBuffer {
 
				         << "Insertions performed out of order.";
			
 
				 
			
 
				     // Find the end of the token before the target token, and add the new token
			
 
				-    // there. Note that new_token_column is a 1-based column number.
			
 
				+    // there.
			
 
				     TokenIndex insert_after(insert_before.index - 1);
			
 
				-    auto [new_token_line, new_token_column] = buffer_.GetEndLoc(insert_after);
			
 
				+    const auto& prev_info = buffer_.GetTokenInfo(insert_after);
			
 
				+    int32_t byte_offset =
			
 
				+        prev_info.byte_offset + buffer_.GetTokenText(insert_after).size();
			
 
				     new_tokens_.push_back(
			
 
				         {insert_before,
			
 
				          {.kind = kind,
			
 
				           .has_trailing_space = buffer_.HasTrailingWhitespace(insert_after),
			
 
				           .is_recovery = true,
			
 
				-          .token_line = new_token_line,
			
 
				-          .column = new_token_column - 1}});
			
 
				+          .byte_offset = byte_offset}});
			
 
				   }
			
 
				 
			
 
				   // Replace the given token with an error token. We do this immediately,
			
--- a/toolchain/lex/tokenized_buffer.cpp
+++ b/toolchain/lex/tokenized_buffer.cpp
@@ -4,6 +4,7 @@
 
				 
			
 
				 #include "toolchain/lex/tokenized_buffer.h"
			
 
				 
			
 
				+#include <algorithm>
			
 
				 #include <cmath>
			
 
				 
			
 
				 #include "common/check.h"
			
@@ -24,7 +25,7 @@ auto TokenizedBuffer::GetKind(TokenIndex token) const -> TokenKind {
 
				 }
			
 
				 
			
 
				 auto TokenizedBuffer::GetLine(TokenIndex token) const -> LineIndex {
			
 
				-  return GetTokenInfo(token).token_line;
			
 
				+  return FindLineIndex(GetTokenInfo(token).byte_offset);
			
 
				 }
			
 
				 
			
 
				 auto TokenizedBuffer::GetLineNumber(TokenIndex token) const -> int {
			
@@ -32,7 +33,9 @@ auto TokenizedBuffer::GetLineNumber(TokenIndex token) const -> int {
 
				 }
			
 
				 
			
 
				 auto TokenizedBuffer::GetColumnNumber(TokenIndex token) const -> int {
			
 
				-  return GetTokenInfo(token).column + 1;
			
 
				+  const auto& token_info = GetTokenInfo(token);
			
 
				+  const auto& line_info = GetLineInfo(FindLineIndex(token_info.byte_offset));
			
 
				+  return token_info.byte_offset - line_info.start + 1;
			
 
				 }
			
 
				 
			
 
				 auto TokenizedBuffer::GetEndLoc(TokenIndex token) const
			
@@ -62,19 +65,16 @@ auto TokenizedBuffer::GetTokenText(TokenIndex token) const -> llvm::StringRef {
 
				   }
			
 
				 
			
 
				   if (token_info.kind == TokenKind::Error) {
			
 
				-    const auto& line_info = GetLineInfo(token_info.token_line);
			
 
				-    int64_t token_start = line_info.start + token_info.column;
			
 
				-    return source_->text().substr(token_start, token_info.error_length);
			
 
				+    return source_->text().substr(token_info.byte_offset,
			
 
				+                                  token_info.error_length);
			
 
				   }
			
 
				 
			
 
				   // Refer back to the source text to preserve oddities like radix or digit
			
 
				   // separators the author included.
			
 
				   if (token_info.kind == TokenKind::IntLiteral ||
			
 
				       token_info.kind == TokenKind::RealLiteral) {
			
 
				-    const auto& line_info = GetLineInfo(token_info.token_line);
			
 
				-    int64_t token_start = line_info.start + token_info.column;
			
 
				     std::optional<NumericLiteral> relexed_token =
			
 
				-        NumericLiteral::Lex(source_->text().substr(token_start));
			
 
				+        NumericLiteral::Lex(source_->text().substr(token_info.byte_offset));
			
 
				     CARBON_CHECK(relexed_token) << "Could not reform numeric literal token.";
			
 
				     return relexed_token->text();
			
 
				   }
			
@@ -82,10 +82,8 @@ auto TokenizedBuffer::GetTokenText(TokenIndex token) const -> llvm::StringRef {
 
				   // Refer back to the source text to find the original spelling, including
			
 
				   // escape sequences etc.
			
 
				   if (token_info.kind == TokenKind::StringLiteral) {
			
 
				-    const auto& line_info = GetLineInfo(token_info.token_line);
			
 
				-    int64_t token_start = line_info.start + token_info.column;
			
 
				     std::optional<StringLiteral> relexed_token =
			
 
				-        StringLiteral::Lex(source_->text().substr(token_start));
			
 
				+        StringLiteral::Lex(source_->text().substr(token_info.byte_offset));
			
 
				     CARBON_CHECK(relexed_token) << "Could not reform string literal token.";
			
 
				     return relexed_token->text();
			
 
				   }
			
@@ -93,10 +91,9 @@ auto TokenizedBuffer::GetTokenText(TokenIndex token) const -> llvm::StringRef {
 
				   // Refer back to the source text to avoid needing to reconstruct the
			
 
				   // spelling from the size.
			
 
				   if (token_info.kind.is_sized_type_literal()) {
			
 
				-    const auto& line_info = GetLineInfo(token_info.token_line);
			
 
				-    int64_t token_start = line_info.start + token_info.column;
			
 
				-    llvm::StringRef suffix =
			
 
				-        source_->text().substr(token_start + 1).take_while(IsDecimalDigit);
			
 
				+    llvm::StringRef suffix = source_->text()
			
 
				+                                 .substr(token_info.byte_offset + 1)
			
 
				+                                 .take_while(IsDecimalDigit);
			
 
				     return llvm::StringRef(suffix.data() - 1, suffix.size() + 1);
			
 
				   }
			
 
				 
			
@@ -254,6 +251,7 @@ auto TokenizedBuffer::PrintToken(llvm::raw_ostream& output_stream,
 
				   widths.Widen(GetTokenPrintWidths(token));
			
 
				   int token_index = token.index;
			
 
				   const auto& token_info = GetTokenInfo(token);
			
 
				+  LineIndex line_index = FindLineIndex(token_info.byte_offset);
			
 
				   llvm::StringRef token_text = GetTokenText(token);
			
 
				 
			
 
				   // Output the main chunk using one format string. We have to do the
			
@@ -265,10 +263,9 @@ auto TokenizedBuffer::PrintToken(llvm::raw_ostream& output_stream,
 
				       llvm::format_decimal(token_index, widths.index),
			
 
				       llvm::right_justify(llvm::formatv("'{0}'", token_info.kind.name()).str(),
			
 
				                           widths.kind + 2),
			
 
				-      llvm::format_decimal(GetLineNumber(token_info.token_line), widths.line),
			
 
				+      llvm::format_decimal(GetLineNumber(GetLine(token)), widths.line),
			
 
				       llvm::format_decimal(GetColumnNumber(token), widths.column),
			
 
				-      llvm::format_decimal(GetIndentColumnNumber(token_info.token_line),
			
 
				-                           widths.indent),
			
 
				+      llvm::format_decimal(GetIndentColumnNumber(line_index), widths.indent),
			
 
				       token_text);
			
 
				 
			
 
				   switch (token_info.kind) {
			
@@ -313,6 +310,31 @@ auto TokenizedBuffer::PrintToken(llvm::raw_ostream& output_stream,
 
				   output_stream << " },";
			
 
				 }
			
 
				 
			
 
				+// Find the line index corresponding to a specific byte offset within the source
			
 
				+// text for this tokenized buffer.
			
 
				+//
			
 
				+// This takes advantage of the lines being sorted by their starting byte offsets
			
 
				+// to do a binary search for the line that contains the provided offset.
			
 
				+auto TokenizedBuffer::FindLineIndex(int32_t byte_offset) const -> LineIndex {
			
 
				+  CARBON_DCHECK(!line_infos_.empty());
			
 
				+  const auto* line_it =
			
 
				+      std::partition_point(line_infos_.begin(), line_infos_.end(),
			
 
				+                           [byte_offset](LineInfo line_info) {
			
 
				+                             return line_info.start <= byte_offset;
			
 
				+                           });
			
 
				+  --line_it;
			
 
				+
			
 
				+  // If this isn't the first line but it starts past the end of the source, then
			
 
				+  // this is a synthetic line added for simplicity of lexing. Step back one
			
 
				+  // further to find the last non-synthetic line.
			
 
				+  if (line_it != line_infos_.begin() &&
			
 
				+      line_it->start == static_cast<int32_t>(source_->text().size())) {
			
 
				+    --line_it;
			
 
				+  }
			
 
				+  CARBON_DCHECK(line_it->start <= byte_offset);
			
 
				+  return LineIndex(line_it - line_infos_.begin());
			
 
				+}
			
 
				+
			
 
				 auto TokenizedBuffer::GetLineInfo(LineIndex line) -> LineInfo& {
			
 
				   return line_infos_[line.index];
			
 
				 }
			
@@ -355,40 +377,32 @@ auto TokenizedBuffer::SourceBufferDiagnosticConverter::ConvertLoc(
 
				     const char* loc, ContextFnT /*context_fn*/) const -> DiagnosticLoc {
			
 
				   CARBON_CHECK(StringRefContainsPointer(buffer_->source_->text(), loc))
			
 
				       << "location not within buffer";
			
 
				-  int64_t offset = loc - buffer_->source_->text().begin();
			
 
				+  int32_t offset = loc - buffer_->source_->text().begin();
			
 
				 
			
 
				-  // Find the first line starting after the given location. Note that we can't
			
 
				-  // inspect `line.length` here because it is not necessarily correct for the
			
 
				-  // final line during lexing (but will be correct later for the parse tree).
			
 
				-  const auto* line_it = std::partition_point(
			
 
				+  // Find the first line starting after the given location.
			
 
				+  const auto* next_line_it = std::partition_point(
			
 
				       buffer_->line_infos_.begin(), buffer_->line_infos_.end(),
			
 
				       [offset](const LineInfo& line) { return line.start <= offset; });
			
 
				 
			
 
				   // Step back one line to find the line containing the given position.
			
 
				-  CARBON_CHECK(line_it != buffer_->line_infos_.begin())
			
 
				+  CARBON_CHECK(next_line_it != buffer_->line_infos_.begin())
			
 
				       << "location precedes the start of the first line";
			
 
				-  --line_it;
			
 
				+  const auto* line_it = std::prev(next_line_it);
			
 
				   int line_number = line_it - buffer_->line_infos_.begin();
			
 
				   int column_number = offset - line_it->start;
			
 
				 
			
 
				-  // Start by grabbing the line from the buffer. If the line isn't fully lexed,
			
 
				-  // the length will be npos and the line will be grabbed from the known start
			
 
				-  // to the end of the buffer; we'll then adjust the length.
			
 
				-  llvm::StringRef line =
			
 
				-      buffer_->source_->text().substr(line_it->start, line_it->length);
			
 
				-  if (line_it->length == static_cast<int32_t>(llvm::StringRef::npos)) {
			
 
				-    CARBON_CHECK(line.take_front(column_number).count('\n') == 0)
			
 
				-        << "Currently we assume no unlexed newlines prior to the error column, "
			
 
				-           "but there was one when erroring at "
			
 
				-        << buffer_->source_->filename() << ":" << line_number << ":"
			
 
				-        << column_number;
			
 
				-    // Look for the next newline since we don't know the length. We can start at
			
 
				-    // the column because prior newlines will have been lexed.
			
 
				-    auto end_newline_pos = line.find('\n', column_number);
			
 
				-    if (end_newline_pos != llvm::StringRef::npos) {
			
 
				-      line = line.take_front(end_newline_pos);
			
 
				-    }
			
 
				-  }
			
 
				+  // Grab the line from the buffer by slicing from this line to the next
			
 
				+  // minus the newline. When on the last line, instead use the start to the end
			
 
				+  // of the buffer.
			
 
				+  llvm::StringRef text = buffer_->source_->text();
			
 
				+  llvm::StringRef line = next_line_it != buffer_->line_infos_.end()
			
 
				+                             ? text.slice(line_it->start, next_line_it->start)
			
 
				+                             : text.substr(line_it->start);
			
 
				+
			
 
				+  // Remove a newline at the end of the line if present.
			
 
				+  // TODO: This should expand to remove all vertical whitespace bytes at the
			
 
				+  // tail of the line such as CR+LF, etc.
			
 
				+  line.consume_back("\n");
			
 
				 
			
 
				   return {.filename = buffer_->source_->filename(),
			
 
				           .line = line,
			
@@ -401,9 +415,8 @@ auto TokenDiagnosticConverter::ConvertLoc(TokenIndex token,
 
				     -> DiagnosticLoc {
			
 
				   // Map the token location into a position within the source buffer.
			
 
				   const auto& token_info = buffer_->GetTokenInfo(token);
			
 
				-  const auto& line_info = buffer_->GetLineInfo(token_info.token_line);
			
 
				   const char* token_start =
			
 
				-      buffer_->source_->text().begin() + line_info.start + token_info.column;
			
 
				+      buffer_->source_->text().begin() + token_info.byte_offset;
			
 
				 
			
 
				   // Find the corresponding file location.
			
 
				   // TODO: Should we somehow indicate in the diagnostic location if this token
			
--- a/toolchain/lex/tokenized_buffer.h
+++ b/toolchain/lex/tokenized_buffer.h
@@ -269,11 +269,8 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
 
				     // Whether the token was injected artificially during error recovery.
			
 
				     bool is_recovery = false;
			
 
				 
			
 
				-    // LineIndex on which the TokenIndex starts.
			
 
				-    LineIndex token_line;
			
 
				-
			
 
				-    // Zero-based byte offset of the token within its line.
			
 
				-    int32_t column;
			
 
				+    // Zero-based byte offset of the token within the file.
			
 
				+    int32_t byte_offset;
			
 
				 
			
 
				     // We may have up to 32 bits of payload, based on the kind of token.
			
 
				     union {
			
@@ -292,23 +289,11 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
 
				   };
			
 
				 
			
 
				   struct LineInfo {
			
 
				-    // The length will always be assigned later. Indent may be assigned if
			
 
				-    // non-zero.
			
 
				-    explicit LineInfo(int64_t start)
			
 
				-        : start(start),
			
 
				-          length(static_cast<int32_t>(llvm::StringRef::npos)),
			
 
				-          indent(0) {}
			
 
				-
			
 
				-    explicit LineInfo(int64_t start, int32_t length)
			
 
				-        : start(start), length(length), indent(0) {}
			
 
				+    explicit LineInfo(int32_t start) : start(start), indent(0) {}
			
 
				 
			
 
				     // Zero-based byte offset of the start of the line within the source buffer
			
 
				     // provided.
			
 
				-    int64_t start;
			
 
				-
			
 
				-    // The byte length of the line. Does not include the newline character (or a
			
 
				-    // nul-terminator or EOF).
			
 
				-    int32_t length;
			
 
				+    int32_t start;
			
 
				 
			
 
				     // The byte offset from the start of the line of the first non-whitespace
			
 
				     // character.
			
@@ -322,6 +307,7 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
 
				                            SourceBuffer& source)
			
 
				       : value_stores_(&value_stores), source_(&source) {}
			
 
				 
			
 
				+  auto FindLineIndex(int32_t byte_offset) const -> LineIndex;
			
 
				   auto GetLineInfo(LineIndex line) -> LineInfo&;
			
 
				   auto GetLineInfo(LineIndex line) const -> const LineInfo&;
			
 
				   auto AddLine(LineInfo info) -> LineIndex;