Browse Source

Change comments and lines to ValueStores (#5621)

Moves LineInfo and CommentData out so that they can easily be set as
`ValueType` on the Index types. I've also been thinking about letting
`ValueStore` take `ValueType` as a parameter instead of requiring it to
be inferred this way, but for these it feels more consistent with the
rest of the toolchain to do it this way.

I'm not doing similar with `TokenInfo` just because the recovery token
splicing makes it more difficult to use `ValueStore`.
Jon Ross-Perkins 11 months ago
parent
commit
766ef7077d
3 changed files with 95 additions and 123 deletions
  1. 40 44
      toolchain/lex/lex.cpp
  2. 23 42
      toolchain/lex/tokenized_buffer.cpp
  3. 32 37
      toolchain/lex/tokenized_buffer.h

+ 40 - 44
toolchain/lex/lex.cpp

@@ -54,7 +54,6 @@ namespace Carbon::Lex {
 class [[clang::internal_linkage]] Lexer {
  public:
   using TokenInfo = TokenizedBuffer::TokenInfo;
-  using LineInfo = TokenizedBuffer::LineInfo;
 
   // Symbolic result of a lexing action. This indicates whether we successfully
   // lexed a token, or whether other lexing actions should be attempted.
@@ -96,18 +95,16 @@ class [[clang::internal_linkage]] Lexer {
   // But because it can, the compiler will flatten this otherwise.
   [[gnu::noinline]] auto MakeLines(llvm::StringRef source_text) -> void;
 
-  auto current_line() -> LineIndex { return LineIndex(line_index_); }
+  auto current_line() -> LineIndex { return line_index_; }
 
-  auto current_line_info() -> LineInfo* {
-    return &buffer_.line_infos_[line_index_];
+  auto current_line_info() -> LineInfo& {
+    return buffer_.line_infos_.Get(line_index_);
   }
 
-  auto next_line() -> LineIndex { return LineIndex(line_index_ + 1); }
+  auto next_line() -> LineIndex { return LineIndex(line_index_.index + 1); }
 
-  auto next_line_info() -> LineInfo* {
-    CARBON_CHECK(line_index_ + 1 <
-                 static_cast<ssize_t>(buffer_.line_infos_.size()));
-    return &buffer_.line_infos_[line_index_ + 1];
+  auto next_line_info() -> LineInfo& {
+    return buffer_.line_infos_.Get(next_line());
   }
 
   // Note when the lexer has encountered whitespace, and the next lexed token
@@ -147,7 +144,7 @@ class [[clang::internal_linkage]] Lexer {
 
   // Starts a new line, skipping whitespace and setting the indent.
   auto AdvanceToLine(llvm::StringRef source_text, ssize_t& position,
-                     ssize_t to_line_index) -> void;
+                     LineIndex to_line_index) -> void;
 
   auto LexHorizontalWhitespace(llvm::StringRef source_text, ssize_t& position)
       -> void;
@@ -231,7 +228,7 @@ class [[clang::internal_linkage]] Lexer {
 
   TokenizedBuffer buffer_;
 
-  ssize_t line_index_;
+  LineIndex line_index_ = LineIndex::None;
 
   // Tracks whether the lexer has encountered whitespace that will be leading
   // whitespace for the next lexed token. Reset after each token lexed.
@@ -781,8 +778,8 @@ auto Lexer::Lex() && -> TokenizedBuffer {
 auto Lexer::MakeLines(llvm::StringRef source_text) -> void {
   if (source_text.empty()) {
     // Construct a single line for empty input.
-    buffer_.AddLine(TokenizedBuffer::LineInfo(0));
-    line_index_ = 0;
+    buffer_.line_infos_.Add(LineInfo(0));
+    line_index_ = LineIndex(0);
     return;
   }
 
@@ -810,23 +807,23 @@ auto Lexer::MakeLines(llvm::StringRef source_text) -> void {
   while (const char* nl = reinterpret_cast<const char*>(
              memchr(&text[start], '\n', size - start))) {
     ssize_t nl_index = nl - text;
-    buffer_.AddLine(TokenizedBuffer::LineInfo(start));
+    buffer_.line_infos_.Add(LineInfo(start));
     start = nl_index + 1;
   }
   // The last line ends at the end of the file.
-  buffer_.AddLine(TokenizedBuffer::LineInfo(start));
+  buffer_.line_infos_.Add(LineInfo(start));
 
   // If the last line wasn't empty, the file ends with an unterminated line.
   // Add an extra blank line so that we never need to handle the special case
   // of being on the last line inside the lexer and needing to not increment
   // to the next line.
   if (start != size) {
-    buffer_.AddLine(TokenizedBuffer::LineInfo(size));
+    buffer_.line_infos_.Add(LineInfo(size));
   }
 
   // Now that all the infos are allocated, get a fresh pointer to the first
   // info for use while lexing.
-  line_index_ = 0;
+  line_index_ = LineIndex(0);
 }
 
 auto Lexer::SkipHorizontalWhitespace(llvm::StringRef source_text,
@@ -842,14 +839,14 @@ auto Lexer::SkipHorizontalWhitespace(llvm::StringRef source_text,
 }
 
 auto Lexer::AdvanceToLine(llvm::StringRef source_text, ssize_t& position,
-                          ssize_t to_line_index) -> void {
+                          LineIndex to_line_index) -> void {
   CARBON_DCHECK(to_line_index >= line_index_);
   line_index_ = to_line_index;
-  auto* line_info = current_line_info();
-  ssize_t line_start = line_info->start;
+  auto& line_info = current_line_info();
+  ssize_t line_start = line_info.start;
   position = line_start;
   SkipHorizontalWhitespace(source_text, position);
-  line_info->indent = position - line_start;
+  line_info.indent = position - line_start;
 }
 
 auto Lexer::LexHorizontalWhitespace(llvm::StringRef source_text,
@@ -863,7 +860,7 @@ auto Lexer::LexHorizontalWhitespace(llvm::StringRef source_text,
 auto Lexer::LexVerticalWhitespace(llvm::StringRef source_text,
                                   ssize_t& position) -> void {
   NoteWhitespace();
-  AdvanceToLine(source_text, position, line_index_ + 1);
+  AdvanceToLine(source_text, position, next_line());
 }
 
 auto Lexer::LexCR(llvm::StringRef source_text, ssize_t& position) -> void {
@@ -958,8 +955,8 @@ auto Lexer::LexComment(llvm::StringRef source_text, ssize_t& position) -> void {
   int32_t comment_start = position;
 
   // Any comment must be the only non-whitespace on the line.
-  const auto* line_info = current_line_info();
-  if (LLVM_UNLIKELY(position != line_info->start + line_info->indent)) {
+  const auto line_info = current_line_info();
+  if (LLVM_UNLIKELY(position != line_info.start + line_info.indent)) {
     CARBON_DIAGNOSTIC(TrailingComment, Error,
                       "trailing comments are not permitted");
 
@@ -970,7 +967,7 @@ auto Lexer::LexComment(llvm::StringRef source_text, ssize_t& position) -> void {
     // whitespace, which already is designed to skip over any erroneous text at
     // the end of the line.
     LexVerticalWhitespace(source_text, position);
-    buffer_.AddComment(line_info->indent, comment_start, position);
+    buffer_.AddComment(line_info.indent, comment_start, position);
     return;
   }
 
@@ -981,12 +978,12 @@ auto Lexer::LexComment(llvm::StringRef source_text, ssize_t& position) -> void {
     llvm::StringRef comment_text = source_text.substr(position);
     if (comment_text.starts_with("//@dump-sem-ir-begin\n")) {
       BeginDumpSemIRRange(comment_text.begin());
-      AdvanceToLine(source_text, position, line_index_ + 1);
+      AdvanceToLine(source_text, position, next_line());
       return;
     }
     if (comment_text.starts_with("//@dump-sem-ir-end\n")) {
       EndDumpSemIRRange(comment_text.begin());
-      AdvanceToLine(source_text, position, line_index_ + 1);
+      AdvanceToLine(source_text, position, next_line());
       return;
     }
 
@@ -999,9 +996,8 @@ auto Lexer::LexComment(llvm::StringRef source_text, ssize_t& position) -> void {
   }
 
   // Skip over this line.
-  ssize_t line_index = line_index_;
-  ++line_index;
-  position = buffer_.line_infos_[line_index].start;
+  LineIndex line_index = next_line();
+  position = buffer_.line_infos_.Get(line_index).start;
 
   // A very common pattern is a long block of comment lines all with the same
   // indent and comment start. We skip these comment blocks in bulk both for
@@ -1014,16 +1010,16 @@ auto Lexer::LexComment(llvm::StringRef source_text, ssize_t& position) -> void {
   //
   // TODO: We should extend this to 32-byte SIMD on platforms with support.
   constexpr int MaxIndent = 13;
-  const int indent = line_info->indent;
-  const ssize_t first_line_start = line_info->start;
+  const int indent = line_info.indent;
+  const ssize_t first_line_start = line_info.start;
   ssize_t prefix_size = indent + (is_valid_after_slashes ? 3 : 2);
   auto skip_to_next_line = [this, indent, &line_index, &position] {
     // We're guaranteed to have a line here even on a comment on the last line
     // as we ensure there is an empty line structure at the end of every file.
-    ++line_index;
-    auto* next_line_info = &buffer_.line_infos_[line_index];
-    next_line_info->indent = indent;
-    position = next_line_info->start;
+    ++line_index.index;
+    auto& next_line_info = buffer_.line_infos_.Get(line_index);
+    next_line_info.indent = indent;
+    position = next_line_info.start;
   };
   if (CARBON_USE_SIMD &&
       position + 16 < static_cast<ssize_t>(source_text.size()) &&
@@ -1142,15 +1138,15 @@ auto Lexer::LexStringLiteral(llvm::StringRef source_text, ssize_t& position)
 
   // Capture the position before we step past the token.
   int32_t byte_offset = position;
-  int string_column = byte_offset - current_line_info()->start;
+  int string_column = byte_offset - current_line_info().start;
   ssize_t literal_size = literal->text().size();
   position += literal_size;
 
   // Update line and column information.
   if (literal->is_multi_line()) {
-    while (next_line_info()->start < position) {
-      ++line_index_;
-      current_line_info()->indent = string_column;
+    while (next_line_info().start < position) {
+      ++line_index_.index;
+      current_line_info().indent = string_column;
     }
     // Note that we've updated the current line at this point, but
     // `set_indent_` is already true from above. That remains correct as the
@@ -1458,8 +1454,8 @@ auto Lexer::LexFileStart(llvm::StringRef source_text, ssize_t& position)
 
   // Also skip any horizontal whitespace and record the indentation of the
   // first line.
-  CARBON_CHECK(current_line_info()->start == 0);
-  AdvanceToLine(source_text, position, /*to_line_index=*/0);
+  CARBON_CHECK(current_line_info().start == 0);
+  AdvanceToLine(source_text, position, /*to_line_index=*/LineIndex(0));
 }
 
 auto Lexer::LexFileEnd(llvm::StringRef source_text, ssize_t position) -> void {
@@ -1470,8 +1466,8 @@ auto Lexer::LexFileEnd(llvm::StringRef source_text, ssize_t position) -> void {
   // as separators in case of a missing newline on the last line. We do this
   // here instead of detecting this when we see the newline to avoid more
   // conditions along that fast path.
-  if (position == current_line_info()->start && line_index_ != 0) {
-    --line_index_;
+  if (position == current_line_info().start && line_index_.index != 0) {
+    --line_index_.index;
     --position;
   }
 

+ 23 - 42
toolchain/lex/tokenized_buffer.cpp

@@ -33,7 +33,8 @@ auto TokenizedBuffer::GetLineNumber(TokenIndex token) const -> int {
 
 auto TokenizedBuffer::GetColumnNumber(TokenIndex token) const -> int {
   const auto& token_info = GetTokenInfo(token);
-  const auto& line_info = GetLineInfo(FindLineIndex(token_info.byte_offset()));
+  const auto& line_info =
+      line_infos_.Get(FindLineIndex(token_info.byte_offset()));
   return token_info.byte_offset() - line_info.start + 1;
 }
 
@@ -166,19 +167,8 @@ auto TokenizedBuffer::IsRecoveryToken(TokenIndex token) const -> bool {
   return recovery_tokens_[token.index];
 }
 
-auto TokenizedBuffer::GetNextLine(LineIndex line) const -> LineIndex {
-  LineIndex next(line.index + 1);
-  CARBON_DCHECK(static_cast<size_t>(next.index) < line_infos_.size());
-  return next;
-}
-
-auto TokenizedBuffer::GetPrevLine(LineIndex line) const -> LineIndex {
-  CARBON_CHECK(line.index > 0);
-  return LineIndex(line.index - 1);
-}
-
 auto TokenizedBuffer::GetIndentColumnNumber(LineIndex line) const -> int {
-  return GetLineInfo(line).indent + 1;
+  return line_infos_.Get(line).indent + 1;
 }
 
 auto TokenizedBuffer::PrintWidths::Widen(const PrintWidths& widths) -> void {
@@ -325,9 +315,11 @@ auto TokenizedBuffer::PrintToken(llvm::raw_ostream& output_stream,
 // This takes advantage of the lines being sorted by their starting byte offsets
 // to do a binary search for the line that contains the provided offset.
 auto TokenizedBuffer::FindLineIndex(int32_t byte_offset) const -> LineIndex {
-  CARBON_DCHECK(!line_infos_.empty());
-  const auto* line_it =
-      llvm::partition_point(line_infos_, [byte_offset](LineInfo line_info) {
+  CARBON_DCHECK(line_infos_.size() > 0);
+
+  auto line_range = line_infos_.values();
+  auto line_it =
+      llvm::partition_point(line_range, [byte_offset](LineInfo line_info) {
         return line_info.start <= byte_offset;
       });
   --line_it;
@@ -335,49 +327,36 @@ auto TokenizedBuffer::FindLineIndex(int32_t byte_offset) const -> LineIndex {
   // If this isn't the first line but it starts past the end of the source, then
   // this is a synthetic line added for simplicity of lexing. Step back one
   // further to find the last non-synthetic line.
-  if (line_it != line_infos_.begin() &&
+  if (line_it != line_range.begin() &&
       line_it->start == static_cast<int32_t>(source_->text().size())) {
     --line_it;
   }
   CARBON_DCHECK(line_it->start <= byte_offset);
-  return LineIndex(line_it - line_infos_.begin());
-}
-
-auto TokenizedBuffer::GetLineInfo(LineIndex line) -> LineInfo& {
-  return line_infos_[line.index];
-}
-
-auto TokenizedBuffer::GetLineInfo(LineIndex line) const -> const LineInfo& {
-  return line_infos_[line.index];
-}
-
-auto TokenizedBuffer::AddLine(LineInfo info) -> LineIndex {
-  line_infos_.push_back(info);
-  return LineIndex(static_cast<int>(line_infos_.size()) - 1);
+  return LineIndex(line_it - line_range.begin());
 }
 
 auto TokenizedBuffer::IsAfterComment(TokenIndex token,
                                      CommentIndex comment_index) const -> bool {
-  const auto& comment_data = comments_[comment_index.index];
+  const auto& comment_data = comments_.Get(comment_index);
   return GetTokenInfo(token).byte_offset() > comment_data.start;
 }
 
 auto TokenizedBuffer::GetCommentText(CommentIndex comment_index) const
     -> llvm::StringRef {
-  const auto& comment_data = comments_[comment_index.index];
+  const auto& comment_data = comments_.Get(comment_index);
   return source_->text().substr(comment_data.start, comment_data.length);
 }
 
 auto TokenizedBuffer::AddComment(int32_t indent, int32_t start, int32_t end)
     -> void {
-  if (!comments_.empty()) {
-    auto& comment = comments_.back();
+  if (comments_.size() > 0) {
+    auto& comment = comments_.Get(CommentIndex(comments_.size() - 1));
     if (comment.start + comment.length + indent == start) {
       comment.length = end - comment.start;
       return;
     }
   }
-  comments_.push_back({.start = start, .length = end - start});
+  comments_.Add({.start = start, .length = end - start});
 }
 
 auto TokenizedBuffer::CollectMemUsage(MemUsage& mem_usage,
@@ -394,23 +373,25 @@ auto TokenizedBuffer::SourcePointerToDiagnosticLoc(const char* loc) const
                "location not within buffer");
   int32_t offset = loc - source_->text().begin();
 
+  auto line_range = line_infos_.values();
+
   // Find the first line starting after the given location.
-  const auto* next_line_it = llvm::partition_point(
-      line_infos_,
+  const auto next_line_it = llvm::partition_point(
+      line_range,
       [offset](const LineInfo& line) { return line.start <= offset; });
 
   // Step back one line to find the line containing the given position.
-  CARBON_CHECK(next_line_it != line_infos_.begin(),
+  CARBON_CHECK(next_line_it != line_range.begin(),
                "location precedes the start of the first line");
-  const auto* line_it = std::prev(next_line_it);
-  int line_number = line_it - line_infos_.begin();
+  const auto line_it = std::prev(next_line_it);
+  int line_number = line_it - line_range.begin();
   int column_number = offset - line_it->start;
 
   // Grab the line from the buffer by slicing from this line to the next
   // minus the newline. When on the last line, instead use the start to the end
   // of the buffer.
   llvm::StringRef text = source_->text();
-  llvm::StringRef line = next_line_it != line_infos_.end()
+  llvm::StringRef line = next_line_it != line_range.end()
                              ? text.slice(line_it->start, next_line_it->start)
                              : text.substr(line_it->start);
 

+ 32 - 37
toolchain/lex/tokenized_buffer.h

@@ -26,6 +26,18 @@ namespace Carbon::Lex {
 
 class TokenizedBuffer;
 
+struct LineInfo {
+  explicit LineInfo(int32_t start) : start(start), indent(0) {}
+
+  // Zero-based byte offset of the start of the line within the source buffer
+  // provided.
+  int32_t start;
+
+  // The byte offset from the start of the line of the first non-whitespace
+  // character.
+  int32_t indent;
+};
+
 // A lightweight handle to a lexed line in a `TokenizedBuffer`.
 //
 // `LineIndex` objects are designed to be passed by value, not reference or
@@ -38,6 +50,8 @@ class TokenizedBuffer;
 //
 // All other APIs to query a `LineIndex` are on the `TokenizedBuffer`.
 struct LineIndex : public IndexBase<LineIndex> {
+  using ValueType = LineInfo;
+
   static constexpr llvm::StringLiteral Label = "line";
   static const LineIndex None;
   using IndexBase::IndexBase;
@@ -45,8 +59,24 @@ struct LineIndex : public IndexBase<LineIndex> {
 
 constexpr LineIndex LineIndex::None(NoneIndex);
 
+// A comment, which can be a block of lines. These are tracked separately from
+// tokens because they don't affect parse; if they were part of tokens, we'd
+// need more general special-casing within token logic.
+//
+// Note that `CommentInfo` is used for an API to expose the comment.
+struct CommentData {
+  // Zero-based byte offset of the start of the comment within the source
+  // buffer provided.
+  int32_t start;
+
+  // The comment's length.
+  int32_t length;
+};
+
 // Indices for comments within the buffer.
 struct CommentIndex : public IndexBase<CommentIndex> {
+  using ValueType = CommentData;
+
   static constexpr llvm::StringLiteral Label = "comment";
   static const CommentIndex None;
   using IndexBase::IndexBase;
@@ -146,12 +176,6 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
   // Returns the 1-based indentation column number.
   auto GetIndentColumnNumber(LineIndex line) const -> int;
 
-  // Returns the next line handle.
-  auto GetNextLine(LineIndex line) const -> LineIndex;
-
-  // Returns the previous line handle.
-  auto GetPrevLine(LineIndex line) const -> LineIndex;
-
   auto GetByteOffset(TokenIndex token) const -> int32_t {
     return GetTokenInfo(token).byte_offset();
   }
@@ -436,32 +460,6 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
   static_assert(sizeof(TokenInfo) == 8,
                 "Expected `TokenInfo` to pack to an 8-byte structure.");
 
-  // A comment, which can be a block of lines. These are tracked separately from
-  // tokens because they don't affect parse; if they were part of tokens, we'd
-  // need more general special-casing within token logic.
-  //
-  // Note that `CommentInfo` is used for an API to expose the comment.
-  struct CommentData {
-    // Zero-based byte offset of the start of the comment within the source
-    // buffer provided.
-    int32_t start;
-
-    // The comment's length.
-    int32_t length;
-  };
-
-  struct LineInfo {
-    explicit LineInfo(int32_t start) : start(start), indent(0) {}
-
-    // Zero-based byte offset of the start of the line within the source buffer
-    // provided.
-    int32_t start;
-
-    // The byte offset from the start of the line of the first non-whitespace
-    // character.
-    int32_t indent;
-  };
-
   // The constructor is merely responsible for trivial initialization of
   // members. A working object of this type is built with `Lex::Lex` so that its
   // return can indicate if an error was encountered while lexing.
@@ -471,9 +469,6 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
       : value_stores_(&value_stores), source_(&source) {}
 
   auto FindLineIndex(int32_t byte_offset) const -> LineIndex;
-  auto GetLineInfo(LineIndex line) -> LineInfo&;
-  auto GetLineInfo(LineIndex line) const -> const LineInfo&;
-  auto AddLine(LineInfo info) -> LineIndex;
   auto GetTokenInfo(TokenIndex token) -> TokenInfo&;
   auto GetTokenInfo(TokenIndex token) const -> const TokenInfo&;
   auto AddToken(TokenInfo info) -> TokenIndex;
@@ -493,10 +488,10 @@ class TokenizedBuffer : public Printable<TokenizedBuffer> {
 
   llvm::SmallVector<TokenInfo> token_infos_;
 
-  llvm::SmallVector<LineInfo> line_infos_;
+  ValueStore<LineIndex> line_infos_;
 
   // Comments in the file.
-  llvm::SmallVector<CommentData> comments_;
+  ValueStore<CommentIndex> comments_;
 
   // A range of tokens marked by `//@dump-semir-[begin|end]`.
   //