소스 검색

Validate source text size and fix empty buffer bugs. (#1113)

There's currently a bug with empty files, in that it initializes SourceBuffer with an invalid StringRef that results in a crash. That got me looking at the std::optional TODO, but the issue is that there are really three states:

- Buffered
- mmapped (not buffered)
- Moved out of (no longer initialized)

Technically an optional could work if we initialize the buffer on move out, indicating the mmap is gone. But the mode setup felt better to me.

And then this also adds the size check. Which is really how I started looking at this.
Jon Meow 4 년 전
부모
커밋
8a2ef22c2a

+ 1 - 1
toolchain/lexer/tokenized_buffer_fuzzer.cpp

@@ -28,7 +28,7 @@ extern "C" int LLVMFuzzerTestOneInput(const unsigned char* data,
   auto source = SourceBuffer::CreateFromText(
       llvm::StringRef(reinterpret_cast<const char*>(data), size));
 
-  auto buffer = TokenizedBuffer::Lex(source, NullDiagnosticConsumer());
+  auto buffer = TokenizedBuffer::Lex(*source, NullDiagnosticConsumer());
   if (buffer.HasErrors()) {
     return 0;
   }

+ 2 - 1
toolchain/lexer/tokenized_buffer_test.cpp

@@ -32,7 +32,8 @@ using ::testing::StrEq;
 class LexerTest : public ::testing::Test {
  protected:
   auto GetSourceBuffer(llvm::Twine text) -> SourceBuffer& {
-    source_storage.push_back(SourceBuffer::CreateFromText(text.str()));
+    source_storage.push_back(
+        std::move(*SourceBuffer::CreateFromText(text.str())));
     return source_storage.back();
   }
 

+ 1 - 1
toolchain/parser/parse_tree_fuzzer.cpp

@@ -28,7 +28,7 @@ extern "C" int LLVMFuzzerTestOneInput(const unsigned char* data,
       llvm::StringRef(reinterpret_cast<const char*>(data), size));
 
   // Lex the input.
-  auto tokens = TokenizedBuffer::Lex(source, NullDiagnosticConsumer());
+  auto tokens = TokenizedBuffer::Lex(*source, NullDiagnosticConsumer());
   if (tokens.HasErrors()) {
     return 0;
   }

+ 2 - 1
toolchain/parser/parse_tree_test.cpp

@@ -32,7 +32,8 @@ using ::testing::StrEq;
 class ParseTreeTest : public ::testing::Test {
  protected:
   auto GetSourceBuffer(llvm::Twine t) -> SourceBuffer& {
-    source_storage.push_front(SourceBuffer::CreateFromText(t.str()));
+    source_storage.push_front(
+        std::move(*SourceBuffer::CreateFromText(t.str())));
     return source_storage.front();
   }
 

+ 1 - 1
toolchain/semantics/semantics_test.cpp

@@ -20,7 +20,7 @@ namespace {
 class ParseTreeTest : public ::testing::Test {
  protected:
   auto Analyze(llvm::Twine t) -> Semantics {
-    source_buffer.emplace(SourceBuffer::CreateFromText(t.str()));
+    source_buffer.emplace(std::move(*SourceBuffer::CreateFromText(t.str())));
     tokenized_buffer = TokenizedBuffer::Lex(*source_buffer, consumer);
     parse_tree = ParseTree::Parse(*tokenized_buffer, consumer);
     return Semantics::Analyze(*parse_tree, consumer);

+ 54 - 16
toolchain/source/source_buffer.cpp

@@ -11,16 +11,34 @@
 
 #include <cerrno>
 #include <cstdint>
+#include <limits>
+#include <optional>
 #include <system_error>
+#include <variant>
 
 #include "common/check.h"
 #include "llvm/ADT/ScopeExit.h"
+#include "llvm/Support/Error.h"
 
 namespace Carbon {
 
+// Verifies that the content size is within limits.
+static auto CheckContentSize(int64_t size) -> llvm::Error {
+  if (size < std::numeric_limits<int32_t>::max()) {
+    return llvm::Error::success();
+  }
+  return llvm::createStringError(llvm::inconvertibleErrorCode(),
+                                 "Input too large!");
+}
+
 auto SourceBuffer::CreateFromText(llvm::Twine text, llvm::StringRef filename)
-    -> SourceBuffer {
-  return SourceBuffer(filename, text.str());
+    -> llvm::Expected<SourceBuffer> {
+  std::string buffer = text.str();
+  auto size_check = CheckContentSize(buffer.size());
+  if (size_check) {
+    return size_check;
+  }
+  return SourceBuffer(filename.str(), std::move(buffer));
 }
 
 static auto ErrnoToError(int errno_value) -> llvm::Error {
@@ -30,10 +48,11 @@ static auto ErrnoToError(int errno_value) -> llvm::Error {
 
 auto SourceBuffer::CreateFromFile(llvm::StringRef filename)
     -> llvm::Expected<SourceBuffer> {
-  SourceBuffer buffer(filename);
+  // Add storage to ensure there's a nul-terminator for open().
+  std::string filename_str = filename.str();
 
   errno = 0;
-  int file_descriptor = open(buffer.filename_.c_str(), O_RDONLY);
+  int file_descriptor = open(filename_str.c_str(), O_RDONLY);
   if (file_descriptor == -1) {
     return ErrnoToError(errno);
   }
@@ -50,8 +69,12 @@ auto SourceBuffer::CreateFromFile(llvm::StringRef filename)
 
   int64_t size = stat_buffer.st_size;
   if (size == 0) {
-    // Nothing to do for an empty file.
-    return {std::move(buffer)};
+    // Rather than opening an empty file, create an empty buffer.
+    return SourceBuffer(std::move(filename_str), std::string());
+  }
+  auto size_check = CheckContentSize(size);
+  if (size_check) {
+    return size_check;
   }
 
   errno = 0;
@@ -78,24 +101,39 @@ auto SourceBuffer::CreateFromFile(llvm::StringRef filename)
     return ErrnoToError(errno);
   }
 
-  buffer.text_ = llvm::StringRef(static_cast<const char*>(mapped_text), size);
-  CHECK(!buffer.text_.empty())
+  return SourceBuffer(
+      std::move(filename_str),
+      llvm::StringRef(static_cast<const char*>(mapped_text), size));
+}
+
+SourceBuffer::SourceBuffer(SourceBuffer&& arg) noexcept
+    // Sets Uninitialized to ensure the input doesn't release mmapped data.
+    : content_mode_(
+          std::exchange(arg.content_mode_, ContentMode::Uninitialized)),
+      filename_(std::move(arg.filename_)),
+      text_storage_(std::move(arg.text_storage_)),
+      text_(content_mode_ == ContentMode::Owned ? text_storage_ : arg.text_) {}
+
+SourceBuffer::SourceBuffer(std::string filename, std::string text)
+    : content_mode_(ContentMode::Owned),
+      filename_(std::move(filename)),
+      text_storage_(std::move(text)),
+      text_(text_storage_) {}
+
+SourceBuffer::SourceBuffer(std::string filename, llvm::StringRef text)
+    : content_mode_(ContentMode::MMapped),
+      filename_(std::move(filename)),
+      text_(text) {
+  CHECK(!text.empty())
       << "Must not have an empty text when we have mapped data from a file!";
-  return {std::move(buffer)};
 }
 
 SourceBuffer::~SourceBuffer() {
-  if (is_string_rep_) {
-    string_storage_.~decltype(string_storage_)();
-    return;
-  }
-
-  if (!text_.empty()) {
+  if (content_mode_ == ContentMode::MMapped) {
     errno = 0;
     int result =
         munmap(const_cast<void*>(static_cast<const void*>(text_.data())),
                text_.size());
-    (void)result;
     CHECK(result != -1) << "Unmapping text failed!";
   }
 }

+ 14 - 37
toolchain/source/source_buffer.h

@@ -35,34 +35,18 @@ class SourceBuffer {
  public:
   static auto CreateFromText(llvm::Twine text,
                              llvm::StringRef filename = "/text")
-      -> SourceBuffer;
+      -> llvm::Expected<SourceBuffer>;
   static auto CreateFromFile(llvm::StringRef filename)
       -> llvm::Expected<SourceBuffer>;
 
   // Use one of the factory functions above to create a source buffer.
   SourceBuffer() = delete;
 
-  // Cannot copy as there may be non-trivial owned file data, see the class
+  // Cannot copy as there may be non-trivial owned file data; see the class
   // comment for details.
   SourceBuffer(const SourceBuffer& arg) = delete;
 
-  SourceBuffer(SourceBuffer&& arg) noexcept
-      : filename_(std::move(arg.filename_)),
-        text_(arg.text_),
-        is_string_rep_(arg.is_string_rep_) {
-    // The easy case in when we don't need to transfer an allocated string
-    // representation.
-    if (!arg.is_string_rep_) {
-      // Take ownership of a non-string representation by clearing its text.
-      arg.text_ = llvm::StringRef();
-      return;
-    }
-
-    // If the argument is using a string rep we need to move that storage over
-    // and recreate our text `StringRef` to point at our storage.
-    new (&string_storage_) std::string(std::move(arg.string_storage_));
-    text_ = string_storage_;
-  }
+  SourceBuffer(SourceBuffer&& arg) noexcept;
 
   ~SourceBuffer();
 
@@ -71,28 +55,21 @@ class SourceBuffer {
   [[nodiscard]] auto Text() const -> llvm::StringRef { return text_; }
 
  private:
-  SourceBuffer(llvm::StringRef fake_filename, std::string buffer_text)
-      : filename_(fake_filename.str()),
-        is_string_rep_(true),
-        string_storage_(std::move(buffer_text)) {
-    text_ = string_storage_;
-  }
+  enum class ContentMode {
+    Uninitialized,
+    MMapped,
+    Owned,
+  };
 
-  explicit SourceBuffer(llvm::StringRef filename)
-      : filename_(filename.str()), text_(), is_string_rep_(false) {}
+  // Constructor for mmapped content.
+  SourceBuffer(std::string filename, llvm::StringRef text);
+  // Constructor for owned content.
+  SourceBuffer(std::string filename, std::string text);
 
+  ContentMode content_mode_;
   std::string filename_;
-
+  std::string text_storage_;
   llvm::StringRef text_;
-
-  bool is_string_rep_;
-
-  // We use a transparent union to avoid constructing the storage.
-  // FIXME: We should replace this and the boolean with an optional which would
-  // be much simpler.
-  union {
-    std::string string_storage_;
-  };
 };
 
 }  // namespace Carbon

+ 21 - 8
toolchain/source/source_buffer_test.cpp

@@ -15,17 +15,17 @@ namespace Carbon::Testing {
 namespace {
 
 TEST(SourceBufferTest, StringRep) {
-  SourceBuffer buffer =
-      SourceBuffer::CreateFromText(llvm::Twine("Hello") + " World");
-
-  EXPECT_EQ("/text", buffer.Filename());
-  EXPECT_EQ("Hello World", buffer.Text());
+  auto buffer = SourceBuffer::CreateFromText(llvm::Twine("Hello") + " World");
+  EXPECT_EQ("/text", buffer->Filename());
+  EXPECT_EQ("Hello World", buffer->Text());
+}
 
+TEST(SourceBufferText, StringRepWithFilename) {
   // Give a custom filename.
-  auto buffer2 =
+  auto buffer =
       SourceBuffer::CreateFromText("Hello World Again!", "/custom/text");
-  EXPECT_EQ("/custom/text", buffer2.Filename());
-  EXPECT_EQ("Hello World Again!", buffer2.Text());
+  EXPECT_EQ("/custom/text", buffer->Filename());
+  EXPECT_EQ("Hello World Again!", buffer->Text());
 }
 
 auto CreateTestFile(llvm::StringRef text) -> std::string {
@@ -58,5 +58,18 @@ TEST(SourceBufferTest, FileRep) {
   EXPECT_EQ("Hello World", buffer.Text());
 }
 
+TEST(SourceBufferTest, FileRepEmpty) {
+  auto test_file_path = CreateTestFile("");
+
+  auto expected_buffer = SourceBuffer::CreateFromFile(test_file_path);
+  ASSERT_TRUE(static_cast<bool>(expected_buffer))
+      << "Error message: " << toString(expected_buffer.takeError());
+
+  SourceBuffer& buffer = *expected_buffer;
+
+  EXPECT_EQ(test_file_path, buffer.Filename());
+  EXPECT_EQ("", buffer.Text());
+}
+
 }  // namespace
 }  // namespace Carbon::Testing