sammccall updated this revision to Diff 167888.
sammccall added a comment.

latest snapshot


Repository:
  rC Clang

https://reviews.llvm.org/D52549

Files:
  include/clang/Basic/VirtualFileSystem.h
  lib/Basic/AvoidStatsVFS.cpp
  lib/Basic/CMakeLists.txt
  lib/Basic/VirtualFileSystem.cpp
  unittests/Basic/VirtualFileSystemTest.cpp

Index: unittests/Basic/VirtualFileSystemTest.cpp
===================================================================
--- unittests/Basic/VirtualFileSystemTest.cpp
+++ unittests/Basic/VirtualFileSystemTest.cpp
@@ -1616,3 +1616,139 @@
 
   EXPECT_EQ(3, NumDiagnostics);
 }
+
+class AvoidStatsVFSTest : public ::testing::Test {
+ protected:
+  struct Counts {
+    unsigned Status = 0;
+    unsigned OpenFile = 0;
+    unsigned ListDir = 0;
+  };
+  Counts Outer, Inner;
+  IntrusiveRefCntPtr<vfs::InMemoryFileSystem> MemFS;
+
+  IntrusiveRefCntPtr<vfs::FileSystem> reset() {
+    Outer = {};
+    Inner = {};
+    MemFS = new vfs::InMemoryFileSystem();
+    return new CountingFS(
+        Outer, vfs::avoidStats(new CountingFS(Inner, MemFS)).release());
+  }
+
+  void touch(const Twine &Path) {
+    MemFS->addFile(Path, 0, MemoryBuffer::getMemBuffer(""));
+  }
+
+ private:
+   class CountingFS : public vfs::ProxyFileSystem {
+     Counts &C;
+
+   public:
+     CountingFS(Counts &C, IntrusiveRefCntPtr<vfs::FileSystem> Base)
+         : ProxyFileSystem(Base), C(C) {}
+
+     llvm::ErrorOr<std::unique_ptr<vfs::File>>
+     openFileForRead(const Twine &Path) override {
+       ++C.OpenFile;
+       return ProxyFileSystem::openFileForRead(Path);
+     }
+
+     vfs::directory_iterator dir_begin(const Twine &Dir,
+                                       std::error_code &EC) override {
+       ++C.ListDir;
+       return ProxyFileSystem::dir_begin(Dir, EC);
+     }
+
+     llvm::ErrorOr<vfs::Status> status(const Twine &Path) override {
+       ++C.Status;
+       return ProxyFileSystem::status(Path);
+     }
+   };
+};
+
+TEST_F(AvoidStatsVFSTest, MissingFileNoStat) {
+  auto FS = reset();
+  touch("/foo/a");
+  touch("/foo/c");
+  touch("/foo/f");
+
+  EXPECT_FALSE(FS->status("/foo/a").getError()); // Just stat, /foo wanted once.
+  EXPECT_TRUE(FS->status("/foo/b").getError()); // List /foo, b known missing
+  EXPECT_FALSE(FS->status("/foo/c").getError()); // c exists, must stat
+  EXPECT_TRUE(FS->status("/foo/d").getError()); // d known missing
+  EXPECT_TRUE(FS->openFileForRead("/foo/e").getError()); // e known missing
+  EXPECT_FALSE(FS->openFileForRead("/foo/f").getError()); // f exists, must open
+
+  EXPECT_EQ(Outer.Status, 4u);
+  EXPECT_EQ(Outer.OpenFile, 2u);
+  EXPECT_EQ(Outer.ListDir, 0u);
+  EXPECT_EQ(Inner.Status, 2u);
+  EXPECT_EQ(Inner.OpenFile, 1u);
+  EXPECT_EQ(Inner.ListDir, 1u);
+}
+
+TEST_F(AvoidStatsVFSTest, ParentDirsMissing) {
+  auto FS = reset();
+  touch("/a/b/z");
+
+  EXPECT_TRUE(FS->status("/a/b/1").getError()); // Just stat.
+  EXPECT_TRUE(FS->status("/a/b/2").getError()); // List /a/b
+  EXPECT_TRUE(FS->status("/a/b/3").getError()); // Known missing
+  EXPECT_TRUE(FS->status("/a/b/c/d").getError()); // Known missing
+  EXPECT_TRUE(FS->status("/a/b/z/d").getError()); // Parent is a file
+  EXPECT_EQ(Outer.Status, 5u);
+  EXPECT_EQ(Outer.ListDir, 0u);
+  EXPECT_EQ(Inner.Status, 1u);
+  EXPECT_EQ(Inner.ListDir, 1u);
+}
+
+TEST_F(AvoidStatsVFSTest, HugeDir) {
+  auto FS = reset();
+  for (int I = 0; I < 10000; ++I)
+    touch("/big/" + Twine(I));
+
+  EXPECT_FALSE(FS->status("/big/1").getError());    // Just stat.
+  EXPECT_FALSE(FS->status("/big/9998").getError()); // Try to list, fail, stat.
+  EXPECT_FALSE(FS->status("/big/9999").getError()); // stat, don't list
+  EXPECT_TRUE(FS->status("/big/10001").getError());
+  EXPECT_TRUE(FS->status("/big/x/10001").getError());
+  EXPECT_TRUE(FS->status("/big/1/10001").getError()); // missing parent in cache
+
+  EXPECT_EQ(Outer.Status, 6u);
+  EXPECT_EQ(Outer.ListDir, 0u);
+  EXPECT_EQ(Inner.Status, 5u);
+  EXPECT_EQ(Inner.ListDir, 1u);
+}
+
+TEST_F(AvoidStatsVFSTest, Duplicate) {
+  // Stat + stat on a missing file is one operation.
+  auto FS = reset();
+  EXPECT_TRUE(FS->status("/x").getError()); // Just stat.
+  EXPECT_TRUE(FS->status("/x").getError()); // No need to stat or list again.
+  EXPECT_EQ(Inner.Status, 1u);
+  EXPECT_EQ(Inner.ListDir, 0u);
+
+  // Same with open + open.
+  FS = reset();
+  EXPECT_TRUE(FS->openFileForRead("/x").getError());
+  EXPECT_TRUE(FS->openFileForRead("/x").getError());
+  EXPECT_EQ(Inner.OpenFile, 1u);
+  EXPECT_EQ(Inner.ListDir, 0u);
+
+  // And stat + open.
+  FS = reset();
+  EXPECT_TRUE(FS->status("/x").getError());
+  EXPECT_TRUE(FS->openFileForRead("/x").getError());
+  EXPECT_EQ(Inner.Status, 1u);
+  EXPECT_EQ(Inner.OpenFile, 0u);
+  EXPECT_EQ(Inner.ListDir, 0u);
+
+  // But open + stat is two: as far as open() knows, it might be a dir.
+  // We list the directory (second attempt) and see that it's not.
+  FS = reset();
+  EXPECT_TRUE(FS->openFileForRead("/x").getError());
+  EXPECT_TRUE(FS->status("/x").getError());
+  EXPECT_EQ(Inner.OpenFile, 1u);
+  EXPECT_EQ(Inner.Status, 0u);
+  EXPECT_EQ(Inner.ListDir, 1u);
+}
Index: lib/Basic/VirtualFileSystem.cpp
===================================================================
--- lib/Basic/VirtualFileSystem.cpp
+++ lib/Basic/VirtualFileSystem.cpp
@@ -303,11 +303,6 @@
   return llvm::sys::fs::real_path(Path, Output);
 }
 
-IntrusiveRefCntPtr<FileSystem> vfs::getRealFileSystem() {
-  static IntrusiveRefCntPtr<FileSystem> FS = new RealFileSystem();
-  return FS;
-}
-
 namespace {
 
 class RealFSDirIter : public clang::vfs::detail::DirIterImpl {
@@ -2141,3 +2136,9 @@
 
   return *this;
 }
+
+IntrusiveRefCntPtr<FileSystem> vfs::getRealFileSystem() {
+  static IntrusiveRefCntPtr<FileSystem> FS(
+      avoidStats(new RealFileSystem()).release());
+  return FS;
+}
Index: lib/Basic/CMakeLists.txt
===================================================================
--- lib/Basic/CMakeLists.txt
+++ lib/Basic/CMakeLists.txt
@@ -46,6 +46,7 @@
 
 add_clang_library(clangBasic
   Attributes.cpp
+  AvoidStatsVFS.cpp
   Builtins.cpp
   CharInfo.cpp
   Cuda.cpp
Index: lib/Basic/AvoidStatsVFS.cpp
===================================================================
--- /dev/null
+++ lib/Basic/AvoidStatsVFS.cpp
@@ -0,0 +1,368 @@
+//===- AvoidStatsVFS.cpp - Implements a stat-reducing VFS wrapper ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// We implement a VFS wrapper that uses directory listings to prune status() and
+// open() operations for files that *do not exist*.
+//
+// These operations are common in clang because of include paths.
+// With an include path of {A, B, ...}, an #include <foo/bar> directive results
+// in attempts to open or stat {A/foo/bar, B/foo/bar, ...}.
+//
+// This is expensive because this typically takes one syscall per path, so we
+// have O(NumIncludes * IncludePathLen) syscalls.
+//
+// To optimize this, we can list parent directories such as A/.
+// If A only contains {config.h}, attempts to open A/foo/bar, A/foo/baz, A/qux
+// can all fail instantly.
+// Listing a directory tends to be ~4 syscalls, and in practice most
+// missing files can be recognized by looking at the same few directories.
+// In practice the number of syscalls is O(NumIncludes + IncludePathLen).
+//
+// Our constant factor is higher, but this improves performance for large TUs
+// with many include paths.
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Basic/VirtualFileSystem.h"
+#include "llvm/ADT/ScopeExit.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Path.h"
+#include <mutex>
+#define DEBUG_TYPE "AvoidStatsVFS"
+
+namespace clang {
+namespace vfs {
+namespace {
+using namespace llvm;
+namespace path = llvm::sys::path;
+using llvm::sys::fs::file_type;
+
+class AvoidStatVFS : public ProxyFileSystem {
+public:
+  // In fact we only read a directory once we've wanted its contents several
+  // times. This avoids a common pattern:
+  //   - we read files under /some/long/path/...
+  //   - we don't read anything else under /some/long/...
+  //   - we readdir(/some/long), which tells us that /some/long/path is a dir
+  //   - we still need to readdir(/some/long/path) to get its contents
+  //   - that also tells us it's a directory, so readdir(/some/long) was useless
+  // We refuse to do IO to understand each dir until we reach this threshold.
+  constexpr static unsigned ReadDirThreshold = 2;
+
+  AvoidStatVFS(IntrusiveRefCntPtr<FileSystem> Base)
+      : ProxyFileSystem(std::move(Base)) {}
+
+  llvm::ErrorOr<std::unique_ptr<vfs::File>>
+  openFileForRead(const Twine &Path) override {
+    LLVM_DEBUG(dump("before open " + Path));
+    auto G = make_scope_exit([&] { LLVM_DEBUG(dump("after open " + Path)); });
+
+    auto NormPath = normalizePath(Path);
+    {
+      std::lock_guard<std::mutex> Lock(CacheMu);
+      if (!mayBeFile</*OrDir=*/false>(NormPath))
+        return std::make_error_code(std::errc::no_such_file_or_directory);
+    }
+    LLVM_DEBUG(llvm::dbgs() << "passthru open " << Path << "\n");
+    auto Result = ProxyFileSystem::openFileForRead(Path);
+
+    std::lock_guard<std::mutex> Lock(CacheMu);
+    recordState(NormPath, Result ? File : MissingOrDir);
+    return Result;
+  }
+
+  llvm::ErrorOr<Status> status(const Twine &Path) override {
+    LLVM_DEBUG(dump("before status " + Path));
+    auto G = make_scope_exit([&] { LLVM_DEBUG(dump("after status " + Path)); });
+
+    auto NormPath = normalizePath(Path);
+    {
+      std::lock_guard<std::mutex> Lock(CacheMu);
+      if (!mayBeFile</*OrDir=*/true>(NormPath))
+        return std::make_error_code(std::errc::no_such_file_or_directory);
+    }
+    LLVM_DEBUG(llvm::dbgs() << "passthru status " << Path << "\n");
+    auto Result = ProxyFileSystem::status(Path);
+
+    std::lock_guard<std::mutex> Lock(CacheMu);
+    recordState(NormPath, Result
+                              ? (Result->getType() == file_type::directory_file
+                                     ? DirUnknownChildren
+                                     : File)
+                              : Missing);
+    return Result;
+  }
+
+private:
+  // What we know about a path, which may be partial information.
+  // Values are ordered: higher values have more info and "upgrade" lower ones.
+  enum PathState {
+    // Ambiguous states.
+    Unknown,
+    MissingOrDir,
+    MissingOrFile,
+    // Type known, but not all relevant details.
+    DirUnknownChildren,
+    DirUnenumerable, // unknown children, but don't try to enumerate again!
+    // Complete information.
+    DirKnownChildren,
+    File,
+    Missing,
+  };
+
+  // The cache always uses absolute paths with ./ and ../ segments removed.
+  SmallString<256> normalizePath(const Twine &Path) {
+    SmallString<256> Result;
+    Path.toVector(Result);
+    makeAbsolute(Result);
+    path::remove_dots(Result, /*remove_dot_dot=*/true);
+    return Result;
+  }
+
+  // Helpers used in LLVM_DEBUG() logging.
+#ifndef NDEBUG
+  void dump(const Twine &When) {
+    llvm::dbgs() << "AvoidStatsVFS " << When << ":\n";
+    std::vector<std::pair<StringRef, PathState>> V;
+    for (const auto& E : Cache)
+      V.emplace_back(E.getKey(), E.getValue());
+    llvm::sort(V.begin(), V.end());
+    for (const auto &E : V)
+      llvm::dbgs() << " - " << E.first << " = " << str(E.second) << "\n";
+  }
+
+  const char* str(PathState S) {
+    switch (S) {
+      case Unknown: return "Unknown";
+      case MissingOrDir: return "MissingOrDir";
+      case MissingOrFile: return "MissingOrFile";
+      case DirUnknownChildren: return "DirUnknownChildren";
+      case DirUnenumerable: return "DirUnenumerable";
+      case DirKnownChildren: return "DirKnownChildren";
+      case File: return "File";
+      case Missing: return "Missing";
+    }
+  }
+#endif
+
+  // Returns false if we know NormPath points to a file that, based on cached
+  // information about the path and its parents.
+  // May retrieve extra information about parent directories to better answer.
+  // If the function returns true, NormPath may or may not be a file.
+  // If OrDir is true, both files and directories are acceptable "files".
+  // REQUIRES: CacheMu is held.
+  template <bool OrDir>
+  bool mayBeFile(StringRef NormPath) {
+    // First, just see if we can answer from the cache.
+    switch (Cache.lookup(NormPath)) {
+    case Unknown:
+    case MissingOrFile:
+      break;
+    case MissingOrDir:
+      if (!OrDir)
+        return false;
+      break;
+    case DirUnknownChildren:
+    case DirUnenumerable:
+    case DirKnownChildren:
+      return OrDir;
+    case File:
+      return true;
+    case Missing:
+      return false;
+    }
+
+    // Next, maybe we can get an answer based on the parent directory.
+    auto Parent = path::parent_path(NormPath);
+    if (Parent.empty())
+      return OrDir; // Root is a directory.
+    // We may need to populate its cache entry.
+    if (!populateCacheForDir(Parent))
+      return true; // No more information.
+
+    switch (Cache.lookup(Parent)) {
+    case Unknown:
+    case MissingOrDir:
+    case DirUnknownChildren:
+      llvm_unreachable("populateCacheForDir didn't provide enough info");
+    case MissingOrFile:
+    case File:
+    case Missing:
+      return false;
+    case DirUnenumerable:
+      return true;
+    case DirKnownChildren:
+      // The point: we listed the parent, all children are now in cache.
+      switch (Cache.lookup(NormPath)) {
+      case MissingOrDir:
+      case MissingOrFile:
+        llvm_unreachable("populateCacheForDir didn't provide child info");
+      case Unknown:
+      case Missing:
+        return false;
+      case DirUnknownChildren:
+      case DirUnenumerable:
+      case DirKnownChildren:
+        return OrDir;
+      case File:
+        return true;
+      };
+    }
+  }
+
+  // Record new information about a path's state.
+  // Combines appropriately with existing cached information (won't overwrite
+  // a more specific state with less specific state).
+  // REQUIRES: CacheMu is held.
+  void recordState(StringRef NormPath, PathState State) {
+    auto& Current = Cache[NormPath];
+    // Sherlock Holmes would be proud of this special case.
+    if ((Current == MissingOrDir && State == MissingOrFile) ||
+        (Current == MissingOrFile && State == MissingOrDir)) {
+      Current = Missing;
+      return;
+    }
+    Current = std::max(Current, State);
+    switch (State) {
+      case Unknown:
+      case MissingOrDir:
+      case MissingOrFile:
+      case Missing:
+        break;
+      case DirUnenumerable:
+      case DirUnknownChildren:
+      case DirKnownChildren:
+      case File:
+        auto Parent = path::parent_path(NormPath);
+        if (!Parent.empty())
+          recordState(Parent, DirUnknownChildren);
+        break;
+    }
+  }
+
+  // Attempts to populate the cache with information about a directory.
+  // Roughly: if the directory children are not known, readdir() to fill it.
+  //
+  // Details are messy:
+  //   - If we know the directory doesn't exist, don't read from it.
+  //   - We may need to populate parent caches in order to know that.
+  //   - We may decline to populate the cache because the directory wasn't
+  //     accessed enough yet.
+  //
+  // After returning, exactly one of the following is true:
+  //   - we returned false, and declined to populate the cache
+  //   - Cache[NormPath] indicates NormPath is not a directory
+  //   - Cache[NormPath] == DirKnownChildren, all children are in cache
+  //   - Cache[NormPath] == DirUnenumerable
+  //
+  // If AllowIO is false, we *only* attempt to propagate information already in
+  // cache. Only AllowIO=true calls count towards the ReadDirThreshold.
+  //
+  // REQUIRES: CacheMu is held.
+  bool populateCacheForDir(StringRef NormPath, bool AllowIO = true) {
+    // First, just see if we have any work to do.
+    bool ParentMightHelp = true;
+    switch (Cache.lookup(NormPath)) {
+    case Unknown:
+    case MissingOrDir:
+      break; // Need to populate cache with more info.
+    case DirUnknownChildren:
+      ParentMightHelp = false; // Listing parent won't detail our children.
+      break;
+    case MissingOrFile:
+    case DirUnenumerable:
+    case DirKnownChildren:
+    case File:
+    case Missing:
+      return true; // Cache already satisfies postcondition.
+    }
+    if (AllowIO && ++ReadDirAttempts[NormPath] < ReadDirThreshold)
+      AllowIO = false;
+    // Next, populate parent info and see if that determines our state.
+    auto Parent = path::parent_path(NormPath);
+    if (ParentMightHelp && !Parent.empty() &&
+        populateCacheForDir(Parent, AllowIO)) {
+      switch (Cache.lookup(Parent)) {
+      case Unknown:
+      case MissingOrDir:
+      case DirUnknownChildren:
+        llvm_unreachable("populateCacheForDir didn't provide enough info");
+      case MissingOrFile:
+      case File:
+      case Missing:
+        // Child can't exist if parent is not a directory.
+        recordState(NormPath, Missing);
+        return true;
+      case DirUnenumerable:
+        break; // No info about child, need to read it.
+      case DirKnownChildren:
+        // Parent is a directory, and we can tell whether child is.
+        switch (Cache.lookup(NormPath)) {
+        case MissingOrDir:
+        case MissingOrFile:
+          llvm_unreachable("populateCacheForDir didn't provide child info");
+        case Unknown:
+          recordState(NormPath, Missing);
+          return true;
+        case DirUnknownChildren:
+          break; // Need to list children.
+        case DirUnenumerable:
+        case DirKnownChildren:
+          // populateCacheForDir shouldn't do this, but we might be racing.
+          return true;
+        case File:
+        case Missing:
+          return true; // Cache now satisfies postcondition.
+        }
+        break;
+      }
+    }
+
+    // No other sources of info, we have to list the directory.
+    if (!AllowIO)
+      return false;
+    // Unlock around IO - no accessing the cache!
+    CacheMu.unlock();
+    LLVM_DEBUG(llvm::dbgs() << "synthetic readdir " << NormPath << "\n");
+    std::vector<std::pair<std::string, file_type>> DirListing;
+    std::error_code EC;
+    for (auto It = dir_begin(NormPath, EC);
+         !EC && It != directory_iterator();) {
+      DirListing.emplace_back(It->path(), It->type());
+      It.increment(EC);
+      if (DirListing.size() == 3000)
+        EC = std::make_error_code(std::errc::result_out_of_range);
+    }
+    CacheMu.lock();
+
+    // Record the results of the directory listing.
+    for (const auto& Entry : DirListing)
+      recordState(Entry.first, Entry.second == file_type::directory_file
+                                   ? DirUnknownChildren
+                                   : File);
+    recordState(NormPath,
+                EC ? DirListing.empty() ? MissingOrFile : DirUnenumerable
+                   : DirKnownChildren);
+    return true;
+  }
+
+  std::mutex CacheMu;
+  StringMap<PathState> Cache;
+  StringMap<unsigned> ReadDirAttempts;
+};
+} // namespace
+
+std::unique_ptr<FileSystem>
+avoidStats(llvm::IntrusiveRefCntPtr<FileSystem> FS) {
+  return llvm::make_unique<AvoidStatVFS>(std::move(FS));
+}
+
+} // namespace vfs
+} // namespace clang
Index: include/clang/Basic/VirtualFileSystem.h
===================================================================
--- include/clang/Basic/VirtualFileSystem.h
+++ include/clang/Basic/VirtualFileSystem.h
@@ -461,6 +461,20 @@
   std::error_code setCurrentWorkingDirectory(const Twine &Path) override;
 };
 
+/// Wrap a filesystem to avoid calling status() and open() on missing files.
+///
+/// When files are accessed, the FS lists ancestor directories and caches their
+/// contents. This allows subsequent status() and open() calls for sibling
+/// nonexistent paths to fail without making any syscalls.
+///
+/// This is likely to improve performance when a large fraction of open()ed or
+/// status()ed files do not exist, and their paths are related.
+/// Notably: resolving #included files with a long include path.
+///
+/// This wrapper never invalidates cached information, so is only suitable when
+/// the underlying FS is assumed to be static.
+std::unique_ptr<FileSystem> avoidStats(llvm::IntrusiveRefCntPtr<FileSystem>);
+
 /// Get a globally unique ID for a virtual file or directory.
 llvm::sys::fs::UniqueID getNextVirtualUniqueID();
 
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to