https://github.com/jansvoboda11 updated 
https://github.com/llvm/llvm-project/pull/131354

>From 7e5de89fa20b9fe7c07ab0d05a5caaec3ac31255 Mon Sep 17 00:00:00 2001
From: Jan Svoboda <jan_svob...@apple.com>
Date: Mon, 17 Mar 2025 08:11:52 -0700
Subject: [PATCH 1/2] [clang][modules] Fix filesystem races in `ModuleManager`

The `ModuleManager` uses `FileEntry` objects to uniquely identify module files. 
This requires first consulting the `FileManager` (and therefore the file 
system) when loading PCM files. This is problematic, as this may load a 
different PCM file to what's already in the `InMemoryModuleCache` and fail the 
size and mtime checks.

This PR changes things so that module files are identified by their file system 
path. This removes the need of knowing the file entry at the start and allows 
us to fix the race. The downside is that we no longer get the `FileManager` 
inode-based uniquing, meaning symlinks in the module cache no longer work. I 
think this is fine, since Clang itself never creates symlinks in that 
directory. Moreover, we already had to work around filesystems recycling inode 
numbers, so this actually seems like a win to me (and resolves a long-standing 
FIXME-like comment).

Note that this change also requires that all places calling into 
`ModuleManager` from within a single `CompilerInstance` use consistent paths to 
refer to module files. This might be problematic if there are concurrent Clang 
processes operating on the same module cache directory but with different 
spellings - the `IMPORT` records in PCM files will be valid and consistent 
within single process, but may break uniquing in another process. We might need 
to do some canonicalization of the module cache paths to avoid this issue.
---
 .../clang/Serialization/ModuleManager.h       |  25 +--
 clang/lib/Serialization/ModuleManager.cpp     | 165 ++++++------------
 2 files changed, 60 insertions(+), 130 deletions(-)

diff --git a/clang/include/clang/Serialization/ModuleManager.h 
b/clang/include/clang/Serialization/ModuleManager.h
index 1eb74aee9787c..84b0276428dbf 100644
--- a/clang/include/clang/Serialization/ModuleManager.h
+++ b/clang/include/clang/Serialization/ModuleManager.h
@@ -22,6 +22,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/iterator.h"
 #include "llvm/ADT/iterator_range.h"
@@ -58,7 +59,7 @@ class ModuleManager {
   SmallVector<ModuleFile *, 2> Roots;
 
   /// All loaded modules, indexed by name.
-  llvm::DenseMap<const FileEntry *, ModuleFile *> Modules;
+  llvm::StringMap<ModuleFile *> Modules;
 
   /// FileManager that handles translating between filenames and
   /// FileEntry *.
@@ -179,7 +180,7 @@ class ModuleManager {
   ModuleFile *lookupByModuleName(StringRef ModName) const;
 
   /// Returns the module associated with the given module file.
-  ModuleFile *lookup(const FileEntry *File) const;
+  ModuleFile *lookup(FileEntryRef File) const;
 
   /// Returns the in-memory (virtual file) buffer with the given name
   std::unique_ptr<llvm::MemoryBuffer> lookupBuffer(StringRef Name);
@@ -283,26 +284,6 @@ class ModuleManager {
   void visit(llvm::function_ref<bool(ModuleFile &M)> Visitor,
              llvm::SmallPtrSetImpl<ModuleFile *> *ModuleFilesHit = nullptr);
 
-  /// Attempt to resolve the given module file name to a file entry.
-  ///
-  /// \param FileName The name of the module file.
-  ///
-  /// \param ExpectedSize The size that the module file is expected to have.
-  /// If the actual size differs, the resolver should return \c true.
-  ///
-  /// \param ExpectedModTime The modification time that the module file is
-  /// expected to have. If the actual modification time differs, the resolver
-  /// should return \c true.
-  ///
-  /// \param File Will be set to the file if there is one, or null
-  /// otherwise.
-  ///
-  /// \returns True if a file exists but does not meet the size/
-  /// modification time criteria, false if the file is either available and
-  /// suitable, or is missing.
-  bool lookupModuleFile(StringRef FileName, off_t ExpectedSize,
-                        time_t ExpectedModTime, OptionalFileEntryRef &File);
-
   /// View the graphviz representation of the module graph.
   void viewGraph();
 
diff --git a/clang/lib/Serialization/ModuleManager.cpp 
b/clang/lib/Serialization/ModuleManager.cpp
index 61c4e9ed88e9d..4fb23f92463ac 100644
--- a/clang/lib/Serialization/ModuleManager.cpp
+++ b/clang/lib/Serialization/ModuleManager.cpp
@@ -59,8 +59,8 @@ ModuleFile *ModuleManager::lookupByModuleName(StringRef Name) 
const {
   return nullptr;
 }
 
-ModuleFile *ModuleManager::lookup(const FileEntry *File) const {
-  return Modules.lookup(File);
+ModuleFile *ModuleManager::lookup(FileEntryRef File) const {
+  return Modules.lookup(File.getName());
 }
 
 std::unique_ptr<llvm::MemoryBuffer>
@@ -107,109 +107,51 @@ ModuleManager::addModule(StringRef FileName, ModuleKind 
Type,
                          std::string &ErrorStr) {
   Module = nullptr;
 
-  // Look for the file entry. This only fails if the expected size or
-  // modification time differ.
-  OptionalFileEntryRef Entry;
-  if (Type == MK_ExplicitModule || Type == MK_PrebuiltModule) {
-    // If we're not expecting to pull this file out of the module cache, it
-    // might have a different mtime due to being moved across filesystems in
-    // a distributed build. The size must still match, though. (As must the
-    // contents, but we can't check that.)
-    ExpectedModTime = 0;
-  }
-  // Note: ExpectedSize and ExpectedModTime will be 0 for MK_ImplicitModule
-  // when using an ASTFileSignature.
-  if (lookupModuleFile(FileName, ExpectedSize, ExpectedModTime, Entry)) {
-    ErrorStr = "module file has a different size or mtime than expected";
-    return OutOfDate;
-  }
-
-  if (!Entry) {
-    ErrorStr = "module file not found";
-    return Missing;
-  }
-
-  // The ModuleManager's use of FileEntry nodes as the keys for its map of
-  // loaded modules is less than ideal. Uniqueness for FileEntry nodes is
-  // maintained by FileManager, which in turn uses inode numbers on hosts
-  // that support that. When coupled with the module cache's proclivity for
-  // turning over and deleting stale PCMs, this means entries for different
-  // module files can wind up reusing the same underlying inode. When this
-  // happens, subsequent accesses to the Modules map will disagree on the
-  // ModuleFile associated with a given file. In general, it is not sufficient
-  // to resolve this conundrum with a type like FileEntryRef that stores the
-  // name of the FileEntry node on first access because of path 
canonicalization
-  // issues. However, the paths constructed for implicit module builds are
-  // fully under Clang's control. We *can*, therefore, rely on their structure
-  // being consistent across operating systems and across subsequent accesses
-  // to the Modules map.
-  auto implicitModuleNamesMatch = [](ModuleKind Kind, const ModuleFile *MF,
-                                     FileEntryRef Entry) -> bool {
-    if (Kind != MK_ImplicitModule)
-      return true;
-    return Entry.getName() == MF->FileName;
-  };
-
   // Check whether we already loaded this module, before
-  if (ModuleFile *ModuleEntry = Modules.lookup(*Entry)) {
-    if (implicitModuleNamesMatch(Type, ModuleEntry, *Entry)) {
-      // Check the stored signature.
-      if (checkSignature(ModuleEntry->Signature, ExpectedSignature, ErrorStr))
-        return OutOfDate;
-
-      Module = ModuleEntry;
-      updateModuleImports(*ModuleEntry, ImportedBy, ImportLoc);
-      return AlreadyLoaded;
-    }
+  if (ModuleFile *ModuleEntry = Modules.lookup(FileName)) {
+    // Check the stored signature.
+    if (checkSignature(ModuleEntry->Signature, ExpectedSignature, ErrorStr))
+      return OutOfDate;
+
+    Module = ModuleEntry;
+    updateModuleImports(*ModuleEntry, ImportedBy, ImportLoc);
+    return AlreadyLoaded;
   }
 
-  // Allocate a new module.
-  auto NewModule = std::make_unique<ModuleFile>(Type, *Entry, Generation);
-  NewModule->Index = Chain.size();
-  NewModule->FileName = FileName.str();
-  NewModule->ImportLoc = ImportLoc;
-  NewModule->InputFilesValidationTimestamp = 0;
-
-  if (NewModule->Kind == MK_ImplicitModule) {
-    std::string TimestampFilename =
-        ModuleFile::getTimestampFilename(NewModule->FileName);
-    llvm::vfs::Status Status;
-    // A cached stat value would be fine as well.
-    if (!FileMgr.getNoncachedStatValue(TimestampFilename, Status))
-      NewModule->InputFilesValidationTimestamp =
-          llvm::sys::toTimeT(Status.getLastModificationTime());
-  }
+  OptionalFileEntryRef Entry;
+  llvm::MemoryBuffer *ModuleBuffer = nullptr;
 
   // Load the contents of the module
   if (std::unique_ptr<llvm::MemoryBuffer> Buffer = lookupBuffer(FileName)) {
     // The buffer was already provided for us.
-    NewModule->Buffer = &getModuleCache().getInMemoryModuleCache().addBuiltPCM(
+    ModuleBuffer = &getModuleCache().getInMemoryModuleCache().addBuiltPCM(
         FileName, std::move(Buffer));
-    // Since the cached buffer is reused, it is safe to close the file
-    // descriptor that was opened while stat()ing the PCM in
-    // lookupModuleFile() above, it won't be needed any longer.
-    Entry->closeFile();
   } else if (llvm::MemoryBuffer *Buffer =
                  getModuleCache().getInMemoryModuleCache().lookupPCM(
                      FileName)) {
-    NewModule->Buffer = Buffer;
-    // As above, the file descriptor is no longer needed.
-    Entry->closeFile();
+    ModuleBuffer = Buffer;
   } else if (getModuleCache().getInMemoryModuleCache().shouldBuildPCM(
                  FileName)) {
     // Report that the module is out of date, since we tried (and failed) to
     // import it earlier.
-    Entry->closeFile();
     return OutOfDate;
   } else {
+    Entry = FileName == "-"
+                ? expectedToOptional(FileMgr.getSTDIN())
+                : FileMgr.getOptionalFileRef(FileName, /*OpenFile=*/true,
+                                             /*CacheFailure=*/false);
+    if (!Entry) {
+      ErrorStr = "module file not found";
+      return Missing;
+    }
+
     // Get a buffer of the file and close the file descriptor when done.
     // The file is volatile because in a parallel build we expect multiple
     // compiler processes to use the same module file rebuilding it if needed.
     //
     // RequiresNullTerminator is false because module files don't need it, and
     // this allows the file to still be mmapped.
-    auto Buf = FileMgr.getBufferForFile(NewModule->File,
-                                        /*IsVolatile=*/true,
+    auto Buf = FileMgr.getBufferForFile(*Entry, /*IsVolatile=*/true,
                                         /*RequiresNullTerminator=*/false);
 
     if (!Buf) {
@@ -217,10 +159,40 @@ ModuleManager::addModule(StringRef FileName, ModuleKind 
Type,
       return Missing;
     }
 
-    NewModule->Buffer = &getModuleCache().getInMemoryModuleCache().addPCM(
+    if ((ExpectedSize && ExpectedSize != Entry->getSize()) ||
+        (ExpectedModTime && ExpectedModTime != Entry->getModificationTime())) {
+      ErrorStr = "module file has a different size or mtime than expected";
+      return OutOfDate;
+    }
+
+    ModuleBuffer = &getModuleCache().getInMemoryModuleCache().addPCM(
         FileName, std::move(*Buf));
   }
 
+  // TODO: Make it so that ModuleFile is not tied to a FileEntry.
+  if (!Entry && !((Entry = FileMgr.getVirtualFileRef(FileName, ExpectedSize,
+                                                     ExpectedModTime))))
+    return Missing;
+
+  // Allocate a new module.
+  auto NewModule = std::make_unique<ModuleFile>(Type, *Entry, Generation);
+  NewModule->Index = Chain.size();
+  NewModule->FileName = FileName.str();
+  NewModule->ImportLoc = ImportLoc;
+  NewModule->InputFilesValidationTimestamp = 0;
+
+  if (NewModule->Kind == MK_ImplicitModule) {
+    std::string TimestampFilename =
+        ModuleFile::getTimestampFilename(NewModule->FileName);
+    llvm::vfs::Status Status;
+    // A cached stat value would be fine as well.
+    if (!FileMgr.getNoncachedStatValue(TimestampFilename, Status))
+      NewModule->InputFilesValidationTimestamp =
+          llvm::sys::toTimeT(Status.getLastModificationTime());
+  }
+
+  NewModule->Buffer = ModuleBuffer;
+
   // Initialize the stream.
   NewModule->Data = PCHContainerRdr.ExtractPCH(*NewModule->Buffer);
 
@@ -231,7 +203,7 @@ ModuleManager::addModule(StringRef FileName, ModuleKind 
Type,
     return OutOfDate;
 
   // We're keeping this module.  Store it everywhere.
-  Module = Modules[*Entry] = NewModule.get();
+  Module = Modules[FileName] = NewModule.get();
 
   updateModuleImports(*NewModule, ImportedBy, ImportLoc);
 
@@ -277,7 +249,7 @@ void ModuleManager::removeModules(ModuleIterator First) {
 
   // Delete the modules.
   for (ModuleIterator victim = First; victim != Last; ++victim)
-    Modules.erase(victim->File);
+    Modules.erase(victim->File.getName());
 
   Chain.erase(Chain.begin() + (First - begin()), Chain.end());
 }
@@ -436,29 +408,6 @@ void 
ModuleManager::visit(llvm::function_ref<bool(ModuleFile &M)> Visitor,
   returnVisitState(std::move(State));
 }
 
-bool ModuleManager::lookupModuleFile(StringRef FileName, off_t ExpectedSize,
-                                     time_t ExpectedModTime,
-                                     OptionalFileEntryRef &File) {
-  if (FileName == "-") {
-    File = expectedToOptional(FileMgr.getSTDIN());
-    return false;
-  }
-
-  // Open the file immediately to ensure there is no race between stat'ing and
-  // opening the file.
-  File = FileMgr.getOptionalFileRef(FileName, /*OpenFile=*/true,
-                                    /*CacheFailure=*/false);
-
-  if (File &&
-      ((ExpectedSize && ExpectedSize != File->getSize()) ||
-       (ExpectedModTime && ExpectedModTime != File->getModificationTime())))
-    // Do not destroy File, as it may be referenced. If we need to rebuild it,
-    // it will be destroyed by removeModules.
-    return true;
-
-  return false;
-}
-
 #ifndef NDEBUG
 namespace llvm {
 

>From aec5c1963603e21ef764453f96680035569c08cb Mon Sep 17 00:00:00 2001
From: Jan Svoboda <m...@jansvoboda.dev>
Date: Mon, 17 Mar 2025 16:45:40 -0700
Subject: [PATCH 2/2] Normalize paths to fix Windows bot

---
 clang/lib/Serialization/ASTReader.cpp         | 13 +++++---
 .../lib/Serialization/InMemoryModuleCache.cpp | 32 +++++++++++++++----
 clang/lib/Serialization/ModuleManager.cpp     | 10 ++++--
 3 files changed, 42 insertions(+), 13 deletions(-)

diff --git a/clang/lib/Serialization/ASTReader.cpp 
b/clang/lib/Serialization/ASTReader.cpp
index 2c73501821bff..9f5544fbb6321 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -4350,10 +4350,15 @@ void ASTReader::ReadModuleOffsetMap(ModuleFile &F) 
const {
     uint16_t Len = endian::readNext<uint16_t, llvm::endianness::little>(Data);
     StringRef Name = StringRef((const char*)Data, Len);
     Data += Len;
-    ModuleFile *OM = (Kind == MK_PrebuiltModule || Kind == MK_ExplicitModule ||
-                              Kind == MK_ImplicitModule
-                          ? ModuleMgr.lookupByModuleName(Name)
-                          : ModuleMgr.lookupByFileName(Name));
+    ModuleFile *OM;
+    if (Kind == MK_PrebuiltModule || Kind == MK_ExplicitModule ||
+        Kind == MK_ImplicitModule) {
+      OM = ModuleMgr.lookupByModuleName(Name);
+    } else {
+      SmallString<128> NormalizedName = Name;
+      llvm::sys::path::make_preferred(NormalizedName);
+      OM = ModuleMgr.lookupByFileName(NormalizedName);
+    }
     if (!OM) {
       std::string Msg = "refers to unknown module, cannot find ";
       Msg.append(std::string(Name));
diff --git a/clang/lib/Serialization/InMemoryModuleCache.cpp 
b/clang/lib/Serialization/InMemoryModuleCache.cpp
index d35fa2a807f4d..a36a2b69e27f4 100644
--- a/clang/lib/Serialization/InMemoryModuleCache.cpp
+++ b/clang/lib/Serialization/InMemoryModuleCache.cpp
@@ -7,13 +7,18 @@
 
//===----------------------------------------------------------------------===//
 
 #include "clang/Serialization/InMemoryModuleCache.h"
+#include "llvm/ADT/SmallString.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
 
 using namespace clang;
 
 InMemoryModuleCache::State
 InMemoryModuleCache::getPCMState(llvm::StringRef Filename) const {
-  auto I = PCMs.find(Filename);
+  llvm::SmallString<128> NormalizedFilename = Filename;
+  llvm::sys::path::make_preferred(NormalizedFilename);
+
+  auto I = PCMs.find(NormalizedFilename);
   if (I == PCMs.end())
     return Unknown;
   if (I->second.IsFinal)
@@ -24,7 +29,10 @@ InMemoryModuleCache::getPCMState(llvm::StringRef Filename) 
const {
 llvm::MemoryBuffer &
 InMemoryModuleCache::addPCM(llvm::StringRef Filename,
                             std::unique_ptr<llvm::MemoryBuffer> Buffer) {
-  auto Insertion = PCMs.insert(std::make_pair(Filename, std::move(Buffer)));
+  llvm::SmallString<128> NormalizedFilename = Filename;
+  llvm::sys::path::make_preferred(NormalizedFilename);
+
+  auto Insertion = PCMs.insert({NormalizedFilename, std::move(Buffer)});
   assert(Insertion.second && "Already has a PCM");
   return *Insertion.first->second.Buffer;
 }
@@ -32,7 +40,10 @@ InMemoryModuleCache::addPCM(llvm::StringRef Filename,
 llvm::MemoryBuffer &
 InMemoryModuleCache::addBuiltPCM(llvm::StringRef Filename,
                                  std::unique_ptr<llvm::MemoryBuffer> Buffer) {
-  auto &PCM = PCMs[Filename];
+  llvm::SmallString<128> NormalizedFilename = Filename;
+  llvm::sys::path::make_preferred(NormalizedFilename);
+
+  auto &PCM = PCMs[NormalizedFilename];
   assert(!PCM.IsFinal && "Trying to override finalized PCM?");
   assert(!PCM.Buffer && "Trying to override tentative PCM?");
   PCM.Buffer = std::move(Buffer);
@@ -42,7 +53,10 @@ InMemoryModuleCache::addBuiltPCM(llvm::StringRef Filename,
 
 llvm::MemoryBuffer *
 InMemoryModuleCache::lookupPCM(llvm::StringRef Filename) const {
-  auto I = PCMs.find(Filename);
+  llvm::SmallString<128> NormalizedFilename = Filename;
+  llvm::sys::path::make_preferred(NormalizedFilename);
+
+  auto I = PCMs.find(NormalizedFilename);
   if (I == PCMs.end())
     return nullptr;
   return I->second.Buffer.get();
@@ -57,7 +71,10 @@ bool InMemoryModuleCache::shouldBuildPCM(llvm::StringRef 
Filename) const {
 }
 
 bool InMemoryModuleCache::tryToDropPCM(llvm::StringRef Filename) {
-  auto I = PCMs.find(Filename);
+  llvm::SmallString<128> NormalizedFilename = Filename;
+  llvm::sys::path::make_preferred(NormalizedFilename);
+
+  auto I = PCMs.find(NormalizedFilename);
   assert(I != PCMs.end() && "PCM to remove is unknown...");
 
   auto &PCM = I->second;
@@ -71,7 +88,10 @@ bool InMemoryModuleCache::tryToDropPCM(llvm::StringRef 
Filename) {
 }
 
 void InMemoryModuleCache::finalizePCM(llvm::StringRef Filename) {
-  auto I = PCMs.find(Filename);
+  llvm::SmallString<128> NormalizedFilename = Filename;
+  llvm::sys::path::make_preferred(NormalizedFilename);
+
+  auto I = PCMs.find(NormalizedFilename);
   assert(I != PCMs.end() && "PCM to finalize is unknown...");
 
   auto &PCM = I->second;
diff --git a/clang/lib/Serialization/ModuleManager.cpp 
b/clang/lib/Serialization/ModuleManager.cpp
index 4fb23f92463ac..14e43452c97bd 100644
--- a/clang/lib/Serialization/ModuleManager.cpp
+++ b/clang/lib/Serialization/ModuleManager.cpp
@@ -60,7 +60,9 @@ ModuleFile *ModuleManager::lookupByModuleName(StringRef Name) 
const {
 }
 
 ModuleFile *ModuleManager::lookup(FileEntryRef File) const {
-  return Modules.lookup(File.getName());
+  llvm::SmallString<128> NormalizedFileName = File.getName();
+  llvm::sys::path::make_preferred(NormalizedFileName);
+  return Modules.lookup(NormalizedFileName);
 }
 
 std::unique_ptr<llvm::MemoryBuffer>
@@ -108,7 +110,9 @@ ModuleManager::addModule(StringRef FileName, ModuleKind 
Type,
   Module = nullptr;
 
   // Check whether we already loaded this module, before
-  if (ModuleFile *ModuleEntry = Modules.lookup(FileName)) {
+  llvm::SmallString<128> NormalizedFileName = FileName;
+  llvm::sys::path::make_preferred(NormalizedFileName);
+  if (ModuleFile *ModuleEntry = Modules.lookup(NormalizedFileName)) {
     // Check the stored signature.
     if (checkSignature(ModuleEntry->Signature, ExpectedSignature, ErrorStr))
       return OutOfDate;
@@ -203,7 +207,7 @@ ModuleManager::addModule(StringRef FileName, ModuleKind 
Type,
     return OutOfDate;
 
   // We're keeping this module.  Store it everywhere.
-  Module = Modules[FileName] = NewModule.get();
+  Module = Modules[NormalizedFileName] = NewModule.get();
 
   updateModuleImports(*NewModule, ImportedBy, ImportLoc);
 

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to