Author: Abhina Sree Date: 2024-12-11T07:46:51-05:00 New Revision: 04379c98638ac3901257b5fa319f9ece828af767
URL: https://github.com/llvm/llvm-project/commit/04379c98638ac3901257b5fa319f9ece828af767 DIFF: https://github.com/llvm/llvm-project/commit/04379c98638ac3901257b5fa319f9ece828af767.diff LOG: [SystemZ][z/OS] Update autoconversion functions to improve support for UTF-8 (#98652) This fixes the following error when reading source and header files on z/OS: error: source file is not valid UTF-8 Added: Modified: clang/include/clang/Basic/FileEntry.h clang/lib/Basic/SourceManager.cpp llvm/include/llvm/Support/AutoConvert.h llvm/lib/Support/AutoConvert.cpp llvm/lib/Support/MemoryBuffer.cpp Removed: ################################################################################ diff --git a/clang/include/clang/Basic/FileEntry.h b/clang/include/clang/Basic/FileEntry.h index 68d4bf60930037..ba6dfa69f214d8 100644 --- a/clang/include/clang/Basic/FileEntry.h +++ b/clang/include/clang/Basic/FileEntry.h @@ -70,6 +70,11 @@ class FileEntryRef { const FileEntry &getFileEntry() const { return *getBaseMapEntry().second->V.get<FileEntry *>(); } + + // This function is used if the buffer size needs to be increased + // due to potential z/OS EBCDIC -> UTF-8 conversion + inline void updateFileEntryBufferSize(unsigned BufferSize); + DirectoryEntryRef getDir() const { return ME->second->Dir; } inline off_t getSize() const; @@ -323,6 +328,8 @@ class FileEntry { StringRef tryGetRealPathName() const { return RealPathName; } off_t getSize() const { return Size; } + // Size may increase due to potential z/OS EBCDIC -> UTF-8 conversion. + void setSize(off_t NewSize) { Size = NewSize; } unsigned getUID() const { return UID; } const llvm::sys::fs::UniqueID &getUniqueID() const { return UniqueID; } time_t getModificationTime() const { return ModTime; } @@ -353,6 +360,10 @@ bool FileEntryRef::isNamedPipe() const { return getFileEntry().isNamedPipe(); } void FileEntryRef::closeFile() const { getFileEntry().closeFile(); } +void FileEntryRef::updateFileEntryBufferSize(unsigned BufferSize) { + getBaseMapEntry().second->V.get<FileEntry *>()->setSize(BufferSize); +} + } // end namespace clang #endif // LLVM_CLANG_BASIC_FILEENTRY_H diff --git a/clang/lib/Basic/SourceManager.cpp b/clang/lib/Basic/SourceManager.cpp index 6e588ce63d813f..849c18f171f6e1 100644 --- a/clang/lib/Basic/SourceManager.cpp +++ b/clang/lib/Basic/SourceManager.cpp @@ -24,6 +24,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Support/Allocator.h" +#include "llvm/Support/AutoConvert.h" #include "llvm/Support/Capacity.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Endian.h" @@ -156,8 +157,11 @@ ContentCache::getBufferOrNone(DiagnosticsEngine &Diag, FileManager &FM, // Unless this is a named pipe (in which case we can handle a mismatch), // check that the file's size is the same as in the file entry (which may // have come from a stat cache). + // The buffer will always be larger than the file size on z/OS in the presence + // of characters outside the base character set. + assert(Buffer->getBufferSize() >= (size_t)ContentsEntry->getSize()); if (!ContentsEntry->isNamedPipe() && - Buffer->getBufferSize() != (size_t)ContentsEntry->getSize()) { + Buffer->getBufferSize() < (size_t)ContentsEntry->getSize()) { Diag.Report(Loc, diag::err_file_modified) << ContentsEntry->getName(); return std::nullopt; @@ -583,6 +587,18 @@ SourceManager::getOrCreateFileID(FileEntryRef SourceFile, FileCharacter); } +/// Helper function to determine if an input file requires conversion +bool needConversion(StringRef Filename) { +#ifdef __MVS__ + llvm::ErrorOr<bool> NeedConversion = + llvm::needzOSConversion(Filename.str().c_str()); + assert(NeedConversion && "Filename was not found"); + return *NeedConversion; +#else + return false; +#endif +} + /// createFileID - Create a new FileID for the specified ContentCache and /// include position. This works regardless of whether the ContentCache /// corresponds to a file or some other input source. @@ -602,6 +618,20 @@ FileID SourceManager::createFileIDImpl(ContentCache &File, StringRef Filename, return FileID::get(LoadedID); } unsigned FileSize = File.getSize(); + bool NeedConversion = needConversion(Filename); + if (NeedConversion) { + // Buffer size may increase due to potential z/OS EBCDIC to UTF-8 + // conversion. + if (std::optional<llvm::MemoryBufferRef> Buffer = + File.getBufferOrNone(Diag, getFileManager())) { + unsigned BufSize = Buffer->getBufferSize(); + if (BufSize > FileSize) { + if (File.ContentsEntry.has_value()) + File.ContentsEntry->updateFileEntryBufferSize(BufSize); + FileSize = BufSize; + } + } + } if (!(NextLocalOffset + FileSize + 1 > NextLocalOffset && NextLocalOffset + FileSize + 1 <= CurrentLoadedOffset)) { Diag.Report(IncludePos, diag::err_sloc_space_too_large); diff --git a/llvm/include/llvm/Support/AutoConvert.h b/llvm/include/llvm/Support/AutoConvert.h index 65ac576ae5676b..5d6d9394ef1d81 100644 --- a/llvm/include/llvm/Support/AutoConvert.h +++ b/llvm/include/llvm/Support/AutoConvert.h @@ -17,6 +17,7 @@ #ifdef __MVS__ #include <_Ccsid.h> #ifdef __cplusplus +#include "llvm/Support/ErrorOr.h" #include <system_error> #endif /* __cplusplus */ @@ -54,8 +55,14 @@ std::error_code restorezOSStdHandleAutoConversion(int FD); /** \brief Set the tag information for a file descriptor. */ std::error_code setzOSFileTag(int FD, int CCSID, bool Text); -} /* namespace llvm */ -#endif /* __cplusplus */ +// Get the the tag ccsid for a file name or a file descriptor. +ErrorOr<__ccsid_t> getzOSFileTag(const char *FileName, const int FD = -1); + +// Query the file tag to determine if it needs conversion to UTF-8 codepage. +ErrorOr<bool> needzOSConversion(const char *FileName, const int FD = -1); + +} // namespace llvm +#endif // __cplusplus #endif /* __MVS__ */ diff --git a/llvm/lib/Support/AutoConvert.cpp b/llvm/lib/Support/AutoConvert.cpp index 66570735f8fc88..f7918548df1d0d 100644 --- a/llvm/lib/Support/AutoConvert.cpp +++ b/llvm/lib/Support/AutoConvert.cpp @@ -20,6 +20,8 @@ #include <sys/stat.h> #include <unistd.h> +using namespace llvm; + static int savedStdHandleAutoConversionMode[3] = {-1, -1, -1}; int disablezOSAutoConversion(int FD) { @@ -116,4 +118,40 @@ std::error_code llvm::setzOSFileTag(int FD, int CCSID, bool Text) { return std::error_code(); } -#endif // __MVS__ +ErrorOr<__ccsid_t> llvm::getzOSFileTag(const char *FileName, const int FD) { + // If we have a file descriptor, use it to find out file tagging. Otherwise we + // need to use stat() with the file path. + if (FD != -1) { + struct f_cnvrt Query = { + QUERYCVT, // cvtcmd + 0, // pccsid + 0, // fccsid + }; + if (fcntl(FD, F_CONTROL_CVT, &Query) == -1) + return std::error_code(errno, std::generic_category()); + return Query.fccsid; + } + struct stat Attr; + if (stat(FileName, &Attr) == -1) + return std::error_code(errno, std::generic_category()); + return Attr.st_tag.ft_ccsid; +} + +ErrorOr<bool> llvm::needzOSConversion(const char *FileName, const int FD) { + ErrorOr<__ccsid_t> Ccsid = getzOSFileTag(FileName, FD); + if (std::error_code EC = Ccsid.getError()) + return EC; + // We don't need conversion for UTF-8 tagged files or binary files. + // TODO: Remove the assumption of ISO8859-1 = UTF-8 here when we fully resolve + // problems related to UTF-8 tagged source files. + switch (*Ccsid) { + case CCSID_UTF_8: + case CCSID_ISO8859_1: + case FT_BINARY: + return false; + default: + return true; + } +} + +#endif //__MVS__ diff --git a/llvm/lib/Support/MemoryBuffer.cpp b/llvm/lib/Support/MemoryBuffer.cpp index 7ea68ee4cafd76..e2044bcc4e4f08 100644 --- a/llvm/lib/Support/MemoryBuffer.cpp +++ b/llvm/lib/Support/MemoryBuffer.cpp @@ -361,6 +361,11 @@ static bool shouldUseMmap(sys::fs::file_t FD, bool RequiresNullTerminator, int PageSize, bool IsVolatile) { +#if defined(__MVS__) + // zOS Enhanced ASCII auto convert does not support mmap. + return false; +#endif + // mmap may leave the buffer without null terminator if the file size changed // by the time the last page is mapped in, so avoid it if the file size is // likely to change. @@ -503,9 +508,16 @@ getOpenFileImpl(sys::fs::file_t FD, const Twine &Filename, uint64_t FileSize, } #ifdef __MVS__ - // Set codepage auto-conversion for z/OS. - if (auto EC = llvm::enablezOSAutoConversion(FD)) + ErrorOr<bool> NeedConversion = needzOSConversion(Filename.str().c_str(), FD); + if (std::error_code EC = NeedConversion.getError()) return EC; + // File size may increase due to EBCDIC -> UTF-8 conversion, therefore we + // cannot trust the file size and we create the memory buffer by copying + // off the stream. + // Note: This only works with the assumption of reading a full file (i.e, + // Offset == 0 and MapSize == FileSize). Reading a file slice does not work. + if (Offset == 0 && MapSize == FileSize && *NeedConversion) + return getMemoryBufferForStream(FD, Filename); #endif auto Buf = _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits