On Thu, Oct 26, 2023 at 01:23:35PM +0200, Pavel Sanda wrote: > Possible collision are easily handled by recursive > hash(hash(..(hash(string))), > see attached patch.
Polished patch attached. I will do some more testing before committing to the tree. Pavel
diff --git a/src/insets/InsetGraphics.cpp b/src/insets/InsetGraphics.cpp index dac61ee449..62577f07db 100644 --- a/src/insets/InsetGraphics.cpp +++ b/src/insets/InsetGraphics.cpp @@ -575,14 +575,14 @@ copyFileIfNeeded(FileName const & file_in, FileName const & file_out) pair<GraphicsCopyStatus, FileName> const -copyToDirIfNeeded(DocFileName const & file, string const & dir) +copyToDirIfNeeded(DocFileName const & file, string const & dir, bool encrypt_path) { string const file_in = file.absFileName(); string const only_path = onlyPath(file_in); if (rtrim(only_path, "/") == rtrim(dir, "/")) return make_pair(IDENTICAL_PATHS, FileName(file_in)); - string mangled = file.mangledFileName(empty_string(), false, true); + string mangled = file.mangledFileName(empty_string(), encrypt_path); if (theFormats().isZippedFile(file)) { // We need to change _eps.gz to .eps.gz. The mangled name is // still unique because of the counter in mangledFileName(). @@ -680,7 +680,7 @@ string InsetGraphics::prepareFile(OutputParams const & runparams) const // we move it to a temp dir or uncompress it. FileName temp_file; GraphicsCopyStatus status; - tie(status, temp_file) = copyToDirIfNeeded(params().filename, temp_path); + tie(status, temp_file) = copyToDirIfNeeded(params().filename, temp_path, false); if (status == FAILURE) return orig_file; @@ -997,7 +997,7 @@ string InsetGraphics::prepareHTMLFile(OutputParams const & runparams) const // Copy to temporary directory. FileName temp_file; GraphicsCopyStatus status; - tie(status, temp_file) = copyToDirIfNeeded(params().filename, temp_path); + tie(status, temp_file) = copyToDirIfNeeded(params().filename, temp_path, true); if (status == FAILURE) return string(); diff --git a/src/support/FileName.cpp b/src/support/FileName.cpp index 32a5862fdb..7fbd2f4b4c 100644 --- a/src/support/FileName.cpp +++ b/src/support/FileName.cpp @@ -957,69 +957,102 @@ string DocFileName::outputFileName(string const & path) const return save_abs_path_ ? absFileName() : relFileName(path); } - -string DocFileName::mangledFileName(string const & dir) const +// check for an existing value in a map +bool collision_exists(map<string, string> const & db, string const & value) { - return mangledFileName(dir, true, false); + for (auto it = db.begin(); it != db.end(); ++it) + if (it->second == value) + return true; + return false; + +/* replace the above, once we support C++17 + for (const auto& [key, val] : db) + if (val == value) + return true; + return false; */ } -string DocFileName::mangledFileName(string const & dir, bool use_counter, bool encrypt_path) const +string DocFileName::mangledFileName(string const & dir, bool encrypt_path) const { // Concurrent access to these variables is possible. // We need to make sure that every DocFileName instance for a given // filename returns the same mangled name. typedef map<string, string> MangledMap; - static MangledMap mangledNames; + static MangledMap mangledPlainNames; + static MangledMap mangledEncryptNames; //separate table for xhtml() route static Mutex mangledMutex; // this locks both access to mangledNames and counter below Mutex::Locker lock(&mangledMutex); - MangledMap::const_iterator const it = mangledNames.find(absFileName()); - if (it != mangledNames.end()) + MangledMap * mangledNames = encrypt_path ? &mangledEncryptNames : &mangledPlainNames; + MangledMap::const_iterator const it = mangledNames->find(absFileName()); + if (it != mangledNames->end()) return (*it).second; string const name = absFileName(); - // Now the real work. Remove the extension. - string mname = support::changeExtension(name, string()); + string mname; - if (encrypt_path) - mname = "export_" + onlyFileName() + "_" + toHexHash(mname); + // xHTML route + // we use hash instead of counter to get stable filenames in export directory + if (encrypt_path) { + // sanitization probably not neccessary for xhtml, but won't harm + string sanfn = support::changeExtension(onlyFileName(), string()); + sanfn = sanitizeFileName(sanfn); + // Add the extension back on + sanfn = support::changeExtension(sanfn, getExtension(onlyFileName())); - // The mangled name must be a valid LaTeX name. - mname = sanitizeFileName(mname); - // Add the extension back on - mname = support::changeExtension(mname, getExtension(name)); + //various filesystems have filename limit around 2^8 + if (sanfn.length() > 230) + sanfn = ""; - // Prepend a counter to the filename. This is necessary to make - // the mangled name unique. - static int counter = 0; + string enc_name = "e_" + toHexHash(name, true) + "_" + sanfn; + + while (collision_exists(mangledEncryptNames, enc_name)) + enc_name = "e_" + toHexHash(enc_name, true) + "_" + sanfn; + + mname = enc_name; + } + + // LaTeX route + if (!encrypt_path) { + // Now the real work. Remove the extension. + mname = support::changeExtension(name, string()); + // The mangled name must be a valid LaTeX name. + mname = sanitizeFileName(mname); + // Add the extension back on + mname = support::changeExtension(mname, getExtension(name)); + + // Prepend a counter to the filename. This is necessary to make + // the mangled name unique, see truncation below. + // + static int counter = 0; - if (use_counter) { ostringstream s; s << counter++ << mname; mname = s.str(); - } - // MiKTeX's YAP (version 2.4.1803) crashes if the file name - // is longer than about 160 characters. MiKTeX's pdflatex - // is even pickier. A maximum length of 100 has been proven to work. - // If dir.size() > max length, all bets are off for YAP. We truncate - // the filename nevertheless, keeping a minimum of 10 chars. - string::size_type max_length = max(100 - ((int)dir.size() + 1), 10); + // MiKTeX's YAP (version 2.4.1803) crashes if the file name + // is longer than about 160 characters. MiKTeX's pdflatex + // is even pickier. A maximum length of 100 has been proven to work. + // If dir.size() > max length, all bets are off for YAP. We truncate + // the filename nevertheless, keeping a minimum of 10 chars. - // If the mangled file name is too long, hack it to fit. - // We know we're guaranteed to have a unique file name because - // of the counter. - if (mname.size() > max_length) { - int const half = (int(max_length) / 2) - 2; - if (half > 0) { - mname = mname.substr(0, half) + "___" + - mname.substr(mname.size() - half); + string::size_type max_length = max(100 - ((int)dir.size() + 1), 10); + + // If the mangled file name is too long, hack it to fit. + // We know we're guaranteed to have a unique file name because + // of the counter. + if (mname.size() > max_length) { + int const half = (int(max_length) / 2) - 2; + if (half > 0) { + mname = mname.substr(0, half) + "___" + + mname.substr(mname.size() - half); + } } } - mangledNames[absFileName()] = mname; + (*mangledNames)[absFileName()] = mname; return mname; } diff --git a/src/support/FileName.h b/src/support/FileName.h index 95940a9235..35f182e428 100644 --- a/src/support/FileName.h +++ b/src/support/FileName.h @@ -288,19 +288,30 @@ public: * - two FileName instances with the same filename have identical * mangled names. * + * @param encrypt_path will use short hash instead of path + * in the mangled name if set to true. Useful for xHTML export + * so we do not leak e.g. user names contained in the paths. + * Filename itself is stripped if the resulting filename would + * become too long (~250 chars). + * Prefix counter is not used because + * 1) it's hack useful for MikTeX/YAP only + * 2) causes many duplicates within the export directory across different + * LyX sessions as (unlike in LaTeX export) we use mangled names in + * final xHTML export directory. + * An example of hashed mangled case: + * C:/foo bar/baz.png - > e_95a42ec852ea_baz.png + * + * It is guaranteed that + * - two different filenames have different mangled names (modulo hash collision) + * - two FileName instances with the same filename have identical hashed + * mangled names. + * + * * Only the mangled file name is returned. It is not prepended * with @c dir. */ std::string - mangledFileName(std::string const & dir = empty_string()) const; - - /** Identical to mangledFileName, with the following additions: - * - * @encrypt_path allows using hash (SHA-256) instead of full path. - * @use_counter allows disabling the counter in the filename. - */ - std::string - mangledFileName(std::string const & dir, bool use_counter, bool encrypt_path) const; + mangledFileName(std::string const & dir = empty_string(), bool encrypt_path=false) const; /// \return the absolute file name without its .gz, .z, .Z extension std::string unzippedFileName() const; diff --git a/src/support/filetools.cpp b/src/support/filetools.cpp index e8f3aa19ce..a2eba83eef 100644 --- a/src/support/filetools.cpp +++ b/src/support/filetools.cpp @@ -1334,13 +1334,22 @@ void fileUnlock(int fd, const char * /* lock_file*/) } -std::string toHexHash(const std::string & str) +std::string toHexHash(const std::string & str, bool shorten) { - // Use the best available hashing algorithm. auto hashAlgo = QCryptographicHash::Sha256; QByteArray hash = QCryptographicHash::hash(toqstr(str).toLocal8Bit(), hashAlgo); - return fromqstr(QString(hash.toHex())); + QString qshash=QString(hash.toHex()); + + /* For shortened case we take 12 leftmost chars (6 bytes encoded). + * Random experiment shows: + * 8 chars: 16 collisions for 10^5 graphic filenames + * 12 chars: 0 collisions for 10^5 graphic filenames + */ + if (shorten) + qshash=qshash.left(12); + + return fromqstr(qshash); } diff --git a/src/support/filetools.h b/src/support/filetools.h index 205e7ff589..ceb1da62be 100644 --- a/src/support/filetools.h +++ b/src/support/filetools.h @@ -348,14 +348,16 @@ int fileLock(const char * lock_file); void fileUnlock(int fd, const char * lock_file); /** Return the hex-encoded cryptographic hash of a string. - * The hash algorithm is not fixed, but it is determined at compile time. * This function is typically used to create relatively stable file names, * because cryptographic hash functions ensure that very small changes in the * input result in large changes in the output. + * Due to inherent limits of path length in Windows we allow very short version + * (effectively leftmost 6 bytes encoded via 12 chars in hex) while increasing the + * probability of collision (via shorten=true). * There is no limit in the length of the input string: it can be a file name * or the contents of a file, for instance. */ -std::string toHexHash(const std::string & str); +std::string toHexHash(const std::string & str, bool shorten=false); /// Replace non-ASCII characters to ensure that the string can be used as a /// file name on all platforms and as a LaTeX name.
-- lyx-devel mailing list lyx-devel@lists.lyx.org http://lists.lyx.org/mailman/listinfo/lyx-devel