Re: Mangled file names in temp dir

Pavel Sanda Wed, 08 Nov 2023 01:45:18 -0800

On Thu, Oct 26, 2023 at 01:23:35PM +0200, Pavel Sanda wrote:
> Possible collision are easily handled by recursive 
> hash(hash(..(hash(string))),
> see attached patch.


Polished patch attached. I will do some more testing before committing to the 
tree.

Pavel

diff --git a/src/insets/InsetGraphics.cpp b/src/insets/InsetGraphics.cpp
index dac61ee449..62577f07db 100644
--- a/src/insets/InsetGraphics.cpp
+++ b/src/insets/InsetGraphics.cpp
@@ -575,14 +575,14 @@ copyFileIfNeeded(FileName const & file_in, FileName const 
& file_out)
 
 
 pair<GraphicsCopyStatus, FileName> const
-copyToDirIfNeeded(DocFileName const & file, string const & dir)
+copyToDirIfNeeded(DocFileName const & file, string const & dir, bool 
encrypt_path)
 {
        string const file_in = file.absFileName();
        string const only_path = onlyPath(file_in);
        if (rtrim(only_path, "/") == rtrim(dir, "/"))
                return make_pair(IDENTICAL_PATHS, FileName(file_in));
 
-       string mangled = file.mangledFileName(empty_string(), false, true);
+       string mangled = file.mangledFileName(empty_string(), encrypt_path);
        if (theFormats().isZippedFile(file)) {
                // We need to change _eps.gz to .eps.gz. The mangled name is
                // still unique because of the counter in mangledFileName().
@@ -680,7 +680,7 @@ string InsetGraphics::prepareFile(OutputParams const & 
runparams) const
        // we move it to a temp dir or uncompress it.
        FileName temp_file;
        GraphicsCopyStatus status;
-       tie(status, temp_file) = copyToDirIfNeeded(params().filename, 
temp_path);
+       tie(status, temp_file) = copyToDirIfNeeded(params().filename, 
temp_path, false);
 
        if (status == FAILURE)
                return orig_file;
@@ -997,7 +997,7 @@ string InsetGraphics::prepareHTMLFile(OutputParams const & 
runparams) const
        // Copy to temporary directory.
        FileName temp_file;
        GraphicsCopyStatus status;
-       tie(status, temp_file) = copyToDirIfNeeded(params().filename, 
temp_path);
+       tie(status, temp_file) = copyToDirIfNeeded(params().filename, 
temp_path, true);
 
        if (status == FAILURE)
                return string();
diff --git a/src/support/FileName.cpp b/src/support/FileName.cpp
index 32a5862fdb..7fbd2f4b4c 100644
--- a/src/support/FileName.cpp
+++ b/src/support/FileName.cpp
@@ -957,69 +957,102 @@ string DocFileName::outputFileName(string const & path) 
const
        return save_abs_path_ ? absFileName() : relFileName(path);
 }
 
-
-string DocFileName::mangledFileName(string const & dir) const
+// check for an existing value in a map
+bool collision_exists(map<string, string> const & db, string const & value)
 {
-       return mangledFileName(dir, true, false);
+       for (auto it = db.begin(); it != db.end(); ++it)
+               if (it->second == value)
+                       return true;
+       return false;
+
+/* replace the above, once we support C++17
+        for (const auto& [key, val] : db)
+                if (val == value)
+                        return true;
+        return false; */
 }
 
-string DocFileName::mangledFileName(string const & dir, bool use_counter, bool 
encrypt_path) const
+string DocFileName::mangledFileName(string const & dir, bool encrypt_path) 
const
 {
        // Concurrent access to these variables is possible.
 
        // We need to make sure that every DocFileName instance for a given
        // filename returns the same mangled name.
        typedef map<string, string> MangledMap;
-       static MangledMap mangledNames;
+       static MangledMap mangledPlainNames;
+       static MangledMap mangledEncryptNames;  //separate table for xhtml() 
route
        static Mutex mangledMutex;
        // this locks both access to mangledNames and counter below
        Mutex::Locker lock(&mangledMutex);
-       MangledMap::const_iterator const it = mangledNames.find(absFileName());
-       if (it != mangledNames.end())
+       MangledMap * mangledNames = encrypt_path ? &mangledEncryptNames  : 
&mangledPlainNames;
+       MangledMap::const_iterator const it = mangledNames->find(absFileName());
+       if (it != mangledNames->end())
                return (*it).second;
 
        string const name = absFileName();
-       // Now the real work. Remove the extension.
-       string mname = support::changeExtension(name, string());
+       string mname;
 
-       if (encrypt_path)
-               mname = "export_" + onlyFileName() + "_" + toHexHash(mname);
+       // xHTML route
+       // we use hash instead of counter to get stable filenames in export 
directory
+       if (encrypt_path) {
+               // sanitization probably not neccessary for xhtml, but won't 
harm
+               string sanfn = support::changeExtension(onlyFileName(), 
string());
+               sanfn = sanitizeFileName(sanfn);
+               // Add the extension back on
+               sanfn = support::changeExtension(sanfn, 
getExtension(onlyFileName()));
 
-       // The mangled name must be a valid LaTeX name.
-       mname = sanitizeFileName(mname);
-       // Add the extension back on
-       mname = support::changeExtension(mname, getExtension(name));
+               //various filesystems have filename limit around 2^8
+               if (sanfn.length() > 230)
+                       sanfn = "";
 
-       // Prepend a counter to the filename. This is necessary to make
-       // the mangled name unique.
-       static int counter = 0;
+               string enc_name = "e_" + toHexHash(name, true) + "_" + sanfn;
+
+               while (collision_exists(mangledEncryptNames, enc_name))
+                       enc_name = "e_" + toHexHash(enc_name, true) + "_" + 
sanfn;
+
+               mname = enc_name;
+       }
+
+       // LaTeX route
+       if (!encrypt_path) {
+               // Now the real work. Remove the extension.
+               mname = support::changeExtension(name, string());
+               // The mangled name must be a valid LaTeX name.
+               mname = sanitizeFileName(mname);
+               // Add the extension back on
+               mname = support::changeExtension(mname, getExtension(name));
+
+               // Prepend a counter to the filename. This is necessary to make
+               // the mangled name unique, see truncation below.
+               //
+               static int counter = 0;
 
-       if (use_counter) {
                ostringstream s;
                s << counter++ << mname;
                mname = s.str();
-       }
 
-       // MiKTeX's YAP (version 2.4.1803) crashes if the file name
-       // is longer than about 160 characters. MiKTeX's pdflatex
-       // is even pickier. A maximum length of 100 has been proven to work.
-       // If dir.size() > max length, all bets are off for YAP. We truncate
-       // the filename nevertheless, keeping a minimum of 10 chars.
 
-       string::size_type max_length = max(100 - ((int)dir.size() + 1), 10);
+               // MiKTeX's YAP (version 2.4.1803) crashes if the file name
+               // is longer than about 160 characters. MiKTeX's pdflatex
+               // is even pickier. A maximum length of 100 has been proven to 
work.
+               // If dir.size() > max length, all bets are off for YAP. We 
truncate
+               // the filename nevertheless, keeping a minimum of 10 chars.
 
-       // If the mangled file name is too long, hack it to fit.
-       // We know we're guaranteed to have a unique file name because
-       // of the counter.
-       if (mname.size() > max_length) {
-               int const half = (int(max_length) / 2) - 2;
-               if (half > 0) {
-                       mname = mname.substr(0, half) + "___" +
-                               mname.substr(mname.size() - half);
+               string::size_type max_length = max(100 - ((int)dir.size() + 1), 
10);
+
+               // If the mangled file name is too long, hack it to fit.
+               // We know we're guaranteed to have a unique file name because
+               // of the counter.
+               if (mname.size() > max_length) {
+                       int const half = (int(max_length) / 2) - 2;
+                       if (half > 0) {
+                               mname = mname.substr(0, half) + "___" +
+                                       mname.substr(mname.size() - half);
+                       }
                }
        }
 
-       mangledNames[absFileName()] = mname;
+       (*mangledNames)[absFileName()] = mname;
        return mname;
 }
 
diff --git a/src/support/FileName.h b/src/support/FileName.h
index 95940a9235..35f182e428 100644
--- a/src/support/FileName.h
+++ b/src/support/FileName.h
@@ -288,19 +288,30 @@ public:
         *  - two FileName instances with the same filename have identical
         *    mangled names.
         *
+        *  @param encrypt_path will use short hash instead of path
+        *  in the mangled name if set to true. Useful for xHTML export
+        *  so we do not leak e.g. user names contained in the paths.
+        *  Filename itself is stripped if the resulting filename would
+        *  become too long (~250 chars).
+        *  Prefix counter is not used because
+        *  1) it's hack useful for MikTeX/YAP only
+        *  2) causes many duplicates within the export directory across 
different
+        *     LyX sessions as (unlike in LaTeX export) we use mangled names in
+        *     final xHTML export directory.
+        *  An example of hashed mangled case:
+        *  C:/foo bar/baz.png - > e_95a42ec852ea_baz.png
+        *
+        *  It is guaranteed that
+        *  - two different filenames have different mangled names (modulo hash 
collision)
+        *  - two FileName instances with the same filename have identical 
hashed
+        *    mangled names.
+        *
+        *
         *  Only the mangled file name is returned. It is not prepended
         *  with @c dir.
         */
        std::string
-       mangledFileName(std::string const & dir = empty_string()) const;
-
-       /** Identical to mangledFileName, with the following additions:
-       *
-       * @encrypt_path allows using hash (SHA-256) instead of full path.
-       * @use_counter allows disabling the counter in the filename.
-       */
-       std::string
-       mangledFileName(std::string const & dir, bool use_counter, bool 
encrypt_path) const;
+       mangledFileName(std::string const & dir = empty_string(), bool 
encrypt_path=false) const;
 
        /// \return the absolute file name without its .gz, .z, .Z extension
        std::string unzippedFileName() const;
diff --git a/src/support/filetools.cpp b/src/support/filetools.cpp
index e8f3aa19ce..a2eba83eef 100644
--- a/src/support/filetools.cpp
+++ b/src/support/filetools.cpp
@@ -1334,13 +1334,22 @@ void fileUnlock(int fd, const char * /* lock_file*/)
 }
 
 
-std::string toHexHash(const std::string & str)
+std::string toHexHash(const std::string & str, bool shorten)
 {
-       // Use the best available hashing algorithm.
        auto hashAlgo = QCryptographicHash::Sha256;
 
        QByteArray hash = QCryptographicHash::hash(toqstr(str).toLocal8Bit(), 
hashAlgo);
-       return fromqstr(QString(hash.toHex()));
+       QString qshash=QString(hash.toHex());
+
+       /* For shortened case we take 12 leftmost chars (6 bytes encoded).
+        * Random experiment shows:
+        *  8 chars: 16 collisions for 10^5 graphic filenames
+        * 12 chars:  0 collisions for 10^5 graphic filenames
+        */
+       if (shorten)
+               qshash=qshash.left(12);
+
+       return fromqstr(qshash);
 }
 
 
diff --git a/src/support/filetools.h b/src/support/filetools.h
index 205e7ff589..ceb1da62be 100644
--- a/src/support/filetools.h
+++ b/src/support/filetools.h
@@ -348,14 +348,16 @@ int fileLock(const char * lock_file);
 void fileUnlock(int fd, const char * lock_file);
 
 /** Return the hex-encoded cryptographic hash of a string.
- * The hash algorithm is not fixed, but it is determined at compile time.
  * This function is typically used to create relatively stable file names,
  * because cryptographic hash functions ensure that very small changes in the
  * input result in large changes in the output.
+ * Due to inherent limits of path length in Windows we allow very short version
+ * (effectively leftmost 6 bytes encoded via 12 chars in hex) while increasing 
the
+ * probability of collision (via shorten=true).
  * There is no limit in the length of the input string: it can be a file name
  * or the contents of a file, for instance.
  */
-std::string toHexHash(const std::string & str);
+std::string toHexHash(const std::string & str, bool shorten=false);
 
 /// Replace non-ASCII characters to ensure that the string can be used as a
 /// file name on all platforms and as a LaTeX name.

-- 
lyx-devel mailing list
lyx-devel@lists.lyx.org
http://lists.lyx.org/mailman/listinfo/lyx-devel

Re: Mangled file names in temp dir

Reply via email to