On 2024/7/19 16:19, Gao Xiang wrote:


On 2024/7/18 13:40, Hongzhen Luo wrote:
Add a bloom filter to exclude entries that are not in `pwd->i_subdirs`,
thereby improving the performance of `erofs_rebuild_get_dentry`. Below
are the results for different # of files in the same directory:

+---------+--------------------+
| # files | time reduction (%) |
+---------+--------------------+
|   1e4   |         55%        |
+---------+--------------------+
|   1e5   |         98%        |
+---------+--------------------+
|   2e5   |         98%        |
+---------+--------------------+
|   3e5   |         99%        |
+---------+--------------------+

Signed-off-by: Hongzhen Luo <hongz...@linux.alibaba.com>
---
  lib/rebuild.c | 61 ++++++++++++++++++++++++++++++++++++++++++++-------
  lib/super.c   |  2 ++
  mkfs/main.c   |  4 ++++
  3 files changed, 59 insertions(+), 8 deletions(-)

diff --git a/lib/rebuild.c b/lib/rebuild.c
index 9e8bf8f..3fd3ea0 100644
--- a/lib/rebuild.c
+++ b/lib/rebuild.c
@@ -15,6 +15,7 @@
  #include "erofs/xattr.h"
  #include "erofs/blobchunk.h"
  #include "erofs/internal.h"
+#include "erofs/bloom_filter.h"
  #include "liberofs_uuid.h"
    #ifdef HAVE_LINUX_AUFS_TYPE_H
@@ -62,14 +63,48 @@ struct erofs_dentry *erofs_rebuild_get_dentry(struct erofs_inode *pwd,
          char *path, bool aufs, bool *whout, bool *opq, bool to_head)
  {
      struct erofs_dentry *d = NULL;
-    unsigned int len = strlen(path);
-    char *s = path;
+    unsigned int len, p = 0;
+    char *s;
+    struct erofs_sb_info *sbi = pwd->sbi;
+    char buf[4096];
        *whout = false;
      *opq = false;
  +    s = pwd->i_srcpath;
+    len = strlen(pwd->i_srcpath);
+    /* normalize the pwd path, e.g., /./../xxx/yyy -> /xxx/yyy */
+    buf[p++] = '/';
+    while (s < pwd->i_srcpath + len) {

Why should we need that?

We need that because in the incremental build scenario, the `i_srcpath` of the inodes in tarerofs_parse_tar() could be "denormalized" (e.g., ./data/1.txt) while the path in erofs_rebuild_load_basedir() is "normalized" (e.g., /data/1.txt). This inconsistency might cause the bloom filter to determine that the path does not exist, thereby preventing it from traversing the `/data` directory, which would result in an error.
+        char *slash = memchr(s, '/', pwd->i_srcpath + len - s);
+        if (slash) {
+            if (s == slash) {
+                while(*++s == '/');
+                continue;;
+            }
+            *slash = '\0';
+        }
+        if (memcmp(s, ".", 2) && memcmp(s, "..", 3)) {
+            memcpy(buf + p, s, strlen(s));
+            p += strlen(s);
+            buf[p++] = '/';
+
+        }
+        if (slash) {
+            *slash = '/';
+            s = slash + 1;
+        } else{
+            break;
+        }
+    }
+    if (buf[p - 1] != '/')
+        buf[p++] = '/';
+
+    s = path;
+    len = strlen(path);
      while (s < path + len) {
          char *slash = memchr(s, '/', path + len - s);
+        int bloom, slen;
            if (slash) {
              if (s == slash) {
@@ -97,13 +132,19 @@ struct erofs_dentry *erofs_rebuild_get_dentry(struct erofs_inode *pwd,
                  }
              }
  -            list_for_each_entry(d, &pwd->i_subdirs, d_child) {
-                if (!strcmp(d->name, s)) {
-                    if (d->type != EROFS_FT_DIR && slash)
-                        return ERR_PTR(-EIO);
-                    inode = d->inode;
-                    break;
+            slen = strlen(s);
+            memcpy(buf + p, s, slen);
+            p += slen;
+            if ((bloom = erofs_bloom_test(sbi, buf, p)) > 0) {
+                list_for_each_entry(d, &pwd->i_subdirs, d_child) {
+                    if (!strcmp(d->name, s)) {
+                        if (d->type != EROFS_FT_DIR && slash)
+                            return ERR_PTR(-EIO);
+                        inode = d->inode;
+                        break;
+                    }
                  }
+
              }
                if (inode) {
@@ -124,6 +165,10 @@ struct erofs_dentry *erofs_rebuild_get_dentry(struct erofs_inode *pwd,
                      return d;
                  pwd = d->inode;
              }
+            if (!bloom && erofs_bloom_add(sbi, buf, p))
+                return ERR_PTR(-EINVAL);
+            if (slash)
+                buf[p++] = '/';
          }
          if (slash) {
              *slash = '/';
diff --git a/lib/super.c b/lib/super.c
index 45233c4..7289236 100644
--- a/lib/super.c
+++ b/lib/super.c
@@ -7,6 +7,7 @@
  #include "erofs/print.h"
  #include "erofs/xattr.h"
  #include "erofs/cache.h"
+#include "erofs/bloom_filter.h"
    static bool check_layout_compatibility(struct erofs_sb_info *sbi,
                         struct erofs_super_block *dsb)
@@ -153,6 +154,7 @@ void erofs_put_super(struct erofs_sb_info *sbi)
          erofs_buffer_exit(sbi->bmgr);
          sbi->bmgr = NULL;
      }
+    erofs_bloom_exit(sbi);
  }
    int erofs_writesb(struct erofs_sb_info *sbi, struct erofs_buffer_head *sb_bh,
diff --git a/mkfs/main.c b/mkfs/main.c
index 20f12fc..27a2ea0 100644
--- a/mkfs/main.c
+++ b/mkfs/main.c
@@ -31,6 +31,7 @@
  #include "../lib/liberofs_private.h"
  #include "../lib/liberofs_uuid.h"
  #include "../lib/compressor.h"
+#include "erofs/bloom_filter.h"
    static struct option long_options[] = {
      {"version", no_argument, 0, 'V'},
@@ -1344,6 +1345,9 @@ int main(int argc, char **argv)
      }
        erofs_inode_manager_init();
+    srand(time(NULL));
+    /* 1000000 should be enough */
+    erofs_bloom_init(&g_sbi, 5, 1000000, rand());

I don't want to add any new init function anymore, please wrap
all init functions into a common one.

Sure. I plan to use an erofs_init_super() function to initialize the relevant fields of sbi. However, other initialization functions like erofs_buffer_init() are coupled with the main() function, so it might not be easy to wrap them into this function.
Also what's the meaning of hard-coded 1000000?

How about adding an option to mkfs to allow configuring the number of hash functions and the capacity of the bloom filter? If the user does not specify, a default value will be set. Below is the implementation:

struct erofs_configure {

// ...

u32 c_bloom_nrfuc;

unsigned long c_bloom_bitsize;

 };

Thanks,

Hongzhen Luo


Thanks,
Gao Xiang

Reply via email to