On 2024/7/19 16:19, Gao Xiang wrote:
We need that because in the incremental build scenario, the `i_srcpath` of the inodes in tarerofs_parse_tar() could be "denormalized" (e.g., ./data/1.txt) while the path in erofs_rebuild_load_basedir() is "normalized" (e.g., /data/1.txt). This inconsistency might cause the bloom filter to determine that the path does not exist, thereby preventing it from traversing the `/data` directory, which would result in an error.On 2024/7/18 13:40, Hongzhen Luo wrote:Add a bloom filter to exclude entries that are not in `pwd->i_subdirs`, thereby improving the performance of `erofs_rebuild_get_dentry`. Below are the results for different # of files in the same directory: +---------+--------------------+ | # files | time reduction (%) | +---------+--------------------+ | 1e4 | 55% | +---------+--------------------+ | 1e5 | 98% | +---------+--------------------+ | 2e5 | 98% | +---------+--------------------+ | 3e5 | 99% | +---------+--------------------+ Signed-off-by: Hongzhen Luo <hongz...@linux.alibaba.com> --- lib/rebuild.c | 61 ++++++++++++++++++++++++++++++++++++++++++++------- lib/super.c | 2 ++ mkfs/main.c | 4 ++++ 3 files changed, 59 insertions(+), 8 deletions(-) diff --git a/lib/rebuild.c b/lib/rebuild.c index 9e8bf8f..3fd3ea0 100644 --- a/lib/rebuild.c +++ b/lib/rebuild.c @@ -15,6 +15,7 @@ #include "erofs/xattr.h" #include "erofs/blobchunk.h" #include "erofs/internal.h" +#include "erofs/bloom_filter.h" #include "liberofs_uuid.h" #ifdef HAVE_LINUX_AUFS_TYPE_H@@ -62,14 +63,48 @@ struct erofs_dentry *erofs_rebuild_get_dentry(struct erofs_inode *pwd,char *path, bool aufs, bool *whout, bool *opq, bool to_head) { struct erofs_dentry *d = NULL; - unsigned int len = strlen(path); - char *s = path; + unsigned int len, p = 0; + char *s; + struct erofs_sb_info *sbi = pwd->sbi; + char buf[4096]; *whout = false; *opq = false; + s = pwd->i_srcpath; + len = strlen(pwd->i_srcpath); + /* normalize the pwd path, e.g., /./../xxx/yyy -> /xxx/yyy */ + buf[p++] = '/'; + while (s < pwd->i_srcpath + len) {Why should we need that?
Sure. I plan to use an erofs_init_super() function to initialize the relevant fields of sbi. However, other initialization functions like erofs_buffer_init() are coupled with the main() function, so it might not be easy to wrap them into this function.+ char *slash = memchr(s, '/', pwd->i_srcpath + len - s); + if (slash) { + if (s == slash) { + while(*++s == '/'); + continue;; + } + *slash = '\0'; + } + if (memcmp(s, ".", 2) && memcmp(s, "..", 3)) { + memcpy(buf + p, s, strlen(s)); + p += strlen(s); + buf[p++] = '/'; + + } + if (slash) { + *slash = '/'; + s = slash + 1; + } else{ + break; + } + } + if (buf[p - 1] != '/') + buf[p++] = '/'; + + s = path; + len = strlen(path); while (s < path + len) { char *slash = memchr(s, '/', path + len - s); + int bloom, slen; if (slash) { if (s == slash) {@@ -97,13 +132,19 @@ struct erofs_dentry *erofs_rebuild_get_dentry(struct erofs_inode *pwd,} } - list_for_each_entry(d, &pwd->i_subdirs, d_child) { - if (!strcmp(d->name, s)) { - if (d->type != EROFS_FT_DIR && slash) - return ERR_PTR(-EIO); - inode = d->inode; - break; + slen = strlen(s); + memcpy(buf + p, s, slen); + p += slen; + if ((bloom = erofs_bloom_test(sbi, buf, p)) > 0) { + list_for_each_entry(d, &pwd->i_subdirs, d_child) { + if (!strcmp(d->name, s)) { + if (d->type != EROFS_FT_DIR && slash) + return ERR_PTR(-EIO); + inode = d->inode; + break; + } } + } if (inode) {@@ -124,6 +165,10 @@ struct erofs_dentry *erofs_rebuild_get_dentry(struct erofs_inode *pwd,return d; pwd = d->inode; } + if (!bloom && erofs_bloom_add(sbi, buf, p)) + return ERR_PTR(-EINVAL); + if (slash) + buf[p++] = '/'; } if (slash) { *slash = '/'; diff --git a/lib/super.c b/lib/super.c index 45233c4..7289236 100644 --- a/lib/super.c +++ b/lib/super.c @@ -7,6 +7,7 @@ #include "erofs/print.h" #include "erofs/xattr.h" #include "erofs/cache.h" +#include "erofs/bloom_filter.h" static bool check_layout_compatibility(struct erofs_sb_info *sbi, struct erofs_super_block *dsb) @@ -153,6 +154,7 @@ void erofs_put_super(struct erofs_sb_info *sbi) erofs_buffer_exit(sbi->bmgr); sbi->bmgr = NULL; } + erofs_bloom_exit(sbi); }int erofs_writesb(struct erofs_sb_info *sbi, struct erofs_buffer_head *sb_bh,diff --git a/mkfs/main.c b/mkfs/main.c index 20f12fc..27a2ea0 100644 --- a/mkfs/main.c +++ b/mkfs/main.c @@ -31,6 +31,7 @@ #include "../lib/liberofs_private.h" #include "../lib/liberofs_uuid.h" #include "../lib/compressor.h" +#include "erofs/bloom_filter.h" static struct option long_options[] = { {"version", no_argument, 0, 'V'}, @@ -1344,6 +1345,9 @@ int main(int argc, char **argv) } erofs_inode_manager_init(); + srand(time(NULL)); + /* 1000000 should be enough */ + erofs_bloom_init(&g_sbi, 5, 1000000, rand());I don't want to add any new init function anymore, please wrap all init functions into a common one.
How about adding an option to mkfs to allow configuring the number of hash functions and the capacity of the bloom filter? If the user does not specify, a default value will be set. Below is the implementation:Also what's the meaning of hard-coded 1000000?
struct erofs_configure { // ... u32 c_bloom_nrfuc; unsigned long c_bloom_bitsize; }; Thanks, Hongzhen Luo
Thanks, Gao Xiang