On 2025/2/18 15:12, Hongbo Li wrote:
On 2025/2/18 8:39, Kent Overstreet wrote:
Hongbo - please go over this thoroughly. It turns out (and I should've
caught this before I merged it) your check code in fsck.c was completely
fubar, and wouldn't ever log an error when i_size was off, so bugs
elsewhere would have been masked.
-- >8 --
Error handling was wrong, causing unhandled transaction restart errors.
check_directory_size() was also inefficient, since keys in multiple
snapshots would be iterated over once for every snapshot. Convert it to
the same scheme used for i_sectors and subdir count checking.
I think the efficiency is the same. When dealing with size, the snapshot
has already been updated. Moreover, although there are more items in the
inodes btree, the traversal process is reused.
Cc: Hongbo Li <[email protected]>
Signed-off-by: Kent Overstreet <[email protected]>
---
fs/bcachefs/fsck.c | 78 ++++++++++++++++++----------------------------
1 file changed, 31 insertions(+), 47 deletions(-)
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 53a421ff136d..d406336e9a0a 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -823,6 +823,7 @@ struct inode_walker_entry {
struct bch_inode_unpacked inode;
u32 snapshot;
u64 count;
+ u64 i_size;
};
struct inode_walker {
@@ -910,8 +911,9 @@ lookup_inode_for_snapshot(struct bch_fs *c, struct
inode_walker *w, struct bkey_
if (k.k->p.snapshot != i->snapshot && !is_whiteout) {
struct inode_walker_entry new = *i;
- new.snapshot = k.k->p.snapshot;
- new.count = 0;
+ new.snapshot = k.k->p.snapshot;
+ new.count = 0;
+ new.i_size = 0;
struct printbuf buf = PRINTBUF;
bch2_bkey_val_to_text(&buf, c, k);
@@ -1116,37 +1118,6 @@ static int get_snapshot_root_inode(struct
btree_trans *trans,
return ret;
}
-static int check_directory_size(struct btree_trans *trans,
- struct bch_inode_unpacked *inode_u,
- struct bkey_s_c inode_k, bool *write_inode)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- u64 new_size = 0;
- int ret;
-
- for_each_btree_key_max_norestart(trans, iter, BTREE_ID_dirents,
- SPOS(inode_k.k->p.offset, 0, inode_k.k->p.snapshot),
- POS(inode_k.k->p.offset, U64_MAX),
- 0, k, ret) {
- if (k.k->type != KEY_TYPE_dirent)
- continue;
-
- struct bkey_s_c_dirent dirent = bkey_s_c_to_dirent(k);
- struct qstr name = bch2_dirent_get_name(dirent);
-
- new_size += dirent_occupied_size(&name);
- }
- bch2_trans_iter_exit(trans, &iter);
-
- if (!ret && inode_u->bi_size != new_size) {
- inode_u->bi_size = new_size;
- *write_inode = true;
- }
-
- return ret;
-}
-
static int check_inode(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k,
@@ -1335,16 +1306,6 @@ static int check_inode(struct btree_trans *trans,
u.bi_journal_seq = journal_cur_seq(&c->journal);
do_update = true;
}
-
- if (S_ISDIR(u.bi_mode)) {
- ret = check_directory_size(trans, &u, k, &do_update);
-
- fsck_err_on(ret,
- trans, directory_size_mismatch,
- "directory inode %llu:%u with the mismatch directory
size",
- u.bi_inum, k.k->p.snapshot);
It only tries to fix the mismatched i_size, and only logs the error
during recalculating. And why the i_size will be off?
- ret = 0;
- }
do_update:
if (do_update) {
ret = __bch2_fsck_write_inode(trans, &u);
@@ -2017,10 +1978,31 @@ static int check_subdir_count_notnested(struct
btree_trans *trans, struct inode_
return ret;
}
-static int check_subdir_count(struct btree_trans *trans, struct
inode_walker *w)
+static int check_dir_i_size_notnested(struct btree_trans *trans,
struct inode_walker *w)
+{
+ struct bch_fs *c = trans->c;
+ int ret = 0;
+
+ darray_for_each(w->inodes, i)
+ if (fsck_err_on(i->inode.bi_size != i->i_size,
+ trans, inode_dir_wrong_nlink,
+ "directory %llu:%u with wrong i_nlink: got %u, should
be %llu",
+ w->last_pos.inode, i->snapshot, i->inode.bi_nlink,
i->count)) {
This shows the wrong message. It should use directory_size_mismatch
error and corresponding values.
+ i->inode.bi_size = i->i_size;
+ ret = bch2_fsck_write_inode(trans, &i->inode);
+ if (ret)
+ break;
+ }
+fsck_err:
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+static int check_subdir_dirents_count(struct btree_trans *trans,
struct inode_walker *w)
{
u32 restart_count = trans->restart_count;
return check_subdir_count_notnested(trans, w) ?:
+ check_dir_i_size_notnested(trans, w) ?:
trans_was_restarted(trans, restart_count);
}
@@ -2367,7 +2349,7 @@ static int check_dirent(struct btree_trans
*trans, struct btree_iter *iter,
goto out;
if (dir->last_pos.inode != k.k->p.inode && dir->have_inodes) {
- ret = check_subdir_count(trans, dir);
+ ret = check_subdir_dirents_count(trans, dir);
if (ret)
goto err;
}
@@ -2457,9 +2439,11 @@ static int check_dirent(struct btree_trans
*trans, struct btree_iter *iter,
if (ret)
goto err;
- if (d.v->d_type == DT_DIR)
- for_each_visible_inode(c, s, dir, d.k->p.snapshot, i)
+ for_each_visible_inode(c, s, dir, d.k->p.snapshot, i) {
+ if (d.v->d_type == DT_DIR)
i->count++;
+ i->i_size += bkey_bytes(d.k);
We cannot rely on the value of bkey_bytes of the older version (the
older is 0 for directory). The traversal order will affect the accounts
of i_size. We should recalculate the sub-entries.
Got, the last item to be handled was missed. That's :
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index d406336e9a0a..8769ca332619 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -2470,7 +2470,8 @@ int bch2_check_dirents(struct bch_fs *c)
POS(BCACHEFS_ROOT_INO, 0),
BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
check_dirent(trans, &iter, k, &hash_info, &dir,
&target, &s)) ?:
- check_subdir_count_notnested(trans, &dir));
+ check_subdir_count_notnested(trans, &dir) ?:
+ check_dir_i_size_notnested(trans, &dir));
snapshots_seen_exit(&s);
inode_walker_exit(&dir);
Thanks,
Hongbo
Also I did the following test:
older version without dir i_size:
├── foo
│ ├── MAINTAINERS
│ ├── foo-2
│ │ ├── MAINTAINERS
│ │ └── profile
│ └── profile
└── lost+found
After fsck, it shows the i_size of foo-2 is still 0.
Thanks,
Hongbo
+ }
out:
err:
fsck_err: