Hi Gang, It looks good to me.
On 2019/1/15 16:23, Gang He wrote: > Hello ChangWei, > >>>> On 2019/1/15 at 16:00, in message > <63adc13fd55d6546b7dece290d39e3730127826...@h3cmlb12-ex.srv.huawei-3com.com>, > Changwei Ge <ge.chang...@h3c.com> wrote: >> On 2019/1/15 13:49, Gang He wrote: >>> Hello Changewei, >>> >>>>>> On 2019/1/15 at 11:50, in message >>> <63adc13fd55d6546b7dece290d39e3730127825...@h3cmlb12-ex.srv.huawei-3com.com>, >>> Changwei Ge <ge.chang...@h3c.com> wrote: >>>> Hi Gang, >>>> >>>> Most parts of this patch look sane to me, just a tiny question... >>>> >>>> On 2019/1/11 17:01, Gang He wrote: >>>>> The user reported this problem, the upper application IO was >>>>> timeout when fstrim was running on this ocfs2 partition. the >>>>> application monitoring resource agent considered that this >>>>> application did not work, then this node was fenced by the cluster >>>>> brain (e.g. pacemaker). >>>>> The root cause is that fstrim thread always holds main_bm meta-file >>>>> related locks until all the cluster groups are trimmed. >>>>> This patch will make fstrim thread release main_bm meta-file >>>>> related locks when each cluster group is trimmed, this will let >>>>> the current application IO has a chance to claim the clusters from >>>>> main_bm meta-file. >>>>> >>>>> Signed-off-by: Gang He <g...@suse.com> Reviewed-by: Changwei Ge <ge.chang...@h3c.com> >>>>> --- >>>>> fs/ocfs2/alloc.c | 159 +++++++++++++++++++++++++---------------- >>>>> fs/ocfs2/dlmglue.c | 5 ++ >>>>> fs/ocfs2/ocfs2.h | 1 + >>>>> fs/ocfs2/ocfs2_trace.h | 2 + >>>>> fs/ocfs2/super.c | 2 + >>>>> 5 files changed, 106 insertions(+), 63 deletions(-) >>>>> >>>>> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c >>>>> index d1cbb27808e2..6f0999015a44 100644 >>>>> --- a/fs/ocfs2/alloc.c >>>>> +++ b/fs/ocfs2/alloc.c >>>>> @@ -7532,10 +7532,11 @@ static int ocfs2_trim_group(struct super_block >>>>> *sb, >>>>> return count; >>>>> } >>>>> >>>>> -int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) >>>>> +static >>>>> +int ocfs2_trim_mainbm(struct super_block *sb, struct fstrim_range *range) >>>>> { >>>>> struct ocfs2_super *osb = OCFS2_SB(sb); >>>>> - u64 start, len, trimmed, first_group, last_group, group; >>>>> + u64 start, len, trimmed = 0, first_group, last_group = 0, group = 0; >>>>> int ret, cnt; >>>>> u32 first_bit, last_bit, minlen; >>>>> struct buffer_head *main_bm_bh = NULL; >>>>> @@ -7543,7 +7544,6 @@ int ocfs2_trim_fs(struct super_block *sb, struct >>>> fstrim_range *range) >>>>> struct buffer_head *gd_bh = NULL; >>>>> struct ocfs2_dinode *main_bm; >>>>> struct ocfs2_group_desc *gd = NULL; >>>>> - struct ocfs2_trim_fs_info info, *pinfo = NULL; >>>>> >>>>> start = range->start >> osb->s_clustersize_bits; >>>>> len = range->len >> osb->s_clustersize_bits; >>>>> @@ -7552,6 +7552,9 @@ int ocfs2_trim_fs(struct super_block *sb, struct >>>> fstrim_range *range) >>>>> if (minlen >= osb->bitmap_cpg || range->len < sb->s_blocksize) >>>>> return -EINVAL; >>>>> >>>>> + trace_ocfs2_trim_mainbm(start, len, minlen); >>>>> + >>>>> +next_group: >>>>> main_bm_inode = ocfs2_get_system_file_inode(osb, >>>>> >>>>> GLOBAL_BITMAP_SYSTEM_INODE, >>>>> OCFS2_INVALID_SLOT); >>>>> @@ -7570,64 +7573,34 @@ int ocfs2_trim_fs(struct super_block *sb, struct >>>> fstrim_range *range) >>>>> } >>>>> main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data; >>>>> >>>>> - if (start >= le32_to_cpu(main_bm->i_clusters)) { >>>>> - ret = -EINVAL; >>>>> - goto out_unlock; >>>>> - } >>>>> - >>>>> - len = range->len >> osb->s_clustersize_bits; >>>>> - if (start + len > le32_to_cpu(main_bm->i_clusters)) >>>>> - len = le32_to_cpu(main_bm->i_clusters) - start; >>>>> - >>>>> - trace_ocfs2_trim_fs(start, len, minlen); >>>>> - >>>>> - ocfs2_trim_fs_lock_res_init(osb); >>>>> - ret = ocfs2_trim_fs_lock(osb, NULL, 1); >>>>> - if (ret < 0) { >>>>> - if (ret != -EAGAIN) { >>>>> - mlog_errno(ret); >>>>> - ocfs2_trim_fs_lock_res_uninit(osb); >>>>> + /* >>>>> + * Do some check before trim the first group. >>>>> + */ >>>>> + if (!group) { >>>>> + if (start >= le32_to_cpu(main_bm->i_clusters)) { >>>>> + ret = -EINVAL; >>>>> goto out_unlock; >>>>> } >>>>> >>>>> - mlog(ML_NOTICE, "Wait for trim on device (%s) to " >>>>> - "finish, which is running from another node.\n", >>>>> - osb->dev_str); >>>>> - ret = ocfs2_trim_fs_lock(osb, &info, 0); >>>>> - if (ret < 0) { >>>>> - mlog_errno(ret); >>>>> - ocfs2_trim_fs_lock_res_uninit(osb); >>>>> - goto out_unlock; >>>>> - } >>>>> + if (start + len > le32_to_cpu(main_bm->i_clusters)) >>>>> + len = le32_to_cpu(main_bm->i_clusters) - start; >>>>> >>>>> - if (info.tf_valid && info.tf_success && >>>>> - info.tf_start == start && info.tf_len == len && >>>>> - info.tf_minlen == minlen) { >>>>> - /* Avoid sending duplicated trim to a shared device */ >>>>> - mlog(ML_NOTICE, "The same trim on device (%s) was " >>>>> - "just done from node (%u), return.\n", >>>>> - osb->dev_str, info.tf_nodenum); >>>>> - range->len = info.tf_trimlen; >>>>> - goto out_trimunlock; >>>>> - } >>>>> + /* >>>>> + * Determine first and last group to examine based on >>>>> + * start and len >>>>> + */ >>>>> + first_group = ocfs2_which_cluster_group(main_bm_inode, start); >>>>> + if (first_group == osb->first_cluster_group_blkno) >>>>> + first_bit = start; >>>>> + else >>>>> + first_bit = start - ocfs2_blocks_to_clusters(sb, >>>>> + first_group); >>>>> + last_group = ocfs2_which_cluster_group(main_bm_inode, >>>>> + start + len - 1); >>>>> + group = first_group; >>>>> } >>>>> >>>>> - info.tf_nodenum = osb->node_num; >>>>> - info.tf_start = start; >>>>> - info.tf_len = len; >>>>> - info.tf_minlen = minlen; >>>>> - >>>>> - /* Determine first and last group to examine based on start and len */ >>>>> - first_group = ocfs2_which_cluster_group(main_bm_inode, start); >>>>> - if (first_group == osb->first_cluster_group_blkno) >>>>> - first_bit = start; >>>>> - else >>>>> - first_bit = start - ocfs2_blocks_to_clusters(sb, first_group); >>>>> - last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1); >>>>> - last_bit = osb->bitmap_cpg; >>>>> - >>>>> - trimmed = 0; >>>>> - for (group = first_group; group <= last_group;) { >>>>> + do { >>>>> if (first_bit + len >= osb->bitmap_cpg) >>>>> last_bit = osb->bitmap_cpg; >>>>> else >>>>> @@ -7659,21 +7632,81 @@ int ocfs2_trim_fs(struct super_block *sb, struct >>>> fstrim_range *range) >>>>> group = ocfs2_clusters_to_blocks(sb, >>>>> osb->bitmap_cpg); >>>>> else >>>>> group += ocfs2_clusters_to_blocks(sb, >>>>> osb->bitmap_cpg); >>>>> - } >>>>> - range->len = trimmed * sb->s_blocksize; >>>>> + } while (0); >>>>> >>>>> - info.tf_trimlen = range->len; >>>>> - info.tf_success = (ret ? 0 : 1); >>>>> - pinfo = &info; >>>>> -out_trimunlock: >>>>> - ocfs2_trim_fs_unlock(osb, pinfo); >>>>> - ocfs2_trim_fs_lock_res_uninit(osb); >>>>> out_unlock: >>>>> ocfs2_inode_unlock(main_bm_inode, 0); >>>>> brelse(main_bm_bh); >>>>> + main_bm_bh = NULL; >>>>> out_mutex: >>>>> inode_unlock(main_bm_inode); >>>>> iput(main_bm_inode); >>>>> + >>>>> + /* >>>>> + * If all the groups trim are not done or failed, but we should release >>>>> + * main_bm related locks for avoiding the current IO starve, then go to >>>>> + * trim the next group >>>>> + */ >>>>> + if (ret >= 0 && group <= last_group) >>>>> + goto next_group; >>>>> out: >>>>> + range->len = trimmed * sb->s_blocksize; >>>>> + return ret; >>>>> +} >>>>> + >>>>> +int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) >>>>> +{ >>>>> + int ret; >>>>> + struct ocfs2_super *osb = OCFS2_SB(sb); >>>>> + struct ocfs2_trim_fs_info info, *pinfo = NULL; >>>>> + >>>>> + ocfs2_trim_fs_lock_res_init(osb); >>>>> + >>>>> + trace_ocfs2_trim_fs(range->start, range->len, range->minlen); >>>>> + >>>>> + ret = ocfs2_trim_fs_lock(osb, NULL, 1); >>>>> + if (ret < 0) { >>>>> + if (ret != -EAGAIN) { >>>>> + mlog_errno(ret); >>>>> + ocfs2_trim_fs_lock_res_uninit(osb); >>>>> + return ret; >>>>> + } >>>>> + >>>>> + mlog(ML_NOTICE, "Wait for trim on device (%s) to " >>>>> + "finish, which is running from another node.\n", >>>>> + osb->dev_str); >>>>> + ret = ocfs2_trim_fs_lock(osb, &info, 0); >>>>> + if (ret < 0) { >>>>> + mlog_errno(ret); >>>>> + ocfs2_trim_fs_lock_res_uninit(osb); >>>>> + return ret; >>>>> + } >>>>> + >>>>> + if (info.tf_valid && info.tf_success && >>>>> + info.tf_start == range->start && >>>>> + info.tf_len == range->len && >>>>> + info.tf_minlen == range->minlen) { >>>>> + /* Avoid sending duplicated trim to a shared device */ >>>>> + mlog(ML_NOTICE, "The same trim on device (%s) was " >>>>> + "just done from node (%u), return.\n", >>>>> + osb->dev_str, info.tf_nodenum); >>>>> + range->len = info.tf_trimlen; >>>>> + goto out; >>>>> + } >>>>> + } >>>>> + >>>>> + info.tf_nodenum = osb->node_num; >>>>> + info.tf_start = range->start; >>>>> + info.tf_len = range->len; >>>>> + info.tf_minlen = range->minlen; >>>>> + >>>>> + ret = ocfs2_trim_mainbm(sb, range); >>>>> + >>>>> + info.tf_trimlen = range->len; >>>>> + info.tf_success = (ret < 0 ? 0 : 1); >>>>> + pinfo = &info; >>>>> +out: >>>>> + ocfs2_trim_fs_unlock(osb, pinfo); >>>>> + ocfs2_trim_fs_lock_res_uninit(osb); >>>>> return ret; >>>>> } >>>>> diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c >>>>> index 7c835824247e..af405586c5b1 100644 >>>>> --- a/fs/ocfs2/dlmglue.c >>>>> +++ b/fs/ocfs2/dlmglue.c >>>>> @@ -686,6 +686,9 @@ void ocfs2_trim_fs_lock_res_init(struct ocfs2_super >>>>> *osb) >>>>> { >>>>> struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres; >>>>> >>>>> + /* Only one trimfs thread are allowed to work at the same time. */ >>>>> + mutex_lock(&osb->obs_trim_fs_mutex); >>>>> + >>>> >>>> Cluster lock of fstrim have a trylock behavior, will it be better if we >>>> trylock here? >>> Here, I prefer to just serialize fstrim threads on the local node to >> simplify the code logic, >>> maybe the user want to do like that, although this behavior is not >> recommended. >>> You know, on one node, ideally, the user should call fstrim command once >> regularly. >>> If he calls fstrim command more times in a very short time, >>> the code will not make each fstrim command return to failure, just do the >> fstrim task one by one. >> >> I have a thought having nothing to do with your patch. >> Do you think it's possible for us to implement *discard on unlink file* like >> ext4 does. >> So the application doesn't have to invoke fstrim periodically. > Yes, we can do some investigation for adding a mount option "discard" to > support discard a file when it is deleted. > This can be considered as another feature, since its discard occasion and > granularity is different with fstrim. > The scheduled fstrim command can considered as a traditional file system > level trim. > If the file system supports the on-demand discard when some blocks were > released, that's better. > Of course. these two features can coexist. Good to hear this. :-) > > Thanks > Gang > >> >> Thanks, >> Changwei >> >>> >>> Thanks >>> Gang >>> >>>> >>>> Thanks, >>>> Changwei >>>> >>>>> ocfs2_lock_res_init_once(lockres); >>>>> ocfs2_build_lock_name(OCFS2_LOCK_TYPE_TRIM_FS, 0, 0, >>>>> lockres->l_name); >>>>> ocfs2_lock_res_init_common(osb, lockres, >>>>> OCFS2_LOCK_TYPE_TRIM_FS, >>>>> @@ -698,6 +701,8 @@ void ocfs2_trim_fs_lock_res_uninit(struct ocfs2_super >>>> *osb) >>>>> >>>>> ocfs2_simple_drop_lockres(osb, lockres); >>>>> ocfs2_lock_res_free(lockres); >>>>> + >>>>> + mutex_unlock(&osb->obs_trim_fs_mutex); >>>>> } >>>>> >>>>> static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res >>>>> *res, >>>>> diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h >>>>> index 4f86ac0027b5..1f029fbe8b8d 100644 >>>>> --- a/fs/ocfs2/ocfs2.h >>>>> +++ b/fs/ocfs2/ocfs2.h >>>>> @@ -407,6 +407,7 @@ struct ocfs2_super >>>>> struct ocfs2_lock_res osb_rename_lockres; >>>>> struct ocfs2_lock_res osb_nfs_sync_lockres; >>>>> struct ocfs2_lock_res osb_trim_fs_lockres; >>>>> + struct mutex obs_trim_fs_mutex; >>>>> struct ocfs2_dlm_debug *osb_dlm_debug; >>>>> >>>>> struct dentry *osb_debug_root; >>>>> diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h >>>>> index 2ee76a90ba8f..dc4bce1649c1 100644 >>>>> --- a/fs/ocfs2/ocfs2_trace.h >>>>> +++ b/fs/ocfs2/ocfs2_trace.h >>>>> @@ -712,6 +712,8 @@ TRACE_EVENT(ocfs2_trim_extent, >>>>> >>>>> DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_trim_group); >>>>> >>>>> +DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_mainbm); >>>>> + >>>>> DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_fs); >>>>> >>>>> /* End of trace events for fs/ocfs2/alloc.c. */ >>>>> diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c >>>>> index 3415e0b09398..96ae7cedd487 100644 >>>>> --- a/fs/ocfs2/super.c >>>>> +++ b/fs/ocfs2/super.c >>>>> @@ -1847,6 +1847,8 @@ static int ocfs2_mount_volume(struct super_block >>>>> *sb) >>>>> if (ocfs2_is_hard_readonly(osb)) >>>>> goto leave; >>>>> >>>>> + mutex_init(&osb->obs_trim_fs_mutex); >>>>> + >>>>> status = ocfs2_dlm_init(osb); >>>>> if (status < 0) { >>>>> mlog_errno(status); >>>>> >>> >