Hi Gang,

It looks good to me.

On 2019/1/15 16:23, Gang He wrote:
> Hello ChangWei,
> 
>>>> On 2019/1/15 at 16:00, in message
> <63adc13fd55d6546b7dece290d39e3730127826...@h3cmlb12-ex.srv.huawei-3com.com>,
> Changwei Ge <ge.chang...@h3c.com> wrote:
>> On 2019/1/15 13:49, Gang He wrote:
>>> Hello Changewei,
>>>
>>>>>> On 2019/1/15 at 11:50, in message
>>> <63adc13fd55d6546b7dece290d39e3730127825...@h3cmlb12-ex.srv.huawei-3com.com>,
>>> Changwei Ge <ge.chang...@h3c.com> wrote:
>>>> Hi Gang,
>>>>
>>>> Most parts of this patch look sane to me, just a tiny question...
>>>>
>>>> On 2019/1/11 17:01, Gang He wrote:
>>>>> The user reported this problem, the upper application IO was
>>>>> timeout when fstrim was running on this ocfs2 partition. the
>>>>> application monitoring resource agent considered that this
>>>>> application did not work, then this node was fenced by the cluster
>>>>> brain (e.g. pacemaker).
>>>>> The root cause is that fstrim thread always holds main_bm meta-file
>>>>> related locks until all the cluster groups are trimmed.
>>>>> This patch will make fstrim thread release main_bm meta-file
>>>>> related locks when each cluster group is trimmed, this will let
>>>>> the current application IO has a chance to claim the clusters from
>>>>> main_bm meta-file.
>>>>>
>>>>> Signed-off-by: Gang He <g...@suse.com>

Reviewed-by: Changwei Ge <ge.chang...@h3c.com>

>>>>> ---
>>>>>     fs/ocfs2/alloc.c       | 159 +++++++++++++++++++++++++----------------
>>>>>     fs/ocfs2/dlmglue.c     |   5 ++
>>>>>     fs/ocfs2/ocfs2.h       |   1 +
>>>>>     fs/ocfs2/ocfs2_trace.h |   2 +
>>>>>     fs/ocfs2/super.c       |   2 +
>>>>>     5 files changed, 106 insertions(+), 63 deletions(-)
>>>>>
>>>>> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
>>>>> index d1cbb27808e2..6f0999015a44 100644
>>>>> --- a/fs/ocfs2/alloc.c
>>>>> +++ b/fs/ocfs2/alloc.c
>>>>> @@ -7532,10 +7532,11 @@ static int ocfs2_trim_group(struct super_block 
>>>>> *sb,
>>>>>           return count;
>>>>>     }
>>>>>     
>>>>> -int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
>>>>> +static
>>>>> +int ocfs2_trim_mainbm(struct super_block *sb, struct fstrim_range *range)
>>>>>     {
>>>>>           struct ocfs2_super *osb = OCFS2_SB(sb);
>>>>> - u64 start, len, trimmed, first_group, last_group, group;
>>>>> + u64 start, len, trimmed = 0, first_group, last_group = 0, group = 0;
>>>>>           int ret, cnt;
>>>>>           u32 first_bit, last_bit, minlen;
>>>>>           struct buffer_head *main_bm_bh = NULL;
>>>>> @@ -7543,7 +7544,6 @@ int ocfs2_trim_fs(struct super_block *sb, struct
>>>> fstrim_range *range)
>>>>>           struct buffer_head *gd_bh = NULL;
>>>>>           struct ocfs2_dinode *main_bm;
>>>>>           struct ocfs2_group_desc *gd = NULL;
>>>>> - struct ocfs2_trim_fs_info info, *pinfo = NULL;
>>>>>     
>>>>>           start = range->start >> osb->s_clustersize_bits;
>>>>>           len = range->len >> osb->s_clustersize_bits;
>>>>> @@ -7552,6 +7552,9 @@ int ocfs2_trim_fs(struct super_block *sb, struct
>>>> fstrim_range *range)
>>>>>           if (minlen >= osb->bitmap_cpg || range->len < sb->s_blocksize)
>>>>>                   return -EINVAL;
>>>>>     
>>>>> + trace_ocfs2_trim_mainbm(start, len, minlen);
>>>>> +
>>>>> +next_group:
>>>>>           main_bm_inode = ocfs2_get_system_file_inode(osb,
>>>>>                                                       
>>>>> GLOBAL_BITMAP_SYSTEM_INODE,
>>>>>                                                       OCFS2_INVALID_SLOT);
>>>>> @@ -7570,64 +7573,34 @@ int ocfs2_trim_fs(struct super_block *sb, struct
>>>> fstrim_range *range)
>>>>>           }
>>>>>           main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data;
>>>>>     
>>>>> - if (start >= le32_to_cpu(main_bm->i_clusters)) {
>>>>> -         ret = -EINVAL;
>>>>> -         goto out_unlock;
>>>>> - }
>>>>> -
>>>>> - len = range->len >> osb->s_clustersize_bits;
>>>>> - if (start + len > le32_to_cpu(main_bm->i_clusters))
>>>>> -         len = le32_to_cpu(main_bm->i_clusters) - start;
>>>>> -
>>>>> - trace_ocfs2_trim_fs(start, len, minlen);
>>>>> -
>>>>> - ocfs2_trim_fs_lock_res_init(osb);
>>>>> - ret = ocfs2_trim_fs_lock(osb, NULL, 1);
>>>>> - if (ret < 0) {
>>>>> -         if (ret != -EAGAIN) {
>>>>> -                 mlog_errno(ret);
>>>>> -                 ocfs2_trim_fs_lock_res_uninit(osb);
>>>>> + /*
>>>>> +  * Do some check before trim the first group.
>>>>> +  */
>>>>> + if (!group) {
>>>>> +         if (start >= le32_to_cpu(main_bm->i_clusters)) {
>>>>> +                 ret = -EINVAL;
>>>>>                           goto out_unlock;
>>>>>                   }
>>>>>     
>>>>> -         mlog(ML_NOTICE, "Wait for trim on device (%s) to "
>>>>> -              "finish, which is running from another node.\n",
>>>>> -              osb->dev_str);
>>>>> -         ret = ocfs2_trim_fs_lock(osb, &info, 0);
>>>>> -         if (ret < 0) {
>>>>> -                 mlog_errno(ret);
>>>>> -                 ocfs2_trim_fs_lock_res_uninit(osb);
>>>>> -                 goto out_unlock;
>>>>> -         }
>>>>> +         if (start + len > le32_to_cpu(main_bm->i_clusters))
>>>>> +                 len = le32_to_cpu(main_bm->i_clusters) - start;
>>>>>     
>>>>> -         if (info.tf_valid && info.tf_success &&
>>>>> -             info.tf_start == start && info.tf_len == len &&
>>>>> -             info.tf_minlen == minlen) {
>>>>> -                 /* Avoid sending duplicated trim to a shared device */
>>>>> -                 mlog(ML_NOTICE, "The same trim on device (%s) was "
>>>>> -                      "just done from node (%u), return.\n",
>>>>> -                      osb->dev_str, info.tf_nodenum);
>>>>> -                 range->len = info.tf_trimlen;
>>>>> -                 goto out_trimunlock;
>>>>> -         }
>>>>> +         /*
>>>>> +          * Determine first and last group to examine based on
>>>>> +          * start and len
>>>>> +          */
>>>>> +         first_group = ocfs2_which_cluster_group(main_bm_inode, start);
>>>>> +         if (first_group == osb->first_cluster_group_blkno)
>>>>> +                 first_bit = start;
>>>>> +         else
>>>>> +                 first_bit = start - ocfs2_blocks_to_clusters(sb,
>>>>> +                                                         first_group);
>>>>> +         last_group = ocfs2_which_cluster_group(main_bm_inode,
>>>>> +                                                start + len - 1);
>>>>> +         group = first_group;
>>>>>           }
>>>>>     
>>>>> - info.tf_nodenum = osb->node_num;
>>>>> - info.tf_start = start;
>>>>> - info.tf_len = len;
>>>>> - info.tf_minlen = minlen;
>>>>> -
>>>>> - /* Determine first and last group to examine based on start and len */
>>>>> - first_group = ocfs2_which_cluster_group(main_bm_inode, start);
>>>>> - if (first_group == osb->first_cluster_group_blkno)
>>>>> -         first_bit = start;
>>>>> - else
>>>>> -         first_bit = start - ocfs2_blocks_to_clusters(sb, first_group);
>>>>> - last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1);
>>>>> - last_bit = osb->bitmap_cpg;
>>>>> -
>>>>> - trimmed = 0;
>>>>> - for (group = first_group; group <= last_group;) {
>>>>> + do {
>>>>>                   if (first_bit + len >= osb->bitmap_cpg)
>>>>>                           last_bit = osb->bitmap_cpg;
>>>>>                   else
>>>>> @@ -7659,21 +7632,81 @@ int ocfs2_trim_fs(struct super_block *sb, struct
>>>> fstrim_range *range)
>>>>>                           group = ocfs2_clusters_to_blocks(sb, 
>>>>> osb->bitmap_cpg);
>>>>>                   else
>>>>>                           group += ocfs2_clusters_to_blocks(sb, 
>>>>> osb->bitmap_cpg);
>>>>> - }
>>>>> - range->len = trimmed * sb->s_blocksize;
>>>>> + } while (0);
>>>>>     
>>>>> - info.tf_trimlen = range->len;
>>>>> - info.tf_success = (ret ? 0 : 1);
>>>>> - pinfo = &info;
>>>>> -out_trimunlock:
>>>>> - ocfs2_trim_fs_unlock(osb, pinfo);
>>>>> - ocfs2_trim_fs_lock_res_uninit(osb);
>>>>>     out_unlock:
>>>>>           ocfs2_inode_unlock(main_bm_inode, 0);
>>>>>           brelse(main_bm_bh);
>>>>> + main_bm_bh = NULL;
>>>>>     out_mutex:
>>>>>           inode_unlock(main_bm_inode);
>>>>>           iput(main_bm_inode);
>>>>> +
>>>>> + /*
>>>>> +  * If all the groups trim are not done or failed, but we should release
>>>>> +  * main_bm related locks for avoiding the current IO starve, then go to
>>>>> +  * trim the next group
>>>>> +  */
>>>>> + if (ret >= 0 && group <= last_group)
>>>>> +         goto next_group;
>>>>>     out:
>>>>> + range->len = trimmed * sb->s_blocksize;
>>>>> + return ret;
>>>>> +}
>>>>> +
>>>>> +int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
>>>>> +{
>>>>> + int ret;
>>>>> + struct ocfs2_super *osb = OCFS2_SB(sb);
>>>>> + struct ocfs2_trim_fs_info info, *pinfo = NULL;
>>>>> +
>>>>> + ocfs2_trim_fs_lock_res_init(osb);
>>>>> +
>>>>> + trace_ocfs2_trim_fs(range->start, range->len, range->minlen);
>>>>> +
>>>>> + ret = ocfs2_trim_fs_lock(osb, NULL, 1);
>>>>> + if (ret < 0) {
>>>>> +         if (ret != -EAGAIN) {
>>>>> +                 mlog_errno(ret);
>>>>> +                 ocfs2_trim_fs_lock_res_uninit(osb);
>>>>> +                 return ret;
>>>>> +         }
>>>>> +
>>>>> +         mlog(ML_NOTICE, "Wait for trim on device (%s) to "
>>>>> +              "finish, which is running from another node.\n",
>>>>> +              osb->dev_str);
>>>>> +         ret = ocfs2_trim_fs_lock(osb, &info, 0);
>>>>> +         if (ret < 0) {
>>>>> +                 mlog_errno(ret);
>>>>> +                 ocfs2_trim_fs_lock_res_uninit(osb);
>>>>> +                 return ret;
>>>>> +         }
>>>>> +
>>>>> +         if (info.tf_valid && info.tf_success &&
>>>>> +             info.tf_start == range->start &&
>>>>> +             info.tf_len == range->len &&
>>>>> +             info.tf_minlen == range->minlen) {
>>>>> +                 /* Avoid sending duplicated trim to a shared device */
>>>>> +                 mlog(ML_NOTICE, "The same trim on device (%s) was "
>>>>> +                      "just done from node (%u), return.\n",
>>>>> +                      osb->dev_str, info.tf_nodenum);
>>>>> +                 range->len = info.tf_trimlen;
>>>>> +                 goto out;
>>>>> +         }
>>>>> + }
>>>>> +
>>>>> + info.tf_nodenum = osb->node_num;
>>>>> + info.tf_start = range->start;
>>>>> + info.tf_len = range->len;
>>>>> + info.tf_minlen = range->minlen;
>>>>> +
>>>>> + ret = ocfs2_trim_mainbm(sb, range);
>>>>> +
>>>>> + info.tf_trimlen = range->len;
>>>>> + info.tf_success = (ret < 0 ? 0 : 1);
>>>>> + pinfo = &info;
>>>>> +out:
>>>>> + ocfs2_trim_fs_unlock(osb, pinfo);
>>>>> + ocfs2_trim_fs_lock_res_uninit(osb);
>>>>>           return ret;
>>>>>     }
>>>>> diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
>>>>> index 7c835824247e..af405586c5b1 100644
>>>>> --- a/fs/ocfs2/dlmglue.c
>>>>> +++ b/fs/ocfs2/dlmglue.c
>>>>> @@ -686,6 +686,9 @@ void ocfs2_trim_fs_lock_res_init(struct ocfs2_super 
>>>>> *osb)
>>>>>     {
>>>>>           struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres;
>>>>>     
>>>>> + /* Only one trimfs thread are allowed to work at the same time. */
>>>>> + mutex_lock(&osb->obs_trim_fs_mutex);
>>>>> +
>>>>
>>>> Cluster lock of fstrim have a trylock behavior, will it be better if we
>>>> trylock here?
>>> Here, I prefer to just serialize fstrim threads on the local node to
>> simplify the code logic,
>>> maybe the user want to do like that, although this behavior is not
>> recommended.
>>> You know, on one node, ideally, the user should call  fstrim command once
>> regularly.
>>> If he calls fstrim command more times in a very short time,
>>> the code will not make each fstrim command return to failure, just do the
>> fstrim task one by one.
>>
>> I have a thought having nothing to do with your patch.
>> Do you think it's possible for us to implement *discard on unlink file* like
>> ext4 does.
>> So the application doesn't have to invoke fstrim periodically.
> Yes, we can do some investigation for adding a mount option "discard" to 
> support discard a file when it is deleted.
> This can be considered as another feature, since its discard occasion and 
> granularity is different with fstrim.
> The scheduled fstrim command can considered as a traditional file system 
> level trim.
> If the file system supports the on-demand discard when some blocks were 
> released, that's better.
> Of course. these two features can coexist.

Good to hear this. :-)

> 
> Thanks
> Gang
> 
>>
>> Thanks,
>> Changwei
>>
>>>
>>> Thanks
>>> Gang
>>>
>>>>
>>>> Thanks,
>>>> Changwei
>>>>
>>>>>           ocfs2_lock_res_init_once(lockres);
>>>>>           ocfs2_build_lock_name(OCFS2_LOCK_TYPE_TRIM_FS, 0, 0, 
>>>>> lockres->l_name);
>>>>>           ocfs2_lock_res_init_common(osb, lockres, 
>>>>> OCFS2_LOCK_TYPE_TRIM_FS,
>>>>> @@ -698,6 +701,8 @@ void ocfs2_trim_fs_lock_res_uninit(struct ocfs2_super
>>>> *osb)
>>>>>     
>>>>>           ocfs2_simple_drop_lockres(osb, lockres);
>>>>>           ocfs2_lock_res_free(lockres);
>>>>> +
>>>>> + mutex_unlock(&osb->obs_trim_fs_mutex);
>>>>>     }
>>>>>     
>>>>>     static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res 
>>>>> *res,
>>>>> diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
>>>>> index 4f86ac0027b5..1f029fbe8b8d 100644
>>>>> --- a/fs/ocfs2/ocfs2.h
>>>>> +++ b/fs/ocfs2/ocfs2.h
>>>>> @@ -407,6 +407,7 @@ struct ocfs2_super
>>>>>           struct ocfs2_lock_res osb_rename_lockres;
>>>>>           struct ocfs2_lock_res osb_nfs_sync_lockres;
>>>>>           struct ocfs2_lock_res osb_trim_fs_lockres;
>>>>> + struct mutex obs_trim_fs_mutex;
>>>>>           struct ocfs2_dlm_debug *osb_dlm_debug;
>>>>>     
>>>>>           struct dentry *osb_debug_root;
>>>>> diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
>>>>> index 2ee76a90ba8f..dc4bce1649c1 100644
>>>>> --- a/fs/ocfs2/ocfs2_trace.h
>>>>> +++ b/fs/ocfs2/ocfs2_trace.h
>>>>> @@ -712,6 +712,8 @@ TRACE_EVENT(ocfs2_trim_extent,
>>>>>     
>>>>>     DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_trim_group);
>>>>>     
>>>>> +DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_mainbm);
>>>>> +
>>>>>     DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_fs);
>>>>>     
>>>>>     /* End of trace events for fs/ocfs2/alloc.c. */
>>>>> diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
>>>>> index 3415e0b09398..96ae7cedd487 100644
>>>>> --- a/fs/ocfs2/super.c
>>>>> +++ b/fs/ocfs2/super.c
>>>>> @@ -1847,6 +1847,8 @@ static int ocfs2_mount_volume(struct super_block 
>>>>> *sb)
>>>>>           if (ocfs2_is_hard_readonly(osb))
>>>>>                   goto leave;
>>>>>     
>>>>> + mutex_init(&osb->obs_trim_fs_mutex);
>>>>> +
>>>>>           status = ocfs2_dlm_init(osb);
>>>>>           if (status < 0) {
>>>>>                   mlog_errno(status);
>>>>>
>>>
> 

Reply via email to