Hi Jaegeuk,

> -----Original Message-----
> From: Jaegeuk Kim [mailto:jaeg...@kernel.org]
> Sent: Tuesday, September 23, 2014 12:53 PM
> To: linux-kernel@vger.kernel.org; linux-fsde...@vger.kernel.org;
> linux-f2fs-de...@lists.sourceforge.net
> Cc: Jaegeuk Kim
> Subject: [f2fs-dev] [PATCH 3/3] f2fs: refactor flush_nat_entries to remove 
> costly reorganizing
> ops
> 
> Previously, f2fs tries to reorganize the dirty nat entries into multiple sets
> according to its nid ranges. This can improve the flushing nat pages, however,
> if there are a lot of cached nat entries, it becomes a bottleneck.
> 
> This patch introduces a new set management flow by removing dirty nat list and
> adding a series of set operations when the nat entry becomes dirty.
> 
> Signed-off-by: Jaegeuk Kim <jaeg...@kernel.org>
> ---
>  fs/f2fs/f2fs.h |  13 +--
>  fs/f2fs/node.c | 299 
> +++++++++++++++++++++++++++++----------------------------
>  fs/f2fs/node.h |   9 +-
>  3 files changed, 162 insertions(+), 159 deletions(-)
> 
> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
> index 7b1e1d2..94cfdc4 100644
> --- a/fs/f2fs/f2fs.h
> +++ b/fs/f2fs/f2fs.h
> @@ -164,6 +164,9 @@ struct fsync_inode_entry {
>  #define sit_in_journal(sum, i)               (sum->sit_j.entries[i].se)
>  #define segno_in_journal(sum, i)     (sum->sit_j.entries[i].segno)
> 
> +#define MAX_NAT_JENTRIES(sum)        (NAT_JOURNAL_ENTRIES - 
> nats_in_cursum(sum))
> +#define MAX_SIT_JENTRIES(sum)        (SIT_JOURNAL_ENTRIES - 
> sits_in_cursum(sum))
> +
>  static inline int update_nats_in_cursum(struct f2fs_summary_block *rs, int i)
>  {
>       int before = nats_in_cursum(rs);
> @@ -182,9 +185,8 @@ static inline bool __has_cursum_space(struct 
> f2fs_summary_block *sum, int
> size,
>                                                               int type)
>  {
>       if (type == NAT_JOURNAL)
> -             return nats_in_cursum(sum) + size <= NAT_JOURNAL_ENTRIES;
> -
> -     return sits_in_cursum(sum) + size <= SIT_JOURNAL_ENTRIES;
> +             return size <= MAX_NAT_JENTRIES(sum);
> +     return size <= MAX_SIT_JENTRIES(sum);
>  }
> 
>  /*
> @@ -292,11 +294,10 @@ struct f2fs_nm_info {
> 
>       /* NAT cache management */
>       struct radix_tree_root nat_root;/* root of the nat entry cache */
> +     struct radix_tree_root nat_set_root;/* root of the nat set cache */
>       rwlock_t nat_tree_lock;         /* protect nat_tree_lock */
> -     unsigned int nat_cnt;           /* the # of cached nat entries */
>       struct list_head nat_entries;   /* cached nat entry list (clean) */
> -     struct list_head dirty_nat_entries; /* cached nat entry list (dirty) */
> -     struct list_head nat_entry_set; /* nat entry set list */
> +     unsigned int nat_cnt;           /* the # of cached nat entries */
>       unsigned int dirty_nat_cnt;     /* total num of nat entries in set */
> 
>       /* free node ids management */
> diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
> index 21ed91b..f5a21f4 100644
> --- a/fs/f2fs/node.c
> +++ b/fs/f2fs/node.c
> @@ -123,6 +123,57 @@ static void __del_from_nat_cache(struct f2fs_nm_info 
> *nm_i, struct
> nat_entry *e)
>       kmem_cache_free(nat_entry_slab, e);
>  }
> 
> +static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
> +                                             struct nat_entry *ne)
> +{
> +     nid_t set = ne->ni.nid / NAT_ENTRY_PER_BLOCK;

nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid);

> +     struct nat_entry_set *head;
> +
> +     if (get_nat_flag(ne, IS_DIRTY))
> +             return;
> +retry:
> +     head = radix_tree_lookup(&nm_i->nat_set_root, set);
> +     if (!head) {
> +             head = f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_ATOMIC);
> +
> +             INIT_LIST_HEAD(&head->entry_list);
> +             INIT_LIST_HEAD(&head->set_list);
> +             head->set = set;
> +             head->entry_cnt = 0;
> +
> +             if (radix_tree_insert(&nm_i->nat_set_root, set, head)) {
> +                     cond_resched();
> +                     goto retry;
> +             }
> +     }
> +     list_move_tail(&ne->list, &head->entry_list);
> +     nm_i->dirty_nat_cnt++;
> +     head->entry_cnt++;
> +     set_nat_flag(ne, IS_DIRTY, true);
> +}
> +
> +static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i,
> +                                             struct nat_entry *ne)
> +{
> +     nid_t set = ne->ni.nid / NAT_ENTRY_PER_BLOCK;
> +     struct nat_entry_set *head;
> +
> +     head = radix_tree_lookup(&nm_i->nat_set_root, set);
> +     if (head) {
> +             list_move_tail(&ne->list, &nm_i->nat_entries);
> +             set_nat_flag(ne, IS_DIRTY, false);
> +             head->entry_cnt--;
> +             nm_i->dirty_nat_cnt--;
> +     }
> +}
> +
> +static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i,
> +             nid_t start, unsigned int nr, struct nat_entry_set **ep)
> +{
> +     return radix_tree_gang_lookup(&nm_i->nat_set_root, (void **)ep,
> +                                                     start, nr);
> +}
> +
>  bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
>  {
>       struct f2fs_nm_info *nm_i = NM_I(sbi);
> @@ -1739,79 +1790,6 @@ skip:
>       return err;
>  }
> 
> -static struct nat_entry_set *grab_nat_entry_set(void)
> -{
> -     struct nat_entry_set *nes =
> -                     f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_ATOMIC);
> -
> -     nes->entry_cnt = 0;
> -     INIT_LIST_HEAD(&nes->set_list);
> -     INIT_LIST_HEAD(&nes->entry_list);
> -     return nes;
> -}
> -
> -static void release_nat_entry_set(struct nat_entry_set *nes,
> -                                             struct f2fs_nm_info *nm_i)
> -{
> -     nm_i->dirty_nat_cnt -= nes->entry_cnt;
> -     list_del(&nes->set_list);
> -     kmem_cache_free(nat_entry_set_slab, nes);
> -}
> -
> -static void adjust_nat_entry_set(struct nat_entry_set *nes,
> -                                             struct list_head *head)
> -{
> -     struct nat_entry_set *next = nes;
> -
> -     if (list_is_last(&nes->set_list, head))
> -             return;
> -
> -     list_for_each_entry_continue(next, head, set_list)
> -             if (nes->entry_cnt <= next->entry_cnt)
> -                     break;
> -
> -     list_move_tail(&nes->set_list, &next->set_list);
> -}
> -
> -static void add_nat_entry(struct nat_entry *ne, struct list_head *head)
> -{
> -     struct nat_entry_set *nes;
> -     nid_t start_nid = START_NID(ne->ni.nid);
> -
> -     list_for_each_entry(nes, head, set_list) {
> -             if (nes->start_nid == start_nid) {
> -                     list_move_tail(&ne->list, &nes->entry_list);
> -                     nes->entry_cnt++;
> -                     adjust_nat_entry_set(nes, head);
> -                     return;
> -             }
> -     }
> -
> -     nes = grab_nat_entry_set();
> -
> -     nes->start_nid = start_nid;
> -     list_move_tail(&ne->list, &nes->entry_list);
> -     nes->entry_cnt++;
> -     list_add(&nes->set_list, head);
> -}
> -
> -static void merge_nats_in_set(struct f2fs_sb_info *sbi)
> -{
> -     struct f2fs_nm_info *nm_i = NM_I(sbi);
> -     struct list_head *dirty_list = &nm_i->dirty_nat_entries;
> -     struct list_head *set_list = &nm_i->nat_entry_set;
> -     struct nat_entry *ne, *tmp;
> -
> -     write_lock(&nm_i->nat_tree_lock);
> -     list_for_each_entry_safe(ne, tmp, dirty_list, list) {
> -             if (nat_get_blkaddr(ne) == NEW_ADDR)
> -                     continue;

Shouldn't we move this condition judgment into __flush_nat_entry_set?

> -             add_nat_entry(ne, set_list);
> -             nm_i->dirty_nat_cnt++;
> -     }
> -     write_unlock(&nm_i->nat_tree_lock);
> -}
> -
>  static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
>  {
>       struct f2fs_nm_info *nm_i = NM_I(sbi);
> @@ -1846,101 +1824,129 @@ found:
>       mutex_unlock(&curseg->curseg_mutex);
>  }
> 
> -/*
> - * This function is called during the checkpointing process.
> - */
> -void flush_nat_entries(struct f2fs_sb_info *sbi)
> +static void __adjust_nat_entry_set(struct nat_entry_set *nes,
> +                                             struct list_head *head, int max)
>  {
> -     struct f2fs_nm_info *nm_i = NM_I(sbi);
> -     struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
> -     struct f2fs_summary_block *sum = curseg->sum_blk;
> -     struct nat_entry_set *nes, *tmp;
> -     struct list_head *head = &nm_i->nat_entry_set;
> -     bool to_journal = true;
> +     struct nat_entry_set *cur;
> +     nid_t dirty_cnt = 0;
> 
> -     /* merge nat entries of dirty list to nat entry set temporarily */
> -     merge_nats_in_set(sbi);
> +     if (nes->entry_cnt >= max)
> +             goto add_out;
> 
> -     /*
> -      * if there are no enough space in journal to store dirty nat
> -      * entries, remove all entries from journal and merge them
> -      * into nat entry set.
> -      */
> -     if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt, NAT_JOURNAL)) {
> -             remove_nats_in_journal(sbi);
> -
> -             /*
> -              * merge nat entries of dirty list to nat entry set temporarily
> -              */
> -             merge_nats_in_set(sbi);
> +     list_for_each_entry(cur, head, set_list) {
> +             dirty_cnt += cur->entry_cnt;
> +             if (dirty_cnt > max)
> +                     break;

Seems a problem here, For example:
set no:         1       2       3       4       5       6       7
nat number:     4       3       2       1       1       1       1
max = 6
set list will be: 3--4--2--1--1--1--1
Then we will prior flush nat set with more entries in the set list, result in
a degradation.
We'd better not break the adjustment until we finish to traverse the whole
sequential sorted set list to avoid above condition, so how about removing the
codes relate to dirty_cnt?

Thanks,
Yu

> +             if (cur->entry_cnt >= nes->entry_cnt) {
> +                     list_add(&nes->set_list, cur->set_list.prev);
> +                     return;
> +             }
>       }
> +add_out:
> +     list_add_tail(&nes->set_list, head);
> +}
> 
> -     if (!nm_i->dirty_nat_cnt)
> -             return;
> +static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
> +                                     struct nat_entry_set *set)
> +{
> +     struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
> +     struct f2fs_summary_block *sum = curseg->sum_blk;
> +     nid_t start_nid = set->set * NAT_ENTRY_PER_BLOCK;
> +     bool to_journal = true;
> +     struct f2fs_nat_block *nat_blk;
> +     struct nat_entry *ne, *cur;
> +     struct page *page = NULL;
> 
>       /*
>        * there are two steps to flush nat entries:
>        * #1, flush nat entries to journal in current hot data summary block.
>        * #2, flush nat entries to nat page.
>        */
> -     list_for_each_entry_safe(nes, tmp, head, set_list) {
> -             struct f2fs_nat_block *nat_blk;
> -             struct nat_entry *ne, *cur;
> -             struct page *page;
> -             nid_t start_nid = nes->start_nid;
> +     if (!__has_cursum_space(sum, set->entry_cnt, NAT_JOURNAL))
> +             to_journal = false;
> 
> -             if (to_journal &&
> -                     !__has_cursum_space(sum, nes->entry_cnt, NAT_JOURNAL))
> -                     to_journal = false;
> +     if (to_journal) {
> +             mutex_lock(&curseg->curseg_mutex);
> +     } else {
> +             page = get_next_nat_page(sbi, start_nid);
> +             nat_blk = page_address(page);
> +             f2fs_bug_on(sbi, !nat_blk);
> +     }
> +
> +     /* flush dirty nats in nat entry set */
> +     list_for_each_entry_safe(ne, cur, &set->entry_list, list) {
> +             struct f2fs_nat_entry *raw_ne;
> +             nid_t nid = nat_get_nid(ne);
> +             int offset;
> 
>               if (to_journal) {
> -                     mutex_lock(&curseg->curseg_mutex);
> +                     offset = lookup_journal_in_cursum(sum,
> +                                                     NAT_JOURNAL, nid, 1);
> +                     f2fs_bug_on(sbi, offset < 0);
> +                     raw_ne = &nat_in_journal(sum, offset);
> +                     nid_in_journal(sum, offset) = cpu_to_le32(nid);
>               } else {
> -                     page = get_next_nat_page(sbi, start_nid);
> -                     nat_blk = page_address(page);
> -                     f2fs_bug_on(sbi, !nat_blk);
> +                     raw_ne = &nat_blk->entries[nid - start_nid];
>               }
> +             raw_nat_from_node_info(raw_ne, &ne->ni);
> 
> -             /* flush dirty nats in nat entry set */
> -             list_for_each_entry_safe(ne, cur, &nes->entry_list, list) {
> -                     struct f2fs_nat_entry *raw_ne;
> -                     nid_t nid = nat_get_nid(ne);
> -                     int offset;
> +             write_lock(&NM_I(sbi)->nat_tree_lock);
> +             nat_reset_flag(ne);
> +             __clear_nat_cache_dirty(NM_I(sbi), ne);
> +             write_unlock(&NM_I(sbi)->nat_tree_lock);
> 
> -                     if (to_journal) {
> -                             offset = lookup_journal_in_cursum(sum,
> -                                                     NAT_JOURNAL, nid, 1);
> -                             f2fs_bug_on(sbi, offset < 0);
> -                             raw_ne = &nat_in_journal(sum, offset);
> -                             nid_in_journal(sum, offset) = cpu_to_le32(nid);
> -                     } else {
> -                             raw_ne = &nat_blk->entries[nid - start_nid];
> -                     }
> -                     raw_nat_from_node_info(raw_ne, &ne->ni);
> +             if (nat_get_blkaddr(ne) == NULL_ADDR)
> +                     add_free_nid(sbi, nid, false);
> +     }
> 
> -                     if (nat_get_blkaddr(ne) == NULL_ADDR &&
> -                             add_free_nid(sbi, nid, false) <= 0) {
> -                             write_lock(&nm_i->nat_tree_lock);
> -                             __del_from_nat_cache(nm_i, ne);
> -                             write_unlock(&nm_i->nat_tree_lock);
> -                     } else {
> -                             write_lock(&nm_i->nat_tree_lock);
> -                             nat_reset_flag(ne);
> -                             __clear_nat_cache_dirty(nm_i, ne);
> -                             write_unlock(&nm_i->nat_tree_lock);
> -                     }
> -             }
> +     if (to_journal)
> +             mutex_unlock(&curseg->curseg_mutex);
> +     else
> +             f2fs_put_page(page, 1);
> 
> -             if (to_journal)
> -                     mutex_unlock(&curseg->curseg_mutex);
> -             else
> -                     f2fs_put_page(page, 1);
> +     f2fs_bug_on(sbi, set->entry_cnt);
> +     radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set);
> +     kmem_cache_free(nat_entry_set_slab, set);
> +}
> 
> -             f2fs_bug_on(sbi, !list_empty(&nes->entry_list));
> -             release_nat_entry_set(nes, nm_i);
> +/*
> + * This function is called during the checkpointing process.
> + */
> +void flush_nat_entries(struct f2fs_sb_info *sbi)
> +{
> +     struct f2fs_nm_info *nm_i = NM_I(sbi);
> +     struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
> +     struct f2fs_summary_block *sum = curseg->sum_blk;
> +     struct nat_entry_set *setvec[NATVEC_SIZE];
> +     struct nat_entry_set *set, *tmp;
> +     unsigned int found;
> +     nid_t set_idx = 0;
> +     LIST_HEAD(sets);
> +
> +     /*
> +      * if there are no enough space in journal to store dirty nat
> +      * entries, remove all entries from journal and merge them
> +      * into nat entry set.
> +      */
> +     if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt, NAT_JOURNAL))
> +             remove_nats_in_journal(sbi);
> +
> +     if (!nm_i->dirty_nat_cnt)
> +             return;
> +
> +     while ((found = __gang_lookup_nat_set(nm_i,
> +                                     set_idx, NATVEC_SIZE, setvec))) {
> +             unsigned idx;
> +             set_idx = setvec[found - 1]->set + 1;
> +             for (idx = 0; idx < found; idx++)
> +                     __adjust_nat_entry_set(setvec[idx], &sets,
> +                                                     MAX_NAT_JENTRIES(sum));
>       }
> 
> -     f2fs_bug_on(sbi, !list_empty(head));
> +     /* flush dirty nats in nat entry set */
> +     list_for_each_entry_safe(set, tmp, &sets, set_list)
> +             __flush_nat_entry_set(sbi, set);
> +
>       f2fs_bug_on(sbi, nm_i->dirty_nat_cnt);
>  }
> 
> @@ -1968,9 +1974,8 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
>       INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC);
>       INIT_LIST_HEAD(&nm_i->free_nid_list);
>       INIT_RADIX_TREE(&nm_i->nat_root, GFP_ATOMIC);
> +     INIT_RADIX_TREE(&nm_i->nat_set_root, GFP_ATOMIC);
>       INIT_LIST_HEAD(&nm_i->nat_entries);
> -     INIT_LIST_HEAD(&nm_i->dirty_nat_entries);
> -     INIT_LIST_HEAD(&nm_i->nat_entry_set);
> 
>       mutex_init(&nm_i->build_lock);
>       spin_lock_init(&nm_i->free_nid_list_lock);
> diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
> index b8ba63c..bd826d9 100644
> --- a/fs/f2fs/node.h
> +++ b/fs/f2fs/node.h
> @@ -43,6 +43,7 @@ enum {
>       IS_CHECKPOINTED,        /* is it checkpointed before? */
>       HAS_FSYNCED_INODE,      /* is the inode fsynced before? */
>       HAS_LAST_FSYNC,         /* has the latest node fsync mark? */
> +     IS_DIRTY,               /* this nat entry is dirty? */
>  };
> 
>  struct nat_entry {
> @@ -60,10 +61,6 @@ struct nat_entry {
>  #define nat_get_version(nat)         (nat->ni.version)
>  #define nat_set_version(nat, v)              (nat->ni.version = v)
> 
> -#define __set_nat_cache_dirty(nm_i, ne)                                      
> \
> -             list_move_tail(&ne->list, &nm_i->dirty_nat_entries);
> -#define __clear_nat_cache_dirty(nm_i, ne)                            \
> -             list_move_tail(&ne->list, &nm_i->nat_entries);
>  #define inc_node_version(version)    (++version)
> 
>  static inline void set_nat_flag(struct nat_entry *ne,
> @@ -113,9 +110,9 @@ enum mem_type {
>  };
> 
>  struct nat_entry_set {
> -     struct list_head set_list;      /* link with all nat sets */
> +     struct list_head set_list;      /* link with other nat sets */
>       struct list_head entry_list;    /* link with dirty nat entries */
> -     nid_t start_nid;                /* start nid of nats in set */
> +     nid_t set;                      /* set number*/
>       unsigned int entry_cnt;         /* the # of nat entries in set */
>  };
> 
> --
> 1.8.5.2 (Apple Git-48)
> 
> 
> ------------------------------------------------------------------------------
> Meet PCI DSS 3.0 Compliance Requirements with EventLog Analyzer
> Achieve PCI DSS 3.0 Compliant Status with Out-of-the-box PCI DSS Reports
> Are you Audit-Ready for PCI DSS 3.0 Compliance? Download White paper
> Comply to PCI DSS 3.0 Requirement 10 and 11.5 with EventLog Analyzer
> http://pubads.g.doubleclick.net/gampad/clk?id=154622311&iu=/4140/ostg.clktrk
> _______________________________________________
> Linux-f2fs-devel mailing list
> linux-f2fs-de...@lists.sourceforge.net
> https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to