hlist_bl based hash for quotas

Paul E. McKenney Tue, 21 Jan 2014 21:33:54 -0800

On Mon, Jan 20, 2014 at 12:23:40PM +0000, Steven Whitehouse wrote:
> Prior to this patch, GFS2 kept all the quotas for each
> super block in a single linked list. This is rather slow
> when there are large numbers of quotas.
> 
> This patch introduces a hlist_bl based hash table, similar
> to the one used for glocks. The initial look up of the quota
> is now lockless in the case where it is already cached,
> although we still have to take the per quota spinlock in
> order to bump the ref count. Either way though, this is a
> big improvement on what was there before.
> 
> The qd_lock and the per super block list is preserved, for
> the time being. However it is intended that since this is no
> longer used for its original role, it should be possible to
> shrink the number of items on that list in due course and
> remove the requirement to take qd_lock in qd_get.
> 
> Signed-off-by: Steven Whitehouse <swhit...@redhat.com>
> Cc: Abhijith Das <a...@redhat.com>
> Cc: Paul E. McKenney <paul...@linux.vnet.ibm.com>


Interesting!  I thought that Sasha Levin had a hash table in the works,
but I don't see it, so CCing him.

A few questions and comments below.

                                                        Thanx, Paul

> diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
> index a99f60c..59d99ec 100644
> --- a/fs/gfs2/incore.h
> +++ b/fs/gfs2/incore.h
> @@ -428,10 +428,13 @@ enum {
>  };
> 
>  struct gfs2_quota_data {
> +     struct hlist_bl_node qd_hlist;
>       struct list_head qd_list;
>       struct kqid qd_id;
> +     struct gfs2_sbd *qd_sbd;
>       struct lockref qd_lockref;
>       struct list_head qd_lru;
> +     unsigned qd_hash;
> 
>       unsigned long qd_flags;         /* QDF_... */
> 
> @@ -450,6 +453,7 @@ struct gfs2_quota_data {
> 
>       u64 qd_sync_gen;
>       unsigned long qd_last_warn;
> +     struct rcu_head qd_rcu;
>  };
> 
>  struct gfs2_trans {
> diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
> index 0650db2..c272e73 100644
> --- a/fs/gfs2/main.c
> +++ b/fs/gfs2/main.c
> @@ -76,6 +76,7 @@ static int __init init_gfs2_fs(void)
> 
>       gfs2_str2qstr(&gfs2_qdot, ".");
>       gfs2_str2qstr(&gfs2_qdotdot, "..");
> +     gfs2_quota_hash_init();
> 
>       error = gfs2_sys_init();
>       if (error)
> diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
> index 1b6b367..a1df01d 100644
> --- a/fs/gfs2/quota.c
> +++ b/fs/gfs2/quota.c
> @@ -52,6 +52,10 @@
>  #include <linux/dqblk_xfs.h>
>  #include <linux/lockref.h>
>  #include <linux/list_lru.h>
> +#include <linux/rcupdate.h>
> +#include <linux/rculist_bl.h>
> +#include <linux/bit_spinlock.h>
> +#include <linux/jhash.h>
> 
>  #include "gfs2.h"
>  #include "incore.h"
> @@ -67,10 +71,43 @@
>  #include "inode.h"
>  #include "util.h"
> 
> -/* Lock order: qd_lock -> qd->lockref.lock -> lru lock */
> +#define GFS2_QD_HASH_SHIFT      12

Should this be a function of the number of CPUs?  (Might not be an issue
if the really big systems don't use GFS.)

> +#define GFS2_QD_HASH_SIZE       (1 << GFS2_QD_HASH_SHIFT)
> +#define GFS2_QD_HASH_MASK       (GFS2_QD_HASH_SIZE - 1)
> +
> +/* Lock order: qd_lock -> bucket lock -> qd->lockref.lock -> lru lock */
>  static DEFINE_SPINLOCK(qd_lock);
>  struct list_lru gfs2_qd_lru;
> 
> +static struct hlist_bl_head qd_hash_table[GFS2_QD_HASH_SIZE];
> +
> +static unsigned int gfs2_qd_hash(const struct gfs2_sbd *sdp,
> +                              const struct kqid qid)
> +{
> +     unsigned int h;
> +
> +     h = jhash(&sdp, sizeof(struct gfs2_sbd *), 0);
> +     h = jhash(&qid, sizeof(struct kqid), h);
> +
> +     return h & GFS2_QD_HASH_MASK;
> +}
> +
> +static inline void spin_lock_bucket(unsigned int hash)
> +{
> +        hlist_bl_lock(&qd_hash_table[hash]);
> +}
> +
> +static inline void spin_unlock_bucket(unsigned int hash)
> +{
> +        hlist_bl_unlock(&qd_hash_table[hash]);
> +}
> +
> +static void gfs2_qd_dealloc(struct rcu_head *rcu)
> +{
> +     struct gfs2_quota_data *qd = container_of(rcu, struct gfs2_quota_data, 
> qd_rcu);
> +     kmem_cache_free(gfs2_quotad_cachep, qd);
> +}
> +
>  static void gfs2_qd_dispose(struct list_head *list)
>  {
>       struct gfs2_quota_data *qd;
> @@ -87,6 +124,10 @@ static void gfs2_qd_dispose(struct list_head *list)
>               list_del(&qd->qd_list);
>               spin_unlock(&qd_lock);
> 
> +             spin_lock_bucket(qd->qd_hash);
> +             hlist_bl_del_rcu(&qd->qd_hlist);
> +             spin_unlock_bucket(qd->qd_hash);
> +

Good, removed from the RCU-traversed list before invoking call_rcu().

>               gfs2_assert_warn(sdp, !qd->qd_change);
>               gfs2_assert_warn(sdp, !qd->qd_slot_count);
>               gfs2_assert_warn(sdp, !qd->qd_bh_count);
> @@ -95,7 +136,7 @@ static void gfs2_qd_dispose(struct list_head *list)
>               atomic_dec(&sdp->sd_quota_count);
> 
>               /* Delete it from the common reclaim list */
> -             kmem_cache_free(gfs2_quotad_cachep, qd);
> +             call_rcu(&qd->qd_rcu, gfs2_qd_dealloc);
>       }
>  }
> 
> @@ -165,83 +206,95 @@ static u64 qd2offset(struct gfs2_quota_data *qd)
>       return offset;
>  }
> 
> -static int qd_alloc(struct gfs2_sbd *sdp, struct kqid qid,
> -                 struct gfs2_quota_data **qdp)
> +static struct gfs2_quota_data *qd_alloc(unsigned hash, struct gfs2_sbd *sdp, 
> struct kqid qid)
>  {
>       struct gfs2_quota_data *qd;
>       int error;
> 
>       qd = kmem_cache_zalloc(gfs2_quotad_cachep, GFP_NOFS);
>       if (!qd)
> -             return -ENOMEM;
> +             return NULL;
> 
> +     qd->qd_sbd = sdp;
>       qd->qd_lockref.count = 1;
>       spin_lock_init(&qd->qd_lockref.lock);
>       qd->qd_id = qid;
>       qd->qd_slot = -1;
>       INIT_LIST_HEAD(&qd->qd_lru);
> +     qd->qd_hash = hash;
> 
>       error = gfs2_glock_get(sdp, qd2index(qd),
>                             &gfs2_quota_glops, CREATE, &qd->qd_gl);
>       if (error)
>               goto fail;
> 
> -     *qdp = qd;
> -
> -     return 0;
> +     return qd;
> 
>  fail:
>       kmem_cache_free(gfs2_quotad_cachep, qd);
> -     return error;
> +     return NULL;
>  }
> 
> -static int qd_get(struct gfs2_sbd *sdp, struct kqid qid,
> -               struct gfs2_quota_data **qdp)
> +static struct gfs2_quota_data *gfs2_qd_search_bucket(unsigned int hash,
> +                                                  const struct gfs2_sbd *sdp,
> +                                                  struct kqid qid)
>  {
> -     struct gfs2_quota_data *qd = NULL, *new_qd = NULL;
> -     int error, found;
> -
> -     *qdp = NULL;
> +     struct gfs2_quota_data *qd;
> +     struct hlist_bl_node *h;
> 
> -     for (;;) {
> -             found = 0;
> -             spin_lock(&qd_lock);
> -             list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) {
> -                     if (qid_eq(qd->qd_id, qid) &&
> -                         lockref_get_not_dead(&qd->qd_lockref)) {
> -                             list_lru_del(&gfs2_qd_lru, &qd->qd_lru);
> -                             found = 1;
> -                             break;
> -                     }
> +     hlist_bl_for_each_entry_rcu(qd, h, &qd_hash_table[hash], qd_hlist) {

All callers of gfs2_qd_search_bucket() either hold the update-side lock
(acquired via spin_lock_bucket()) or have invoked rcu_read_lock(), so
this is good.

> +             if (!qid_eq(qd->qd_id, qid))
> +                     continue;
> +             if (qd->qd_sbd != sdp)
> +                     continue;
> +             if (lockref_get_not_dead(&qd->qd_lockref)) {
> +                     list_lru_del(&gfs2_qd_lru, &qd->qd_lru);

list_lru_del() acquires a lock, but it is from an array whose size
depends on the NODES_SHIFT Kconfig variable.  The array size seems to
vary from 8 to 16 to 64 to 1024, depending on other configuration info,
so should be OK.

> +                     return qd;
>               }
> +     }
> 
> -             if (!found)
> -                     qd = NULL;
> +     return NULL;
> +}
> 
> -             if (!qd && new_qd) {
> -                     qd = new_qd;
> -                     list_add(&qd->qd_list, &sdp->sd_quota_list);
> -                     atomic_inc(&sdp->sd_quota_count);
> -                     new_qd = NULL;
> -             }
> 
> -             spin_unlock(&qd_lock);
> +static int qd_get(struct gfs2_sbd *sdp, struct kqid qid,
> +               struct gfs2_quota_data **qdp)
> +{
> +     struct gfs2_quota_data *qd, *new_qd;
> +     unsigned int hash = gfs2_qd_hash(sdp, qid);
> 
> -             if (qd) {
> -                     if (new_qd) {
> -                             gfs2_glock_put(new_qd->qd_gl);
> -                             kmem_cache_free(gfs2_quotad_cachep, new_qd);
> -                     }
> -                     *qdp = qd;
> -                     return 0;
> -             }
> +     rcu_read_lock();
> +     *qdp = qd = gfs2_qd_search_bucket(hash, sdp, qid);
> +     rcu_read_unlock();
> 
> -             error = qd_alloc(sdp, qid, &new_qd);
> -             if (error)
> -                     return error;
> +     if (qd)
> +             return 0;
> +
> +     new_qd = qd_alloc(hash, sdp, qid);
> +     if (!new_qd)
> +             return -ENOMEM;
> +
> +     spin_lock(&qd_lock);
> +     spin_lock_bucket(hash);
> +     *qdp = qd = gfs2_qd_search_bucket(hash, sdp, qid);
> +     if (qd == NULL) {
> +             *qdp = new_qd;
> +             list_add(&new_qd->qd_list, &sdp->sd_quota_list);
> +             hlist_bl_add_head_rcu(&new_qd->qd_hlist, &qd_hash_table[hash]);
> +             atomic_inc(&sdp->sd_quota_count);
> +     }
> +     spin_unlock_bucket(hash);
> +     spin_unlock(&qd_lock);
> +
> +     if (qd) {
> +             gfs2_glock_put(new_qd->qd_gl);
> +             kmem_cache_free(gfs2_quotad_cachep, new_qd);
>       }
> +
> +     return 0;
>  }
> 
> +
>  static void qd_hold(struct gfs2_quota_data *qd)
>  {
>       struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
> @@ -1215,6 +1268,7 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
>       unsigned int blocks = size >> sdp->sd_sb.sb_bsize_shift;
>       unsigned int x, slot = 0;
>       unsigned int found = 0;
> +     unsigned int hash;
>       u64 dblock;
>       u32 extlen = 0;
>       int error;
> @@ -1272,8 +1326,9 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
>                       if (!qc_change)
>                               continue;
> 
> -                     error = qd_alloc(sdp, qc_id, &qd);
> -                     if (error) {
> +                     hash = gfs2_qd_hash(sdp, qc_id);
> +                     qd = qd_alloc(hash, sdp, qc_id);
> +                     if (qd == NULL) {
>                               brelse(bh);
>                               goto fail;
>                       }
> @@ -1289,6 +1344,10 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
>                       atomic_inc(&sdp->sd_quota_count);
>                       spin_unlock(&qd_lock);
> 
> +                     spin_lock_bucket(hash);
> +                     hlist_bl_add_head_rcu(&qd->qd_hlist, 
> &qd_hash_table[hash]);
> +                     spin_unlock_bucket(hash);
> +
>                       found++;
>               }
> 
> @@ -1335,11 +1394,16 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
>               spin_unlock(&qd->qd_lockref.lock);
> 
>               list_del(&qd->qd_list);
> +
>               /* Also remove if this qd exists in the reclaim list */
>               list_lru_del(&gfs2_qd_lru, &qd->qd_lru);
>               atomic_dec(&sdp->sd_quota_count);
>               spin_unlock(&qd_lock);
> 
> +             spin_lock_bucket(qd->qd_hash);
> +             hlist_bl_del_rcu(&qd->qd_hlist);

Might just be my unfamiliarity with this code, but it took me a bit
to see the difference between ->qd_hlist and ->qd_list.  Of course, until
I spotted the difference, I was wondering why you were removing the
item twice.  ;-)

> +             spin_unlock_bucket(qd->qd_hash);
> +
>               if (!qd->qd_lockref.count) {
>                       gfs2_assert_warn(sdp, !qd->qd_change);
>                       gfs2_assert_warn(sdp, !qd->qd_slot_count);
> @@ -1348,7 +1412,7 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
>               gfs2_assert_warn(sdp, !qd->qd_bh_count);
> 
>               gfs2_glock_put(qd->qd_gl);
> -             kmem_cache_free(gfs2_quotad_cachep, qd);
> +             call_rcu(&qd->qd_rcu, gfs2_qd_dealloc);
> 
>               spin_lock(&qd_lock);
>       }
> @@ -1643,3 +1707,11 @@ const struct quotactl_ops gfs2_quotactl_ops = {
>       .get_dqblk      = gfs2_get_dqblk,
>       .set_dqblk      = gfs2_set_dqblk,
>  };
> +
> +void __init gfs2_quota_hash_init(void)
> +{
> +     unsigned i;
> +
> +     for(i = 0; i < GFS2_QD_HASH_SIZE; i++)
> +             INIT_HLIST_BL_HEAD(&qd_hash_table[i]);
> +}
> diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
> index 96e4f34..55d506e 100644
> --- a/fs/gfs2/quota.h
> +++ b/fs/gfs2/quota.h
> @@ -57,5 +57,6 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode 
> *ip)
>  extern const struct quotactl_ops gfs2_quotactl_ops;
>  extern struct shrinker gfs2_qd_shrinker;
>  extern struct list_lru gfs2_qd_lru;
> +extern void __init gfs2_quota_hash_init(void);
> 
>  #endif /* __QUOTA_DOT_H__ */
> -- 
> 1.8.3.1
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 17/24] GFS2: Use RCU/hlist_bl based hash for quotas

Reply via email to