On Mon, Jan 20, 2014 at 12:23:40PM +0000, Steven Whitehouse wrote: > Prior to this patch, GFS2 kept all the quotas for each > super block in a single linked list. This is rather slow > when there are large numbers of quotas. > > This patch introduces a hlist_bl based hash table, similar > to the one used for glocks. The initial look up of the quota > is now lockless in the case where it is already cached, > although we still have to take the per quota spinlock in > order to bump the ref count. Either way though, this is a > big improvement on what was there before. > > The qd_lock and the per super block list is preserved, for > the time being. However it is intended that since this is no > longer used for its original role, it should be possible to > shrink the number of items on that list in due course and > remove the requirement to take qd_lock in qd_get. > > Signed-off-by: Steven Whitehouse <swhit...@redhat.com> > Cc: Abhijith Das <a...@redhat.com> > Cc: Paul E. McKenney <paul...@linux.vnet.ibm.com>
Interesting! I thought that Sasha Levin had a hash table in the works, but I don't see it, so CCing him. A few questions and comments below. Thanx, Paul > diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h > index a99f60c..59d99ec 100644 > --- a/fs/gfs2/incore.h > +++ b/fs/gfs2/incore.h > @@ -428,10 +428,13 @@ enum { > }; > > struct gfs2_quota_data { > + struct hlist_bl_node qd_hlist; > struct list_head qd_list; > struct kqid qd_id; > + struct gfs2_sbd *qd_sbd; > struct lockref qd_lockref; > struct list_head qd_lru; > + unsigned qd_hash; > > unsigned long qd_flags; /* QDF_... */ > > @@ -450,6 +453,7 @@ struct gfs2_quota_data { > > u64 qd_sync_gen; > unsigned long qd_last_warn; > + struct rcu_head qd_rcu; > }; > > struct gfs2_trans { > diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c > index 0650db2..c272e73 100644 > --- a/fs/gfs2/main.c > +++ b/fs/gfs2/main.c > @@ -76,6 +76,7 @@ static int __init init_gfs2_fs(void) > > gfs2_str2qstr(&gfs2_qdot, "."); > gfs2_str2qstr(&gfs2_qdotdot, ".."); > + gfs2_quota_hash_init(); > > error = gfs2_sys_init(); > if (error) > diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c > index 1b6b367..a1df01d 100644 > --- a/fs/gfs2/quota.c > +++ b/fs/gfs2/quota.c > @@ -52,6 +52,10 @@ > #include <linux/dqblk_xfs.h> > #include <linux/lockref.h> > #include <linux/list_lru.h> > +#include <linux/rcupdate.h> > +#include <linux/rculist_bl.h> > +#include <linux/bit_spinlock.h> > +#include <linux/jhash.h> > > #include "gfs2.h" > #include "incore.h" > @@ -67,10 +71,43 @@ > #include "inode.h" > #include "util.h" > > -/* Lock order: qd_lock -> qd->lockref.lock -> lru lock */ > +#define GFS2_QD_HASH_SHIFT 12 Should this be a function of the number of CPUs? (Might not be an issue if the really big systems don't use GFS.) > +#define GFS2_QD_HASH_SIZE (1 << GFS2_QD_HASH_SHIFT) > +#define GFS2_QD_HASH_MASK (GFS2_QD_HASH_SIZE - 1) > + > +/* Lock order: qd_lock -> bucket lock -> qd->lockref.lock -> lru lock */ > static DEFINE_SPINLOCK(qd_lock); > struct list_lru gfs2_qd_lru; > > +static struct hlist_bl_head qd_hash_table[GFS2_QD_HASH_SIZE]; > + > +static unsigned int gfs2_qd_hash(const struct gfs2_sbd *sdp, > + const struct kqid qid) > +{ > + unsigned int h; > + > + h = jhash(&sdp, sizeof(struct gfs2_sbd *), 0); > + h = jhash(&qid, sizeof(struct kqid), h); > + > + return h & GFS2_QD_HASH_MASK; > +} > + > +static inline void spin_lock_bucket(unsigned int hash) > +{ > + hlist_bl_lock(&qd_hash_table[hash]); > +} > + > +static inline void spin_unlock_bucket(unsigned int hash) > +{ > + hlist_bl_unlock(&qd_hash_table[hash]); > +} > + > +static void gfs2_qd_dealloc(struct rcu_head *rcu) > +{ > + struct gfs2_quota_data *qd = container_of(rcu, struct gfs2_quota_data, > qd_rcu); > + kmem_cache_free(gfs2_quotad_cachep, qd); > +} > + > static void gfs2_qd_dispose(struct list_head *list) > { > struct gfs2_quota_data *qd; > @@ -87,6 +124,10 @@ static void gfs2_qd_dispose(struct list_head *list) > list_del(&qd->qd_list); > spin_unlock(&qd_lock); > > + spin_lock_bucket(qd->qd_hash); > + hlist_bl_del_rcu(&qd->qd_hlist); > + spin_unlock_bucket(qd->qd_hash); > + Good, removed from the RCU-traversed list before invoking call_rcu(). > gfs2_assert_warn(sdp, !qd->qd_change); > gfs2_assert_warn(sdp, !qd->qd_slot_count); > gfs2_assert_warn(sdp, !qd->qd_bh_count); > @@ -95,7 +136,7 @@ static void gfs2_qd_dispose(struct list_head *list) > atomic_dec(&sdp->sd_quota_count); > > /* Delete it from the common reclaim list */ > - kmem_cache_free(gfs2_quotad_cachep, qd); > + call_rcu(&qd->qd_rcu, gfs2_qd_dealloc); > } > } > > @@ -165,83 +206,95 @@ static u64 qd2offset(struct gfs2_quota_data *qd) > return offset; > } > > -static int qd_alloc(struct gfs2_sbd *sdp, struct kqid qid, > - struct gfs2_quota_data **qdp) > +static struct gfs2_quota_data *qd_alloc(unsigned hash, struct gfs2_sbd *sdp, > struct kqid qid) > { > struct gfs2_quota_data *qd; > int error; > > qd = kmem_cache_zalloc(gfs2_quotad_cachep, GFP_NOFS); > if (!qd) > - return -ENOMEM; > + return NULL; > > + qd->qd_sbd = sdp; > qd->qd_lockref.count = 1; > spin_lock_init(&qd->qd_lockref.lock); > qd->qd_id = qid; > qd->qd_slot = -1; > INIT_LIST_HEAD(&qd->qd_lru); > + qd->qd_hash = hash; > > error = gfs2_glock_get(sdp, qd2index(qd), > &gfs2_quota_glops, CREATE, &qd->qd_gl); > if (error) > goto fail; > > - *qdp = qd; > - > - return 0; > + return qd; > > fail: > kmem_cache_free(gfs2_quotad_cachep, qd); > - return error; > + return NULL; > } > > -static int qd_get(struct gfs2_sbd *sdp, struct kqid qid, > - struct gfs2_quota_data **qdp) > +static struct gfs2_quota_data *gfs2_qd_search_bucket(unsigned int hash, > + const struct gfs2_sbd *sdp, > + struct kqid qid) > { > - struct gfs2_quota_data *qd = NULL, *new_qd = NULL; > - int error, found; > - > - *qdp = NULL; > + struct gfs2_quota_data *qd; > + struct hlist_bl_node *h; > > - for (;;) { > - found = 0; > - spin_lock(&qd_lock); > - list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) { > - if (qid_eq(qd->qd_id, qid) && > - lockref_get_not_dead(&qd->qd_lockref)) { > - list_lru_del(&gfs2_qd_lru, &qd->qd_lru); > - found = 1; > - break; > - } > + hlist_bl_for_each_entry_rcu(qd, h, &qd_hash_table[hash], qd_hlist) { All callers of gfs2_qd_search_bucket() either hold the update-side lock (acquired via spin_lock_bucket()) or have invoked rcu_read_lock(), so this is good. > + if (!qid_eq(qd->qd_id, qid)) > + continue; > + if (qd->qd_sbd != sdp) > + continue; > + if (lockref_get_not_dead(&qd->qd_lockref)) { > + list_lru_del(&gfs2_qd_lru, &qd->qd_lru); list_lru_del() acquires a lock, but it is from an array whose size depends on the NODES_SHIFT Kconfig variable. The array size seems to vary from 8 to 16 to 64 to 1024, depending on other configuration info, so should be OK. > + return qd; > } > + } > > - if (!found) > - qd = NULL; > + return NULL; > +} > > - if (!qd && new_qd) { > - qd = new_qd; > - list_add(&qd->qd_list, &sdp->sd_quota_list); > - atomic_inc(&sdp->sd_quota_count); > - new_qd = NULL; > - } > > - spin_unlock(&qd_lock); > +static int qd_get(struct gfs2_sbd *sdp, struct kqid qid, > + struct gfs2_quota_data **qdp) > +{ > + struct gfs2_quota_data *qd, *new_qd; > + unsigned int hash = gfs2_qd_hash(sdp, qid); > > - if (qd) { > - if (new_qd) { > - gfs2_glock_put(new_qd->qd_gl); > - kmem_cache_free(gfs2_quotad_cachep, new_qd); > - } > - *qdp = qd; > - return 0; > - } > + rcu_read_lock(); > + *qdp = qd = gfs2_qd_search_bucket(hash, sdp, qid); > + rcu_read_unlock(); > > - error = qd_alloc(sdp, qid, &new_qd); > - if (error) > - return error; > + if (qd) > + return 0; > + > + new_qd = qd_alloc(hash, sdp, qid); > + if (!new_qd) > + return -ENOMEM; > + > + spin_lock(&qd_lock); > + spin_lock_bucket(hash); > + *qdp = qd = gfs2_qd_search_bucket(hash, sdp, qid); > + if (qd == NULL) { > + *qdp = new_qd; > + list_add(&new_qd->qd_list, &sdp->sd_quota_list); > + hlist_bl_add_head_rcu(&new_qd->qd_hlist, &qd_hash_table[hash]); > + atomic_inc(&sdp->sd_quota_count); > + } > + spin_unlock_bucket(hash); > + spin_unlock(&qd_lock); > + > + if (qd) { > + gfs2_glock_put(new_qd->qd_gl); > + kmem_cache_free(gfs2_quotad_cachep, new_qd); > } > + > + return 0; > } > > + > static void qd_hold(struct gfs2_quota_data *qd) > { > struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; > @@ -1215,6 +1268,7 @@ int gfs2_quota_init(struct gfs2_sbd *sdp) > unsigned int blocks = size >> sdp->sd_sb.sb_bsize_shift; > unsigned int x, slot = 0; > unsigned int found = 0; > + unsigned int hash; > u64 dblock; > u32 extlen = 0; > int error; > @@ -1272,8 +1326,9 @@ int gfs2_quota_init(struct gfs2_sbd *sdp) > if (!qc_change) > continue; > > - error = qd_alloc(sdp, qc_id, &qd); > - if (error) { > + hash = gfs2_qd_hash(sdp, qc_id); > + qd = qd_alloc(hash, sdp, qc_id); > + if (qd == NULL) { > brelse(bh); > goto fail; > } > @@ -1289,6 +1344,10 @@ int gfs2_quota_init(struct gfs2_sbd *sdp) > atomic_inc(&sdp->sd_quota_count); > spin_unlock(&qd_lock); > > + spin_lock_bucket(hash); > + hlist_bl_add_head_rcu(&qd->qd_hlist, > &qd_hash_table[hash]); > + spin_unlock_bucket(hash); > + > found++; > } > > @@ -1335,11 +1394,16 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp) > spin_unlock(&qd->qd_lockref.lock); > > list_del(&qd->qd_list); > + > /* Also remove if this qd exists in the reclaim list */ > list_lru_del(&gfs2_qd_lru, &qd->qd_lru); > atomic_dec(&sdp->sd_quota_count); > spin_unlock(&qd_lock); > > + spin_lock_bucket(qd->qd_hash); > + hlist_bl_del_rcu(&qd->qd_hlist); Might just be my unfamiliarity with this code, but it took me a bit to see the difference between ->qd_hlist and ->qd_list. Of course, until I spotted the difference, I was wondering why you were removing the item twice. ;-) > + spin_unlock_bucket(qd->qd_hash); > + > if (!qd->qd_lockref.count) { > gfs2_assert_warn(sdp, !qd->qd_change); > gfs2_assert_warn(sdp, !qd->qd_slot_count); > @@ -1348,7 +1412,7 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp) > gfs2_assert_warn(sdp, !qd->qd_bh_count); > > gfs2_glock_put(qd->qd_gl); > - kmem_cache_free(gfs2_quotad_cachep, qd); > + call_rcu(&qd->qd_rcu, gfs2_qd_dealloc); > > spin_lock(&qd_lock); > } > @@ -1643,3 +1707,11 @@ const struct quotactl_ops gfs2_quotactl_ops = { > .get_dqblk = gfs2_get_dqblk, > .set_dqblk = gfs2_set_dqblk, > }; > + > +void __init gfs2_quota_hash_init(void) > +{ > + unsigned i; > + > + for(i = 0; i < GFS2_QD_HASH_SIZE; i++) > + INIT_HLIST_BL_HEAD(&qd_hash_table[i]); > +} > diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h > index 96e4f34..55d506e 100644 > --- a/fs/gfs2/quota.h > +++ b/fs/gfs2/quota.h > @@ -57,5 +57,6 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode > *ip) > extern const struct quotactl_ops gfs2_quotactl_ops; > extern struct shrinker gfs2_qd_shrinker; > extern struct list_lru gfs2_qd_lru; > +extern void __init gfs2_quota_hash_init(void); > > #endif /* __QUOTA_DOT_H__ */ > -- > 1.8.3.1 > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/