On Sat, 4 Feb 2007 [EMAIL PROTECTED] wrote:
I noticed in an LCA talk mention that apprently extensible hashing
with RCU access is an unsolved problem. Here's an idea for solving it.
I'm assuming the table is a power of 2 in size with open chaining
for collisions. When the chains get too long, the table is doubled.
When the chains get too short, the table size is halved.
.....
For purposes of discussion, I've attached a "toy" implementation
for doing dynamic resizing of a hashtable. It is useless, except
as a proof of concept.
I think this is very similar to what you are describing, no?
--
Arthur
#include <linux/init.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/vmalloc.h>
#include <linux/seqlock.h>
#include <linux/rcupdate.h>
#include <linux/netdevice.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <asm/uaccess.h>
#define _NETHASH_BUFSZ_ 16
#define MODULE_NAME "nethash"
#define _DEBUG_IT_
#ifdef _DEBUG_IT_
#define nprintk(x...) printk(KERN_ALERT x);
#else /* !_DEBUG_IT_ */
#define nprintk(x)
#endif /* _DEBUG_IT_ */
static struct proc_dir_entry *nh_proc_dir;
enum nh_type { NH_ENTRY, NH_HEAD };
struct nh_item {
/* the list head must be first followed by nh_type */
struct list_head nh_list;
enum nh_type nh_type;
};
struct nh_entry {
/* the list head must be first followed by nh_type */
struct list_head nh_list;
enum nh_type nh_type;
unsigned long data;
struct rcu_head rcu_head;
};
struct nh_head {
/* the list head must be first followed by nh_type */
struct list_head list;
enum nh_type nh_type;
spinlock_t lock;
};
struct nh {
unsigned long nentries;
struct nh_head* hash;
struct rcu_head rcu_head;
};
static struct nh * __nh;
static DEFINE_SEQLOCK(nethash_seq);
static DEFINE_MUTEX(nethash_resize_mutex);
extern void * system_hash_alloc(unsigned long sz);
extern void system_hash_free(void * v, unsigned long sz);
/* XXX nh_dump() is for for debug only
* it must be called under rcu_read_lock() */
static int nh_dump(struct nh * nh, const char * debug_str);
static struct nh * get_nh(void);
static unsigned long nh_hashval(unsigned long data)
{
return data;
}
static void nh_entry_free(struct rcu_head * head)
{
struct nh_entry * entry = container_of(head, struct nh_entry, rcu_head);
nprintk("%s: freeing entry with data %lu\n", __FUNCTION__, entry->data);
kfree(entry);
}
static void nh_free(struct rcu_head * head)
{
struct nh * nh = container_of(head, struct nh, rcu_head);
unsigned long nentries = nh->nentries;
unsigned long size = sizeof (struct nh_head) * nentries;
nprintk("%s: freeing nh of size %lu\n", __FUNCTION__, size);
system_hash_free((void *)nh->hash, size);
nprintk("%s: freeing nh\n", __FUNCTION__);
kfree(nh);
}
/* called with head's spin_lock held */
static int __nh_insert(struct nh_entry * entry,
struct nh_head * head,
unsigned long nentries)
{
struct list_head * prev; /* insert entry after prev */
struct list_head * list;
nprintk("%s: linking entry with data %lu\n", __FUNCTION__,
entry->data);
/* find the appropriate spot to place entry */
prev = &head->list;
if ( nh_hashval(entry->data) & nentries ) {
list_for_each_rcu(list, &head->list) {
struct nh_entry * tmp;
struct nh_item * item = (struct nh_item *)list;
if (item->nh_type != NH_ENTRY)
continue;
tmp = list_entry(list, struct nh_entry, nh_list);
prev = &tmp->nh_list;
if (nh_hashval(tmp->data) & nentries) {
/* put entry after nh */
break;
}
}
}
list_add_rcu(&entry->nh_list, prev);
return 0;
}
/* called with head's lock held */
static void __nh_sort_chain(struct nh_head * head, unsigned long nentries)
{
struct list_head tmp, *list = &head->list;
struct nh_entry* entry;
INIT_LIST_HEAD(&tmp);
list_splice_init(list, &tmp);
while ( !list_empty(&tmp) ) {
struct list_head * first = tmp.next;
list_del(first);
entry = (struct nh_entry*)first;
__nh_insert(entry, head, nentries);
}
}
static void nh_fixup_grow(struct rcu_head * head)
{
struct nh * nh;
struct nh * oh = container_of(head, struct nh, rcu_head);
unsigned long i, oentries = oh->nentries;
rcu_read_lock();
nh = get_nh();
/* this is an rcu_callback - only the "new" nh is
* visible elsewhere now, so make sure to access
* the hashtable using the proper (new) locks,... */
for (i = 0; i < oentries; i++) {
struct nh_head * nhead1 = &nh->hash[i];
struct nh_head * nhead2 = &nh->hash[i + oentries];
struct list_head *olist, *tail1, *tail2;
/* grab locks in order */
spin_lock(&nhead1->lock);
spin_lock(&nhead2->lock);
olist = &oh->hash[i].list;
list_del(olist); /* don't need rcu variant here */
tail1 = nhead2->list.prev;
tail2 = nhead1->list.prev;
nhead1->list.prev = tail1;
nhead2->list.prev = tail2;
tail1->next = &nhead1->list;
tail2->next = &nhead2->list;
__nh_sort_chain(nhead1, oentries << 1);
__nh_sort_chain(nhead2, oentries << 1);
spin_unlock(&nhead2->lock);
spin_unlock(&nhead1->lock);
}
rcu_read_unlock();
nh_free(head);
}
static void nh_fixup_shrink(struct rcu_head * head)
{
struct nh * nh;
struct nh * oh = container_of(head, struct nh, rcu_head);
unsigned long i, nentries;
rcu_read_lock();
nh = get_nh();
nentries = nh->nentries;
/* this is an rcu_callback - only the "new" nh is
* visible elsewhere now, so make sure to access
* the hashtable using the proper (new) locks,... */
for (i = 0; i < nh->nentries; i++) {
struct nh_head * nhead = &nh->hash[i];
struct list_head * olist1 = &oh->hash[i].list;
struct list_head * olist2 = &oh->hash[i + nentries].list;
spin_lock(&nhead->lock);
list_del(olist1); /* don't need RCU variants here */
list_del(olist2);
__nh_sort_chain(nhead, nentries);
spin_unlock(&nhead->lock);
}
rcu_read_unlock();
nh_free(head);
}
/* called under rcu_read_lock */
static struct nh * get_nh(void)
{
unsigned long seq;
struct nh * nh = NULL;
do {
seq = read_seqbegin(&nethash_seq);
nh = __nh;
} while (read_seqretry(&nethash_seq, seq));
return nh;
}
/* use returned value only under rcu_read_lock */
static struct nh * nh_replace(struct nh * new)
{
struct nh * old = NULL;
write_seqlock(&nethash_seq);
old = __nh;
__nh = new;
write_sequnlock(&nethash_seq);
return old;
}
static void nh_destroy(void)
{
unsigned long i;
struct nh * nh;
rcu_read_lock();
nh = nh_replace(NULL);
if (!nh)
goto out_unlock;
for (i = 0; i < nh->nentries; i++) {
struct nh_head * head = &nh->hash[i];
struct list_head *list;
spin_lock(&head->lock);
list_for_each_rcu(list, &head->list) {
struct nh_item * item = (struct nh_item *)list;
struct nh_entry *entry;
if (item->nh_type != NH_ENTRY)
continue;
entry = list_entry(list, struct nh_entry, nh_list);
list_del_rcu(&entry->nh_list);
nprintk("%s: delinking entry with data %lu\n",
__FUNCTION__, entry->data)
call_rcu(&entry->rcu_head, nh_entry_free);
}
spin_unlock(&head->lock);
}
call_rcu(&nh->rcu_head, nh_free);
out_unlock:
rcu_read_unlock();
}
static void nh_init(struct nh* nh,
struct nh_head * hash,
unsigned long nentries)
{
unsigned long i;
for (i = 0; i < nentries; i++) {
INIT_LIST_HEAD(&hash[i].list);
hash[i].nh_type = NH_HEAD;
spin_lock_init(&hash[i].lock);
}
nh->hash = hash;
nh->nentries = nentries;
}
static struct nh * nh_alloc(unsigned long nentries)
{
struct nh * nh;
unsigned long sz;
struct nh_head * hash;
/* nentries must be a power of 2 */
nentries = roundup_pow_of_two(nentries);
sz = nentries * sizeof(struct nh_head);
nprintk("%s: creating hash of %lu entries, size %lu bytes\n",
__FUNCTION__, nentries, sz);
nh = kmalloc(sizeof(struct nh), GFP_KERNEL);
if (!nh) {
nprintk("%s: failed to allocate hash of size %lu\n",
__FUNCTION__, sz);
return NULL;
}
hash = system_hash_alloc(sz);
if (!hash) {
nprintk("%s: failed to allocate hash of size %lu\n",
__FUNCTION__, sz);
kfree(nh);
return NULL;
}
nh_init(nh, hash, nentries);
return nh;
}
static int nh_create(unsigned long nentries)
{
struct nh * nh;
nh_destroy();
nh = nh_alloc(nentries);
if (!nh)
return -ENOMEM;
nh_replace(nh);
return 0;
}
static unsigned long nh_bucket(unsigned long hashval, unsigned long nentries)
{
/* nentries is a power of 2 */
return (hashval & (nentries - 1));
}
static int nh_add(unsigned long data)
{
struct nh_entry * entry;
struct nh * nh = NULL;
int ret = 0;
entry = kmalloc(sizeof(struct nh_entry), GFP_KERNEL);
if (!entry) return -ENOMEM;
entry->data = data;
entry->nh_type = NH_ENTRY;
rcu_read_lock();
nh = get_nh();
if (nh) {
struct nh_head *hash = nh->hash;
unsigned long nentries = nh->nentries;
unsigned long hashval = nh_hashval(data);
unsigned long bucket = nh_bucket(hashval, nentries);
struct nh_head * head = &hash[bucket];
spin_lock(&head->lock);
ret = __nh_insert(entry, head, nentries);
spin_unlock(&head->lock);
}
rcu_read_unlock();
return ret;
}
static void nh_migrate_grow(struct nh * old, struct nh * new)
{
unsigned long i, oentries = old->nentries;
/* this is called prior to calling replace_nh(), so
* only the "old" nh is visible elsewhere - only
* access the hashtable under the appropriate "old"
* locks */
for ( i = 0; i < oentries; i++ ) {
struct nh_head * ohead = &old->hash[i];
struct list_head * olist = &ohead->list;
struct list_head * nlist1 = &new->hash[i].list;
struct list_head * nlist2 = &new->hash[i + oentries].list;
struct nh_entry *entry;
struct list_head * list, * prev = NULL;
nprintk("%s: migrating bucket %lu\n", __FUNCTION__, i);
spin_lock(&ohead->lock);
list_add_rcu(nlist1, olist);
/* find where to split chain */
list_for_each_rcu(list, olist) {
struct nh_item * item = (struct nh_item *)list;
if (item->nh_type != NH_ENTRY) {
prev = list;
continue;
}
entry = list_entry(list, struct nh_entry, nh_list);
if (nh_hashval(entry->data) & oentries) {
break;
}
prev = list;
}
BUG_ON(prev == NULL);
list_add_rcu(nlist2, prev);
spin_unlock(&ohead->lock);
}
}
static void nh_migrate_shrink(struct nh * old, struct nh * new)
{
unsigned long i, oentries = old->nentries;
/* this is called prior to calling replace_nh(), so
* only the "old" nh is visible elsewhere - only
* access the hashtable under the appropriate "old"
* locks */
for ( i = 0; i < oentries >> 1; i++ ) {
struct list_head * nlist = &new->hash[i].list;
struct nh_head * ohead1 = &old->hash[i];
struct nh_head * ohead2 = &old->hash[i + (oentries>>1)];
struct list_head * olist1, * olist2, tmp;
nprintk("%s: migrating bucket %lu\n", __FUNCTION__, i);
/* acquire locks in order */
spin_lock(&ohead1->lock);
spin_lock(&ohead2->lock);
olist1 = &ohead1->list;
olist2 = &ohead2->list;
INIT_LIST_HEAD(&tmp);
list_add_tail_rcu(&tmp, olist2);
list_splice_init(&tmp, olist1->prev);
list_add_tail_rcu(nlist, olist1);
spin_unlock(&ohead2->lock);
spin_unlock(&ohead1->lock);
}
}
unsigned long nh_nentries_grow(void) {
unsigned long nentries = 0;
/* we're called under rcu_read_lock */
struct nh * nh = get_nh();
if (nh)
nentries = nh->nentries << 1;
return nentries;
}
unsigned long nh_nentries_shrink(void) {
unsigned long nentries = 0;
/* we're called under rcu_read_lock */
struct nh * nh = get_nh();
if (nh)
nentries = nh->nentries >> 1;
return nentries;
}
enum { NH_GROW_OPS, NH_SHRINK_OPS, NH_MAX_OPS };
/* all of the nh_resize_ops must be called under the
* nethash_resize_mutex and rcu_read_lock
*/
struct nh_resize_ops {
unsigned long (*nh_nentries_op) (void);
void (*nh_migrate_op) (struct nh *old, struct nh *new);
void (*nh_fixup_op) (struct rcu_head *head);
} nh_resize_ops [NH_MAX_OPS] = {
{
nh_nentries_grow,
nh_migrate_grow,
nh_fixup_grow,
},
{
nh_nentries_shrink,
nh_migrate_shrink,
nh_fixup_shrink,
},
};
static int nh_resize(long dir)
{
struct nh *nh, *new_nh;
unsigned long new_nentries;
int err = 0;
struct nh_resize_ops * nh_ops;
if (dir > 0)
nh_ops = &nh_resize_ops[NH_GROW_OPS];
else if (dir < 0)
nh_ops = &nh_resize_ops[NH_SHRINK_OPS];
else
return -EINVAL;
mutex_lock(&nethash_resize_mutex);
rcu_read_lock();
nh = get_nh();
if (!nh)
goto out_unlock;
new_nentries = nh_ops->nh_nentries_op();
nprintk("%s: resizing nh to %lu buckets\n", __FUNCTION__,
new_nentries);
new_nh = nh_alloc(new_nentries);
if (!new_nh) {
err = -ENOMEM;
goto out_unlock;
}
nh_ops->nh_migrate_op(nh, new_nh);
nh_dump(nh, "old nh [post migration]");
nh_dump(new_nh, "new nh [newborn]");
nh = nh_replace(new_nh);
/* now callers of get_nh() will get the updated
* version, but references to the old nh can still
* be in use. It must be possible for users to see
* all elements of the table using either the old
* or new nh */
call_rcu(&nh->rcu_head, nh_ops->nh_fixup_op);
rcu_read_unlock();
synchronize_net();
/* by holding the nethash_resize_mutex until after
* synchronize_net() we are sure that there are at
* most 2 nh structures that we need to be
* concerned with */
mutex_unlock(&nethash_resize_mutex);
rcu_read_lock();
nh_dump(new_nh, "new nh [done]");
rcu_read_unlock();
return err;
out_unlock:
rcu_read_unlock();
mutex_unlock(&nethash_resize_mutex);
return err;
}
static int nh_getalong(const char __user *buf, size_t count, long *res)
{
char kbuf[_NETHASH_BUFSZ_+1];
if (count > _NETHASH_BUFSZ_)
return -EINVAL;
if (copy_from_user(&kbuf, buf, count))
return -EFAULT;
kbuf[_NETHASH_BUFSZ_] = '\0';
if (sscanf(kbuf, "%ld", res) != 1)
return -EINVAL;
return 0;
}
ssize_t nh_create_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
long res;
int ret;
if ((ret = nh_getalong(buf, count, &res)) != 0)
return ret;
if(res <= 0)
return -EINVAL;
if ((ret = nh_create((unsigned long)res)) != 0)
return ret;
return count;
}
static struct file_operations nh_create_fops = {
.write = nh_create_write,
};
ssize_t nh_add_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
long res;
int ret;
if ((ret = nh_getalong(buf, count, &res)) != 0)
return ret;
if(res <= 0)
return -EINVAL;
if ((ret = nh_add((unsigned long)res)) != 0)
return ret;
return count;
}
static struct file_operations nh_add_fops = {
.write = nh_add_write,
};
ssize_t nh_resize_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
long res;
int ret;
if ((ret = nh_getalong(buf, count, &res)) != 0)
return ret;
if ((ret = nh_resize(res)) != 0)
return ret;
return count;
}
static struct file_operations nh_resize_fops = {
.write = nh_resize_write,
};
static void * d_start (struct seq_file *m, loff_t *pos)
{
struct nh * nh;
rcu_read_lock();
nh = get_nh();
if (nh) {
struct nh_head *hash = nh->hash;
unsigned long nentries = nh->nentries;
if (nentries > *pos) {
seq_printf(m, "%lu: ", (unsigned long)*pos);
return (&hash[*pos]);
}
}
return NULL;
}
static void * d_next (struct seq_file *m, void *v, loff_t *pos)
{
struct nh * nh;
(*pos)++;
/* rcu_read_lock is still held */
nh = get_nh();
if (nh) {
struct nh_head *hash = nh->hash;
unsigned long nentries = nh->nentries;
if (nentries > *pos) {
seq_printf(m, "%lu: ", (unsigned long)*pos);
return (&hash[*pos]);
}
}
return NULL;
}
static void d_stop (struct seq_file *m, void *v)
{
rcu_read_unlock();
}
static int d_show (struct seq_file *m, void *v)
{
struct nh_head *head = v;
struct nh_entry *entry;
struct list_head * list;
rcu_read_lock();
list_for_each_rcu(list, &head->list) {
struct nh_item * item = (struct nh_item *)list;
switch (item->nh_type) {
case NH_ENTRY:
entry = list_entry(list, struct nh_entry, nh_list);
seq_printf(m, "%lu", entry->data);
seq_printf(m, "%s", " -> ");
break;
case NH_HEAD:
seq_printf(m, "%s", " .. ");
break;
default:
seq_printf(m, "%s", "bad nh_type!\n");
}
}
rcu_read_unlock();
seq_printf(m, "%s", "\n");
return 0;
}
struct seq_operations nh_dump_op = {
.start = d_start,
.next = d_next,
.stop = d_stop,
.show = d_show,
};
static int nh_dump_open(struct inode *inode, struct file *file)
{
return seq_open(file, &nh_dump_op);
}
static struct file_operations proc_nh_dump_operations = {
.open = nh_dump_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
};
/* XXX nh_dump() is for for debug only
* it must be called under rcu_read_lock() */
static int nh_dump(struct nh * nh, const char * debug_str)
{
unsigned long i;
struct nh_head * head;
struct nh_entry *entry;
struct list_head * list;
nprintk("%s: %s\n", __FUNCTION__, debug_str);
for (i = 0; i < nh->nentries; i++ ) {
head = &nh->hash[i];
nprintk("[%lu]: (%p)", i, head);
list_for_each_rcu(list, &head->list) {
struct nh_item * item = (struct nh_item *)list;
nprintk(" %p ", list);
switch (item->nh_type) {
case NH_ENTRY:
entry = list_entry(list, struct nh_entry,
nh_list);
nprintk("%lu -> ", entry->data);
break;
case NH_HEAD:
nprintk(" .. ");
break;
default:
nprintk("bad nh_type!\n");
}
}
nprintk("\n");
}
return 0;
}
static int nethash_module_init(void)
{
struct proc_dir_entry *entry;
nh_proc_dir = proc_mkdir(MODULE_NAME, NULL);
if (!nh_proc_dir) return -ENODEV;
/* first the entry to create the hashtable */
entry = create_proc_entry("create", 0644, nh_proc_dir);
if (!entry) return -ENODEV;
entry->proc_fops = &nh_create_fops;
/* and the entry to add an element to the hashtable */
entry = create_proc_entry("add", 0644, nh_proc_dir);
if (!entry) return -ENODEV;
entry->proc_fops = &nh_add_fops;
/* the entry to dump the hashtable */
entry = create_proc_entry("dump", 0444, nh_proc_dir);
if (!entry) return -ENODEV;
entry->proc_fops = &proc_nh_dump_operations;
/* and to resize the hashtable */
entry = create_proc_entry("resize", 0644, nh_proc_dir);
if (!entry) return -ENODEV;
entry->proc_fops = &nh_resize_fops;
return 0;
}
static void nethash_module_exit(void)
{
nh_destroy();
rcu_barrier(); /* don't unload until rcu callbacks are done
* XXX is this really safe? */
if (nh_proc_dir) {
remove_proc_entry("resize", nh_proc_dir);
remove_proc_entry("dump", nh_proc_dir);
remove_proc_entry("add", nh_proc_dir);
remove_proc_entry("create", nh_proc_dir);
}
remove_proc_entry(MODULE_NAME, NULL);
printk("%s removed\n", MODULE_NAME);
}
module_init(nethash_module_init);
module_exit(nethash_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("[EMAIL PROTECTED]");