Re: [ovs-dev] [cmap 1/2] cmap: New module for cuckoo hash table.

Ben Pfaff Fri, 02 May 2014 14:52:36 -0700

Thanks for the review.

Overnight, I found myself thinking about an alternative approach that
might perform even better.  I am going to do a few experiments to find
out before I make any specific responses to your comments.  (But I
imagine that many of them will apply anyway.)


On Fri, May 02, 2014 at 11:23:34AM -0700, Jarno Rajahalme wrote:
> On May 1, 2014, at 5:14 PM, Ben Pfaff <b...@nicira.com> wrote:
> 
> ?
> 
> > The results show:
> > 
> >    - Insertion is generally 3x to 5x faster in an hmap.
> > 
> >    - Iteration is generally about 3x faster in a cmap.
> > 
> >    - Search and mutation is 4x faster with .1% mutations and the
> >      advantage grows as the fraction of mutations grows.  This is
> >      because a cmap does not require locking for read operations,
> >      even in the presence of a writer.
> > 
> >      With no mutations, however, no locking is required in the hmap
> >      case, and the hmap is somewhat faster.  This is because raw hmap
> >      search is somewhat simpler and faster than raw cmap search.
> > 
> 
> The ?no mutations? applies to ?runtime constant? data only, so this means 
> that we should keep using hmap for maps that are initialized when OVS starts, 
> and remain unchanged after that?
> 
> ?
> 
> > diff --git a/lib/cmap.c b/lib/cmap.c
> > new file mode 100644
> > index 0000000..02d6b8f
> > --- /dev/null
> > +++ b/lib/cmap.c
> > @@ -0,0 +1,765 @@
> > +/*
> > + * Copyright (c) 2014 Nicira, Inc.
> > + *
> > + * Licensed under the Apache License, Version 2.0 (the "License");
> > + * you may not use this file except in compliance with the License.
> > + * You may obtain a copy of the License at:
> > + *
> > + *     http://www.apache.org/licenses/LICENSE-2.0
> > + *
> > + * Unless required by applicable law or agreed to in writing, software
> > + * distributed under the License is distributed on an "AS IS" BASIS,
> > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> > + * See the License for the specific language governing permissions and
> > + * limitations under the License.
> > + */
> > +
> > +#include <config.h>
> > +#include "cmap.h"
> > +#include "hash.h"
> > +#include "ovs-rcu.h"
> > +#include "random.h"
> > +#include "util.h"
> > +
> > +/* Optimistic Concurrent Cuckoo Hash
> > + * =================================
> > + *
> > + * A "cuckoo hash" is an open addressing hash table schema, designed such 
> > that
> > + * a given element can be in one of only a small number of buckets 'd', 
> > each of
> > + * which each holds up to a small number 'k' elements.  Thus, the expected 
> > and
> 
> ?which holds?
> 
> > + * worst-case lookup times are O(1) because they require comparing no more 
> > than
> > + * a fixed number of elements (k * d).  Inserting a new element can require
> > + * moving around existing elements, but it is also O(1) amortized expected
> > + * time.
> > + *
> 
> ?
> 
> > +/* A cuckoo hash bucket.  Designed to be cache-aligned and exactly one 
> > cache
> > + * line long. */
> > +struct cmap_bucket {
> > +    /* Allows readers to track in-progress changes.  Initially zero, each
> > +     * writer increments this value just before and just after each change 
> > (see
> > +     * cmap_set_bucket()).  Thus, a reader can ensure that it gets a 
> > consistent
> > +     * snapshot by waiting for the counter to become even (see
> > +     * read_even_counter()), then checking that its value does not change 
> > while
> > +     * examining the bucket (see cmap_find()). */
> > +    atomic_uint32_t counter;
> 
> Do the atomic operations generate CPU instructions, or only compiler barriers?
> 
> > +
> > +    /* (hash, node) slots.  They are parallel arrays instead of an array of
> > +     * structs to reduce the amount of space lost to padding.
> > +     *
> > +     * The slots are in no particular order.  A null pointer indicates 
> > that a
> > +     * pair is unused.  In-use slots are not necessarily in the earliest
> > +     * slots. */
> > +    uint32_t hashes[CMAP_K];
> > +    struct cmap_node *nodes[CMAP_K];
> > +
> > +    /* Padding to make cmap_bucket exactly one cache line long. */
> > +#if CMAP_PADDING > 0
> > +    uint8_t pad[CMAP_PADDING];
> > +#endif
> > +};
> > +BUILD_ASSERT_DECL(sizeof(struct cmap_bucket) == CACHE_LINE_SIZE);
> > +
> > +/* Default maximum load factor (as a fraction of UINT32_MAX + 1) before
> > + * enlarging a cmap.  Reasonable values lie between about 75% and 93%.  
> > Smaller
> > + * values waste memory; larger values increase the average insertion time. 
> > */
> > +#define CMAP_DEFAULT_MAX_LOAD ((uint32_t) (UINT32_MAX * .85))
> > +
> > +/* The implementation of a concurrent hash map. */
> > +struct cmap_impl {
> > +    unsigned int n;             /* Number of in-use elements. */
> > +    unsigned int max_n;         /* Max elements before enlarging. */
> > +    unsigned int max_load;      /* Max load as fraction of UINT32_MAX. */
> > +    uint32_t mask;              /* Number of 'buckets', minus one. */
> > +    uint32_t basis;             /* Basis for rehashing client's hash 
> > values. */
> > +
> > +    /* Padding to make cmap_impl exactly one cache line long. */
> > +    uint8_t pad[CACHE_LINE_SIZE - sizeof(unsigned int) * 5];
> > +
> > +    struct cmap_bucket buckets[];
> > +};
> > +BUILD_ASSERT_DECL(sizeof(struct cmap_impl) == CACHE_LINE_SIZE);
> > +
> > +static struct cmap_impl *cmap_rehash(struct cmap *, uint32_t mask);
> > +
> > +/* Given a rehashed value 'hash', returns the other hash for that rehashed
> > + * value.  This is symmetric: other_hash(other_hash(x)) == x.  (See also 
> > "Hash
> > + * Functions" at the top of this file.) */
> > +static uint32_t
> > +other_hash(uint32_t hash)
> > +{
> > +    return (hash << 16) | (hash >> 16);
> > +}
> > +
> > +/* Returns the rehashed value for 'hash' within 'impl'.  (See also "Hash
> > + * Functions" at the top of this file.) */
> > +static uint32_t
> > +rehash(const struct cmap_impl *impl, uint32_t hash)
> > +{
> > +    return mhash_finish(mhash_add(impl->basis, hash), 4);
> > +}
> 
> Just refinishing might work as well. Consider the user computed hash as 
> ?unfinished?, and rehash() finishing it with the ?impl->basis? as ?n_bytes?. 
> The interpretation of ?n_bytes? depends on the application of the hash 
> function. Since we are always rehashing a fixed size value, we need not 
> interpret the ?n_bytes? as the real length of the data being hashed. For 
> mash_finish(), ?n_bytes? is just a 32-bit number. Given this, this might work 
> as well, and should be about twice as fast:
> 
> static uint32_t
> rehash(const struct cmap_impl *impl, uint32_t hash)
> {
>     return mhash_finish(hash, impl->basis);
> }
> 
> ?
> 
> > +
> > +/* Changes the maximum load factor before enlarging 'cmap' to 'max_load', 
> > which
> > + * is a fraction of UINT32_MAX + 1.  For example, a 'max_load' of 
> > UINT32_MAX/2
> > + * would cause 'cmap' to be enlarged when it is half full.
> > + *
> > + * 'cmap' must be empty.
> > + *
> > + * Reasonable load factors lie between about 75% and 93%.
> > + *
> > + * This function is intended for testing.  Otherwise, it is best to use the
> > + * default load factor (see CMAP_DEFAULT_MAX_LOAD). */
> > +void
> > +cmap_set_max_load_factor(struct cmap *cmap, uint32_t max_load)
> > +{
> > +    struct cmap_impl *impl = cmap_get_impl(cmap);
> > +
> > +    /* Not worth implementing rehashing for changing the max load factor. 
> > */
> > +    ovs_assert(cmap_is_empty(cmap));
> > +
> > +    /* Force the max load factor into a reasonable range:
> > +     *
> > +     *    - At least one element per bucket.
> > +     *
> > +     *    - At most 93%, based on Fig. 1 in Erlingsson [4].
> > +     */
> > +    max_load = MAX(max_load, UINT32_MAX / CMAP_K + 1);
> > +    max_load = MIN(max_load, (uint32_t) (UINT32_MAX * .93));
> 
> Since this is intended for testing, how about asserting instead? You might 
> get strange test results if ?max_load? is silently adjusted.
> 
> > +    impl->max_load = max_load;
> > +    impl->max_n = calc_max_n(impl->mask, max_load);
> > +}
> > +
> 
> ?
> 
> > +static bool
> > +cmap_insert_dup(struct cmap_impl *impl, struct cmap_node *new_node, 
> > uint32_t h)
> > +{
> > +    struct cmap_bucket *b = &impl->buckets[h & impl->mask];
> > +    int i;
> > +
> > +    for (i = 0; i < CMAP_K; i++) {
> > +        struct cmap_node *node = b->nodes[i];
> > +        if (node && b->hashes[i] == h && node->hash == new_node->hash) {
> > +            struct cmap_node *p;
> > +
> > +            p = new_node;
> > +            while (p->next) {
> > +                p = p->next;
> > +            }
> 
> In what cases the new_node has a chain of nodes hanging on it?
> 
> > +            p->next = node;
> > +            cmap_set_bucket(b, i, new_node, h);
> > +
> 
> I guess this is cheaper than scanning to the end of the duplicates and 
> changing the NULL pointer to point to the ?new_node?.
> 
> > +            return true;
> > +        }
> > +    }
> > +    return false;
> > +}
> > +
> > +static void
> > +cmap_check_hashes(struct cmap_impl *impl)
> > +{
> > +    struct cmap_bucket *b;
> > +
> > +    return;
> > +    for (b = impl->buckets; b <= &impl->buckets[impl->mask]; b++) {
> > +        int i;
> > +
> > +        for (i = 0; i < CMAP_K; i++) {
> > +            struct cmap_node *node = b->nodes[i];
> > +            uint32_t h = b->hashes[i];
> > +
> > +            ovs_assert(!node || (h & impl->mask) == b - impl->buckets);
> > +        }
> > +    }
> > +}
> > +
> 
> This is only for debugging? Might the calls to this affect the performance 
> numbers you reported?
> 
> > +static bool
> > +cmap_try_insert(struct cmap_impl *impl, struct cmap_node *node, uint32_t 
> > hash)
> > +{
> > +    enum { MAX_PATH = 4 };
> > +    struct cmap_path {
> > +        struct cmap_bucket *end;
> > +        int start;
> 
> Is this 0 or 1 only?
> 
> > +        int n;
> 
> And this is [0 - MAX_PATH) ?
> 
> > +        uint8_t slots[MAX_PATH];
> > +    };
> > +
> 
> Whose slots the ?slots? point to?
> 
> > +    uint32_t h1 = rehash(impl, hash);
> > +    uint32_t h2 = other_hash(h1);
> > +
> > +    enum { MAX_QUEUE = 512 };
> > +    BUILD_ASSERT_DECL(IS_POW2(MAX_QUEUE));
> > +    struct cmap_path queue[MAX_QUEUE];
> > +    unsigned int head = 0, tail = 0;
> > +
> > +    /* Handle the easy cases. */
> > +    if (cmap_insert_dup(impl, node, h1) ||
> > +        cmap_insert_dup(impl, node, h2) ||
> > +        cmap_insert_bucket(impl, node, h1) ||
> > +        cmap_insert_bucket(impl, node, h2)) {
> > +        return true;
> > +    }
> > +
> 
> Do you know how much faster this would be if we did not allow duplicates, or 
> if we knew there are no duplicates?
> 
> > +    queue[head].end = &impl->buckets[h1 & impl->mask];
> > +    queue[head].start = 0;
> > +    queue[head].n = 0;
> > +    head++;
> > +
> > +    if ((h1 & impl->mask) != (h2 & impl->mask)) {
> > +        queue[head].end = &impl->buckets[h2 & impl->mask];
> > +        queue[head].start = 1;
> > +        queue[head].n = 0;
> > +        head++;
> > +    }
> > +
> 
> /* ?queue? is full when it has space for less than CMAP_K nodes that might be 
> added on each iteration. */
> 
> > +    while (head - tail > 0 && head - tail < MAX_QUEUE - CMAP_K) {
> > +        const struct cmap_path *p = &queue[tail++ & (MAX_QUEUE - 1)];
> > +        struct cmap_bucket *b = p->end;
> > +        int i;
> > +
> 
> /* Check if any of the nodes in ?*b? can be moved to their alternate bucket, 
> thus making space for ?node?. */
> 
> > +        for (i = 0; i < CMAP_K; i++) {
> > +            struct cmap_bucket *b2;
> > +            int j;
> > +
> > +            b2 = &impl->buckets[other_hash(b->hashes[i]) & impl->mask];
> > +            j = cmap_find_empty_slot(b2);
> > +            if (j >= 0) {
> 
> /* Move nodes along the path found to make room for ?node?. */
> 
> > +                struct cmap_bucket *buckets[MAX_PATH + 2];
> > +                int slots[MAX_PATH + 2];
> > +                uint32_t h = p->start ? h2 : h1;
> > +                int k;
> > +
> > +                for (k = 0; k < p->n; k++) {
> > +                    slots[k] = p->slots[k];
> > +                }
> > +                slots[p->n] = i;
> > +                slots[p->n + 1] = j;
> > +
> 
> You lost me here. A few comments on how the path entries and the slots work 
> would be nice.
> 
> > +                buckets[0] = &impl->buckets[h & impl->mask];
> > +                for (k = 0; k <= p->n; k++) {
> > +                    buckets[k + 1] = 
> > &impl->buckets[other_hash(buckets[k]->hashes[slots[k]]) & impl->mask];
> > +                }
> > +
> > +                cmap_check_hashes(imll);
> 
> Maybe this check should be made compile-time conditional?
> 
> > +                for (k = p->n + 1; k > 0; k--) {
> > +                    int slot = slots[k - 1];
> > +
> > +                    cmap_set_bucket(buckets[k], slots[k],
> > +                                    buckets[k - 1]->nodes[slot],
> > +                                    other_hash(buckets[k - 
> > 1]->hashes[slot]));
> > +                    cmap_check_hashes(imll);
> 
> Ditto.
> 
> > +                }
> > +                cmap_set_bucket(buckets[0], slots[0], node, h);
> > +                cmap_check_hashes(impl);
> > +
> 
> This too?
> 
> > +                return true;
> > +            }
> > +
> 
> /* ?b2? was full, add it to the path to be explored in later iteration. */
> 
> > +            if (b2 != b && p->n < MAX_PATH) {
> > +                struct cmap_path *p2 = &queue[head++ & (MAX_QUEUE - 1)];
> > +
> > +                *p2 = *p;
> > +                p2->end = b2;
> > +                p2->slots[p2->n++] = i;
> > +            }
> > +        }
> > +    }
> > +
> > +    return false;
> > +}
> > +
> > +/* Inserts 'node', with the given 'hash', into 'cmap'.  The caller must 
> > ensure
> > + * that 'cmap' cannot change concurrently (from another thread).  If 
> > duplicates
> > + * are undesirable, the caller must have already verified that 'cmap' does 
> > not
> > + * contain a duplicate of 'node'. */
> > +void
> > +cmap_insert(struct cmap *cmap, struct cmap_node *node, uint32_t hash)
> > +{
> > +    struct cmap_impl *impl = cmap_get_impl(cmap);
> > +
> > +    node->hash = hash;
> > +    node->next = NULL;
> > +
> > +    if (OVS_UNLIKELY(impl->n >= impl->max_n)) {
> > +        impl = cmap_rehash(cmap, (impl->mask << 1) | 1);
> > +    }
> > +
> > +    while (OVS_UNLIKELY(!cmap_try_insert(impl, node, hash))) {
> > +        impl = cmap_rehash(cmap, impl->mask);
> > +    }
> > +    impl->n++;
> > +}
> > +
> > +static bool
> > +cmap_remove__(struct cmap_impl *impl, struct cmap_node *node, uint32_t h)
> > +{
> > +    struct cmap_bucket *b = &impl->buckets[h & impl->mask];
> > +    int slot;
> > +
> > +    slot = cmap_find_slot(b, node->hash, h);
> > +    if (slot < 0) {
> > +        return false;
> > +    }
> > +
> > +    if (b->nodes[slot] == node) {
> > +        cmap_set_bucket(b, slot, node->next, h);
> > +    } else {
> > +        struct cmap_node **prev;
> > +
> > +        prev = &b->nodes[slot]->next;
> > +        while (*prev != node) {
> > +            prev = &(*prev)->next;
> > +        }
> > +        *prev = node->next;
> > +        atomic_thread_fence(memory_order_acq_rel);
> 
> I would benefit from a comment on the fence here (why it is needed?).  Would 
> there need to be a corresponding fence on the iteration reading the next 
> pointer?
> 
> > +    }
> > +    return true;
> > +}
> > +
> > 
> ?
> 
> > diff --git a/lib/cmap.h b/lib/cmap.h
> > new file mode 100644
> > index 0000000..e51f8b8
> > --- /dev/null
> > +++ b/lib/cmap.h
> 
> ...
> 
> > +
> > +void cmap_cursor_init(struct cmap_cursor *, const struct cmap *);
> > +struct cmap_node *cmap_cursor_next(struct cmap_cursor *,
> > +                                   const struct cmap_node *);
> > +
> > +/* Another, less preferred, form of iteration. */
> 
> When should this then be used?
> 
> > +struct cmap_position {
> > +    unsigned int bucket;
> > +    unsigned int entry;
> > +    unsigned int offset;
> > +};
> > +
> > +struct cmap_node *cmap_next_position(const struct cmap *,
> > +                                     struct cmap_position *);
> > +
> > +#endif /* cmap.h */
> > diff --git a/lib/ovs-thread.h b/lib/ovs-thread.h
> > index 180b66f..68db71f 100644
> > --- a/lib/ovs-thread.h
> > +++ b/lib/ovs-thread.h
> > @@ -40,7 +40,7 @@ struct OVS_LOCKABLE ovs_mutex {
> > 
> > #ifdef PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP
> > #define OVS_ADAPTIVE_MUTEX_INITIALIZER                  \
> > -    { PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP, NULL }
> > +    { PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP, "<unlocked>? }
> 
> This was in a separate patch already pushed?
> 
> > #else
> > #define OVS_ADAPTIVE_MUTEX_INITIALIZER OVS_MUTEX_INITIALIZER
> > #endif
> > diff --git a/lib/util.h b/lib/util.h
> > index 743b9fe..a0670f1 100644
> > --- a/lib/util.h
> > +++ b/lib/util.h
> > @@ -278,6 +278,11 @@ char *xasprintf(const char *format, ...) 
> > PRINTF_FORMAT(1, 2) MALLOC_LIKE;
> > char *xvasprintf(const char *format, va_list) PRINTF_FORMAT(1, 0) 
> > MALLOC_LIKE;
> > void *x2nrealloc(void *p, size_t *n, size_t s);
> > 
> > +/* This system's cache line size, in bytes.
> > + * Being wrong hurts performance but not correctness. */
> > +#define CACHE_LINE_SIZE 64
> > +BUILD_ASSERT_DECL(IS_POW2(CACHE_LINE_SIZE));
> > +
> 
> We already have this in util.h, earlier in the file. It was added by commit 
> e2051008 (util: Move CACHE_LINE_SIZE here.). It seems I already accidentally 
> added a second define in commit 124f09c9 (lib: Add prefetch support (for 
> GCC)), so this would be a third one :-)
> 
> > void *xmalloc_cacheline(size_t) MALLOC_LIKE;
> > void *xzalloc_cacheline(size_t) MALLOC_LIKE;
> > void free_cacheline(void *);
> ?
> 
>   Jarno
> 
_______________________________________________
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev

Re: [ovs-dev] [cmap 1/2] cmap: New module for cuckoo hash table.

Reply via email to