Re: [ovs-dev] [cmap 1/2] cmap: New module for cuckoo hash table.

Jarno Rajahalme Fri, 02 May 2014 11:23:17 -0700

On May 1, 2014, at 5:14 PM, Ben Pfaff <b...@nicira.com> wrote:

…


> The results show:
> 
>    - Insertion is generally 3x to 5x faster in an hmap.
> 
>    - Iteration is generally about 3x faster in a cmap.
> 
>    - Search and mutation is 4x faster with .1% mutations and the
>      advantage grows as the fraction of mutations grows.  This is
>      because a cmap does not require locking for read operations,
>      even in the presence of a writer.
> 
>      With no mutations, however, no locking is required in the hmap
>      case, and the hmap is somewhat faster.  This is because raw hmap
>      search is somewhat simpler and faster than raw cmap search.
> 

The “no mutations” applies to “runtime constant” data only, so this means that 
we should keep using hmap for maps that are initialized when OVS starts, and 
remain unchanged after that?

…

> diff --git a/lib/cmap.c b/lib/cmap.c
> new file mode 100644
> index 0000000..02d6b8f
> --- /dev/null
> +++ b/lib/cmap.c
> @@ -0,0 +1,765 @@
> +/*
> + * Copyright (c) 2014 Nicira, Inc.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at:
> + *
> + *     http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +#include <config.h>
> +#include "cmap.h"
> +#include "hash.h"
> +#include "ovs-rcu.h"
> +#include "random.h"
> +#include "util.h"
> +
> +/* Optimistic Concurrent Cuckoo Hash
> + * =================================
> + *
> + * A "cuckoo hash" is an open addressing hash table schema, designed such 
> that
> + * a given element can be in one of only a small number of buckets 'd', each 
> of
> + * which each holds up to a small number 'k' elements.  Thus, the expected 
> and

“which holds”

> + * worst-case lookup times are O(1) because they require comparing no more 
> than
> + * a fixed number of elements (k * d).  Inserting a new element can require
> + * moving around existing elements, but it is also O(1) amortized expected
> + * time.
> + *

…

> +/* A cuckoo hash bucket.  Designed to be cache-aligned and exactly one cache
> + * line long. */
> +struct cmap_bucket {
> +    /* Allows readers to track in-progress changes.  Initially zero, each
> +     * writer increments this value just before and just after each change 
> (see
> +     * cmap_set_bucket()).  Thus, a reader can ensure that it gets a 
> consistent
> +     * snapshot by waiting for the counter to become even (see
> +     * read_even_counter()), then checking that its value does not change 
> while
> +     * examining the bucket (see cmap_find()). */
> +    atomic_uint32_t counter;

Do the atomic operations generate CPU instructions, or only compiler barriers?

> +
> +    /* (hash, node) slots.  They are parallel arrays instead of an array of
> +     * structs to reduce the amount of space lost to padding.
> +     *
> +     * The slots are in no particular order.  A null pointer indicates that a
> +     * pair is unused.  In-use slots are not necessarily in the earliest
> +     * slots. */
> +    uint32_t hashes[CMAP_K];
> +    struct cmap_node *nodes[CMAP_K];
> +
> +    /* Padding to make cmap_bucket exactly one cache line long. */
> +#if CMAP_PADDING > 0
> +    uint8_t pad[CMAP_PADDING];
> +#endif
> +};
> +BUILD_ASSERT_DECL(sizeof(struct cmap_bucket) == CACHE_LINE_SIZE);
> +
> +/* Default maximum load factor (as a fraction of UINT32_MAX + 1) before
> + * enlarging a cmap.  Reasonable values lie between about 75% and 93%.  
> Smaller
> + * values waste memory; larger values increase the average insertion time. */
> +#define CMAP_DEFAULT_MAX_LOAD ((uint32_t) (UINT32_MAX * .85))
> +
> +/* The implementation of a concurrent hash map. */
> +struct cmap_impl {
> +    unsigned int n;             /* Number of in-use elements. */
> +    unsigned int max_n;         /* Max elements before enlarging. */
> +    unsigned int max_load;      /* Max load as fraction of UINT32_MAX. */
> +    uint32_t mask;              /* Number of 'buckets', minus one. */
> +    uint32_t basis;             /* Basis for rehashing client's hash values. 
> */
> +
> +    /* Padding to make cmap_impl exactly one cache line long. */
> +    uint8_t pad[CACHE_LINE_SIZE - sizeof(unsigned int) * 5];
> +
> +    struct cmap_bucket buckets[];
> +};
> +BUILD_ASSERT_DECL(sizeof(struct cmap_impl) == CACHE_LINE_SIZE);
> +
> +static struct cmap_impl *cmap_rehash(struct cmap *, uint32_t mask);
> +
> +/* Given a rehashed value 'hash', returns the other hash for that rehashed
> + * value.  This is symmetric: other_hash(other_hash(x)) == x.  (See also 
> "Hash
> + * Functions" at the top of this file.) */
> +static uint32_t
> +other_hash(uint32_t hash)
> +{
> +    return (hash << 16) | (hash >> 16);
> +}
> +
> +/* Returns the rehashed value for 'hash' within 'impl'.  (See also "Hash
> + * Functions" at the top of this file.) */
> +static uint32_t
> +rehash(const struct cmap_impl *impl, uint32_t hash)
> +{
> +    return mhash_finish(mhash_add(impl->basis, hash), 4);
> +}

Just refinishing might work as well. Consider the user computed hash as 
“unfinished”, and rehash() finishing it with the ‘impl->basis’ as “n_bytes”. 
The interpretation of “n_bytes” depends on the application of the hash 
function. Since we are always rehashing a fixed size value, we need not 
interpret the “n_bytes” as the real length of the data being hashed. For 
mash_finish(), “n_bytes” is just a 32-bit number. Given this, this might work 
as well, and should be about twice as fast:

static uint32_t
rehash(const struct cmap_impl *impl, uint32_t hash)
{
    return mhash_finish(hash, impl->basis);
}

…

> +
> +/* Changes the maximum load factor before enlarging 'cmap' to 'max_load', 
> which
> + * is a fraction of UINT32_MAX + 1.  For example, a 'max_load' of 
> UINT32_MAX/2
> + * would cause 'cmap' to be enlarged when it is half full.
> + *
> + * 'cmap' must be empty.
> + *
> + * Reasonable load factors lie between about 75% and 93%.
> + *
> + * This function is intended for testing.  Otherwise, it is best to use the
> + * default load factor (see CMAP_DEFAULT_MAX_LOAD). */
> +void
> +cmap_set_max_load_factor(struct cmap *cmap, uint32_t max_load)
> +{
> +    struct cmap_impl *impl = cmap_get_impl(cmap);
> +
> +    /* Not worth implementing rehashing for changing the max load factor. */
> +    ovs_assert(cmap_is_empty(cmap));
> +
> +    /* Force the max load factor into a reasonable range:
> +     *
> +     *    - At least one element per bucket.
> +     *
> +     *    - At most 93%, based on Fig. 1 in Erlingsson [4].
> +     */
> +    max_load = MAX(max_load, UINT32_MAX / CMAP_K + 1);
> +    max_load = MIN(max_load, (uint32_t) (UINT32_MAX * .93));

Since this is intended for testing, how about asserting instead? You might get 
strange test results if ‘max_load’ is silently adjusted.

> +    impl->max_load = max_load;
> +    impl->max_n = calc_max_n(impl->mask, max_load);
> +}
> +

…

> +static bool
> +cmap_insert_dup(struct cmap_impl *impl, struct cmap_node *new_node, uint32_t 
> h)
> +{
> +    struct cmap_bucket *b = &impl->buckets[h & impl->mask];
> +    int i;
> +
> +    for (i = 0; i < CMAP_K; i++) {
> +        struct cmap_node *node = b->nodes[i];
> +        if (node && b->hashes[i] == h && node->hash == new_node->hash) {
> +            struct cmap_node *p;
> +
> +            p = new_node;
> +            while (p->next) {
> +                p = p->next;
> +            }

In what cases the new_node has a chain of nodes hanging on it?

> +            p->next = node;
> +            cmap_set_bucket(b, i, new_node, h);
> +

I guess this is cheaper than scanning to the end of the duplicates and changing 
the NULL pointer to point to the ‘new_node’.

> +            return true;
> +        }
> +    }
> +    return false;
> +}
> +
> +static void
> +cmap_check_hashes(struct cmap_impl *impl)
> +{
> +    struct cmap_bucket *b;
> +
> +    return;
> +    for (b = impl->buckets; b <= &impl->buckets[impl->mask]; b++) {
> +        int i;
> +
> +        for (i = 0; i < CMAP_K; i++) {
> +            struct cmap_node *node = b->nodes[i];
> +            uint32_t h = b->hashes[i];
> +
> +            ovs_assert(!node || (h & impl->mask) == b - impl->buckets);
> +        }
> +    }
> +}
> +

This is only for debugging? Might the calls to this affect the performance 
numbers you reported?

> +static bool
> +cmap_try_insert(struct cmap_impl *impl, struct cmap_node *node, uint32_t 
> hash)
> +{
> +    enum { MAX_PATH = 4 };
> +    struct cmap_path {
> +        struct cmap_bucket *end;
> +        int start;

Is this 0 or 1 only?

> +        int n;

And this is [0 - MAX_PATH) ?

> +        uint8_t slots[MAX_PATH];
> +    };
> +

Whose slots the ‘slots’ point to?

> +    uint32_t h1 = rehash(impl, hash);
> +    uint32_t h2 = other_hash(h1);
> +
> +    enum { MAX_QUEUE = 512 };
> +    BUILD_ASSERT_DECL(IS_POW2(MAX_QUEUE));
> +    struct cmap_path queue[MAX_QUEUE];
> +    unsigned int head = 0, tail = 0;
> +
> +    /* Handle the easy cases. */
> +    if (cmap_insert_dup(impl, node, h1) ||
> +        cmap_insert_dup(impl, node, h2) ||
> +        cmap_insert_bucket(impl, node, h1) ||
> +        cmap_insert_bucket(impl, node, h2)) {
> +        return true;
> +    }
> +

Do you know how much faster this would be if we did not allow duplicates, or if 
we knew there are no duplicates?

> +    queue[head].end = &impl->buckets[h1 & impl->mask];
> +    queue[head].start = 0;
> +    queue[head].n = 0;
> +    head++;
> +
> +    if ((h1 & impl->mask) != (h2 & impl->mask)) {
> +        queue[head].end = &impl->buckets[h2 & impl->mask];
> +        queue[head].start = 1;
> +        queue[head].n = 0;
> +        head++;
> +    }
> +

/* ‘queue’ is full when it has space for less than CMAP_K nodes that might be 
added on each iteration. */

> +    while (head - tail > 0 && head - tail < MAX_QUEUE - CMAP_K) {
> +        const struct cmap_path *p = &queue[tail++ & (MAX_QUEUE - 1)];
> +        struct cmap_bucket *b = p->end;
> +        int i;
> +

/* Check if any of the nodes in ‘*b’ can be moved to their alternate bucket, 
thus making space for ‘node’. */

> +        for (i = 0; i < CMAP_K; i++) {
> +            struct cmap_bucket *b2;
> +            int j;
> +
> +            b2 = &impl->buckets[other_hash(b->hashes[i]) & impl->mask];
> +            j = cmap_find_empty_slot(b2);
> +            if (j >= 0) {

/* Move nodes along the path found to make room for ‘node’. */

> +                struct cmap_bucket *buckets[MAX_PATH + 2];
> +                int slots[MAX_PATH + 2];
> +                uint32_t h = p->start ? h2 : h1;
> +                int k;
> +
> +                for (k = 0; k < p->n; k++) {
> +                    slots[k] = p->slots[k];
> +                }
> +                slots[p->n] = i;
> +                slots[p->n + 1] = j;
> +

You lost me here. A few comments on how the path entries and the slots work 
would be nice.

> +                buckets[0] = &impl->buckets[h & impl->mask];
> +                for (k = 0; k <= p->n; k++) {
> +                    buckets[k + 1] = 
> &impl->buckets[other_hash(buckets[k]->hashes[slots[k]]) & impl->mask];
> +                }
> +
> +                cmap_check_hashes(imll);

Maybe this check should be made compile-time conditional?

> +                for (k = p->n + 1; k > 0; k--) {
> +                    int slot = slots[k - 1];
> +
> +                    cmap_set_bucket(buckets[k], slots[k],
> +                                    buckets[k - 1]->nodes[slot],
> +                                    other_hash(buckets[k - 
> 1]->hashes[slot]));
> +                    cmap_check_hashes(imll);

Ditto.

> +                }
> +                cmap_set_bucket(buckets[0], slots[0], node, h);
> +                cmap_check_hashes(impl);
> +

This too?

> +                return true;
> +            }
> +

/* ‘b2’ was full, add it to the path to be explored in later iteration. */

> +            if (b2 != b && p->n < MAX_PATH) {
> +                struct cmap_path *p2 = &queue[head++ & (MAX_QUEUE - 1)];
> +
> +                *p2 = *p;
> +                p2->end = b2;
> +                p2->slots[p2->n++] = i;
> +            }
> +        }
> +    }
> +
> +    return false;
> +}
> +
> +/* Inserts 'node', with the given 'hash', into 'cmap'.  The caller must 
> ensure
> + * that 'cmap' cannot change concurrently (from another thread).  If 
> duplicates
> + * are undesirable, the caller must have already verified that 'cmap' does 
> not
> + * contain a duplicate of 'node'. */
> +void
> +cmap_insert(struct cmap *cmap, struct cmap_node *node, uint32_t hash)
> +{
> +    struct cmap_impl *impl = cmap_get_impl(cmap);
> +
> +    node->hash = hash;
> +    node->next = NULL;
> +
> +    if (OVS_UNLIKELY(impl->n >= impl->max_n)) {
> +        impl = cmap_rehash(cmap, (impl->mask << 1) | 1);
> +    }
> +
> +    while (OVS_UNLIKELY(!cmap_try_insert(impl, node, hash))) {
> +        impl = cmap_rehash(cmap, impl->mask);
> +    }
> +    impl->n++;
> +}
> +
> +static bool
> +cmap_remove__(struct cmap_impl *impl, struct cmap_node *node, uint32_t h)
> +{
> +    struct cmap_bucket *b = &impl->buckets[h & impl->mask];
> +    int slot;
> +
> +    slot = cmap_find_slot(b, node->hash, h);
> +    if (slot < 0) {
> +        return false;
> +    }
> +
> +    if (b->nodes[slot] == node) {
> +        cmap_set_bucket(b, slot, node->next, h);
> +    } else {
> +        struct cmap_node **prev;
> +
> +        prev = &b->nodes[slot]->next;
> +        while (*prev != node) {
> +            prev = &(*prev)->next;
> +        }
> +        *prev = node->next;
> +        atomic_thread_fence(memory_order_acq_rel);

I would benefit from a comment on the fence here (why it is needed?).  Would 
there need to be a corresponding fence on the iteration reading the next 
pointer?

> +    }
> +    return true;
> +}
> +
> 
…

> diff --git a/lib/cmap.h b/lib/cmap.h
> new file mode 100644
> index 0000000..e51f8b8
> --- /dev/null
> +++ b/lib/cmap.h

...

> +
> +void cmap_cursor_init(struct cmap_cursor *, const struct cmap *);
> +struct cmap_node *cmap_cursor_next(struct cmap_cursor *,
> +                                   const struct cmap_node *);
> +
> +/* Another, less preferred, form of iteration. */

When should this then be used?

> +struct cmap_position {
> +    unsigned int bucket;
> +    unsigned int entry;
> +    unsigned int offset;
> +};
> +
> +struct cmap_node *cmap_next_position(const struct cmap *,
> +                                     struct cmap_position *);
> +
> +#endif /* cmap.h */
> diff --git a/lib/ovs-thread.h b/lib/ovs-thread.h
> index 180b66f..68db71f 100644
> --- a/lib/ovs-thread.h
> +++ b/lib/ovs-thread.h
> @@ -40,7 +40,7 @@ struct OVS_LOCKABLE ovs_mutex {
> 
> #ifdef PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP
> #define OVS_ADAPTIVE_MUTEX_INITIALIZER                  \
> -    { PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP, NULL }
> +    { PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP, "<unlocked>” }

This was in a separate patch already pushed?

> #else
> #define OVS_ADAPTIVE_MUTEX_INITIALIZER OVS_MUTEX_INITIALIZER
> #endif
> diff --git a/lib/util.h b/lib/util.h
> index 743b9fe..a0670f1 100644
> --- a/lib/util.h
> +++ b/lib/util.h
> @@ -278,6 +278,11 @@ char *xasprintf(const char *format, ...) 
> PRINTF_FORMAT(1, 2) MALLOC_LIKE;
> char *xvasprintf(const char *format, va_list) PRINTF_FORMAT(1, 0) MALLOC_LIKE;
> void *x2nrealloc(void *p, size_t *n, size_t s);
> 
> +/* This system's cache line size, in bytes.
> + * Being wrong hurts performance but not correctness. */
> +#define CACHE_LINE_SIZE 64
> +BUILD_ASSERT_DECL(IS_POW2(CACHE_LINE_SIZE));
> +

We already have this in util.h, earlier in the file. It was added by commit 
e2051008 (util: Move CACHE_LINE_SIZE here.). It seems I already accidentally 
added a second define in commit 124f09c9 (lib: Add prefetch support (for GCC)), 
so this would be a third one :-)

> void *xmalloc_cacheline(size_t) MALLOC_LIKE;
> void *xzalloc_cacheline(size_t) MALLOC_LIKE;
> void free_cacheline(void *);
…

  Jarno

_______________________________________________
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev

Re: [ovs-dev] [cmap 1/2] cmap: New module for cuckoo hash table.

Reply via email to