Hi, On 2021-07-19 15:20:54 +0900, Masahiko Sawada wrote: > BTW is the implementation of the radix tree approach available > somewhere? If so I'd like to experiment with that too. > > > > > I have toyed with implementing adaptively large radix nodes like > > proposed in https://db.in.tum.de/~leis/papers/ART.pdf - but haven't > > gotten it quite working. > > That seems promising approach.
I've since implemented some, but not all of the ideas of that paper (adaptive node sizes, but not the tree compression pieces). E.g. for select prepare( 1000000, -- max block 20, -- # of dead tuples per page 10, -- dead tuples interval within a page 1 -- page inteval ); attach size shuffled ordered array 69 ms 120 MB 84.87 s 8.66 s intset 173 ms 65 MB 68.82 s 11.75 s rtbm 201 ms 67 MB 11.54 s 1.35 s tbm 232 ms 100 MB 8.33 s 1.26 s vtbm 162 ms 58 MB 10.01 s 1.22 s radix 88 ms 42 MB 11.49 s 1.67 s and for select prepare( 1000000, -- max block 10, -- # of dead tuples per page 1, -- dead tuples interval within a page 1 -- page inteval ); attach size shuffled ordered array 24 ms 60MB 3.74s 1.02 s intset 97 ms 49MB 3.14s 0.75 s rtbm 138 ms 36MB 0.41s 0.14 s tbm 198 ms 101MB 0.41s 0.14 s vtbm 118 ms 27MB 0.39s 0.12 s radix 33 ms 10MB 0.28s 0.10 s (this is an almost unfairly good case for radix) Running out of time to format the results of the other testcases before I have to run, unfortunately. radix uses 42MB both in test case 3 and 4. The radix tree code isn't good right now. A ridiculous amount of duplication etc. The naming clearly shows its origins from a buffer mapping radix tree... Currently in a bunch of the cases 20% of the time is spent in radix_reaped(). If I move that into radix.c and for bfm_lookup() to be inlined, I get reduced overhead. rbtm for example essentially already does that, because it does splitting of ItemPointer in rtbm.c. I've attached my current patches against your tree. Greetings, Andres Freund
>From 5dfbe02000aefd3e085bdea0ec809247e1fb71b3 Mon Sep 17 00:00:00 2001 From: Andres Freund <and...@anarazel.de> Date: Mon, 19 Jul 2021 16:03:28 -0700 Subject: [PATCH 1/3] Fix build warnings. --- bdbench/bdbench.c | 2 +- bdbench/rtbm.c | 4 ++-- bdbench/vtbm.c | 10 +++++++--- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/bdbench/bdbench.c b/bdbench/bdbench.c index 800567d..1df5c53 100644 --- a/bdbench/bdbench.c +++ b/bdbench/bdbench.c @@ -655,7 +655,7 @@ _bench(LVTestType *lvtt) fclose(f); #endif - elog(NOTICE, "\"%s\": dead tuples %lu, index tuples %lu, mathed %d, mem %zu", + elog(NOTICE, "\"%s\": dead tuples %lu, index tuples %lu, matched %d, mem %zu", lvtt->name, lvtt->dtinfo.nitems, IndexTids_cache->dtinfo.nitems, diff --git a/bdbench/rtbm.c b/bdbench/rtbm.c index 025d2a9..eac277a 100644 --- a/bdbench/rtbm.c +++ b/bdbench/rtbm.c @@ -449,9 +449,9 @@ dump_entry(RTbm *rtbm, DtEntry *entry) } } - elog(NOTICE, "%s (offset %d len %d)", + elog(NOTICE, "%s (offset %llu len %d)", str.data, - entry->offset, len); + (long long unsigned) entry->offset, len); } static int diff --git a/bdbench/vtbm.c b/bdbench/vtbm.c index c59d6e1..63320f5 100644 --- a/bdbench/vtbm.c +++ b/bdbench/vtbm.c @@ -72,7 +72,8 @@ vtbm_add_tuples(VTbm *vtbm, const BlockNumber blkno, DtEntry *entry; bool found; char oldstatus; - int wordnum, bitnum; + int wordnum = 0; + int bitnum; entry = dttable_insert(vtbm->dttable, blkno, &found); Assert(!found); @@ -216,8 +217,10 @@ vtbm_dump(VTbm *vtbm) vtbm->bitmap_size, vtbm->npages); for (int i = 0; i < vtbm->npages; i++) { + char *bitmap; + entry = entries[i]; - char *bitmap = &(vtbm->bitmap[entry->offset]); + bitmap = &(vtbm->bitmap[entry->offset]); appendStringInfo(&str, "[%5d] : ", entry->blkno); for (int off = 0; off < entry->len; off++) @@ -239,6 +242,7 @@ vtbm_dump_blk(VTbm *vtbm, BlockNumber blkno) { DtEntry *entry; StringInfoData str; + char *bitmap; initStringInfo(&str); @@ -252,7 +256,7 @@ vtbm_dump_blk(VTbm *vtbm, BlockNumber blkno) return; } - char *bitmap = &(vtbm->bitmap[entry->offset]); + bitmap = &(vtbm->bitmap[entry->offset]); appendStringInfo(&str, "[%5d] : ", entry->blkno); for (int off = 1; off < entry->len; off++) -- 2.32.0.rc2
>From 5ba05ffad4a9605a6fb5a24fe625542aee226ec8 Mon Sep 17 00:00:00 2001 From: Andres Freund <and...@anarazel.de> Date: Mon, 19 Jul 2021 16:04:55 -0700 Subject: [PATCH 2/3] Add radix tree. --- bdbench/radix.c | 3088 +++++++++++++++++++++++++++++++++++++++++++++++ bdbench/radix.h | 76 ++ 2 files changed, 3164 insertions(+) create mode 100644 bdbench/radix.c create mode 100644 bdbench/radix.h diff --git a/bdbench/radix.c b/bdbench/radix.c new file mode 100644 index 0000000..c7061f0 --- /dev/null +++ b/bdbench/radix.c @@ -0,0 +1,3088 @@ +/* + * + */ + +#include "postgres.h" + +#include "radix.h" + +#include "lib/stringinfo.h" +#include "port/pg_bitutils.h" +#include "utils/memutils.h" + + +/* + * How many bits are encoded in one tree level. + * + * Linux uses 6, ART uses 8. In a non-adaptive radix tree the disadvantage of + * a higher fanout is increased memory usage - but the adapative node size + * addresses that to a good degree. Using a common multiple of 8 (i.e. bytes + * in a byte) has the advantage of making it easier to eventually support + * variable length data. Therefore go with 8 for now. + */ +#define BFM_FANOUT 8 + +#define BFM_MAX_CLASS (1<<BFM_FANOUT) + +#define BFM_MASK ((1 << BFM_FANOUT) - 1) + + +/* + * Base type for all node types. + */ +struct bfm_tree_node_inner; +typedef struct bfm_tree_node +{ + /* + * Size class of entry (stored as uint8 instead of bfm_tree_node_kind to + * save space). + * + * XXX: For efficiency in random access cases it'd be a good idea to + * encode the kind of a node in the pointer value of upper nodes, in the + * low bits. Being able to do the node type dispatch during traversal + * before the memory for the node has been fetched from memory would + * likely improve performance significantly. But that'd require at least + * 8 byte alignment, which we don't currently guarantee on all platforms. + */ + uint8 kind; + + /* + * Shift indicates which part of the key space is represented by this + * node. I.e. the key is shifted by `shift` and the lowest BFM_FANOUT bits + * are then represented in chunk. + */ + uint8 node_shift; + uint8 node_chunk; + + /* + * Number of children - currently uint16 to be able to indicate 256 + * children at a fanout of 8. + */ + uint16 count; + + /* FIXME: Right now there's always unused bytes here :( */ + + /* + * FIXME: could be removed by using a stack while walking down to deleted + * node. + */ + struct bfm_tree_node_inner *parent; +} bfm_tree_node; + +/* + * Base type for all inner nodes. + */ +typedef struct bfm_tree_node_inner +{ + bfm_tree_node b; +} bfm_tree_node_inner; + +/* + * Base type for all leaf nodes. + */ +typedef struct bfm_tree_node_leaf +{ + bfm_tree_node b; +} bfm_tree_node_leaf; + + +/* + * Size classes. + * + * To reduce memory usage compared to a simple radix tree with a fixed fanout + * we use adaptive node sizes, with different storage methods for different + * numbers of elements. + * + * FIXME: These are currently not well chosen. To reduce memory fragmentation + * smaller class should optimally fit neatly into the next larger class + * (except perhaps at the lowest end). Right now its + * 32->56->160->304->1296->2064/2096 bytes for inner/leaf nodes, repeatedly + * just above a power of 2, leading to large amounts of allocator padding with + * aset.c. Hence the use of slab. + * + * FIXME: Duplication. + * + * XXX: Consider implementing path compression, it reduces worst case memory + * usage substantially. I.e. collapse sequences of nodes with just one child + * into one node. That would make it feasible to use this datastructure for + * wide keys. Gut feeling: When compressing inner nodes a limited number of + * tree levels should be skippable to keep nodes of a constant size. But when + * collapsing to leaf nodes it likely is worth to make them variable width, + * it's such a common scenario (a sparse key will always end with such a chain + * of nodes). + */ + +/* + * Inner node size classes. + */ +typedef struct bfm_tree_node_inner_1 +{ + bfm_tree_node_inner b; + + /* single child, for key chunk */ + uint8 chunk; + bfm_tree_node *slot; +} bfm_tree_node_inner_1; + +typedef struct bfm_tree_node_inner_4 +{ + bfm_tree_node_inner b; + + /* four children, for key chunks */ + uint8 chunks[4]; + bfm_tree_node *slots[4]; +} bfm_tree_node_inner_4; + +typedef struct bfm_tree_node_inner_16 +{ + bfm_tree_node_inner b; + + /* four children, for key chunks */ + uint8 chunks[16]; + bfm_tree_node *slots[16]; +} bfm_tree_node_inner_16; + +#define BFM_TREE_NODE_32_INVALID 0xFF +typedef struct bfm_tree_node_inner_32 +{ + bfm_tree_node_inner b; + + /* + * 32 children. Offsets is indexed by they key chunk and points into + * ->slots. An offset of BFM_TREE_NODE_32_INVALID indicates a non-existing + * entry. + * + * XXX: It'd be nice to shrink the offsets array to use fewer bits - we + * only need to index into an array of 32 entries. But 32 offsets already + * is 5 bits, making a simple & fast encoding nontrivial. + */ + uint8 chunks[32]; + bfm_tree_node *slots[32]; +} bfm_tree_node_inner_32; + +#define BFM_TREE_NODE_128_INVALID 0xFF +typedef struct bfm_tree_node_inner_128 +{ + bfm_tree_node_inner b; + + uint8 offsets[BFM_MAX_CLASS]; + bfm_tree_node *slots[128]; +} bfm_tree_node_inner_128; + +typedef struct bfm_tree_node_inner_max +{ + bfm_tree_node_inner b; + bfm_tree_node *slots[BFM_MAX_CLASS]; +} bfm_tree_node_inner_max; + + +/* + * Leaf node size classes. + * + * Currently these are separate from inner node size classes for two main + * reasons: + * + * 1) the value type might be different than something fitting into a pointer + * width type + * 2) Need to represent non-existing values in a key-type independent way. + * + * 1) is clearly worth being concerned about, but it's not clear 2) is as + * good. It might be better to just indicate non-existing entries the same way + * in inner nodes. + */ + +typedef struct bfm_tree_node_leaf_1 +{ + bfm_tree_node_leaf b; + uint8 chunk; + bfm_value_type value; +} bfm_tree_node_leaf_1; + +#define BFM_TREE_NODE_LEAF_4_INVALID 0xFFFF +typedef struct bfm_tree_node_leaf_4 +{ + bfm_tree_node_leaf b; + uint8 chunks[4]; + bfm_value_type values[4]; +} bfm_tree_node_leaf_4; + +#define BFM_TREE_NODE_LEAF_16_INVALID 0xFFFF +typedef struct bfm_tree_node_leaf_16 +{ + bfm_tree_node_leaf b; + uint8 chunks[16]; + bfm_value_type values[16]; +} bfm_tree_node_leaf_16; + +typedef struct bfm_tree_node_leaf_32 +{ + bfm_tree_node_leaf b; + uint8 chunks[32]; + bfm_value_type values[32]; +} bfm_tree_node_leaf_32; + +typedef struct bfm_tree_node_leaf_128 +{ + bfm_tree_node_leaf b; + uint8 offsets[BFM_MAX_CLASS]; + bfm_value_type values[128]; +} bfm_tree_node_leaf_128; + +typedef struct bfm_tree_node_leaf_max +{ + bfm_tree_node_leaf b; + uint8 set[BFM_MAX_CLASS / (sizeof(uint8) * BITS_PER_BYTE)]; + bfm_value_type values[BFM_MAX_CLASS]; +} bfm_tree_node_leaf_max; + + +typedef struct bfm_tree_size_class_info +{ + const char *const name; + int elements; + size_t size; +} bfm_tree_size_class_info; + +const bfm_tree_size_class_info inner_class_info[] = +{ + [BFM_KIND_1] = {"1", 1, sizeof(bfm_tree_node_inner_1)}, + [BFM_KIND_4] = {"4", 4, sizeof(bfm_tree_node_inner_4)}, + [BFM_KIND_16] = {"16", 16, sizeof(bfm_tree_node_inner_16)}, + [BFM_KIND_32] = {"32", 32, sizeof(bfm_tree_node_inner_32)}, + [BFM_KIND_128] = {"128", 128, sizeof(bfm_tree_node_inner_128)}, + [BFM_KIND_MAX] = {"max", BFM_MAX_CLASS, sizeof(bfm_tree_node_inner_max)}, +}; + +const bfm_tree_size_class_info leaf_class_info[] = +{ + [BFM_KIND_1] = {"1", 1, sizeof(bfm_tree_node_leaf_1)}, + [BFM_KIND_4] = {"4", 4, sizeof(bfm_tree_node_leaf_4)}, + [BFM_KIND_16] = {"16", 16, sizeof(bfm_tree_node_leaf_16)}, + [BFM_KIND_32] = {"32", 32, sizeof(bfm_tree_node_leaf_32)}, + [BFM_KIND_128] = {"128", 128, sizeof(bfm_tree_node_leaf_128)}, + [BFM_KIND_MAX] = {"max", BFM_MAX_CLASS, sizeof(bfm_tree_node_leaf_max)}, +}; + +static void * +bfm_alloc_node(bfm_tree *root, bool inner, bfm_tree_node_kind kind, size_t size) +{ + bfm_tree_node *node; + +#ifdef BFM_USE_SLAB + if (inner) + node = (bfm_tree_node *) MemoryContextAlloc(root->inner_slabs[kind], size); + else + node = (bfm_tree_node *) MemoryContextAlloc(root->leaf_slabs[kind], size); +#elif defined(BFM_USE_OS) + node = (bfm_tree_node *) malloc(size); +#else + node = (bfm_tree_node *) MemoryContextAlloc(root->context, size); +#endif + + return node; +} + +static bfm_tree_node_inner * +bfm_alloc_inner(bfm_tree *root, bfm_tree_node_kind kind, size_t size) +{ + bfm_tree_node_inner *node; + + Assert(inner_class_info[kind].size == size); +#ifdef BFM_STATS + root->inner_nodes[kind]++; +#endif + + node = bfm_alloc_node(root, true, kind, size); + + memset(&node->b, 0, sizeof(node->b)); + node->b.kind = kind; + + return node; +} + +static bfm_tree_node_inner * +bfm_alloc_leaf(bfm_tree *root, bfm_tree_node_kind kind, size_t size) +{ + bfm_tree_node_inner *node; + + Assert(leaf_class_info[kind].size == size); +#ifdef BFM_STATS + root->leaf_nodes[kind]++; +#endif + + node = bfm_alloc_node(root, false, kind, size); + + memset(&node->b, 0, sizeof(node->b)); + node->b.kind = kind; + + return node; +} + + +static bfm_tree_node_inner_1 * +bfm_alloc_inner_1(bfm_tree *root) +{ + bfm_tree_node_inner_1 *node = + (bfm_tree_node_inner_1 *) bfm_alloc_inner(root, BFM_KIND_1, sizeof(*node)); + + return node; +} + +#define BFM_TREE_NODE_INNER_4_INVALID 0xFF +static bfm_tree_node_inner_4 * +bfm_alloc_inner_4(bfm_tree *root) +{ + bfm_tree_node_inner_4 *node = + (bfm_tree_node_inner_4 *) bfm_alloc_inner(root, BFM_KIND_4, sizeof(*node)); + + return node; +} + +#define BFM_TREE_NODE_INNER_16_INVALID 0xFF +static bfm_tree_node_inner_16 * +bfm_alloc_inner_16(bfm_tree *root) +{ + bfm_tree_node_inner_16 *node = + (bfm_tree_node_inner_16 *) bfm_alloc_inner(root, BFM_KIND_16, sizeof(*node)); + + return node; +} + +#define BFM_TREE_NODE_INNER_32_INVALID 0xFF +static bfm_tree_node_inner_32 * +bfm_alloc_inner_32(bfm_tree *root) +{ + bfm_tree_node_inner_32 *node = + (bfm_tree_node_inner_32 *) bfm_alloc_inner(root, BFM_KIND_32, sizeof(*node)); + + return node; +} + +static bfm_tree_node_inner_128 * +bfm_alloc_inner_128(bfm_tree *root) +{ + bfm_tree_node_inner_128 *node = + (bfm_tree_node_inner_128 *) bfm_alloc_inner(root, BFM_KIND_128, sizeof(*node)); + + memset(&node->offsets, BFM_TREE_NODE_128_INVALID, sizeof(node->offsets)); + + return node; +} + +static bfm_tree_node_inner_max * +bfm_alloc_inner_max(bfm_tree *root) +{ + bfm_tree_node_inner_max *node = + (bfm_tree_node_inner_max *) bfm_alloc_inner(root, BFM_KIND_MAX, sizeof(*node)); + + memset(&node->slots, 0, sizeof(node->slots)); + + return node; +} + +static bfm_tree_node_leaf_1 * +bfm_alloc_leaf_1(bfm_tree *root) +{ + bfm_tree_node_leaf_1 *node = + (bfm_tree_node_leaf_1 *) bfm_alloc_leaf(root, BFM_KIND_1, sizeof(*node)); + + return node; +} + +static bfm_tree_node_leaf_4 * +bfm_alloc_leaf_4(bfm_tree *root) +{ + bfm_tree_node_leaf_4 *node = + (bfm_tree_node_leaf_4 *) bfm_alloc_leaf(root, BFM_KIND_4, sizeof(*node)); + + return node; +} + +static bfm_tree_node_leaf_16 * +bfm_alloc_leaf_16(bfm_tree *root) +{ + bfm_tree_node_leaf_16 *node = + (bfm_tree_node_leaf_16 *) bfm_alloc_leaf(root, BFM_KIND_16, sizeof(*node)); + + return node; +} + +static bfm_tree_node_leaf_32 * +bfm_alloc_leaf_32(bfm_tree *root) +{ + bfm_tree_node_leaf_32 *node = + (bfm_tree_node_leaf_32 *) bfm_alloc_leaf(root, BFM_KIND_32, sizeof(*node)); + + return node; +} + +static bfm_tree_node_leaf_128 * +bfm_alloc_leaf_128(bfm_tree *root) +{ + bfm_tree_node_leaf_128 *node = + (bfm_tree_node_leaf_128 *) bfm_alloc_leaf(root, BFM_KIND_128, sizeof(*node)); + + memset(node->offsets, BFM_TREE_NODE_128_INVALID, sizeof(node->offsets)); + + return node; +} + +static bfm_tree_node_leaf_max * +bfm_alloc_leaf_max(bfm_tree *root) +{ + bfm_tree_node_leaf_max *node = + (bfm_tree_node_leaf_max *) bfm_alloc_leaf(root, BFM_KIND_MAX, sizeof(*node)); + + memset(node->set, 0, sizeof(node->set)); + + return node; +} + +static void +bfm_free_internal(bfm_tree *root, void *p) +{ +#if defined(BFM_USE_OS) + free(p); +#else + pfree(p); +#endif +} + +static void +bfm_free_inner(bfm_tree *root, bfm_tree_node_inner *node) +{ + Assert(node->b.node_shift != 0); + +#ifdef BFM_STATS + root->inner_nodes[node->b.kind]--; +#endif + + bfm_free_internal(root, node); +} + +static void +bfm_free_leaf(bfm_tree *root, bfm_tree_node_leaf *node) +{ + Assert(node->b.node_shift == 0); + +#ifdef BFM_STATS + root->leaf_nodes[node->b.kind]--; +#endif + + bfm_free_internal(root, node); +} + +#define BFM_LEAF_MAX_SET_OFFSET(i) (i / (sizeof(uint8) * BITS_PER_BYTE)) +#define BFM_LEAF_MAX_SET_BIT(i) (UINT64_C(1) << (i & ((sizeof(uint8) * BITS_PER_BYTE)-1))) + +static inline bool +bfm_leaf_max_isset(bfm_tree_node_leaf_max *node_max, uint32 i) +{ + return node_max->set[BFM_LEAF_MAX_SET_OFFSET(i)] & BFM_LEAF_MAX_SET_BIT(i); +} + +static inline void +bfm_leaf_max_set(bfm_tree_node_leaf_max *node_max, uint32 i) +{ + node_max->set[BFM_LEAF_MAX_SET_OFFSET(i)] |= BFM_LEAF_MAX_SET_BIT(i); +} + +static inline void +bfm_leaf_max_unset(bfm_tree_node_leaf_max *node_max, uint32 i) +{ + node_max->set[BFM_LEAF_MAX_SET_OFFSET(i)] &= ~BFM_LEAF_MAX_SET_BIT(i); +} + +static uint64 +bfm_maxval_shift(uint32 shift) +{ + uint32 maxshift = (sizeof(bfm_key_type) * BITS_PER_BYTE) / BFM_FANOUT * BFM_FANOUT; + + Assert(shift <= maxshift); + + if (shift == maxshift) + return UINT64_MAX; + + return (UINT64_C(1) << (shift + BFM_FANOUT)) - 1; +} + +static inline int +search_chunk_array_4_eq(uint8 *chunks, uint8 match, uint8 count) +{ + int index = -1; + + for (int i = 0; i < count; i++) + { + if (chunks[i] == match) + { + index = i; + break; + } + } + + return index; +} + +static inline int +search_chunk_array_4_le(uint8 *chunks, uint8 match, uint8 count) +{ + int index; + + for (index = 0; index < count; index++) + if (chunks[index] >= match) + break; + + return index; +} + + +#if defined(__SSE2__) +#include <emmintrin.h> // x86 SSE intrinsics +#endif + +static inline int +search_chunk_array_16_eq(uint8 *chunks, uint8 match, uint8 count) +{ +#if !defined(__SSE2__) || defined(USE_ASSERT_CHECKING) + int index = -1; +#endif + +#ifdef __SSE2__ + int index_sse; + __m128i spread_chunk = _mm_set1_epi8(match); + __m128i haystack = _mm_loadu_si128((__m128i_u*) chunks); + __m128i cmp=_mm_cmpeq_epi8(spread_chunk, haystack); + uint32_t bitfield=_mm_movemask_epi8(cmp); + + bitfield &= ((1<<count)-1); + + if (bitfield) + index_sse = __builtin_ctz(bitfield); + else + index_sse = -1; + +#endif + +#if !defined(__SSE2__) || defined(USE_ASSERT_CHECKING) + for (int i = 0; i < count; i++) + { + if (chunks[i] == match) + { + index = i; + break; + } + } + +#if defined(__SSE2__) + Assert(index_sse == index); +#endif + +#endif + +#if defined(__SSE2__) + return index_sse; +#else + return index; +#endif +} + +/* + * This is a bit more complicated than search_chunk_array_16_eq(), because + * until recently no unsigned uint8 comparison instruction existed on x86. So + * we need to play some trickery using _mm_min_epu8() to effectively get + * <=. There never will be any equal elements in the current uses, but that's + * what we get here... + */ +static inline int +search_chunk_array_16_le(uint8 *chunks, uint8 match, uint8 count) +{ +#if !defined(__SSE2__) || defined(USE_ASSERT_CHECKING) + int index; +#endif + +#ifdef __SSE2__ + int index_sse; + __m128i spread_chunk = _mm_set1_epi8(match); + __m128i haystack = _mm_loadu_si128((__m128i_u*) chunks); + __m128i min = _mm_min_epu8(haystack, spread_chunk); + __m128i cmp = _mm_cmpeq_epi8(spread_chunk, min); + uint32_t bitfield=_mm_movemask_epi8(cmp); + + bitfield &= ((1<<count)-1); + + if (bitfield) + index_sse = __builtin_ctz(bitfield); + else + index_sse = count; +#endif + +#if !defined(__SSE2__) || defined(USE_ASSERT_CHECKING) + for (index = 0; index < count; index++) + if (chunks[index] >= match) + break; + +#if defined(__SSE2__) + Assert(index_sse == index); +#endif + +#endif + +#if defined(__SSE2__) + return index_sse; +#else + return index; +#endif +} + +#if defined(__AVX2__) +#include <immintrin.h> // x86 SSE intrinsics +#endif + +static inline int +search_chunk_array_32_eq(uint8 *chunks, uint8 match, uint8 count) +{ +#if !defined(__AVX2__) || defined(USE_ASSERT_CHECKING) + int index = -1; +#endif + +#ifdef __AVX2__ + int index_sse; + __m256i spread_chunk = _mm256_set1_epi8(match); + __m256i haystack = _mm256_loadu_si256((__m256i_u*) chunks); + __m256i cmp= _mm256_cmpeq_epi8(spread_chunk, haystack); + uint32_t bitfield = _mm256_movemask_epi8(cmp); + + bitfield &= ((UINT64_C(1)<<count)-1); + + if (bitfield) + index_sse = __builtin_ctz(bitfield); + else + index_sse = -1; + +#endif + +#if !defined(__AVX2__) || defined(USE_ASSERT_CHECKING) + for (int i = 0; i < count; i++) + { + if (chunks[i] == match) + { + index = i; + break; + } + } + +#if defined(__AVX2__) + Assert(index_sse == index); +#endif + +#endif + +#if defined(__AVX2__) + return index_sse; +#else + return index; +#endif +} + +/* + * This is a bit more complicated than search_chunk_array_16_eq(), because + * until recently no unsigned uint8 comparison instruction existed on x86. So + * we need to play some trickery using _mm_min_epu8() to effectively get + * <=. There never will be any equal elements in the current uses, but that's + * what we get here... + */ +static inline int +search_chunk_array_32_le(uint8 *chunks, uint8 match, uint8 count) +{ +#if !defined(__AVX2__) || defined(USE_ASSERT_CHECKING) + int index; +#endif + +#ifdef __AVX2__ + int index_sse; + __m256i spread_chunk = _mm256_set1_epi8(match); + __m256i haystack = _mm256_loadu_si256((__m256i_u*) chunks); + __m256i min = _mm256_min_epu8(haystack, spread_chunk); + __m256i cmp=_mm256_cmpeq_epi8(spread_chunk, min); + uint32_t bitfield=_mm256_movemask_epi8(cmp); + + bitfield &= ((1<<count)-1); + + if (bitfield) + index_sse = __builtin_ctz(bitfield); + else + index_sse = count; +#endif + +#if !defined(__AVX2__) || defined(USE_ASSERT_CHECKING) + for (index = 0; index < count; index++) + if (chunks[index] >= match) + break; + +#if defined(__AVX2__) + Assert(index_sse == index); +#endif + +#endif + +#if defined(__AVX2__) + return index_sse; +#else + return index; +#endif +} + +static inline void +chunk_slot_array_grow(uint8 *source_chunks, bfm_tree_node **source_slots, + uint8 *target_chunks, bfm_tree_node **target_slots, + bfm_tree_node_inner *oldnode, bfm_tree_node_inner *newnode) +{ + memcpy(target_chunks, source_chunks, sizeof(source_chunks[0]) * oldnode->b.count); + memcpy(target_slots, source_slots, sizeof(source_slots[0]) * oldnode->b.count); + + for (int i = 0; i < oldnode->b.count; i++) + { + Assert(source_slots[i]->parent == oldnode); + source_slots[i]->parent = newnode; + } +} + +/* + * FIXME: Find a way to deduplicate with bfm_find_one_level_inner() + */ +pg_attribute_always_inline static bfm_tree_node * +bfm_find_one_level_inner(bfm_tree_node_inner * pg_restrict node, uint8 chunk) +{ + bfm_tree_node *slot = NULL; + + Assert(node->b.node_shift != 0); /* is inner node */ + + /* tell the compiler it doesn't need a bounds check */ + if ((bfm_tree_node_kind) node->b.kind > BFM_KIND_MAX) + pg_unreachable(); + + switch((bfm_tree_node_kind) node->b.kind) + { + case BFM_KIND_1: + { + bfm_tree_node_inner_1 *node_1 = + (bfm_tree_node_inner_1 *) node; + + Assert(node_1->b.b.count <= 1); + if (node_1->chunk == chunk) + slot = node_1->slot; + break; + } + + case BFM_KIND_4: + { + bfm_tree_node_inner_4 *node_4 = + (bfm_tree_node_inner_4 *) node; + int index; + + Assert(node_4->b.b.count <= 4); + index = search_chunk_array_4_eq(node_4->chunks, chunk, node_4->b.b.count); + + if (index != -1) + slot = node_4->slots[index]; + + break; + } + + case BFM_KIND_16: + { + bfm_tree_node_inner_16 *node_16 = + (bfm_tree_node_inner_16 *) node; + int index; + + Assert(node_16->b.b.count <= 16); + + index = search_chunk_array_16_eq(node_16->chunks, chunk, node_16->b.b.count); + if (index != -1) + slot = node_16->slots[index]; + + break; + } + + case BFM_KIND_32: + { + bfm_tree_node_inner_32 *node_32 = + (bfm_tree_node_inner_32 *) node; + int index; + + Assert(node_32->b.b.count <= 32); + + index = search_chunk_array_32_eq(node_32->chunks, chunk, node_32->b.b.count); + if (index != -1) + slot = node_32->slots[index]; + + break; + } + + case BFM_KIND_128: + { + bfm_tree_node_inner_128 *node_128 = + (bfm_tree_node_inner_128 *) node; + + Assert(node_128->b.b.count <= 128); + + if (node_128->offsets[chunk] != BFM_TREE_NODE_128_INVALID) + { + slot = node_128->slots[node_128->offsets[chunk]]; + } + break; + } + + case BFM_KIND_MAX: + { + bfm_tree_node_inner_max *node_max = + (bfm_tree_node_inner_max *) node; + + Assert(node_max->b.b.count <= BFM_MAX_CLASS); + slot = node_max->slots[chunk]; + + break; + } + } + + return slot; +} + +/* + * FIXME: Find a way to deduplicate with bfm_find_one_level_inner() + */ +pg_attribute_always_inline static bool +bfm_find_one_level_leaf(bfm_tree_node_leaf * pg_restrict node, uint8 chunk, bfm_value_type * pg_restrict valp) +{ + bool found = false; + + Assert(node->b.node_shift == 0); /* is leaf node */ + + /* tell the compiler it doesn't need a bounds check */ + if ((bfm_tree_node_kind) node->b.kind > BFM_KIND_MAX) + pg_unreachable(); + + switch((bfm_tree_node_kind) node->b.kind) + { + case BFM_KIND_1: + { + bfm_tree_node_leaf_1 *node_1 = + (bfm_tree_node_leaf_1 *) node; + + Assert(node_1->b.b.count <= 1); + if (node_1->b.b.count == 1 && + node_1->chunk == chunk) + { + *valp = node_1->value; + found = true; + break; + } + break; + } + + case BFM_KIND_4: + { + bfm_tree_node_leaf_4 *node_4 = + (bfm_tree_node_leaf_4 *) node; + int index; + + Assert(node_4->b.b.count <= 4); + index = search_chunk_array_4_eq(node_4->chunks, chunk, node_4->b.b.count); + + if (index != -1) + { + *valp = node_4->values[index]; + found = true; + } + break; + } + + case BFM_KIND_16: + { + bfm_tree_node_leaf_16 *node_16 = + (bfm_tree_node_leaf_16 *) node; + int index; + + Assert(node_16->b.b.count <= 16); + + index = search_chunk_array_16_eq(node_16->chunks, chunk, node_16->b.b.count); + if (index != -1) + { + *valp = node_16->values[index]; + found = true; + break; + } + break; + } + + case BFM_KIND_32: + { + bfm_tree_node_leaf_32 *node_32 = + (bfm_tree_node_leaf_32 *) node; + int index; + + Assert(node_32->b.b.count <= 32); + + index = search_chunk_array_32_eq(node_32->chunks, chunk, node_32->b.b.count); + if (index != -1) + { + *valp = node_32->values[index]; + found = true; + break; + } + break; + } + + case BFM_KIND_128: + { + bfm_tree_node_leaf_128 *node_128 = + (bfm_tree_node_leaf_128 *) node; + + Assert(node_128->b.b.count <= 128); + + if (node_128->offsets[chunk] != BFM_TREE_NODE_128_INVALID) + { + *valp = node_128->values[node_128->offsets[chunk]]; + found = true; + } + break; + } + + case BFM_KIND_MAX: + { + bfm_tree_node_leaf_max *node_max = + (bfm_tree_node_leaf_max *) node; + + Assert(node_max->b.b.count <= BFM_MAX_CLASS); + + if (bfm_leaf_max_isset(node_max, chunk)) + { + *valp = node_max->values[chunk]; + found = true; + } + break; + } + } + + return found; +} + +pg_attribute_always_inline static bool +bfm_walk(bfm_tree *root, bfm_tree_node **nodep, bfm_value_type *valp, uint64_t key) +{ + bfm_tree_node *rnode; + bfm_tree_node *cur; + uint8 chunk; + uint32 shift; + + rnode = root->rnode; + + /* can't be contained in the tree */ + if (!rnode || key > root->maxval) + { + *nodep = NULL; + return false; + } + + shift = rnode->node_shift; + chunk = (key >> shift) & BFM_MASK; + cur = rnode; + + while (shift > 0) + { + bfm_tree_node_inner *cur_inner; + bfm_tree_node *slot; + + Assert(cur->node_shift > 0); /* leaf nodes look different */ + Assert(cur->node_shift == shift); + + cur_inner = (bfm_tree_node_inner *) cur; + + slot = bfm_find_one_level_inner(cur_inner, chunk); + + if (slot == NULL) + { + *nodep = cur; + return false; + } + + Assert(&slot->parent->b == cur); + Assert(slot->node_chunk == chunk); + + cur = slot; + shift -= BFM_FANOUT; + chunk = (key >> shift) & BFM_MASK; + } + + Assert(cur->node_shift == shift && shift == 0); + + *nodep = cur; + + return bfm_find_one_level_leaf((bfm_tree_node_leaf*) cur, chunk, valp); +} + +/* + * Redirect parent pointers to oldnode by newnode, for the key chunk + * chunk. Used when growing or shrinking nodes. + */ +static void +bfm_redirect(bfm_tree *root, bfm_tree_node *oldnode, bfm_tree_node *newnode, uint8 chunk) +{ + bfm_tree_node_inner *parent = oldnode->parent; + + if (parent == NULL) + { + Assert(root->rnode == oldnode); + root->rnode = newnode; + return; + } + + /* if there is a parent, it needs to be an inner node */ + Assert(parent->b.node_shift != 0); + + if ((bfm_tree_node_kind) parent->b.kind > BFM_KIND_MAX) + pg_unreachable(); + + switch((bfm_tree_node_kind) parent->b.kind) + { + case BFM_KIND_1: + { + bfm_tree_node_inner_1 *parent_1 = + (bfm_tree_node_inner_1 *) parent; + + Assert(parent_1->slot == oldnode); + Assert(parent_1->chunk == chunk); + + parent_1->slot = newnode; + break; + } + + case BFM_KIND_4: + { + bfm_tree_node_inner_4 *parent_4 = + (bfm_tree_node_inner_4 *) parent; + int index; + + Assert(parent_4->b.b.count <= 4); + index = search_chunk_array_4_eq(parent_4->chunks, chunk, parent_4->b.b.count); + Assert(index != -1); + + Assert(parent_4->slots[index] == oldnode); + parent_4->slots[index] = newnode; + + break; + } + + case BFM_KIND_16: + { + bfm_tree_node_inner_16 *parent_16 = + (bfm_tree_node_inner_16 *) parent; + int index; + + index = search_chunk_array_16_eq(parent_16->chunks, chunk, parent_16->b.b.count); + Assert(index != -1); + + Assert(parent_16->slots[index] == oldnode); + parent_16->slots[index] = newnode; + break; + } + + case BFM_KIND_32: + { + bfm_tree_node_inner_32 *parent_32 = + (bfm_tree_node_inner_32 *) parent; + int index; + + index = search_chunk_array_32_eq(parent_32->chunks, chunk, parent_32->b.b.count); + Assert(index != -1); + + Assert(parent_32->slots[index] == oldnode); + parent_32->slots[index] = newnode; + break; + } + + case BFM_KIND_128: + { + bfm_tree_node_inner_128 *parent_128 = + (bfm_tree_node_inner_128 *) parent; + uint8 offset; + + offset = parent_128->offsets[chunk]; + Assert(offset != BFM_TREE_NODE_128_INVALID); + Assert(parent_128->slots[offset] == oldnode); + parent_128->slots[offset] = newnode; + break; + } + + case BFM_KIND_MAX: + { + bfm_tree_node_inner_max *parent_max = + (bfm_tree_node_inner_max *) parent; + + Assert(parent_max->slots[chunk] == oldnode); + parent_max->slots[chunk] = newnode; + + break; + } + } +} + +static void +bfm_node_copy_common(bfm_tree *root, bfm_tree_node *oldnode, bfm_tree_node *newnode) +{ + newnode->node_shift = oldnode->node_shift; + newnode->node_chunk = oldnode->node_chunk; + newnode->count = oldnode->count; + newnode->parent = oldnode->parent; +} + +/* + * Insert child into node. + * + * NB: `node` cannot be used after this call anymore, it changes if the node + * needs to be grown to fit the insertion. + * + * FIXME: Find a way to deduplicate with bfm_set_leaf() + */ +static void +bfm_insert_inner(bfm_tree *root, bfm_tree_node_inner *node, bfm_tree_node *child, int child_chunk) +{ + Assert(node->b.node_shift != 0); /* is inner node */ + + child->node_chunk = child_chunk; + + /* tell the compiler it doesn't need a bounds check */ + if ((bfm_tree_node_kind) node->b.kind > BFM_KIND_MAX) + pg_unreachable(); + + switch((bfm_tree_node_kind) node->b.kind) + { + case BFM_KIND_1: + { + bfm_tree_node_inner_1 *node_1 = + (bfm_tree_node_inner_1 *) node; + + Assert(node_1->b.b.count <= 1); + + if (unlikely(node_1->b.b.count == 1)) + { + /* grow node from 1 -> 4 */ + bfm_tree_node_inner_4 *newnode_4; + + newnode_4 = bfm_alloc_inner_4(root); + bfm_node_copy_common(root, &node->b, &newnode_4->b.b); + + Assert(node_1->slot->parent != NULL); + Assert(node_1->slot->parent == node); + newnode_4->chunks[0] = node_1->chunk; + newnode_4->slots[0] = node_1->slot; + node_1->slot->parent = &newnode_4->b; + + bfm_redirect(root, &node->b, &newnode_4->b.b, newnode_4->b.b.node_chunk); + bfm_free_inner(root, node); + node = &newnode_4->b; + } + else + { + child->parent = node; + node_1->chunk = child_chunk; + node_1->slot = child; + break; + } + } + /* fallthrough */ + + case BFM_KIND_4: + { + bfm_tree_node_inner_4 *node_4 = + (bfm_tree_node_inner_4 *) node; + + Assert(node_4->b.b.count <= 4); + if (unlikely(node_4->b.b.count == 4)) + { + /* grow node from 4 -> 16 */ + bfm_tree_node_inner_16 *newnode_16; + + newnode_16 = bfm_alloc_inner_16(root); + bfm_node_copy_common(root, &node->b, &newnode_16->b.b); + + chunk_slot_array_grow(node_4->chunks, node_4->slots, + newnode_16->chunks, newnode_16->slots, + &node_4->b, &newnode_16->b); + + bfm_redirect(root, &node->b, &newnode_16->b.b, newnode_16->b.b.node_chunk); + bfm_free_inner(root, node); + node = &newnode_16->b; + } + else + { + int insertpos; + + for (insertpos = 0; insertpos < node_4->b.b.count; insertpos++) + if (node_4->chunks[insertpos] >= child_chunk) + break; + + child->parent = node; + + memmove(&node_4->slots[insertpos + 1], + &node_4->slots[insertpos], + (node_4->b.b.count - insertpos) * sizeof(node_4->slots[0])); + memmove(&node_4->chunks[insertpos + 1], + &node_4->chunks[insertpos], + (node_4->b.b.count - insertpos) * sizeof(node_4->chunks[0])); + + node_4->chunks[insertpos] = child_chunk; + node_4->slots[insertpos] = child; + break; + } + } + /* fallthrough */ + + case BFM_KIND_16: + { + bfm_tree_node_inner_16 *node_16 = + (bfm_tree_node_inner_16 *) node; + + Assert(node_16->b.b.count <= 16); + if (unlikely(node_16->b.b.count == 16)) + { + /* grow node from 16 -> 32 */ + bfm_tree_node_inner_32 *newnode_32; + + newnode_32 = bfm_alloc_inner_32(root); + bfm_node_copy_common(root, &node->b, &newnode_32->b.b); + + chunk_slot_array_grow(node_16->chunks, node_16->slots, + newnode_32->chunks, newnode_32->slots, + &node_16->b, &newnode_32->b); + + bfm_redirect(root, &node->b, &newnode_32->b.b, newnode_32->b.b.node_chunk); + bfm_free_inner(root, node); + node = &newnode_32->b; + } + else + { + int insertpos; + + insertpos = search_chunk_array_16_le(node_16->chunks, child_chunk, node_16->b.b.count); + + child->parent = node; + + memmove(&node_16->slots[insertpos + 1], + &node_16->slots[insertpos], + (node_16->b.b.count - insertpos) * sizeof(node_16->slots[0])); + memmove(&node_16->chunks[insertpos + 1], + &node_16->chunks[insertpos], + (node_16->b.b.count - insertpos) * sizeof(node_16->chunks[0])); + + node_16->chunks[insertpos] = child_chunk; + node_16->slots[insertpos] = child; + break; + } + } + /* fallthrough */ + + case BFM_KIND_32: + { + bfm_tree_node_inner_32 *node_32 = + (bfm_tree_node_inner_32 *) node; + + Assert(node_32->b.b.count <= 32); + if (unlikely(node_32->b.b.count == 32)) + { + /* grow node from 32 -> 128 */ + bfm_tree_node_inner_128 *newnode_128; + + newnode_128 = bfm_alloc_inner_128(root); + bfm_node_copy_common(root, &node->b, &newnode_128->b.b); + + memcpy(newnode_128->slots, node_32->slots, sizeof(node_32->slots)); + + /* change parent pointers of children */ + for (int i = 0; i < 32; i++) + { + Assert(node_32->slots[i]->parent == node); + newnode_128->offsets[node_32->chunks[i]] = i; + node_32->slots[i]->parent = &newnode_128->b; + } + + bfm_redirect(root, &node->b, &newnode_128->b.b, newnode_128->b.b.node_chunk); + bfm_free_inner(root, node); + node = &newnode_128->b; + } + else + { + int insertpos; + + insertpos = search_chunk_array_32_le(node_32->chunks, child_chunk, node_32->b.b.count); + + child->parent = node; + + memmove(&node_32->slots[insertpos + 1], + &node_32->slots[insertpos], + (node_32->b.b.count - insertpos) * sizeof(node_32->slots[0])); + memmove(&node_32->chunks[insertpos + 1], + &node_32->chunks[insertpos], + (node_32->b.b.count - insertpos) * sizeof(node_32->chunks[0])); + + node_32->chunks[insertpos] = child_chunk; + node_32->slots[insertpos] = child; + break; + } + } + /* fallthrough */ + + case BFM_KIND_128: + { + bfm_tree_node_inner_128 *node_128 = + (bfm_tree_node_inner_128 *) node; + uint8 offset; + + Assert(node_128->b.b.count <= 128); + if (unlikely(node_128->b.b.count == 128)) + { + /* grow node from 128 -> max */ + bfm_tree_node_inner_max *newnode_max; + + newnode_max = bfm_alloc_inner_max(root); + bfm_node_copy_common(root, &node->b, &newnode_max->b.b); + + for (int i = 0; i < BFM_MAX_CLASS; i++) + { + uint8 offset = node_128->offsets[i]; + + if (offset == BFM_TREE_NODE_128_INVALID) + continue; + + Assert(node_128->slots[offset] != NULL); + Assert(node_128->slots[offset]->parent == node); + + node_128->slots[offset]->parent = &newnode_max->b; + + newnode_max->slots[i] = node_128->slots[offset]; + } + + bfm_redirect(root, &node->b, &newnode_max->b.b, newnode_max->b.b.node_chunk); + bfm_free_inner(root, node); + node = &newnode_max->b; + } + else + { + child->parent = node; + offset = node_128->b.b.count; + /* FIXME: this may overwrite entry if there had been deletions */ + node_128->offsets[child_chunk] = offset; + node_128->slots[offset] = child; + break; + } + } + /* fallthrough */ + + case BFM_KIND_MAX: + { + bfm_tree_node_inner_max *node_max = + (bfm_tree_node_inner_max *) node; + + Assert(node_max->b.b.count <= (BFM_MAX_CLASS - 1)); + Assert(node_max->slots[child_chunk] == NULL); + + child->parent = node; + node_max->slots[child_chunk] = child; + + break; + } + } + + node->b.count++; +} + +static bool pg_noinline +bfm_grow_leaf_1(bfm_tree *root, bfm_tree_node_leaf_1 *node_1, + int child_chunk, bfm_value_type val) +{ + /* grow node from 1 -> 4 */ + bfm_tree_node_leaf_4 *newnode_4; + + Assert(node_1->b.b.count == 1); + + newnode_4 = bfm_alloc_leaf_4(root); + bfm_node_copy_common(root, &node_1->b.b, &newnode_4->b.b); + + /* copy old & insert new value in the right order */ + if (child_chunk < node_1->chunk) + { + newnode_4->chunks[0] = child_chunk; + newnode_4->values[0] = val; + newnode_4->chunks[1] = node_1->chunk; + newnode_4->values[1] = node_1->value; + } + else + { + newnode_4->chunks[0] = node_1->chunk; + newnode_4->values[0] = node_1->value; + newnode_4->chunks[1] = child_chunk; + newnode_4->values[1] = val; + } + + newnode_4->b.b.count++; +#ifdef BFM_STATS + root->entries++; +#endif + + bfm_redirect(root, &node_1->b.b, &newnode_4->b.b, newnode_4->b.b.node_chunk); + bfm_free_leaf(root, &node_1->b); + + return false; +} + +static bool pg_noinline +bfm_grow_leaf_4(bfm_tree *root, bfm_tree_node_leaf_4 *node_4, + int child_chunk, bfm_value_type val) +{ + /* grow node from 4 -> 16 */ + bfm_tree_node_leaf_16 *newnode_16; + int insertpos; + + Assert(node_4->b.b.count == 4); + + newnode_16 = bfm_alloc_leaf_16(root); + bfm_node_copy_common(root, &node_4->b.b, &newnode_16->b.b); + + insertpos = search_chunk_array_4_le(node_4->chunks, child_chunk, node_4->b.b.count); + + /* first copy old elements ordering before */ + memcpy(&newnode_16->chunks[0], + &node_4->chunks[0], + sizeof(node_4->chunks[0]) * insertpos); + memcpy(&newnode_16->values[0], + &node_4->values[0], + sizeof(node_4->values[0]) * insertpos); + + /* then the new element */ + newnode_16->chunks[insertpos] = child_chunk; + newnode_16->values[insertpos] = val; + + /* and lastly the old elements after */ + memcpy(&newnode_16->chunks[insertpos + 1], + &node_4->chunks[insertpos], + (node_4->b.b.count-insertpos) * sizeof(node_4->chunks[0])); + memcpy(&newnode_16->values[insertpos + 1], + &node_4->values[insertpos], + (node_4->b.b.count-insertpos) * sizeof(node_4->values[0])); + + newnode_16->b.b.count++; +#ifdef BFM_STATS + root->entries++; +#endif + + bfm_redirect(root, &node_4->b.b, &newnode_16->b.b, newnode_16->b.b.node_chunk); + bfm_free_leaf(root, &node_4->b); + + return false; +} + +static bool pg_noinline +bfm_grow_leaf_16(bfm_tree *root, bfm_tree_node_leaf_16 *node_16, + int child_chunk, bfm_value_type val) +{ + /* grow node from 16 -> 32 */ + bfm_tree_node_leaf_32 *newnode_32; + int insertpos; + + Assert(node_16->b.b.count == 16); + + newnode_32 = bfm_alloc_leaf_32(root); + bfm_node_copy_common(root, &node_16->b.b, &newnode_32->b.b); + + insertpos = search_chunk_array_16_le(node_16->chunks, child_chunk, node_16->b.b.count); + + /* first copy old elements ordering before */ + memcpy(&newnode_32->chunks[0], + &node_16->chunks[0], + sizeof(node_16->chunks[0]) * insertpos); + memcpy(&newnode_32->values[0], + &node_16->values[0], + sizeof(node_16->values[0]) * insertpos); + + /* then the new element */ + newnode_32->chunks[insertpos] = child_chunk; + newnode_32->values[insertpos] = val; + + /* and lastly the old elements after */ + memcpy(&newnode_32->chunks[insertpos + 1], + &node_16->chunks[insertpos], + (node_16->b.b.count-insertpos) * sizeof(node_16->chunks[0])); + memcpy(&newnode_32->values[insertpos + 1], + &node_16->values[insertpos], + (node_16->b.b.count-insertpos) * sizeof(node_16->values[0])); + + newnode_32->b.b.count++; +#ifdef BFM_STATS + root->entries++; +#endif + + bfm_redirect(root, &node_16->b.b, &newnode_32->b.b, newnode_32->b.b.node_chunk); + bfm_free_leaf(root, &node_16->b); + + return false; +} + +static bool pg_noinline +bfm_grow_leaf_32(bfm_tree *root, bfm_tree_node_leaf_32 *node_32, + int child_chunk, bfm_value_type val) +{ + /* grow node from 32 -> 128 */ + bfm_tree_node_leaf_128 *newnode_128; + uint8 offset; + + newnode_128 = bfm_alloc_leaf_128(root); + bfm_node_copy_common(root, &node_32->b.b, &newnode_128->b.b); + + memcpy(newnode_128->values, node_32->values, sizeof(node_32->values)); + + for (int i = 0; i < 32; i++) + newnode_128->offsets[node_32->chunks[i]] = i; + + offset = newnode_128->b.b.count; + newnode_128->offsets[child_chunk] = offset; + newnode_128->values[offset] = val; + + newnode_128->b.b.count++; +#ifdef BFM_STATS + root->entries++; +#endif + + bfm_redirect(root, &node_32->b.b, &newnode_128->b.b, newnode_128->b.b.node_chunk); + bfm_free_leaf(root, &node_32->b); + + return false; +} + +static bool pg_noinline +bfm_grow_leaf_128(bfm_tree *root, bfm_tree_node_leaf_128 *node_128, + int child_chunk, bfm_value_type val) +{ + /* grow node from 128 -> max */ + bfm_tree_node_leaf_max *newnode_max; + int i; + + newnode_max = bfm_alloc_leaf_max(root); + bfm_node_copy_common(root, &node_128->b.b, &newnode_max->b.b); + + /* + * The bitmask manipulation is a surprisingly large portion of the + * overhead in the naive implementation. Unrolling the bit manipulation + * removes a lot of that overhead. + */ + i = 0; + for (int byte = 0; byte < BFM_MAX_CLASS / BITS_PER_BYTE; byte++) + { + uint8 bitmap = 0; + + for (int bit = 0; bit < BITS_PER_BYTE; bit++) + { + uint8 offset = node_128->offsets[i]; + + if (offset != BFM_TREE_NODE_128_INVALID) + { + bitmap |= 1 << bit; + newnode_max->values[i] = node_128->values[offset]; + } + + i++; + } + + newnode_max->set[byte] = bitmap; + } + + bfm_leaf_max_set(newnode_max, child_chunk); + newnode_max->values[child_chunk] = val; + newnode_max->b.b.count++; +#ifdef BFM_STATS + root->entries++; +#endif + + bfm_redirect(root, &node_128->b.b, &newnode_max->b.b, newnode_max->b.b.node_chunk); + bfm_free_leaf(root, &node_128->b); + + return false; +} + +/* + * Set key to val. Return false if entry doesn't yet exist, true if it did. + * + * See comments to bfm_insert_inner(). + */ +static bool pg_noinline +bfm_set_leaf(bfm_tree *root, bfm_key_type key, bfm_value_type val, + bfm_tree_node_leaf *node, int child_chunk) +{ + Assert(node->b.node_shift == 0); /* is leaf node */ + + /* tell the compiler it doesn't need a bounds check */ + if ((bfm_tree_node_kind) node->b.kind > BFM_KIND_MAX) + pg_unreachable(); + + switch((bfm_tree_node_kind) node->b.kind) + { + case BFM_KIND_1: + { + bfm_tree_node_leaf_1 *node_1 = + (bfm_tree_node_leaf_1 *) node; + + Assert(node_1->b.b.count <= 1); + + if (node_1->b.b.count == 1 && + node_1->chunk == child_chunk) + { + node_1->value = val; + return true; + } + else if (likely(node_1->b.b.count < 1)) + { + node_1->chunk = child_chunk; + node_1->value = val; + } + else + return bfm_grow_leaf_1(root, node_1, child_chunk, val); + + break; + } + + case BFM_KIND_4: + { + bfm_tree_node_leaf_4 *node_4 = + (bfm_tree_node_leaf_4 *) node; + int index; + + Assert(node_4->b.b.count <= 4); + + index = search_chunk_array_4_eq(node_4->chunks, child_chunk, node_4->b.b.count); + if (index != -1) + { + node_4->values[index] = val; + return true; + } + + if (likely(node_4->b.b.count < 4)) + { + int insertpos; + + insertpos = search_chunk_array_4_le(node_4->chunks, child_chunk, node_4->b.b.count); + + for (int i = node_4->b.b.count - 1; i >= insertpos; i--) + { + /* workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101481 */ +#ifdef __GNUC__ + __asm__(""); +#endif + node_4->values[i + 1] = node_4->values[i]; + node_4->chunks[i + 1] = node_4->chunks[i]; + } + + node_4->chunks[insertpos] = child_chunk; + node_4->values[insertpos] = val; + } + else + return bfm_grow_leaf_4(root, node_4, child_chunk, val); + + break; + } + + case BFM_KIND_16: + { + bfm_tree_node_leaf_16 *node_16 = + (bfm_tree_node_leaf_16 *) node; + int index; + + Assert(node_16->b.b.count <= 16); + + index = search_chunk_array_16_eq(node_16->chunks, child_chunk, node_16->b.b.count); + if (index != -1) + { + node_16->values[index] = val; + return true; + } + + if (likely(node_16->b.b.count < 16)) + { + int insertpos; + + insertpos = search_chunk_array_16_le(node_16->chunks, child_chunk, node_16->b.b.count); + + if (node_16->b.b.count > 16 || insertpos > 15) + pg_unreachable(); + + for (int i = node_16->b.b.count - 1; i >= insertpos; i--) + { + /* workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101481 */ +#ifdef __GNUC__ + __asm__(""); +#endif + node_16->values[i + 1] = node_16->values[i]; + node_16->chunks[i + 1] = node_16->chunks[i]; + } + node_16->chunks[insertpos] = child_chunk; + node_16->values[insertpos] = val; + } + else + return bfm_grow_leaf_16(root, node_16, child_chunk, val); + + break; + } + + case BFM_KIND_32: + { + bfm_tree_node_leaf_32 *node_32 = + (bfm_tree_node_leaf_32 *) node; + int index; + + Assert(node_32->b.b.count <= 32); + + index = search_chunk_array_32_eq(node_32->chunks, child_chunk, node_32->b.b.count); + if (index != -1) + { + node_32->values[index] = val; + return true; + } + + if (likely(node_32->b.b.count < 32)) + { + int insertpos; + + insertpos = search_chunk_array_32_le(node_32->chunks, child_chunk, node_32->b.b.count); + + if (node_32->b.b.count > 32 || insertpos > 31) + pg_unreachable(); + + for (int i = node_32->b.b.count - 1; i >= insertpos; i--) + { + /* workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101481 */ +#ifdef __GNUC__ + __asm__(""); +#endif + node_32->values[i + 1] = node_32->values[i]; + node_32->chunks[i + 1] = node_32->chunks[i]; + } + node_32->chunks[insertpos] = child_chunk; + node_32->values[insertpos] = val; + } + else + return bfm_grow_leaf_32(root, node_32, child_chunk, val); + + break; + } + + case BFM_KIND_128: + { + bfm_tree_node_leaf_128 *node_128 = + (bfm_tree_node_leaf_128 *) node; + uint8 offset; + + Assert(node_128->b.b.count <= 128); + + if (node_128->offsets[child_chunk] != BFM_TREE_NODE_128_INVALID) + { + offset = node_128->offsets[child_chunk]; + node_128->values[offset] = val; + + return true; + } + else if (likely(node_128->b.b.count < 128)) + { + offset = node_128->b.b.count; + node_128->offsets[child_chunk] = offset; + node_128->values[offset] = val; + } + else + return bfm_grow_leaf_128(root, node_128, child_chunk, val); + + break; + } + + case BFM_KIND_MAX: + { + bfm_tree_node_leaf_max *node_max = + (bfm_tree_node_leaf_max *) node; + + Assert(node_max->b.b.count <= (BFM_MAX_CLASS - 1)); + + if (bfm_leaf_max_isset(node_max, child_chunk)) + { + node_max->values[child_chunk] = val; + return true; + } + + bfm_leaf_max_set(node_max, child_chunk); + node_max->values[child_chunk] = val; + + break; + } + } + + node->b.count++; +#ifdef BFM_STATS + root->entries++; +#endif + + return false; +} + +static bool pg_noinline +bfm_set_extend(bfm_tree *root, bfm_key_type key, bfm_value_type val, + bfm_tree_node_inner *cur_inner, + uint32 shift, uint8 chunk) +{ + bfm_tree_node_leaf_1 *new_leaf_1; + + while (shift > BFM_FANOUT) + { + bfm_tree_node_inner_1 *new_inner_1; + + Assert(shift == cur_inner->b.node_shift); + + new_inner_1 = bfm_alloc_inner_1(root); + new_inner_1->b.b.node_shift = shift - BFM_FANOUT; + + bfm_insert_inner(root, cur_inner, &new_inner_1->b.b, chunk); + + shift -= BFM_FANOUT; + chunk = (key >> shift) & BFM_MASK; + cur_inner = &new_inner_1->b; + } + + Assert(shift == BFM_FANOUT && cur_inner->b.node_shift == BFM_FANOUT); + + new_leaf_1 = bfm_alloc_leaf_1(root); + new_leaf_1->b.b.count = 1; + new_leaf_1->b.b.node_shift = 0; + + new_leaf_1->chunk = key & BFM_MASK; + new_leaf_1->value = val; + +#ifdef BFM_STATS + root->entries++; +#endif + + bfm_insert_inner(root, cur_inner, &new_leaf_1->b.b, chunk); + + return false; +} + +static bool pg_noinline +bfm_set_empty(bfm_tree *root, bfm_key_type key, bfm_value_type val) +{ + uint32 shift; + + Assert(root->rnode == NULL); + + if (key == 0) + shift = 0; + else + shift = (pg_leftmost_one_pos64(key)/BFM_FANOUT)*BFM_FANOUT; + + if (shift == 0) + { + bfm_tree_node_leaf_1 *nroot = bfm_alloc_leaf_1(root); + + Assert((key & BFM_MASK) == key); + + nroot->b.b.node_shift = 0; + nroot->b.b.node_chunk = 0; + nroot->b.b.parent = NULL; + + root->maxval = bfm_maxval_shift(0); + + root->rnode = &nroot->b.b; + + return bfm_set_leaf(root, key, val, &nroot->b, key); + } + else + { + bfm_tree_node_inner_1 *nroot = bfm_alloc_inner_1(root); + + nroot->b.b.node_shift = shift; + nroot->b.b.node_chunk = 0; + nroot->b.b.parent = NULL; + + root->maxval = bfm_maxval_shift(shift); + root->rnode = &nroot->b.b; + + + return bfm_set_extend(root, key, val, &nroot->b, + shift, (key >> shift) & BFM_MASK); + } +} + +/* + * Tree doesn't have sufficient height. Put new tree node(s) on top, move + * the old node below it, and then insert. + */ +static bool pg_noinline +bfm_set_shallow(bfm_tree *root, bfm_key_type key, bfm_value_type val) +{ + uint32 shift; + bfm_tree_node_inner_1 *nroot = NULL;; + + Assert(root->rnode != NULL); + + if (key == 0) + shift = 0; + else + shift = (pg_leftmost_one_pos64(key)/BFM_FANOUT)*BFM_FANOUT; + + Assert(root->rnode->node_shift < shift); + + while (unlikely(root->rnode->node_shift < shift)) + { + nroot = bfm_alloc_inner_1(root); + + nroot->slot = root->rnode; + nroot->chunk = 0; + nroot->b.b.count = 1; + nroot->b.b.parent = NULL; + nroot->b.b.node_shift = root->rnode->node_shift + BFM_FANOUT; + + root->rnode->parent = &nroot->b; + root->rnode = &nroot->b.b; + + root->maxval = bfm_maxval_shift(nroot->b.b.node_shift); + } + + Assert(nroot != NULL); + + return bfm_set_extend(root, key, val, &nroot->b, + shift, (key >> shift) & BFM_MASK); +} + +static void +bfm_delete_inner(bfm_tree * pg_restrict root, bfm_tree_node_inner * pg_restrict node, bfm_tree_node *pg_restrict child, int child_chunk) +{ + switch((bfm_tree_node_kind) node->b.kind) + { + case BFM_KIND_1: + { + bfm_tree_node_inner_1 *node_1 = + (bfm_tree_node_inner_1 *) node; + + Assert(node_1->slot == child); + Assert(node_1->chunk == child_chunk); + + node_1->chunk = 17; + node_1->slot = NULL; + + break; + } + + case BFM_KIND_4: + { + bfm_tree_node_inner_4 *node_4 = + (bfm_tree_node_inner_4 *) node; + int index; + + index = search_chunk_array_4_eq(node_4->chunks, child_chunk, node_4->b.b.count); + Assert(index != -1); + + Assert(node_4->slots[index] == child); + memmove(&node_4->slots[index], + &node_4->slots[index + 1], + (node_4->b.b.count-index-1) * sizeof(void*)); + memmove(&node_4->chunks[index], + &node_4->chunks[index + 1], + node_4->b.b.count-index-1); + + node_4->chunks[node_4->b.b.count - 1] = BFM_TREE_NODE_INNER_4_INVALID; + node_4->slots[node_4->b.b.count - 1] = NULL; + + break; + } + + case BFM_KIND_16: + { + bfm_tree_node_inner_16 *node_16 = + (bfm_tree_node_inner_16 *) node; + int index; + + index = search_chunk_array_16_eq(node_16->chunks, child_chunk, node_16->b.b.count); + Assert(index != -1); + + Assert(node_16->slots[index] == child); + memmove(&node_16->slots[index], + &node_16->slots[index + 1], + (node_16->b.b.count - index - 1) * sizeof(node_16->slots[0])); + memmove(&node_16->chunks[index], + &node_16->chunks[index + 1], + (node_16->b.b.count - index - 1) * sizeof(node_16->chunks[0])); + + node_16->chunks[node_16->b.b.count - 1] = BFM_TREE_NODE_INNER_16_INVALID; + node_16->slots[node_16->b.b.count - 1] = NULL; + + break; + } + + case BFM_KIND_32: + { + bfm_tree_node_inner_32 *node_32 = + (bfm_tree_node_inner_32 *) node; + int index; + + index = search_chunk_array_32_eq(node_32->chunks, child_chunk, node_32->b.b.count); + Assert(index != -1); + + Assert(node_32->slots[index] == child); + memmove(&node_32->slots[index], + &node_32->slots[index + 1], + (node_32->b.b.count - index - 1) * sizeof(node_32->slots[0])); + memmove(&node_32->chunks[index], + &node_32->chunks[index + 1], + (node_32->b.b.count - index - 1) * sizeof(node_32->chunks[0])); + + node_32->chunks[node_32->b.b.count - 1] = BFM_TREE_NODE_INNER_32_INVALID; + node_32->slots[node_32->b.b.count - 1] = NULL; + + break; + } + + case BFM_KIND_128: + { + bfm_tree_node_inner_128 *node_128 = + (bfm_tree_node_inner_128 *) node; + uint8 offset; + + offset = node_128->offsets[child_chunk]; + Assert(offset != BFM_TREE_NODE_128_INVALID); + Assert(node_128->slots[offset] == child); + node_128->offsets[child_chunk] = BFM_TREE_NODE_128_INVALID; + node_128->slots[offset] = NULL; + break; + } + + case BFM_KIND_MAX: + { + bfm_tree_node_inner_max *node_max = + (bfm_tree_node_inner_max *) node; + + Assert(node_max->slots[child_chunk] == child); + node_max->slots[child_chunk] = NULL; + + break; + } + } + + node->b.count--; + + if (node->b.count == 0) + { + if (node->b.parent) + bfm_delete_inner(root, node->b.parent, &node->b, node->b.node_chunk); + else + root->rnode = NULL; + bfm_free_inner(root, node); + } +} + +/* + * NB: After this call node cannot be used anymore, it may have been freed or + * shrunk. + * + * FIXME: this should implement shrinking of nodes + */ +static void pg_noinline +bfm_delete_leaf(bfm_tree * pg_restrict root, bfm_tree_node_leaf *pg_restrict node, int child_chunk) +{ + /* tell the compiler it doesn't need a bounds check */ + if ((bfm_tree_node_kind) node->b.kind > BFM_KIND_MAX) + pg_unreachable(); + + switch((bfm_tree_node_kind) node->b.kind) + { + case BFM_KIND_1: + { + bfm_tree_node_leaf_1 *node_1 = + (bfm_tree_node_leaf_1 *) node; + + Assert(node_1->chunk == child_chunk); + + node_1->chunk = 17; + break; + } + + case BFM_KIND_4: + { + bfm_tree_node_leaf_4 *node_4 = + (bfm_tree_node_leaf_4 *) node; + int index; + + index = search_chunk_array_4_eq(node_4->chunks, child_chunk, node_4->b.b.count); + Assert(index != -1); + + memmove(&node_4->values[index], + &node_4->values[index + 1], + (node_4->b.b.count - index - 1) * sizeof(node_4->values[0])); + memmove(&node_4->chunks[index], + &node_4->chunks[index + 1], + (node_4->b.b.count - index - 1) * sizeof(node_4->chunks[0])); + + node_4->chunks[node_4->b.b.count - 1] = BFM_TREE_NODE_INNER_4_INVALID; + node_4->values[node_4->b.b.count - 1] = 0xFF; + + break; + } + + case BFM_KIND_16: + { + bfm_tree_node_leaf_16 *node_16 = + (bfm_tree_node_leaf_16 *) node; + int index; + + index = search_chunk_array_16_eq(node_16->chunks, child_chunk, node_16->b.b.count); + Assert(index != -1); + + memmove(&node_16->values[index], + &node_16->values[index + 1], + (node_16->b.b.count - index - 1) * sizeof(node_16->values[0])); + memmove(&node_16->chunks[index], + &node_16->chunks[index + 1], + (node_16->b.b.count - index - 1) * sizeof(node_16->chunks[0])); + + node_16->chunks[node_16->b.b.count - 1] = BFM_TREE_NODE_INNER_16_INVALID; + node_16->values[node_16->b.b.count - 1] = 0xFF; + + break; + } + + case BFM_KIND_32: + { + bfm_tree_node_leaf_32 *node_32 = + (bfm_tree_node_leaf_32 *) node; + int index; + + index = search_chunk_array_32_eq(node_32->chunks, child_chunk, node_32->b.b.count); + Assert(index != -1); + + memmove(&node_32->values[index], + &node_32->values[index + 1], + (node_32->b.b.count - index - 1) * sizeof(node_32->values[0])); + memmove(&node_32->chunks[index], + &node_32->chunks[index + 1], + (node_32->b.b.count - index - 1) * sizeof(node_32->chunks[0])); + + node_32->chunks[node_32->b.b.count - 1] = BFM_TREE_NODE_INNER_32_INVALID; + node_32->values[node_32->b.b.count - 1] = 0xFF; + + break; + } + + case BFM_KIND_128: + { + bfm_tree_node_leaf_128 *node_128 = + (bfm_tree_node_leaf_128 *) node; + + Assert(node_128->offsets[child_chunk] != BFM_TREE_NODE_128_INVALID); + node_128->offsets[child_chunk] = BFM_TREE_NODE_128_INVALID; + break; + } + + case BFM_KIND_MAX: + { + bfm_tree_node_leaf_max *node_max = + (bfm_tree_node_leaf_max *) node; + + Assert(bfm_leaf_max_isset(node_max, child_chunk)); + bfm_leaf_max_unset(node_max, child_chunk); + + break; + } + } + +#ifdef BFM_STATS + root->entries--; +#endif + node->b.count--; + + if (node->b.count == 0) + { + if (node->b.parent) + bfm_delete_inner(root, node->b.parent, &node->b, node->b.node_chunk); + else + root->rnode = NULL; + bfm_free_leaf(root, node); + } +} + +void +bfm_init(bfm_tree *root) +{ + memset(root, 0, sizeof(*root)); + +#if 1 + root->context = AllocSetContextCreate(CurrentMemoryContext, "radix bench internal", + ALLOCSET_DEFAULT_SIZES); +#else + root->context = CurrentMemoryContext; +#endif + +#ifdef BFM_USE_SLAB + for (int i = 0; i < BFM_KIND_COUNT; i++) + { + root->inner_slabs[i] = SlabContextCreate(root->context, + inner_class_info[i].name, + Max(pg_nextpower2_32((MAXALIGN(inner_class_info[i].size) + 16) * 32), 1024), + inner_class_info[i].size); + root->leaf_slabs[i] = SlabContextCreate(root->context, + leaf_class_info[i].name, + Max(pg_nextpower2_32((MAXALIGN(leaf_class_info[i].size) + 16) * 32), 1024), + leaf_class_info[i].size); +#if 0 + elog(LOG, "%s %s size original %zu, mult %zu, round %u", + "leaf", + leaf_class_info[i].name, + leaf_class_info[i].size, + leaf_class_info[i].size * 32, + pg_nextpower2_32(leaf_class_info[i].size * 32)); +#endif + } +#endif + + /* + * XXX: Might be worth to always allocate a root node, to avoid related + * branches? + */ +} + +bool +bfm_lookup(bfm_tree *root, uint64_t key, bfm_value_type *val) +{ + bfm_tree_node *node; + + return bfm_walk(root, &node, val, key); +} + +/* + * Set key to val. Returns false if entry doesn't yet exist, true if it did. + */ +bool +bfm_set(bfm_tree *root, bfm_key_type key, bfm_value_type val) +{ + bfm_tree_node *cur; + bfm_tree_node_leaf *target; + uint8 chunk; + uint32 shift; + + if (unlikely(!root->rnode)) + return bfm_set_empty(root, key, val); + else if (key > root->maxval) + return bfm_set_shallow(root, key, val); + + shift = root->rnode->node_shift; + chunk = (key >> shift) & BFM_MASK; + cur = root->rnode; + + while (shift > 0) + { + bfm_tree_node_inner *cur_inner; + bfm_tree_node *slot; + + Assert(cur->node_shift > 0); /* leaf nodes look different */ + Assert(cur->node_shift == shift); + + cur_inner = (bfm_tree_node_inner *) cur; + + slot = bfm_find_one_level_inner(cur_inner, chunk); + + if (slot == NULL) + return bfm_set_extend(root, key, val, cur_inner, shift, chunk); + + Assert(&slot->parent->b == cur); + Assert(slot->node_chunk == chunk); + + cur = slot; + shift -= BFM_FANOUT; + chunk = (key >> shift) & BFM_MASK; + } + + Assert(shift == 0 && cur->node_shift == 0); + + target = (bfm_tree_node_leaf *) cur; + + /* + * FIXME: what is the best API to deal with existing values? Overwrite? + * Overwrite and return old value? Just return true? + */ + return bfm_set_leaf(root, key, val, target, chunk); +} + +bool +bfm_delete(bfm_tree *root, uint64 key) +{ + bfm_tree_node *node; + bfm_value_type val; + + if (!bfm_walk(root, &node, &val, key)) + return false; + + Assert(node != NULL && node->node_shift == 0); + + /* recurses upwards and deletes parent nodes if necessary */ + bfm_delete_leaf(root, (bfm_tree_node_leaf *) node, key & BFM_MASK); + + return true; +} + + +StringInfo +bfm_stats(bfm_tree *root) +{ + StringInfo s; +#ifdef BFM_STATS + size_t total; + size_t inner_bytes; + size_t leaf_bytes; + size_t allocator_bytes; +#endif + + s = makeStringInfo(); + + /* FIXME: Some of the below could be printed even without BFM_STATS */ +#ifdef BFM_STATS + appendStringInfo(s, "%zu entries and depth %d\n", + root->entries, + root->rnode ? root->rnode->node_shift / BFM_FANOUT : 0); + + { + appendStringInfo(s, "\tinner nodes:"); + total = 0; + inner_bytes = 0; + for (int i = 0; i < BFM_KIND_COUNT; i++) + { + total += root->inner_nodes[i]; + inner_bytes += inner_class_info[i].size * root->inner_nodes[i]; + appendStringInfo(s, " %s: %zu, ", + inner_class_info[i].name, + root->inner_nodes[i]); + } + appendStringInfo(s, " total: %zu, total_bytes: %zu\n", total, + inner_bytes); + } + + { + appendStringInfo(s, "\tleaf nodes:"); + total = 0; + leaf_bytes = 0; + for (int i = 0; i < BFM_KIND_COUNT; i++) + { + total += root->leaf_nodes[i]; + leaf_bytes += leaf_class_info[i].size * root->leaf_nodes[i]; + appendStringInfo(s, " %s: %zu, ", + leaf_class_info[i].name, + root->leaf_nodes[i]); + } + appendStringInfo(s, " total: %zu, total_bytes: %zu\n", total, + leaf_bytes); + } + + allocator_bytes = MemoryContextMemAllocated(root->context, true); + + appendStringInfo(s, "\t%.2f MB excluding allocator overhead, %.2f MiB including\n", + (inner_bytes + leaf_bytes) / (double) (1024 * 1024), + allocator_bytes / (double) (1024 * 1024)); + appendStringInfo(s, "\t%.2f bytes/entry excluding allocator overhead\n", + root->entries > 0 ? + (inner_bytes + leaf_bytes)/(double)root->entries : 0); + appendStringInfo(s, "\t%.2f bytes/entry including allocator overhead\n", + root->entries > 0 ? + allocator_bytes/(double)root->entries : 0); +#endif + + if (0) + bfm_print(root); + + return s; +} + +static void +bfm_print_node(StringInfo s, int indent, bfm_value_type key, bfm_tree_node *node); + +static void +bfm_print_node_child(StringInfo s, int indent, bfm_value_type key, bfm_tree_node *node, + int i, uint8 chunk, bfm_tree_node *child) +{ + appendStringInfoSpaces(s, indent + 2); + appendStringInfo(s, "%u: child chunk: 0x%.2X, child: %p\n", + i, chunk, child); + key |= ((uint64) chunk) << node->node_shift; + + bfm_print_node(s, indent + 4, key, child); +} + +static void +bfm_print_value(StringInfo s, int indent, bfm_value_type key, bfm_tree_node *node, + int i, uint8 chunk, bfm_value_type value) +{ + key |= chunk; + + appendStringInfoSpaces(s, indent + 2); + appendStringInfo(s, "%u: chunk: 0x%.2X, key: 0x%llX/%llu, value: 0x%llX/%llu\n", + i, + chunk, + (unsigned long long) key, + (unsigned long long) key, + (unsigned long long) value, + (unsigned long long) value); +} + +static void +bfm_print_node(StringInfo s, int indent, bfm_value_type key, bfm_tree_node *node) +{ + appendStringInfoSpaces(s, indent); + appendStringInfo(s, "%s: kind %d, children: %u, shift: %u, node chunk: 0x%.2X, partial key: 0x%llX\n", + node->node_shift != 0 ? "inner" : "leaf", + node->kind, + node->count, + node->node_shift, + node->node_chunk, + (long long unsigned) key); + + if (node->node_shift != 0) + { + bfm_tree_node_inner *inner = (bfm_tree_node_inner *) node; + + switch((bfm_tree_node_kind) inner->b.kind) + { + case BFM_KIND_1: + { + bfm_tree_node_inner_1 *node_1 = + (bfm_tree_node_inner_1 *) node; + + if (node_1->b.b.count > 0) + bfm_print_node_child(s, indent, key, node, + 0, node_1->chunk, node_1->slot); + + break; + } + + case BFM_KIND_4: + { + bfm_tree_node_inner_4 *node_4 = + (bfm_tree_node_inner_4 *) node; + + for (int i = 0; i < node_4->b.b.count; i++) + { + bfm_print_node_child(s, indent, key, node, + i, node_4->chunks[i], node_4->slots[i]); + } + + break; + } + + case BFM_KIND_16: + { + bfm_tree_node_inner_16 *node_16 = + (bfm_tree_node_inner_16 *) node; + + for (int i = 0; i < node_16->b.b.count; i++) + { + bfm_print_node_child(s, indent, key, node, + i, node_16->chunks[i], node_16->slots[i]); + } + + break; + } + + case BFM_KIND_32: + { + bfm_tree_node_inner_32 *node_32 = + (bfm_tree_node_inner_32 *) node; + + for (int i = 0; i < node_32->b.b.count; i++) + { + bfm_print_node_child(s, indent, key, node, + i, node_32->chunks[i], node_32->slots[i]); + } + + break; + } + + case BFM_KIND_128: + { + bfm_tree_node_inner_128 *node_128 = + (bfm_tree_node_inner_128 *) node; + + for (int i = 0; i < BFM_MAX_CLASS; i++) + { + uint8 offset = node_128->offsets[i]; + + if (offset == BFM_TREE_NODE_128_INVALID) + continue; + + bfm_print_node_child(s, indent, key, node, + offset, i, node_128->slots[offset]); + } + + break; + } + + case BFM_KIND_MAX: + { + bfm_tree_node_inner_max *node_max = + (bfm_tree_node_inner_max *) node; + + for (int i = 0; i < BFM_MAX_CLASS; i++) + { + if (node_max->slots[i] == NULL) + continue; + + bfm_print_node_child(s, indent, key, node, + i, i, node_max->slots[i]); + } + + break; + } + } + } + else + { + bfm_tree_node_leaf *leaf = (bfm_tree_node_leaf *) node; + + switch((bfm_tree_node_kind) leaf->b.kind) + { + case BFM_KIND_1: + { + bfm_tree_node_leaf_1 *node_1 = + (bfm_tree_node_leaf_1 *) node; + + if (node_1->b.b.count > 0) + bfm_print_value(s, indent, key, node, + 0, node_1->chunk, node_1->value); + + break; + } + + case BFM_KIND_4: + { + bfm_tree_node_leaf_4 *node_4 = + (bfm_tree_node_leaf_4 *) node; + + for (int i = 0; i < node_4->b.b.count; i++) + { + bfm_print_value(s, indent, key, node, + i, node_4->chunks[i], node_4->values[i]); + } + + break; + } + + case BFM_KIND_16: + { + bfm_tree_node_leaf_16 *node_16 = + (bfm_tree_node_leaf_16 *) node; + + for (int i = 0; i < node_16->b.b.count; i++) + { + bfm_print_value(s, indent, key, node, + i, node_16->chunks[i], node_16->values[i]); + } + + break; + } + + case BFM_KIND_32: + { + bfm_tree_node_leaf_32 *node_32 = + (bfm_tree_node_leaf_32 *) node; + + for (int i = 0; i < node_32->b.b.count; i++) + { + bfm_print_value(s, indent, key, node, + i, node_32->chunks[i], node_32->values[i]); + } + + break; + } + + case BFM_KIND_128: + { + bfm_tree_node_leaf_128 *node_128 = + (bfm_tree_node_leaf_128 *) node; + + for (int i = 0; i < BFM_MAX_CLASS; i++) + { + uint8 offset = node_128->offsets[i]; + + if (offset == BFM_TREE_NODE_128_INVALID) + continue; + + bfm_print_value(s, indent, key, node, + offset, i, node_128->values[offset]); + } + + break; + } + + case BFM_KIND_MAX: + { + bfm_tree_node_leaf_max *node_max = + (bfm_tree_node_leaf_max *) node; + + for (int i = 0; i < BFM_MAX_CLASS; i++) + { + if (!bfm_leaf_max_isset(node_max, i)) + continue; + + bfm_print_value(s, indent, key, node, + i, i, node_max->values[i]); + } + + break; + } + } + } +} + +void +bfm_print(bfm_tree *root) +{ + StringInfoData s; + + initStringInfo(&s); + + if (root->rnode) + bfm_print_node(&s, 0 /* indent */, 0 /* key */, root->rnode); + + elog(LOG, "radix debug print:\n%s", s.data); + pfree(s.data); +} + + +#define EXPECT_TRUE(expr) \ + do { \ + if (!(expr)) \ + elog(ERROR, \ + "%s was unexpectedly false in file \"%s\" line %u", \ + #expr, __FILE__, __LINE__); \ + } while (0) + +#define EXPECT_FALSE(expr) \ + do { \ + if (expr) \ + elog(ERROR, \ + "%s was unexpectedly true in file \"%s\" line %u", \ + #expr, __FILE__, __LINE__); \ + } while (0) + +#define EXPECT_EQ_U32(result_expr, expected_expr) \ + do { \ + uint32 result = (result_expr); \ + uint32 expected = (expected_expr); \ + if (result != expected) \ + elog(ERROR, \ + "%s yielded %u, expected %s in file \"%s\" line %u", \ + #result_expr, result, #expected_expr, __FILE__, __LINE__); \ + } while (0) + +static void +bfm_test_insert_leaf_grow(bfm_tree *root) +{ + bfm_value_type val; + + /* 0->1 */ + EXPECT_FALSE(bfm_set(root, 0, 0+3)); + EXPECT_TRUE(bfm_lookup(root, 0, &val)); + EXPECT_EQ_U32(val, 0+3); + + /* node 1->4 */ + for (int i = 1; i < 4; i++) + { + EXPECT_FALSE(bfm_set(root, i, i+3)); + } + for (int i = 0; i < 4; i++) + { + EXPECT_TRUE(bfm_lookup(root, i, &val)); + EXPECT_EQ_U32(val, i+3); + } + + /* node 4->16, reverse order, for giggles */ + for (int i = 15; i >= 4; i--) + { + EXPECT_FALSE(bfm_set(root, i, i+3)); + } + for (int i = 0; i < 16; i++) + { + EXPECT_TRUE(bfm_lookup(root, i, &val)); + EXPECT_EQ_U32(val, i+3); + } + + /* node 16->32 */ + for (int i = 16; i < 32; i++) + { + EXPECT_FALSE(bfm_set(root, i, i+3)); + } + for (int i = 0; i < 32; i++) + { + EXPECT_TRUE(bfm_lookup(root, i, &val)); + EXPECT_EQ_U32(val, i+3); + } + + /* node 32->128 */ + for (int i = 32; i < 128; i++) + { + EXPECT_FALSE(bfm_set(root, i, i+3)); + } + for (int i = 0; i < 128; i++) + { + EXPECT_TRUE(bfm_lookup(root, i, &val)); + EXPECT_EQ_U32(val, i+3); + } + + /* node 128->max */ + for (int i = 128; i < BFM_MAX_CLASS; i++) + { + EXPECT_FALSE(bfm_set(root, i, i+3)); + } + for (int i = 0; i < BFM_MAX_CLASS; i++) + { + EXPECT_TRUE(bfm_lookup(root, i, &val)); + EXPECT_EQ_U32(val, i+3); + } + +} + +static void +bfm_test_insert_inner_grow(void) +{ + bfm_tree root; + bfm_value_type val; + bfm_value_type cur; + + bfm_init(&root); + + cur = 1025; + + while (!root.rnode || + root.rnode->node_shift == 0 || + root.rnode->count < 4) + { + EXPECT_FALSE(bfm_set(&root, cur, -cur)); + cur += BFM_MAX_CLASS; + } + + for (int i = 1025; i < cur; i += BFM_MAX_CLASS) + { + EXPECT_TRUE(bfm_lookup(&root, i, &val)); + EXPECT_EQ_U32(val, -i); + } + + while (root.rnode->count < 32) + { + EXPECT_FALSE(bfm_set(&root, cur, -cur)); + cur += BFM_MAX_CLASS; + } + + for (int i = 1025; i < cur; i += BFM_MAX_CLASS) + { + EXPECT_TRUE(bfm_lookup(&root, i, &val)); + EXPECT_EQ_U32(val, -i); + } + + while (root.rnode->count < 128) + { + EXPECT_FALSE(bfm_set(&root, cur, -cur)); + cur += BFM_MAX_CLASS; + } + + for (int i = 1025; i < cur; i += BFM_MAX_CLASS) + { + EXPECT_TRUE(bfm_lookup(&root, i, &val)); + EXPECT_EQ_U32(val, -i); + } + + while (root.rnode->count < BFM_MAX_CLASS) + { + EXPECT_FALSE(bfm_set(&root, cur, -cur)); + cur += BFM_MAX_CLASS; + } + + for (int i = 1025; i < cur; i += BFM_MAX_CLASS) + { + EXPECT_TRUE(bfm_lookup(&root, i, &val)); + EXPECT_EQ_U32(val, -i); + } + + while (root.rnode->count == BFM_MAX_CLASS) + { + EXPECT_FALSE(bfm_set(&root, cur, -cur)); + cur += BFM_MAX_CLASS; + } + + for (int i = 1025; i < cur; i += BFM_MAX_CLASS) + { + EXPECT_TRUE(bfm_lookup(&root, i, &val)); + EXPECT_EQ_U32(val, -i); + } + +} + +static void +bfm_test_delete_lots(void) +{ + bfm_tree root; + bfm_value_type val; + bfm_key_type insertval; + + bfm_init(&root); + + insertval = 0; + while (!root.rnode || + root.rnode->node_shift != (BFM_FANOUT * 2)) + { + EXPECT_FALSE(bfm_set(&root, insertval, -insertval)); + insertval++; + } + + for (bfm_key_type i = 0; i < insertval; i++) + { + EXPECT_TRUE(bfm_lookup(&root, i, &val)); + EXPECT_EQ_U32(val, -i); + EXPECT_TRUE(bfm_delete(&root, i)); + EXPECT_FALSE(bfm_lookup(&root, i, &val)); + } + + EXPECT_TRUE(root.rnode == NULL); +} + +#include "portability/instr_time.h" + +static void +bfm_test_insert_bulk(int count) +{ + bfm_tree root; + bfm_value_type val; + instr_time start, end, diff; + int misses; + int mult = 1; + + bfm_init(&root); + + INSTR_TIME_SET_CURRENT(start); + + for (int i = 0; i < count; i++) + bfm_set(&root, i*mult, -i); + + INSTR_TIME_SET_CURRENT(end); + INSTR_TIME_SET_ZERO(diff); + INSTR_TIME_ACCUM_DIFF(diff, end, start); + + elog(NOTICE, "%d ordered insertions in %f seconds, %d/sec", + count, + INSTR_TIME_GET_DOUBLE(diff), + (int)(count/INSTR_TIME_GET_DOUBLE(diff))); + + INSTR_TIME_SET_CURRENT(start); + + misses = 0; + for (int i = 0; i < count; i++) + { + if (unlikely(!bfm_lookup(&root, i*mult, &val))) + misses++; + } + if (misses > 0) + elog(ERROR, "not present for lookup: %d entries", misses); + + INSTR_TIME_SET_CURRENT(end); + INSTR_TIME_SET_ZERO(diff); + INSTR_TIME_ACCUM_DIFF(diff, end, start); + + elog(NOTICE, "%d ordered lookups in %f seconds, %d/sec", + count, + INSTR_TIME_GET_DOUBLE(diff), + (int)(count/INSTR_TIME_GET_DOUBLE(diff))); + + elog(LOG, "stats after lookup are: %s", + bfm_stats(&root)->data); + + INSTR_TIME_SET_CURRENT(start); + + misses = 0; + for (int i = 0; i < count; i++) + { + if (unlikely(!bfm_delete(&root, i*mult))) + misses++; + } + if (misses > 0) + elog(ERROR, "not present for deletion: %d entries", misses); + + INSTR_TIME_SET_CURRENT(end); + INSTR_TIME_SET_ZERO(diff); + INSTR_TIME_ACCUM_DIFF(diff, end, start); + + elog(NOTICE, "%d ordered deletions in %f seconds, %d/sec", + count, + INSTR_TIME_GET_DOUBLE(diff), + (int)(count/INSTR_TIME_GET_DOUBLE(diff))); + + elog(LOG, "stats after deletion are: %s", + bfm_stats(&root)->data); +} + +void +bfm_tests(void) +{ + bfm_tree root; + bfm_value_type val; + + /* initialize a tree starting with a large value */ + bfm_init(&root); + EXPECT_FALSE(bfm_set(&root, 1024, 1)); + EXPECT_TRUE(bfm_lookup(&root, 1024, &val)); + EXPECT_EQ_U32(val, 1); + /* there should only be the key we inserted */ +#ifdef BFM_STATS + EXPECT_EQ_U32(root.leaf_nodes[0], 1); +#endif + + /* check that we can subsequently insert a small value */ + EXPECT_FALSE(bfm_set(&root, 1, 2)); + EXPECT_TRUE(bfm_lookup(&root, 1, &val)); + EXPECT_EQ_U32(val, 2); + EXPECT_TRUE(bfm_lookup(&root, 1024, &val)); + EXPECT_EQ_U32(val, 1); + + /* check that a 0 key and 0 value are correctly recognized */ + bfm_init(&root); + EXPECT_FALSE(bfm_lookup(&root, 0, &val)); + EXPECT_FALSE(bfm_set(&root, 0, 17)); + EXPECT_TRUE(bfm_lookup(&root, 0, &val)); + EXPECT_EQ_U32(val, 17); + + EXPECT_FALSE(bfm_lookup(&root, 2, &val)); + EXPECT_FALSE(bfm_set(&root, 2, 0)); + EXPECT_TRUE(bfm_lookup(&root, 2, &val)); + EXPECT_EQ_U32(val, 0); + + /* check that repeated insertion of the same key updates value */ + bfm_init(&root); + EXPECT_FALSE(bfm_set(&root, 9, 12)); + EXPECT_TRUE(bfm_lookup(&root, 9, &val)); + EXPECT_EQ_U32(val, 12); + EXPECT_TRUE(bfm_set(&root, 9, 13)); + EXPECT_TRUE(bfm_lookup(&root, 9, &val)); + EXPECT_EQ_U32(val, 13); + + + /* initialize a tree starting with a leaf value */ + bfm_init(&root); + EXPECT_FALSE(bfm_set(&root, 3, 1)); + EXPECT_TRUE(bfm_lookup(&root, 3, &val)); + EXPECT_EQ_U32(val, 1); + /* there should only be the key we inserted */ +#ifdef BFM_STATS + EXPECT_EQ_U32(root.leaf_nodes[0], 1); +#endif + /* and no inner ones */ +#ifdef BFM_STATS + EXPECT_EQ_U32(root.inner_nodes[0], 0); +#endif + + EXPECT_FALSE(bfm_set(&root, 1717, 17)); + EXPECT_TRUE(bfm_lookup(&root, 1717, &val)); + EXPECT_EQ_U32(val, 17); + + /* check that a root leaf node grows correctly */ + bfm_init(&root); + bfm_test_insert_leaf_grow(&root); + + /* check that a non-root leaf node grows correctly */ + bfm_init(&root); + EXPECT_FALSE(bfm_set(&root, 1024, 1024)); + bfm_test_insert_leaf_grow(&root); + + /* check that an inner node grows correctly */ + bfm_test_insert_inner_grow(); + + + bfm_init(&root); + EXPECT_FALSE(bfm_set(&root, 1, 1)); + EXPECT_TRUE(bfm_lookup(&root, 1, &val)); + + /* deletion from leaf node at root */ + EXPECT_TRUE(bfm_delete(&root, 1)); + EXPECT_FALSE(bfm_lookup(&root, 1, &val)); + + /* repeated deletion fails */ + EXPECT_FALSE(bfm_delete(&root, 1)); + EXPECT_TRUE(root.rnode == NULL); + + /* one deletion doesn't disturb other values in leaf */ + EXPECT_FALSE(bfm_set(&root, 1, 1)); + EXPECT_FALSE(bfm_set(&root, 2, 2)); + EXPECT_TRUE(bfm_delete(&root, 1)); + EXPECT_FALSE(bfm_lookup(&root, 1, &val)); + EXPECT_TRUE(bfm_lookup(&root, 2, &val)); + EXPECT_EQ_U32(val, 2); + + EXPECT_TRUE(bfm_delete(&root, 2)); + EXPECT_FALSE(bfm_lookup(&root, 2, &val)); + EXPECT_TRUE(root.rnode == NULL); + + /* deletion from a leaf node succeeds */ + EXPECT_FALSE(bfm_set(&root, 0xFFFF02, 0xFFFF02)); + EXPECT_FALSE(bfm_set(&root, 1, 1)); + EXPECT_FALSE(bfm_set(&root, 2, 2)); + + EXPECT_TRUE(bfm_delete(&root, 1)); + EXPECT_TRUE(bfm_lookup(&root, 0xFFFF02, &val)); + EXPECT_FALSE(bfm_lookup(&root, 1, &val)); + EXPECT_TRUE(bfm_lookup(&root, 2, &val)); + + EXPECT_TRUE(bfm_delete(&root, 2)); + EXPECT_TRUE(bfm_lookup(&root, 0xFFFF02, &val)); + EXPECT_FALSE(bfm_lookup(&root, 1, &val)); + + EXPECT_TRUE(bfm_delete(&root, 0xFFFF02)); + EXPECT_FALSE(bfm_delete(&root, 0xFFFF02)); + EXPECT_FALSE(bfm_lookup(&root, 0xFFFF02, &val)); + EXPECT_TRUE(root.rnode == NULL); + + /* check that repeatedly inserting and deleting the same value works */ + bfm_init(&root); + EXPECT_FALSE(bfm_set(&root, 0x10000, -0x10000)); + EXPECT_FALSE(bfm_set(&root, 0, 0)); + EXPECT_TRUE(bfm_lookup(&root, 0, &val)); + EXPECT_TRUE(bfm_delete(&root, 0)); + EXPECT_FALSE(bfm_lookup(&root, 0, &val)); + EXPECT_FALSE(bfm_set(&root, 0, 0)); + EXPECT_TRUE(bfm_set(&root, 0, 0)); + EXPECT_TRUE(bfm_lookup(&root, 0, &val)); + + bfm_test_delete_lots(); + + if (0) + { + int cnt = 300; + + bfm_init(&root); + MemoryContextStats(root.context); + for (int i = 0; i < cnt; i++) + EXPECT_FALSE(bfm_set(&root, i, i)); + MemoryContextStats(root.context); + for (int i = 0; i < cnt; i++) + EXPECT_TRUE(bfm_delete(&root, i)); + MemoryContextStats(root.context); + } + + if (1) + { + //bfm_test_insert_bulk( 100 * 1000); + //bfm_test_insert_bulk( 1000 * 1000); +#ifdef USE_ASSERT_CHECKING + bfm_test_insert_bulk( 1 * 1000 * 1000); +#endif + //bfm_test_insert_bulk( 10 * 1000 * 1000); +#ifndef USE_ASSERT_CHECKING + bfm_test_insert_bulk( 100 * 1000 * 1000); +#endif + //bfm_test_insert_bulk(1000 * 1000 * 1000); + } + + //bfm_print(&root); +} diff --git a/bdbench/radix.h b/bdbench/radix.h new file mode 100644 index 0000000..c908aa5 --- /dev/null +++ b/bdbench/radix.h @@ -0,0 +1,76 @@ +/*------------------------------------------------------------------------- + * + * radix.h + * radix tree, yay. + * + * + * Portions Copyright (c) 2014-2021, PostgreSQL Global Development Group + * + * src/include/storage/radix.h + * + *------------------------------------------------------------------------- + */ + +#ifndef RADIX_H +#define RADIX_H + +typedef uint64 bfm_key_type; +typedef uint64 bfm_value_type; +//typedef uint32 bfm_value_type; +//typedef char bfm_value_type; + +/* How many different size classes are there */ +#define BFM_KIND_COUNT 6 + +typedef enum bfm_tree_node_kind +{ + BFM_KIND_1, + BFM_KIND_4, + BFM_KIND_16, + BFM_KIND_32, + BFM_KIND_128, + BFM_KIND_MAX +} bfm_tree_node_kind; + +struct MemoryContextData; +struct bfm_tree_node; + +/* NB: makes things a bit slower */ +#define BFM_STATS + +#define BFM_USE_SLAB +//#define BFM_USE_OS + +/* + * A radix tree with nodes that are sized based on occupancy. + */ +typedef struct bfm_tree +{ + struct bfm_tree_node *rnode; + uint64 maxval; + + struct MemoryContextData *context; +#ifdef BFM_USE_SLAB + struct MemoryContextData *inner_slabs[BFM_KIND_COUNT]; + struct MemoryContextData *leaf_slabs[BFM_KIND_COUNT]; +#endif + +#ifdef BFM_STATS + /* stats */ + size_t entries; + size_t inner_nodes[BFM_KIND_COUNT]; + size_t leaf_nodes[BFM_KIND_COUNT]; +#endif +} bfm_tree; + +extern void bfm_init(bfm_tree *root); +extern bool bfm_lookup(bfm_tree *root, bfm_key_type key, bfm_value_type *val); +extern bool bfm_set(bfm_tree *root, bfm_key_type key, bfm_value_type val); +extern bool bfm_delete(bfm_tree *root, bfm_key_type key); + +extern struct StringInfoData* bfm_stats(bfm_tree *root); +extern void bfm_print(bfm_tree *root); + +extern void bfm_tests(void); + +#endif -- 2.32.0.rc2
>From 131074dcbe72ff8af00cb879c7c92747dc100e69 Mon Sep 17 00:00:00 2001 From: Andres Freund <and...@anarazel.de> Date: Mon, 19 Jul 2021 16:05:44 -0700 Subject: [PATCH 3/3] Add radix tree benchmark integration. --- bdbench/Makefile | 2 +- bdbench/bdbench--1.0.sql | 4 + bdbench/bdbench.c | 181 ++++++++++++++++++++++++++++++++++++++- bdbench/bench.sql | 6 +- 4 files changed, 189 insertions(+), 4 deletions(-) diff --git a/bdbench/Makefile b/bdbench/Makefile index 6d52940..723132a 100644 --- a/bdbench/Makefile +++ b/bdbench/Makefile @@ -2,7 +2,7 @@ MODULE_big = bdbench DATA = bdbench--1.0.sql -OBJS = bdbench.o vtbm.o rtbm.o +OBJS = bdbench.o vtbm.o rtbm.o radix.o EXTENSION = bdbench REGRESS= bdbench diff --git a/bdbench/bdbench--1.0.sql b/bdbench/bdbench--1.0.sql index 933cf71..bd59293 100644 --- a/bdbench/bdbench--1.0.sql +++ b/bdbench/bdbench--1.0.sql @@ -109,3 +109,7 @@ RETURNS text AS 'MODULE_PATHNAME' LANGUAGE C STRICT VOLATILE; +CREATE FUNCTION radix_run_tests() +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT VOLATILE; diff --git a/bdbench/bdbench.c b/bdbench/bdbench.c index 1df5c53..85d8eaa 100644 --- a/bdbench/bdbench.c +++ b/bdbench/bdbench.c @@ -19,6 +19,7 @@ #include "vtbm.h" #include "rtbm.h" +#include "radix.h" //#define DEBUG_DUMP_MATCHED 1 @@ -89,6 +90,7 @@ PG_FUNCTION_INFO_V1(attach_dead_tuples); PG_FUNCTION_INFO_V1(bench); PG_FUNCTION_INFO_V1(test_generate_tid); PG_FUNCTION_INFO_V1(rtbm_test); +PG_FUNCTION_INFO_V1(radix_run_tests); PG_FUNCTION_INFO_V1(prepare); /* @@ -137,6 +139,16 @@ static void rtbm_attach(LVTestType *lvtt, uint64 nitems, BlockNumber minblk, static bool rtbm_reaped(LVTestType *lvtt, ItemPointer itemptr); static Size rtbm_mem_usage(LVTestType *lvtt); +/* radix */ +static void radix_init(LVTestType *lvtt, uint64 nitems); +static void radix_fini(LVTestType *lvtt); +static void radix_attach(LVTestType *lvtt, uint64 nitems, BlockNumber minblk, + BlockNumber maxblk, OffsetNumber maxoff); +static bool radix_reaped(LVTestType *lvtt, ItemPointer itemptr); +static Size radix_mem_usage(LVTestType *lvtt); +static void radix_load(void *tbm, ItemPointerData *itemptrs, int nitems); + + /* Misc functions */ static void generate_index_tuples(uint64 nitems, BlockNumber minblk, BlockNumber maxblk, OffsetNumber maxoff); @@ -156,12 +168,13 @@ static void load_rtbm(RTbm *vtbm, ItemPointerData *itemptrs, int nitems); .dtinfo = {0}, \ .name = #n, \ .init_fn = n##_init, \ + .fini_fn = n##_fini, \ .attach_fn = n##_attach, \ .reaped_fn = n##_reaped, \ .mem_usage_fn = n##_mem_usage, \ } -#define TEST_SUBJECT_TYPES 5 +#define TEST_SUBJECT_TYPES 6 static LVTestType LVTestSubjects[TEST_SUBJECT_TYPES] = { DECLARE_SUBJECT(array), @@ -169,6 +182,7 @@ static LVTestType LVTestSubjects[TEST_SUBJECT_TYPES] = DECLARE_SUBJECT(intset), DECLARE_SUBJECT(vtbm), DECLARE_SUBJECT(rtbm), + DECLARE_SUBJECT(radix) }; static bool @@ -192,6 +206,31 @@ update_info(DeadTupleInfo *info, uint64 nitems, BlockNumber minblk, info->maxoff = maxoff; } + +/* from geqo's init_tour(), geqo_randint() */ +static int +shuffle_randrange(unsigned short xseed[3], int lower, int upper) +{ + return (int) floor( pg_erand48(xseed) * ((upper-lower)+0.999999)) + lower; +} + +/* Naive Fisher-Yates implementation*/ +static void +shuffle_itemptrs(uint64 nitems, ItemPointer itemptrs) +{ + /* reproducability */ + unsigned short xseed[3] = {0}; + + for (int i = 0; i < nitems - 1; i++) + { + int j = shuffle_randrange(xseed, i, nitems - 1); + ItemPointerData t = itemptrs[j]; + + itemptrs[j] = itemptrs[i]; + itemptrs[i] = t; + } +} + static void generate_index_tuples(uint64 nitems, BlockNumber minblk, BlockNumber maxblk, OffsetNumber maxoff) @@ -586,6 +625,138 @@ load_rtbm(RTbm *rtbm, ItemPointerData *itemptrs, int nitems) rtbm_add_tuples(rtbm, curblkno, offs, noffs); } +/* ---------- radix ---------- */ +static void +radix_init(LVTestType *lvtt, uint64 nitems) +{ + MemoryContext old_ctx; + + lvtt->mcxt = AllocSetContextCreate(TopMemoryContext, + "radix bench", + ALLOCSET_DEFAULT_SIZES); + old_ctx = MemoryContextSwitchTo(lvtt->mcxt); + lvtt->private = palloc(sizeof(bfm_tree)); + bfm_init(lvtt->private); + MemoryContextSwitchTo(old_ctx); +} +static void +radix_fini(LVTestType *lvtt) +{ +#if 0 + if (lvtt->private) + bfm_free((RTbm *) lvtt->private); +#endif +} + +/* log(sizeof(bfm_value_type) * BITS_PER_BYTE, 2) = log(64, 2) = 6 */ +#define ENCODE_BITS 6 + +static uint64 +radix_to_key_off(ItemPointer tid, uint32 *off) +{ + uint64 upper; + + uint32 shift = pg_ceil_log2_32(MaxHeapTuplesPerPage); + int64 tid_i; + + Assert(ItemPointerGetOffsetNumber(tid) < MaxHeapTuplesPerPage); + + tid_i = ItemPointerGetOffsetNumber(tid); + tid_i |= ItemPointerGetBlockNumber(tid) << shift; + + *off = tid_i & ((1 << ENCODE_BITS)-1); + upper = tid_i >> ENCODE_BITS; + Assert(*off < (sizeof(bfm_value_type) * BITS_PER_BYTE)); + + Assert(*off < 64); + + return upper; +} + +static void +radix_attach(LVTestType *lvtt, uint64 nitems, BlockNumber minblk, + BlockNumber maxblk, OffsetNumber maxoff) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(lvtt->mcxt); + + radix_load(lvtt->private, + DeadTuples_orig->itemptrs, + DeadTuples_orig->dtinfo.nitems); + + MemoryContextSwitchTo(oldcontext); +} + + +static bool +radix_reaped(LVTestType *lvtt, ItemPointer itemptr) +{ + uint64 key; + uint32 off; + bfm_value_type val; + + key = radix_to_key_off(itemptr, &off); + + if (!bfm_lookup((bfm_tree *) lvtt->private, key, &val)) + return false; + + return val & ((bfm_value_type)1 << off); +} + +static uint64 +radix_mem_usage(LVTestType *lvtt) +{ + bfm_tree *root = lvtt->private; + size_t mem = MemoryContextMemAllocated(lvtt->mcxt, true); + StringInfo s; + + s = bfm_stats(root); + + ereport(NOTICE, + errmsg("radix tree of %.2f MB, %s", + (double) mem / (1024 * 1024), + s->data), + errhidestmt(true), + errhidecontext(true)); + + pfree(s->data); + pfree(s); + + return mem; +} + +static void +radix_load(void *tbm, ItemPointerData *itemptrs, int nitems) +{ + bfm_tree *root = (bfm_tree *) tbm; + uint64 last_key = PG_UINT64_MAX; + uint64 val = 0; + + for (int i = 0; i < nitems; i++) + { + ItemPointer tid = &(itemptrs[i]); + uint64 key; + uint32 off; + + key = radix_to_key_off(tid, &off); + + if (last_key != PG_UINT64_MAX && + last_key != key) + { + bfm_set(root, last_key, val); + val = 0; + } + + last_key = key; + val |= (uint64)1 << off; + } + + if (last_key != PG_UINT64_MAX) + { + bfm_set(root, last_key, val); + } +} + + static void attach(LVTestType *lvtt, uint64 nitems, BlockNumber minblk, BlockNumber maxblk, OffsetNumber maxoff) @@ -952,3 +1123,11 @@ rtbm_test(PG_FUNCTION_ARGS) PG_RETURN_NULL(); } + +Datum +radix_run_tests(PG_FUNCTION_ARGS) +{ + bfm_tests(); + + PG_RETURN_VOID(); +} diff --git a/bdbench/bench.sql b/bdbench/bench.sql index c5ef2d3..94cfde0 100644 --- a/bdbench/bench.sql +++ b/bdbench/bench.sql @@ -11,10 +11,11 @@ select prepare( -- Load dead tuples to all data structures. select 'array', attach_dead_tuples('array'); -select 'tbm', attach_dead_tuples('tbm'); select 'intset', attach_dead_tuples('intset'); -select 'vtbm', attach_dead_tuples('vtbm'); select 'rtbm', attach_dead_tuples('rtbm'); +select 'tbm', attach_dead_tuples('tbm'); +select 'vtbm', attach_dead_tuples('vtbm'); +select 'radix', attach_dead_tuples('radix'); -- Do benchmark of lazy_tid_reaped. select 'array bench', bench('array'); @@ -22,6 +23,7 @@ select 'intset bench', bench('intset'); select 'rtbm bench', bench('rtbm'); select 'tbm bench', bench('tbm'); select 'vtbm bench', bench('vtbm'); +select 'radix', bench('radix'); -- Check the memory usage. select * from pg_backend_memory_contexts where name ~ 'bench' or name = 'TopMemoryContext' order by name; -- 2.32.0.rc2