This is not in reviewable state, but I wanted to get a sketch of a PoC out there and highlight some issues that will need to be tackled. The index scan successfully populates the radix tree and the contents are verified to match the hash table for the queries present in the regression tests. (Now that I think of it, this patch set might not be buildable as is, since my local tree has two copies of the radix tree template so TID store doesn't break while working on this)
First issue: Different memory management needs from vacuum. I split RT_SET into two functions, the lower of which is similar to simple hash's "insert" method. Second issue: TBM intersection 1) Radix tree iteration assumes that no one will delete keys out from under it. That includes the caller itself! This is not sufficient for intersecting bitmaps. I kluged around it by saving blocks-to-be-deleted in a stack array, but a proper solution could be to (vigorous handwaving) 1) save per-leaf-node keys in the iterator 2) delete the keys when iteration leaves that node, possibly with a bulk-delete mechanism 3) restart iteration from the top, at the next part of the key space The above is enough to get rid of the qsort call, but next we'll want to use variable-length bitmaps like vacuum does. This is both to reduce memory use and to allow the length of the line pointer array to be sized independently from MaxHeapTuplesPerPage. That will require getting rid of the iterator array and doing shared iteration over the tree itself. Ideally, this will be done in a way compatible with vacuum, but resolving the differences into new abstractions will be the final step. -- John Naylor Amazon Web Services
From 678ec6bb7b98204b7d6d3ef9c37773408e90e28c Mon Sep 17 00:00:00 2001 From: John Naylor <john.nay...@postgresql.org> Date: Fri, 14 Feb 2025 12:49:54 +0700 Subject: [PATCH v1 1/5] Get rid of TBMStatus XXX Speculative, this is for simplification. I'm assuming this was more important when the hash table was dynahash. The one-page optimization is ancient, so it's not even clear how useful it is now. It also doesn't have much coverage in regression tests. If creating a TidStore for a single page has too much overhead, we can create its memory contexts lazily, but that's left for future work. --- src/backend/nodes/tidbitmap.c | 130 ++-------------------------------- 1 file changed, 5 insertions(+), 125 deletions(-) diff --git a/src/backend/nodes/tidbitmap.c b/src/backend/nodes/tidbitmap.c index 41031aa8f2f..112f16c2424 100644 --- a/src/backend/nodes/tidbitmap.c +++ b/src/backend/nodes/tidbitmap.c @@ -106,24 +106,6 @@ typedef struct PTEntryArray PagetableEntry ptentry[FLEXIBLE_ARRAY_MEMBER]; } PTEntryArray; -/* - * We want to avoid the overhead of creating the hashtable, which is - * comparatively large, when not necessary. Particularly when we are using a - * bitmap scan on the inside of a nestloop join: a bitmap may well live only - * long enough to accumulate one entry in such cases. We therefore avoid - * creating an actual hashtable until we need two pagetable entries. When - * just one pagetable entry is needed, we store it in a fixed field of - * TIDBitMap. (NOTE: we don't get rid of the hashtable if the bitmap later - * shrinks down to zero or one page again. So, status can be TBM_HASH even - * when nentries is zero or one.) - */ -typedef enum -{ - TBM_EMPTY, /* no hashtable, nentries == 0 */ - TBM_ONE_PAGE, /* entry1 contains the single entry */ - TBM_HASH, /* pagetable is valid, entry1 is not */ -} TBMStatus; - /* * Current iterating state of the TBM. */ @@ -141,7 +123,6 @@ struct TIDBitmap { NodeTag type; /* to make it a valid Node */ MemoryContext mcxt; /* memory context containing me */ - TBMStatus status; /* see codes above */ struct pagetable_hash *pagetable; /* hash table of PagetableEntry's */ int nentries; /* number of entries in pagetable */ int maxentries; /* limit on same to meet maxbytes */ @@ -149,7 +130,6 @@ struct TIDBitmap int nchunks; /* number of lossy entries in pagetable */ TBMIteratingState iterating; /* tbm_begin_iterate called? */ uint32 lossify_start; /* offset to start lossifying hashtable at */ - PagetableEntry entry1; /* used when status == TBM_ONE_PAGE */ /* these are valid when iterating is true: */ PagetableEntry **spages; /* sorted exact-page list, or NULL */ PagetableEntry **schunks; /* sorted lossy-chunk list, or NULL */ @@ -260,7 +240,8 @@ tbm_create(Size maxbytes, dsa_area *dsa) tbm = makeNode(TIDBitmap); tbm->mcxt = CurrentMemoryContext; - tbm->status = TBM_EMPTY; + Assert(tbm->pagetable == NULL); + tbm->pagetable = pagetable_create(tbm->mcxt, 128, tbm); tbm->maxentries = tbm_calculate_entries(maxbytes); tbm->lossify_start = 0; @@ -273,37 +254,6 @@ tbm_create(Size maxbytes, dsa_area *dsa) return tbm; } -/* - * Actually create the hashtable. Since this is a moderately expensive - * proposition, we don't do it until we have to. - */ -static void -tbm_create_pagetable(TIDBitmap *tbm) -{ - Assert(tbm->status != TBM_HASH); - Assert(tbm->pagetable == NULL); - - tbm->pagetable = pagetable_create(tbm->mcxt, 128, tbm); - - /* If entry1 is valid, push it into the hashtable */ - if (tbm->status == TBM_ONE_PAGE) - { - PagetableEntry *page; - bool found; - char oldstatus; - - page = pagetable_insert(tbm->pagetable, - tbm->entry1.blockno, - &found); - Assert(!found); - oldstatus = page->status; - memcpy(page, &tbm->entry1, sizeof(PagetableEntry)); - page->status = oldstatus; - } - - tbm->status = TBM_HASH; -} - /* * tbm_free - free a TIDBitmap */ @@ -451,14 +401,10 @@ tbm_union(TIDBitmap *a, const TIDBitmap *b) if (b->nentries == 0) return; /* Scan through chunks and pages in b, merge into a */ - if (b->status == TBM_ONE_PAGE) - tbm_union_page(a, &b->entry1); - else { pagetable_iterator i; PagetableEntry *bpage; - Assert(b->status == TBM_HASH); pagetable_start_iterate(b->pagetable, &i); while ((bpage = pagetable_iterate(b->pagetable, &i)) != NULL) tbm_union_page(a, bpage); @@ -533,24 +479,10 @@ tbm_intersect(TIDBitmap *a, const TIDBitmap *b) if (a->nentries == 0) return; /* Scan through chunks and pages in a, try to match to b */ - if (a->status == TBM_ONE_PAGE) - { - if (tbm_intersect_page(a, &a->entry1, b)) - { - /* Page is now empty, remove it from a */ - Assert(!a->entry1.ischunk); - a->npages--; - a->nentries--; - Assert(a->nentries == 0); - a->status = TBM_EMPTY; - } - } - else { pagetable_iterator i; PagetableEntry *apage; - Assert(a->status == TBM_HASH); pagetable_start_iterate(a->pagetable, &i); while ((apage = pagetable_iterate(a->pagetable, &i)) != NULL) { @@ -701,7 +633,7 @@ tbm_begin_private_iterate(TIDBitmap *tbm) * attached to the bitmap not the iterator, so they can be used by more * than one iterator. */ - if (tbm->status == TBM_HASH && tbm->iterating == TBM_NOT_ITERATING) + if (tbm->iterating == TBM_NOT_ITERATING) { pagetable_iterator i; PagetableEntry *page; @@ -802,13 +734,10 @@ tbm_prepare_shared_iterate(TIDBitmap *tbm) } /* - * If TBM status is TBM_HASH then iterate over the pagetable and - * convert it to page and chunk arrays. But if it's in the - * TBM_ONE_PAGE mode then directly allocate the space for one entry - * from the DSA. + * iterate over the pagetable and + * convert it to page and chunk arrays. */ npages = nchunks = 0; - if (tbm->status == TBM_HASH) { ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable); @@ -825,19 +754,6 @@ tbm_prepare_shared_iterate(TIDBitmap *tbm) Assert(npages == tbm->npages); Assert(nchunks == tbm->nchunks); } - else if (tbm->status == TBM_ONE_PAGE) - { - /* - * In one page mode allocate the space for one pagetable entry, - * initialize it, and directly store its index (i.e. 0) in the - * page array. - */ - tbm->dsapagetable = dsa_allocate(tbm->dsa, sizeof(PTEntryArray) + - sizeof(PagetableEntry)); - ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable); - memcpy(ptbase->ptentry, &tbm->entry1, sizeof(PagetableEntry)); - ptpages->index[0] = 0; - } if (ptbase != NULL) pg_atomic_init_u32(&ptbase->refcount, 0); @@ -1027,10 +943,6 @@ tbm_private_iterate(TBMPrivateIterator *iterator, TBMIterateResult *tbmres) { PagetableEntry *page; - /* In TBM_ONE_PAGE state, we don't allocate an spages[] array */ - if (tbm->status == TBM_ONE_PAGE) - page = &tbm->entry1; - else page = tbm->spages[iterator->spageptr]; tbmres->internal_page = page; @@ -1177,15 +1089,6 @@ tbm_find_pageentry(const TIDBitmap *tbm, BlockNumber pageno) if (tbm->nentries == 0) /* in case pagetable doesn't exist */ return NULL; - if (tbm->status == TBM_ONE_PAGE) - { - page = &tbm->entry1; - if (page->blockno != pageno) - return NULL; - Assert(!page->ischunk); - return page; - } - page = pagetable_lookup(tbm->pagetable, pageno); if (page == NULL) return NULL; @@ -1208,24 +1111,7 @@ tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno) PagetableEntry *page; bool found; - if (tbm->status == TBM_EMPTY) { - /* Use the fixed slot */ - page = &tbm->entry1; - found = false; - tbm->status = TBM_ONE_PAGE; - } - else - { - if (tbm->status == TBM_ONE_PAGE) - { - page = &tbm->entry1; - if (page->blockno == pageno) - return page; - /* Time to switch from one page to a hashtable */ - tbm_create_pagetable(tbm); - } - /* Look up or create an entry */ page = pagetable_insert(tbm->pagetable, pageno, &found); } @@ -1259,7 +1145,6 @@ tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno) /* we can skip the lookup if there are no lossy chunks */ if (tbm->nchunks == 0) return false; - Assert(tbm->status == TBM_HASH); bitno = pageno % PAGES_PER_CHUNK; chunk_pageno = pageno - bitno; @@ -1293,10 +1178,6 @@ tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno) int wordnum; int bitnum; - /* We force the bitmap into hashtable mode whenever it's lossy */ - if (tbm->status != TBM_HASH) - tbm_create_pagetable(tbm); - bitno = pageno % PAGES_PER_CHUNK; chunk_pageno = pageno - bitno; @@ -1371,7 +1252,6 @@ tbm_lossify(TIDBitmap *tbm) * just end up doing this again very soon. We shoot for maxentries/2. */ Assert(tbm->iterating == TBM_NOT_ITERATING); - Assert(tbm->status == TBM_HASH); pagetable_start_iterate_at(tbm->pagetable, &i, tbm->lossify_start); while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL) -- 2.50.0
From 9eb87637bbd06bbcd1e103e556919f3b55f22a6f Mon Sep 17 00:00:00 2001 From: John Naylor <john.nay...@postgresql.org> Date: Sat, 28 Jun 2025 17:46:16 +0700 Subject: [PATCH v1 5/5] Populate radix tree at the same time as hash table and verify match --- src/backend/nodes/tidbitmap.c | 511 +++++++++++++++++++++++++++++++++- src/include/lib/radixtree.h | 2 +- 2 files changed, 506 insertions(+), 7 deletions(-) diff --git a/src/backend/nodes/tidbitmap.c b/src/backend/nodes/tidbitmap.c index 4a5c0e6ad1b..d5926bc1968 100644 --- a/src/backend/nodes/tidbitmap.c +++ b/src/backend/nodes/tidbitmap.c @@ -116,6 +116,14 @@ typedef enum TBM_ITERATING_SHARED, /* converted to shared page and chunk array */ } TBMIteratingState; +#define RT_PREFIX tbmrt +#define RT_SCOPE +#define RT_DECLARE +#define RT_DEFINE +#define RT_VALUE_TYPE PagetableEntry +#define RT_USE_DELETE +#include "lib/radixtree.h" + /* * Here is the representation for a whole TIDBitMap: */ @@ -124,6 +132,8 @@ struct TIDBitmap NodeTag type; /* to make it a valid Node */ MemoryContext mcxt; /* memory context containing me */ struct pagetable_hash *pagetable; /* hash table of PagetableEntry's */ + MemoryContext rt_cxt; /* radix tree memory context */ + tbmrt_radix_tree *pagetable2; int nentries; /* number of entries in pagetable */ int maxentries; /* limit on same to meet maxbytes */ int npages; /* number of exact entries in pagetable */ @@ -133,6 +143,8 @@ struct TIDBitmap /* these are valid when iterating is true: */ PagetableEntry **spages; /* sorted exact-page list, or NULL */ PagetableEntry **schunks; /* sorted lossy-chunk list, or NULL */ + PagetableEntry **spages2; + PagetableEntry **schunks2; dsa_pointer dsapagetable; /* dsa_pointer to the element array */ dsa_pointer dsapagetableold; /* dsa_pointer to the old element array */ dsa_pointer ptpages; /* dsa_pointer to the page array */ @@ -196,13 +208,21 @@ struct TBMSharedIterator /* Local function prototypes */ static void tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage); +static void tbm_union_page2(TIDBitmap *a, const PagetableEntry *bpage); static bool tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage, const TIDBitmap *b); +static bool tbm_intersect_page2(TIDBitmap *a, PagetableEntry *apage, + const TIDBitmap *b); static const PagetableEntry *tbm_find_pageentry(const TIDBitmap *tbm, BlockNumber pageno); +static const PagetableEntry *tbm_find_pageentry2(const TIDBitmap *tbm, + BlockNumber pageno); static PagetableEntry *tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno); +static PagetableEntry *tbm_get_pageentry2(TIDBitmap *tbm, BlockNumber pageno); static bool tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno); +static bool tbm_page_is_lossy2(const TIDBitmap *tbm, BlockNumber pageno); static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno); +static void tbm_mark_page_lossy2(TIDBitmap *tbm, BlockNumber pageno); static void tbm_lossify(TIDBitmap *tbm); static int tbm_comparator(const void *left, const void *right); static int tbm_shared_comparator(const void *left, const void *right, @@ -240,8 +260,12 @@ tbm_create(Size maxbytes, dsa_area *dsa) tbm = makeNode(TIDBitmap); tbm->mcxt = CurrentMemoryContext; + tbm->rt_cxt = AllocSetContextCreate(tbm->mcxt, + "tidbitmap radix tree", + ALLOCSET_DEFAULT_SIZES); Assert(tbm->pagetable == NULL); tbm->pagetable = pagetable_create(tbm->mcxt, 128, tbm); + tbm->pagetable2 = tbmrt_create(tbm->rt_cxt); tbm->maxentries = tbm_calculate_entries(maxbytes); tbm->lossify_start = 0; @@ -262,10 +286,16 @@ tbm_free(TIDBitmap *tbm) { if (tbm->pagetable) pagetable_destroy(tbm->pagetable); + if (tbm->pagetable2) + tbmrt_free(tbm->pagetable2); if (tbm->spages) pfree(tbm->spages); if (tbm->schunks) pfree(tbm->schunks); + if (tbm->spages2) + pfree(tbm->spages2); + if (tbm->schunks2) + pfree(tbm->schunks2); pfree(tbm); } @@ -306,6 +336,62 @@ tbm_free_shared_area(dsa_area *dsa, dsa_pointer dp) dsa_free(dsa, dp); } +static void +compare_pages(const PagetableEntry *a, const PagetableEntry *b) +{ + if (a->blockno != b->blockno) + elog(ERROR, "MISMATCH BLK a: %u b: %u", a->blockno, b->blockno); + if (a->ischunk != b->ischunk) + elog(ERROR, "MISMATCH ischunk a: %c b: %c", a->ischunk ? 'C' : '_', b->ischunk ? 'C' : '_'); + if (a->recheck != b->recheck) + elog(ERROR, "MISMATCH recheck a: %c x b: %c", a->recheck ? 'R' : '_', b->recheck ? 'R' : '_'); + + for (int wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++) + if (a->words[wordnum] != b->words[wordnum]) + elog(ERROR, " MISMATCH BLOCK %u WORD %d a: %lu b: %lu", a->blockno, wordnum, a->words[wordnum], b->words[wordnum]); +} + +static void +verify_tbm(const TIDBitmap *a) +{ + // for every page in hash, check it exists in tree + { + pagetable_iterator i; + PagetableEntry *hpage; + + pagetable_start_iterate(a->pagetable, &i); + while ((hpage = pagetable_iterate(a->pagetable, &i)) != NULL) + { + PagetableEntry *rpage; + + rpage = tbmrt_find(a->pagetable2, hpage->blockno); + if (rpage == NULL) + elog(NOTICE, " MISMATCH block %u not found in tree", hpage->blockno); + else + compare_pages(hpage, rpage); + } + } + + // for every page in tree, check it exists in hash + { + tbmrt_iter *i; + PagetableEntry *rpage; + uint64 key; + + i = tbmrt_begin_iterate(a->pagetable2); + while ((rpage = tbmrt_iterate_next(i, &key)) != NULL) + { + PagetableEntry *hpage; + + hpage = pagetable_lookup(a->pagetable, rpage->blockno); + if (hpage == NULL) + elog(NOTICE, " MISMATCH block %u not found in hash", rpage->blockno); + else + ; //don't need to compare again + } + } +} + /* * tbm_add_tuples - add some tuple IDs to a TIDBitmap * @@ -318,6 +404,7 @@ tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids, { BlockNumber currblk = InvalidBlockNumber; PagetableEntry *page = NULL; /* only valid when currblk is valid */ + PagetableEntry *page2 = NULL; int i; Assert(tbm->iterating == TBM_NOT_ITERATING); @@ -340,9 +427,18 @@ tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids, if (blk != currblk) { if (tbm_page_is_lossy(tbm, blk)) + { page = NULL; /* remember page is lossy */ + + if (!tbm_page_is_lossy2(tbm, blk)) + elog(ERROR, "PAGE %u NOT LOSSY", blk); + } else + { page = tbm_get_pageentry(tbm, blk); + page2 = tbm_get_pageentry2(tbm, blk); + compare_pages(page, page2); + } currblk = blk; } @@ -351,6 +447,8 @@ tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids, if (page->ischunk) { + if (! page2->ischunk) + elog(ERROR, "TREE block %u is not lossy", page2->blockno); /* The page is a lossy chunk header, set bit for itself */ wordnum = bitnum = 0; } @@ -362,6 +460,11 @@ tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids, } page->words[wordnum] |= ((bitmapword) 1 << bitnum); page->recheck |= recheck; + page2->words[wordnum] |= ((bitmapword) 1 << bitnum); + page2->recheck |= recheck; + + compare_pages(page, page2); + verify_tbm(tbm); if (tbm->nentries > tbm->maxentries) { @@ -383,6 +486,7 @@ tbm_add_page(TIDBitmap *tbm, BlockNumber pageno) { /* Enter the page in the bitmap, or mark it lossy if already present */ tbm_mark_page_lossy(tbm, pageno); + tbm_mark_page_lossy2(tbm, pageno); /* If we went over the memory limit, lossify some more pages */ if (tbm->nentries > tbm->maxentries) tbm_lossify(tbm); @@ -396,6 +500,7 @@ tbm_add_page(TIDBitmap *tbm, BlockNumber pageno) void tbm_union(TIDBitmap *a, const TIDBitmap *b) { +elog(NOTICE, "UNTESTED: UNION"); Assert(!a->iterating); /* Nothing to do if b is empty */ if (b->nentries == 0) @@ -409,6 +514,20 @@ tbm_union(TIDBitmap *a, const TIDBitmap *b) while ((bpage = pagetable_iterate(b->pagetable, &i)) != NULL) tbm_union_page(a, bpage); } + + // union radix tree pages + { + tbmrt_iter *i; + PagetableEntry *bpage; + uint64 key; + + i = tbmrt_begin_iterate(b->pagetable2); + while ((bpage = tbmrt_iterate_next(i, &key)) != NULL) + tbm_union_page2(a, bpage); + tbmrt_end_iterate(i); + } + + verify_tbm(a); } /* Process one page of b during a union op */ @@ -466,6 +585,61 @@ tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage) tbm_lossify(a); } +/* Process one page of b during a union op */ +static void +tbm_union_page2(TIDBitmap *a, const PagetableEntry *bpage) +{ + PagetableEntry *apage; + int wordnum; + + if (bpage->ischunk) + { + /* Scan b's chunk, mark each indicated page lossy in a */ + for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++) + { + bitmapword w = bpage->words[wordnum]; + + if (w != 0) + { + BlockNumber pg; + + pg = bpage->blockno + (wordnum * BITS_PER_BITMAPWORD); + while (w != 0) + { + if (w & 1) + tbm_mark_page_lossy2(a, pg); + pg++; + w >>= 1; + } + } + } + } + else if (tbm_page_is_lossy2(a, bpage->blockno)) + { + /* page is already lossy in a, nothing to do */ + return; + } + else + { + apage = tbm_get_pageentry2(a, bpage->blockno); + if (apage->ischunk) + { + /* The page is a lossy chunk header, set bit for itself */ + apage->words[0] |= ((bitmapword) 1 << 0); + } + else + { + /* Both pages are exact, merge at the bit level */ + for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++) + apage->words[wordnum] |= bpage->words[wordnum]; + apage->recheck |= bpage->recheck; + } + } + + if (a->nentries > a->maxentries) + tbm_lossify(a); +} + /* * tbm_intersect - set intersection * @@ -474,20 +648,42 @@ tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage) void tbm_intersect(TIDBitmap *a, const TIDBitmap *b) { + int ndeleted = 0; + int ndeleted2 = 0; + Assert(!a->iterating); /* Nothing to do if a is empty */ if (a->nentries == 0) return; - /* Scan through chunks and pages in a, try to match to b */ + { - pagetable_iterator i; - PagetableEntry *apage; + tbmrt_iter *i; + PagetableEntry *apage2; + uint64 key; - pagetable_start_iterate(a->pagetable, &i); - while ((apage = pagetable_iterate(a->pagetable, &i)) != NULL) + // FIXME: for regression tests, save list of deletable block numbers + // until radix tree can delete during iteration + BlockNumber deletable[100000] = {InvalidBlockNumber}; + + i = tbmrt_begin_iterate(a->pagetable2); + while ((apage2 = tbmrt_iterate_next(i, &key)) != NULL) { - if (tbm_intersect_page(a, apage, b)) + bool deleted; + bool deleted2; + PagetableEntry *apage; + + Assert(key == apage2->blockno); + + apage = pagetable_lookup(a->pagetable, apage2->blockno); + compare_pages(apage, apage2); + deleted = tbm_intersect_page(a, apage, b); + deleted2 = tbm_intersect_page2(a, apage2, b); + Assert(deleted == deleted2); + compare_pages(apage, apage2); + + if (deleted) { + ndeleted++; /* Page or chunk is now empty, remove it from a */ if (apage->ischunk) a->nchunks--; @@ -497,8 +693,34 @@ tbm_intersect(TIDBitmap *a, const TIDBitmap *b) if (!pagetable_delete(a->pagetable, apage->blockno)) elog(ERROR, "hash table corrupted"); } + + if (deleted2) + { + deletable[ndeleted2] = apage2->blockno; + ndeleted2++; + } + else + { + // we'd better find it in the hash table too + if (tbm_find_pageentry(a, apage2->blockno) == NULL) + elog(ERROR, "TREE NOT DELETED BUT NOT IN HASH"); + } + } + tbmrt_end_iterate(i); + + for (int i =0; i<ndeleted2; ++i) + { + Assert(deletable[i] != InvalidBlockNumber); + if (!tbmrt_delete(a->pagetable2, deletable[i])) + elog(ERROR, "radix tree corrupted"); + + // we'd better not find it in the hash table + if (tbm_find_pageentry(a, deletable[i]) != NULL) + elog(ERROR, " TREE DELETED BUT STILL IN HASH"); } } + + verify_tbm(a); } /* @@ -517,6 +739,8 @@ tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage, const TIDBitmap *b) /* Scan each bit in chunk, try to clear */ bool candelete = true; +elog(NOTICE, "UNTESTED: PAGE CHUNK a %u", apage->blockno); + for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++) { bitmapword w = apage->words[wordnum]; @@ -553,6 +777,7 @@ tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage, const TIDBitmap *b) } else if (tbm_page_is_lossy(b, apage->blockno)) { +elog(NOTICE, "UNTESTED: HASH FOUND LOSSY b %u", apage->blockno); /* * Some of the tuples in 'a' might not satisfy the quals for 'b', but * because the page 'b' is lossy, we don't know which ones. Therefore @@ -584,6 +809,109 @@ tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage, const TIDBitmap *b) } } +/* + * Process one page of a during an intersection op + * + * Returns true if apage is now empty and should be deleted from a + */ +static bool +tbm_intersect_page2(TIDBitmap *a, PagetableEntry *apage, const TIDBitmap *b) +{ + const PagetableEntry *bpage; + int wordnum; + + if (apage->ischunk) + { + /* Scan each bit in chunk, try to clear */ + bool candelete = true; + +elog(NOTICE, "UNTESTED: PAGE CHUNK a %u", apage->blockno); + + for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++) + { + bitmapword w = apage->words[wordnum]; + + if (w != 0) + { + bitmapword neww = w; + BlockNumber pg; + int bitnum; + + pg = apage->blockno + (wordnum * BITS_PER_BITMAPWORD); + bitnum = 0; + while (w != 0) + { + if (w & 1) + { + if (!tbm_page_is_lossy2(b, pg) && + tbm_find_pageentry2(b, pg) == NULL) + { + /* Page is not in b at all, lose lossy bit */ + neww &= ~((bitmapword) 1 << bitnum); + } + } + pg++; + bitnum++; + w >>= 1; + } + apage->words[wordnum] = neww; + if (neww != 0) + candelete = false; + } + } + return candelete; + } + else if (tbm_page_is_lossy2(b, apage->blockno)) + { +elog(NOTICE, "UNTESTED: TREE FOUND LOSSY b %u", apage->blockno); + /* + * Some of the tuples in 'a' might not satisfy the quals for 'b', but + * because the page 'b' is lossy, we don't know which ones. Therefore + * we mark 'a' as requiring rechecks, to indicate that at most those + * tuples set in 'a' are matches. + */ + apage->recheck = true; + return false; + } + else + { + bool candelete = true; + + bpage = tbm_find_pageentry2(b, apage->blockno); + if (bpage != NULL) + { + /* Both pages are exact, merge at the bit level */ + Assert(!bpage->ischunk); + for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++) + { + apage->words[wordnum] &= bpage->words[wordnum]; + if (apage->words[wordnum] != 0) + candelete = false; + } + apage->recheck |= bpage->recheck; + + // does it match what we just did in the hash table? + if (pagetable_lookup(a->pagetable, apage->blockno) != NULL) + { + compare_pages(apage, pagetable_lookup(a->pagetable, apage->blockno)); + } + else + { + // if the hash entry is not there it should have been deleted + // check that we did the same + Assert(candelete); + } + } + else + { + // we'd better not find it in the hash table + Assert(pagetable_lookup(b->pagetable, apage->blockno) == NULL); + } + /* If there is no matching b page, we can just delete the a page */ + return candelete; + } +} + /* * tbm_is_empty - is a TIDBitmap completely empty? */ @@ -637,17 +965,40 @@ tbm_begin_private_iterate(TIDBitmap *tbm) { pagetable_iterator i; PagetableEntry *page; + PagetableEntry *page2; + tbmrt_iter *iter = NULL; + uint64 key; int npages; int nchunks; + int npages2; + int nchunks2; if (!tbm->spages && tbm->npages > 0) + { tbm->spages = (PagetableEntry **) MemoryContextAlloc(tbm->mcxt, tbm->npages * sizeof(PagetableEntry *)); + } + if (!tbm->spages2 && tbm->pagetable2->ctl->num_keys > 0) + { + // WIP: just allocate number of keys + tbm->spages2 = (PagetableEntry **) + MemoryContextAlloc(tbm->mcxt, + tbm->pagetable2->ctl->num_keys * sizeof(PagetableEntry *)); + } if (!tbm->schunks && tbm->nchunks > 0) + { tbm->schunks = (PagetableEntry **) MemoryContextAlloc(tbm->mcxt, tbm->nchunks * sizeof(PagetableEntry *)); + } + if (!tbm->schunks2 && tbm->pagetable2->ctl->num_keys > 0) + { + // WIP: just allocate number of keys + tbm->schunks2 = (PagetableEntry **) + MemoryContextAlloc(tbm->mcxt, + tbm->pagetable2->ctl->num_keys * sizeof(PagetableEntry *)); + } npages = nchunks = 0; pagetable_start_iterate(tbm->pagetable, &i); @@ -666,6 +1017,35 @@ tbm_begin_private_iterate(TIDBitmap *tbm) if (nchunks > 1) qsort(tbm->schunks, nchunks, sizeof(PagetableEntry *), tbm_comparator); + + // same for radix tree + npages2 = nchunks2 = 0; + iter = tbmrt_begin_iterate(tbm->pagetable2); + while(true) + { + page2 = tbmrt_iterate_next(iter, &key); + if (page2 == NULL) + break; + + if (page2->ischunk) + tbm->schunks2[nchunks2++] = page2; + else + tbm->spages2[npages2++] = page2; + } + tbmrt_end_iterate(iter); + + if (npages == npages2) + { + for (int i=0; i<npages; i++) + { + compare_pages(tbm->spages[i], tbm->spages2[i]); + } + } + else + { + elog(NOTICE, "LENGTH npages: %d npages2: %d", npages, npages2); + elog(NOTICE, "LENGTH nchunks: %d nchunks2: %d\n", nchunks, nchunks2); + } } tbm->iterating = TBM_ITERATING_PRIVATE; @@ -1097,6 +1477,22 @@ tbm_find_pageentry(const TIDBitmap *tbm, BlockNumber pageno) return page; } +static const PagetableEntry * +tbm_find_pageentry2(const TIDBitmap *tbm, BlockNumber pageno) +{ + const PagetableEntry *page; + +// if (tbm->nentries == 0) /* in case pagetable doesn't exist */ +// return NULL; + + page = tbmrt_find(tbm->pagetable2, pageno); + if (page == NULL) + return NULL; + if (page->ischunk) + return NULL; /* don't want a lossy chunk header */ + return page; +} + /* * tbm_get_pageentry - find or create a PagetableEntry for the pageno * @@ -1132,6 +1528,33 @@ tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno) return page; } +static PagetableEntry * +tbm_get_pageentry2(TIDBitmap *tbm, BlockNumber pageno) +{ + PagetableEntry *page; + bool found; + + /* Look up or create an entry */ + page = tbmrt_find(tbm->pagetable2, pageno); + + /* Initialize it if not present before */ + if (page == NULL) + { + PagetableEntry ** slot; + + page = tbmrt_alloc_leaf(tbm->pagetable2, sizeof(PagetableEntry)); + slot = tbmrt_get_slot(tbm->pagetable2, pageno, &found); + *slot = page; + Assert(!found); + + MemSet(page, 0, sizeof(PagetableEntry)); + page->blockno = pageno; + /* don't count it again */ + } + + return page; +} + /* * tbm_page_is_lossy - is the page marked as lossily stored? */ @@ -1162,6 +1585,33 @@ tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno) return false; } +static bool +tbm_page_is_lossy2(const TIDBitmap *tbm, BlockNumber pageno) +{ + PagetableEntry *page; + BlockNumber chunk_pageno; + int bitno; + + /* we can skip the lookup if there are no lossy chunks */ +// if (tbm->nchunks == 0) +// return false; + + bitno = pageno % PAGES_PER_CHUNK; + chunk_pageno = pageno - bitno; + + page = tbmrt_find(tbm->pagetable2, chunk_pageno); + + if (page != NULL && page->ischunk) + { + int wordnum = WORDNUM(bitno); + int bitnum = BITNUM(bitno); + + if ((page->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0) + return true; + } + return false; +} + /* * tbm_mark_page_lossy - mark the page number as lossily stored * @@ -1193,6 +1643,7 @@ tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno) tbm->nentries--; tbm->npages--; /* assume it must have been non-lossy */ } + tbmrt_delete(tbm->pagetable2, pageno); } /* Look up or create entry for chunk-header page */ @@ -1233,6 +1684,53 @@ tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno) page->words[wordnum] |= ((bitmapword) 1 << bitnum); } +static void +tbm_mark_page_lossy2(TIDBitmap *tbm, BlockNumber pageno) +{ + PagetableEntry *page2; + bool found; + BlockNumber chunk_pageno; + int bitno; + int wordnum; + int bitnum; + + bitno = pageno % PAGES_PER_CHUNK; + chunk_pageno = pageno - bitno; + + page2 = tbmrt_find(tbm->pagetable2, pageno); + + /* Initialize it if not present before */ + if (page2 == NULL) + { + PagetableEntry ** slot; + + page2 = tbmrt_alloc_leaf(tbm->pagetable2, sizeof(PagetableEntry)); + slot = tbmrt_get_slot(tbm->pagetable2, pageno, &found); + *slot = page2; + Assert(!found); + + MemSet(page2, 0, sizeof(PagetableEntry)); + page2->blockno = chunk_pageno; + page2->ischunk = true; + /* don't count it again */ + } + else if (!page2->ischunk) + { + /* chunk header page was formerly non-lossy, make it lossy */ + MemSet(page2, 0, sizeof(PagetableEntry)); + page2->blockno = chunk_pageno; + page2->ischunk = true; + /* we assume it had some tuple bit(s) set, so mark it lossy */ + page2->words[0] = ((bitmapword) 1 << 0); + /* don't adjust counts */ + } + + /* Now set the original target page's bit */ + wordnum = WORDNUM(bitno); + bitnum = BITNUM(bitno); + page2->words[wordnum] |= ((bitmapword) 1 << bitnum); +} + /* * tbm_lossify - lose some information to get back under the memory limit */ @@ -1270,6 +1768,7 @@ tbm_lossify(TIDBitmap *tbm) /* This does the dirty work ... */ tbm_mark_page_lossy(tbm, page->blockno); + //tbm_mark_page_lossy2(tbm, page->blockno); if (tbm->nentries <= tbm->maxentries / 2) { diff --git a/src/include/lib/radixtree.h b/src/include/lib/radixtree.h index 86647eaae93..61ebe0f3304 100644 --- a/src/include/lib/radixtree.h +++ b/src/include/lib/radixtree.h @@ -405,7 +405,7 @@ typedef struct RT_NODE #define RT_INVALID_PTR_ALLOC InvalidDsaPointer #define RT_PTR_ALLOC_IS_VALID(ptr) DsaPointerIsValid(ptr) #else -#define RT_PTR_ALLOC RT_NODE * +#define RT_PTR_ALLOC void * #define RT_INVALID_PTR_ALLOC NULL #define RT_PTR_ALLOC_IS_VALID(ptr) PointerIsValid(ptr) #endif -- 2.50.0
From 2e055fb0ae82cf5a288cb84e8f0211b7faed6395 Mon Sep 17 00:00:00 2001 From: John Naylor <john.nay...@postgresql.org> Date: Fri, 14 Feb 2025 14:30:14 +0700 Subject: [PATCH v1 2/5] TEMP disable tbm_lossify() --- src/backend/nodes/tidbitmap.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/backend/nodes/tidbitmap.c b/src/backend/nodes/tidbitmap.c index 112f16c2424..4a5c0e6ad1b 100644 --- a/src/backend/nodes/tidbitmap.c +++ b/src/backend/nodes/tidbitmap.c @@ -1239,6 +1239,8 @@ tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno) static void tbm_lossify(TIDBitmap *tbm) { + +#if 0 pagetable_iterator i; PagetableEntry *page; @@ -1299,6 +1301,7 @@ tbm_lossify(TIDBitmap *tbm) */ if (tbm->nentries > tbm->maxentries / 2) tbm->maxentries = Min(tbm->nentries, (INT_MAX - 1) / 2) * 2; +#endif } /* -- 2.50.0
From 8108fc3712ebf01404e52fd69cea2c559d028984 Mon Sep 17 00:00:00 2001 From: John Naylor <john.nay...@postgresql.org> Date: Fri, 21 Feb 2025 14:14:07 +0700 Subject: [PATCH v1 3/5] Turn RT_SET into wrapper that calls to RT_GET_SLOT and RT_SET_SLOT RT_GET_SLOT: helper function for finding or creating a value slot in the tree RT_SET_SLOT: update the value in place --- src/include/lib/radixtree.h | 61 ++++++++++++++++++++++++++++--------- 1 file changed, 46 insertions(+), 15 deletions(-) diff --git a/src/include/lib/radixtree.h b/src/include/lib/radixtree.h index a75b77270c4..b0902988ee1 100644 --- a/src/include/lib/radixtree.h +++ b/src/include/lib/radixtree.h @@ -183,6 +183,8 @@ #define RT_LOCK_SHARE RT_MAKE_NAME(lock_share) #define RT_UNLOCK RT_MAKE_NAME(unlock) #endif +#define RT_GET_SLOT RT_MAKE_NAME (get_slot) +#define RT_SET_SLOT RT_MAKE_NAME (set_slot) #define RT_SET RT_MAKE_NAME(set) #define RT_BEGIN_ITERATE RT_MAKE_NAME(begin_iterate) #define RT_ITERATE_NEXT RT_MAKE_NAME(iterate_next) @@ -288,6 +290,8 @@ RT_SCOPE RT_RADIX_TREE *RT_CREATE(MemoryContext ctx); RT_SCOPE void RT_FREE(RT_RADIX_TREE * tree); RT_SCOPE RT_VALUE_TYPE *RT_FIND(RT_RADIX_TREE * tree, uint64 key); +RT_SCOPE void* RT_GET_SLOT(RT_RADIX_TREE * tree, uint64 key, bool* found); +RT_SCOPE bool RT_SET_SLOT(RT_RADIX_TREE * tree, void* slot_v, RT_VALUE_TYPE * value_p, bool found); RT_SCOPE bool RT_SET(RT_RADIX_TREE * tree, uint64 key, RT_VALUE_TYPE * value_p); #ifdef RT_USE_DELETE @@ -1692,18 +1696,10 @@ RT_GET_SLOT_RECURSIVE(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, uint64 k } } -/* - * Set key to value that value_p points to. If the entry already exists, we - * update its value and return true. Returns false if entry doesn't yet exist. - * - * Taking a lock in exclusive mode is the caller's responsibility. - */ -RT_SCOPE bool -RT_SET(RT_RADIX_TREE * tree, uint64 key, RT_VALUE_TYPE * value_p) +RT_SCOPE void * +RT_GET_SLOT(RT_RADIX_TREE * tree, uint64 key, bool * found) { - bool found; RT_PTR_ALLOC *slot; - size_t value_sz = RT_GET_VALUE_SIZE(value_p); #ifdef RT_SHMEM Assert(tree->ctl->magic == RT_RADIX_TREE_MAGIC); @@ -1733,7 +1729,8 @@ RT_SET(RT_RADIX_TREE * tree, uint64 key, RT_VALUE_TYPE * value_p) n4->chunks[0] = RT_GET_KEY_CHUNK(key, start_shift); slot = RT_EXTEND_DOWN(tree, &n4->children[0], key, start_shift); - found = false; + *found = false; + tree->ctl->start_shift = start_shift; tree->ctl->max_val = RT_SHIFT_GET_MAX_VAL(start_shift); goto have_slot; @@ -1743,11 +1740,30 @@ RT_SET(RT_RADIX_TREE * tree, uint64 key, RT_VALUE_TYPE * value_p) } slot = RT_GET_SLOT_RECURSIVE(tree, &tree->ctl->root, - key, tree->ctl->start_shift, &found); + key, tree->ctl->start_shift, found); have_slot: Assert(slot != NULL); + /* Update the statistics */ + if (!*found) + tree->ctl->num_keys++; + + return slot; +} + +/* + * Set key to value that value_p points to. If the entry already exists, we + * update its value and return true. Returns false if entry doesn't yet exist. + * + * Taking a lock in exclusive mode is the caller's responsibility. + */ +RT_SCOPE bool +RT_SET_SLOT(RT_RADIX_TREE * tree, void * slot_v, RT_VALUE_TYPE * value_p, bool found) +{ + RT_PTR_ALLOC * slot = (RT_PTR_ALLOC *) (slot_v); + size_t value_sz = RT_GET_VALUE_SIZE(value_p); + if (RT_VALUE_IS_EMBEDDABLE(value_p)) { /* free the existing leaf */ @@ -1797,10 +1813,23 @@ have_slot: memcpy(leaf.local, value_p, value_sz); } - /* Update the statistics */ - if (!found) - tree->ctl->num_keys++; + return found; +} + +/* + * Set key to value that value_p points to. If the entry already exists, we + * update its value and return true. Returns false if entry doesn't yet exist. + * + * Taking a lock in exclusive mode is the caller's responsibility. + */ +RT_SCOPE bool +RT_SET(RT_RADIX_TREE * tree, uint64 key, RT_VALUE_TYPE * value_p) +{ + bool found; + void * slot; + slot = RT_GET_SLOT(tree, key, &found); + RT_SET_SLOT(tree, slot, value_p, found); return found; } @@ -2967,6 +2996,8 @@ RT_DUMP_NODE(RT_NODE * node) #undef RT_UNLOCK #undef RT_GET_HANDLE #undef RT_FIND +#undef RT_GET_SLOT +#undef RT_SET_SLOT #undef RT_SET #undef RT_BEGIN_ITERATE #undef RT_ITERATE_NEXT -- 2.50.0
From cff35e2c4b57d2db81686cbb17f86b65430372fc Mon Sep 17 00:00:00 2001 From: John Naylor <john.nay...@postgresql.org> Date: Thu, 1 May 2025 00:38:43 +0700 Subject: [PATCH v1 4/5] Get rid of RT_CHILD_PTR * Change RT_PTR_SET_LOCAL to RT_PTR_GET_LOCAL * Split out the node initialization functionality of RT_ALLOC_NODE to RT_INIT_NODE_* functions This was orignally part of a series for tagging child pointers with the node kind. The purpose for bitmap heap scan is to allow callers to allotate their own memory. Likely not all of this is strictly necessary. --- src/include/lib/radixtree.h | 446 ++++++++++++++++++------------------ 1 file changed, 225 insertions(+), 221 deletions(-) diff --git a/src/include/lib/radixtree.h b/src/include/lib/radixtree.h index b0902988ee1..86647eaae93 100644 --- a/src/include/lib/radixtree.h +++ b/src/include/lib/radixtree.h @@ -202,6 +202,7 @@ #define RT_GET_SLOT_RECURSIVE RT_MAKE_NAME(get_slot_recursive) #define RT_DELETE_RECURSIVE RT_MAKE_NAME(delete_recursive) #define RT_ALLOC_NODE RT_MAKE_NAME(alloc_node) +#define RT_INIT_NODE RT_MAKE_NAME(init_node) #define RT_ALLOC_LEAF RT_MAKE_NAME(alloc_leaf) #define RT_FREE_NODE RT_MAKE_NAME(free_node) #define RT_FREE_LEAF RT_MAKE_NAME(free_leaf) @@ -209,7 +210,7 @@ #define RT_EXTEND_UP RT_MAKE_NAME(extend_up) #define RT_EXTEND_DOWN RT_MAKE_NAME(extend_down) #define RT_COPY_COMMON RT_MAKE_NAME(copy_common) -#define RT_PTR_SET_LOCAL RT_MAKE_NAME(ptr_set_local) +#define RT_PTR_GET_LOCAL RT_MAKE_NAME(ptr_get_local) #define RT_NODE_16_SEARCH_EQ RT_MAKE_NAME(node_16_search_eq) #define RT_NODE_4_GET_INSERTPOS RT_MAKE_NAME(node_4_get_insertpos) #define RT_NODE_16_GET_INSERTPOS RT_MAKE_NAME(node_16_get_insertpos) @@ -251,7 +252,6 @@ #define RT_HANDLE RT_MAKE_NAME(handle) #endif #define RT_NODE RT_MAKE_NAME(node) -#define RT_CHILD_PTR RT_MAKE_NAME(child_ptr) #define RT_NODE_ITER RT_MAKE_NAME(node_iter) #define RT_NODE_4 RT_MAKE_NAME(node_4) #define RT_NODE_16 RT_MAKE_NAME(node_16) @@ -410,21 +410,6 @@ typedef struct RT_NODE #define RT_PTR_ALLOC_IS_VALID(ptr) PointerIsValid(ptr) #endif -/* - * A convenience type used when we need to work with a DSA pointer as well - * as its local pointer. For local memory, both members are the same, so - * we use a union. - */ -#ifdef RT_SHMEM -typedef struct RT_CHILD_PTR -#else -typedef union RT_CHILD_PTR -#endif -{ - RT_PTR_ALLOC alloc; - RT_NODE *local; -} RT_CHILD_PTR; - /* * Helper macros and functions for value storage. @@ -733,7 +718,8 @@ struct RT_RADIX_TREE /* state for iterating over a single node */ typedef struct RT_NODE_ITER { - RT_CHILD_PTR node; + RT_PTR_ALLOC nodealloc; + RT_NODE *node; /* * The next index of the chunk array in RT_NODE_KIND_4 and RT_NODE_KIND_16 @@ -764,11 +750,13 @@ struct RT_ITER /* verification (available only in assert-enabled builds) */ static void RT_VERIFY_NODE(RT_NODE * node); -static inline void -RT_PTR_SET_LOCAL(RT_RADIX_TREE * tree, RT_CHILD_PTR * node) +static inline RT_NODE * +RT_PTR_GET_LOCAL(RT_RADIX_TREE * tree, RT_PTR_ALLOC nodealloc) { #ifdef RT_SHMEM - node->local = dsa_get_address(tree->dsa, node->alloc); + return dsa_get_address(tree->dsa, nodealloc); +#else + return nodealloc; #endif } @@ -828,30 +816,35 @@ RT_SHIFT_GET_MAX_VAL(int shift) return (UINT64CONST(1) << (shift + RT_SPAN)) - 1; } -/* - * Allocate a new node with the given node kind and size class. - */ -static inline RT_CHILD_PTR -RT_ALLOC_NODE(RT_RADIX_TREE * tree, const uint8 kind, const RT_SIZE_CLASS size_class) +static inline RT_PTR_ALLOC +RT_ALLOC_NODE(RT_RADIX_TREE * tree, const RT_SIZE_CLASS size_class) { - RT_CHILD_PTR allocnode; - RT_NODE *node; size_t allocsize; + RT_PTR_ALLOC nodealloc; allocsize = RT_SIZE_CLASS_INFO[size_class].allocsize; #ifdef RT_SHMEM - allocnode.alloc = dsa_allocate(tree->dsa, allocsize); + nodealloc = dsa_allocate(tree->dsa, allocsize); #else - allocnode.alloc = (RT_PTR_ALLOC) MemoryContextAlloc(tree->node_slabs[size_class], - allocsize); + nodealloc = (RT_PTR_ALLOC) MemoryContextAlloc(tree->node_slabs[size_class], + allocsize); #endif - RT_PTR_SET_LOCAL(tree, &allocnode); - node = allocnode.local; +#ifdef RT_DEBUG + /* update the statistics */ + tree->ctl->num_nodes[size_class]++; +#endif - /* initialize contents */ + return nodealloc; +} +/* + * Initialize the given node with the necessary metadata. + */ +static inline void +RT_INIT_NODE(RT_NODE * node, const uint8 kind, const RT_SIZE_CLASS size_class) +{ switch (kind) { case RT_NODE_KIND_4: @@ -882,35 +875,27 @@ RT_ALLOC_NODE(RT_RADIX_TREE * tree, const uint8 kind, const RT_SIZE_CLASS size_c * because that node doesn't need to introspect this value. */ node->fanout = RT_SIZE_CLASS_INFO[size_class].fanout; - -#ifdef RT_DEBUG - /* update the statistics */ - tree->ctl->num_nodes[size_class]++; -#endif - - return allocnode; } /* * Allocate a new leaf. */ -static RT_CHILD_PTR +static RT_PTR_ALLOC RT_ALLOC_LEAF(RT_RADIX_TREE * tree, size_t allocsize) { - RT_CHILD_PTR leaf; + RT_PTR_ALLOC leafalloc; #ifdef RT_SHMEM - leaf.alloc = dsa_allocate(tree->dsa, allocsize); - RT_PTR_SET_LOCAL(tree, &leaf); + leafalloc = dsa_allocate(tree->dsa, allocsize); #else - leaf.alloc = (RT_PTR_ALLOC) MemoryContextAlloc(tree->leaf_context, allocsize); + leafalloc = (RT_PTR_ALLOC) MemoryContextAlloc(tree->leaf_context, allocsize); #endif #ifdef RT_DEBUG tree->ctl->num_leaves++; #endif - return leaf; + return leafalloc; } /* @@ -918,23 +903,24 @@ RT_ALLOC_LEAF(RT_RADIX_TREE * tree, size_t allocsize) * This is a separate function in case other fields are added. */ static inline void -RT_COPY_COMMON(RT_CHILD_PTR newnode, RT_CHILD_PTR oldnode) +RT_COPY_COMMON(RT_NODE * newnode, RT_NODE * oldnode) { - (newnode.local)->count = (oldnode.local)->count; + newnode->count = oldnode->count; } /* Free the given node */ static void -RT_FREE_NODE(RT_RADIX_TREE * tree, RT_CHILD_PTR node) +RT_FREE_NODE(RT_RADIX_TREE * tree, RT_PTR_ALLOC nodealloc) { #ifdef RT_DEBUG int i; + RT_NODE *node = RT_PTR_GET_LOCAL(tree, nodealloc); /* update the statistics */ for (i = 0; i < RT_NUM_SIZE_CLASSES; i++) { - if ((node.local)->fanout == RT_SIZE_CLASS_INFO[i].fanout) + if (node->fanout == RT_SIZE_CLASS_INFO[i].fanout) break; } @@ -950,9 +936,9 @@ RT_FREE_NODE(RT_RADIX_TREE * tree, RT_CHILD_PTR node) #endif #ifdef RT_SHMEM - dsa_free(tree->dsa, node.alloc); + dsa_free(tree->dsa, nodealloc); #else - pfree(node.alloc); + pfree(nodealloc); #endif } @@ -1094,7 +1080,8 @@ RT_NODE_SEARCH(RT_NODE * node, uint8 chunk) RT_SCOPE RT_VALUE_TYPE * RT_FIND(RT_RADIX_TREE * tree, uint64 key) { - RT_CHILD_PTR node; + RT_PTR_ALLOC nodealloc; + RT_NODE *node; RT_PTR_ALLOC *slot = NULL; int shift; @@ -1106,18 +1093,18 @@ RT_FIND(RT_RADIX_TREE * tree, uint64 key) return NULL; Assert(RT_PTR_ALLOC_IS_VALID(tree->ctl->root)); - node.alloc = tree->ctl->root; + nodealloc = tree->ctl->root; shift = tree->ctl->start_shift; /* Descend the tree */ while (shift >= 0) { - RT_PTR_SET_LOCAL(tree, &node); - slot = RT_NODE_SEARCH(node.local, RT_GET_KEY_CHUNK(key, shift)); + node = RT_PTR_GET_LOCAL(tree, nodealloc); + slot = RT_NODE_SEARCH(node, RT_GET_KEY_CHUNK(key, shift)); if (slot == NULL) return NULL; - node.alloc = *slot; + nodealloc = *slot; shift -= RT_SPAN; } @@ -1125,8 +1112,8 @@ RT_FIND(RT_RADIX_TREE * tree, uint64 key) return (RT_VALUE_TYPE *) slot; else { - RT_PTR_SET_LOCAL(tree, &node); - return (RT_VALUE_TYPE *) node.local; + node = RT_PTR_GET_LOCAL(tree, nodealloc); + return (RT_VALUE_TYPE *) node; } } @@ -1270,9 +1257,8 @@ RT_COPY_ARRAYS_FOR_INSERT(uint8 *dst_chunks, RT_PTR_ALLOC * dst_children, } static inline RT_PTR_ALLOC * -RT_ADD_CHILD_256(RT_RADIX_TREE * tree, RT_CHILD_PTR node, uint8 chunk) +RT_ADD_CHILD_256(RT_RADIX_TREE * tree, RT_NODE_256 * n256, uint8 chunk) { - RT_NODE_256 *n256 = (RT_NODE_256 *) node.local; int idx = RT_BM_IDX(chunk); int bitnum = RT_BM_BIT(chunk); @@ -1286,20 +1272,22 @@ RT_ADD_CHILD_256(RT_RADIX_TREE * tree, RT_CHILD_PTR node, uint8 chunk) } static pg_noinline RT_PTR_ALLOC * -RT_GROW_NODE_48(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_CHILD_PTR node, +RT_GROW_NODE_48(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_NODE_48 * n48, uint8 chunk) { - RT_NODE_48 *n48 = (RT_NODE_48 *) node.local; - RT_CHILD_PTR newnode; + RT_PTR_ALLOC newalloc; + RT_NODE *newnode; RT_NODE_256 *new256; int i = 0; /* initialize new node */ - newnode = RT_ALLOC_NODE(tree, RT_NODE_KIND_256, RT_CLASS_256); - new256 = (RT_NODE_256 *) newnode.local; + newalloc = RT_ALLOC_NODE(tree, RT_CLASS_256); + newnode = RT_PTR_GET_LOCAL(tree, newalloc); + RT_INIT_NODE(newnode, RT_NODE_KIND_256, RT_CLASS_256); + new256 = (RT_NODE_256 *) newnode; /* copy over the entries */ - RT_COPY_COMMON(newnode, node); + RT_COPY_COMMON((RT_NODE *) new256, (RT_NODE *) n48); for (int word_num = 0; word_num < RT_BM_IDX(RT_NODE_MAX_SLOTS); word_num++) { bitmapword bitmap = 0; @@ -1326,16 +1314,15 @@ RT_GROW_NODE_48(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_CHILD_PTR n } /* free old node and update reference in parent */ - *parent_slot = newnode.alloc; - RT_FREE_NODE(tree, node); + RT_FREE_NODE(tree, *parent_slot); + *parent_slot = newalloc; - return RT_ADD_CHILD_256(tree, newnode, chunk); + return RT_ADD_CHILD_256(tree, new256, chunk); } static inline RT_PTR_ALLOC * -RT_ADD_CHILD_48(RT_RADIX_TREE * tree, RT_CHILD_PTR node, uint8 chunk) +RT_ADD_CHILD_48(RT_RADIX_TREE * tree, RT_NODE_48 * n48, uint8 chunk) { - RT_NODE_48 *n48 = (RT_NODE_48 *) node.local; int insertpos; int idx = 0; bitmapword w, @@ -1371,25 +1358,27 @@ RT_ADD_CHILD_48(RT_RADIX_TREE * tree, RT_CHILD_PTR node, uint8 chunk) } static pg_noinline RT_PTR_ALLOC * -RT_GROW_NODE_16(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_CHILD_PTR node, +RT_GROW_NODE_16(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_NODE_16 * n16, uint8 chunk) { - RT_NODE_16 *n16 = (RT_NODE_16 *) node.local; int insertpos; if (n16->base.fanout < RT_FANOUT_16_HI) { - RT_CHILD_PTR newnode; + RT_PTR_ALLOC newalloc; + RT_NODE *newnode; RT_NODE_16 *new16; Assert(n16->base.fanout == RT_FANOUT_16_LO); /* initialize new node */ - newnode = RT_ALLOC_NODE(tree, RT_NODE_KIND_16, RT_CLASS_16_HI); - new16 = (RT_NODE_16 *) newnode.local; + newalloc = RT_ALLOC_NODE(tree, RT_CLASS_16_HI); + newnode = RT_PTR_GET_LOCAL(tree, newalloc); + RT_INIT_NODE(newnode, RT_NODE_KIND_16, RT_CLASS_16_HI); + new16 = (RT_NODE_16 *) newnode; /* copy over existing entries */ - RT_COPY_COMMON(newnode, node); + RT_COPY_COMMON((RT_NODE *) new16, (RT_NODE *) n16); Assert(n16->base.count == RT_FANOUT_16_LO); insertpos = RT_NODE_16_GET_INSERTPOS(n16, chunk); RT_COPY_ARRAYS_FOR_INSERT(new16->chunks, new16->children, @@ -1403,14 +1392,15 @@ RT_GROW_NODE_16(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_CHILD_PTR n RT_VERIFY_NODE((RT_NODE *) new16); /* free old node and update references */ - RT_FREE_NODE(tree, node); - *parent_slot = newnode.alloc; + RT_FREE_NODE(tree, *parent_slot); + *parent_slot = newalloc; return &new16->children[insertpos]; } else { - RT_CHILD_PTR newnode; + RT_PTR_ALLOC newalloc; + RT_NODE *newnode; RT_NODE_48 *new48; int idx, bit; @@ -1418,11 +1408,13 @@ RT_GROW_NODE_16(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_CHILD_PTR n Assert(n16->base.fanout == RT_FANOUT_16_HI); /* initialize new node */ - newnode = RT_ALLOC_NODE(tree, RT_NODE_KIND_48, RT_CLASS_48); - new48 = (RT_NODE_48 *) newnode.local; + newalloc = RT_ALLOC_NODE(tree, RT_CLASS_48); + newnode = RT_PTR_GET_LOCAL(tree, newalloc); + RT_INIT_NODE(newnode, RT_NODE_KIND_48, RT_CLASS_48); + new48 = (RT_NODE_48 *) newnode; /* copy over the entries */ - RT_COPY_COMMON(newnode, node); + RT_COPY_COMMON((RT_NODE *) new48, (RT_NODE *) n16); for (int i = 0; i < RT_FANOUT_16_HI; i++) new48->slot_idxs[n16->chunks[i]] = i; memcpy(&new48->children[0], &n16->children[0], RT_FANOUT_16_HI * sizeof(new48->children[0])); @@ -1450,17 +1442,16 @@ RT_GROW_NODE_16(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_CHILD_PTR n RT_VERIFY_NODE((RT_NODE *) new48); /* free old node and update reference in parent */ - *parent_slot = newnode.alloc; - RT_FREE_NODE(tree, node); + RT_FREE_NODE(tree, *parent_slot); + *parent_slot = newalloc; return &new48->children[insertpos]; } } static inline RT_PTR_ALLOC * -RT_ADD_CHILD_16(RT_RADIX_TREE * tree, RT_CHILD_PTR node, uint8 chunk) +RT_ADD_CHILD_16(RT_RADIX_TREE * tree, RT_NODE_16 * n16, uint8 chunk) { - RT_NODE_16 *n16 = (RT_NODE_16 *) node.local; int insertpos = RT_NODE_16_GET_INSERTPOS(n16, chunk); /* shift chunks and children */ @@ -1477,20 +1468,22 @@ RT_ADD_CHILD_16(RT_RADIX_TREE * tree, RT_CHILD_PTR node, uint8 chunk) } static pg_noinline RT_PTR_ALLOC * -RT_GROW_NODE_4(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_CHILD_PTR node, +RT_GROW_NODE_4(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_NODE_4 * n4, uint8 chunk) { - RT_NODE_4 *n4 = (RT_NODE_4 *) (node.local); - RT_CHILD_PTR newnode; + RT_PTR_ALLOC newalloc; + RT_NODE *newnode; RT_NODE_16 *new16; int insertpos; /* initialize new node */ - newnode = RT_ALLOC_NODE(tree, RT_NODE_KIND_16, RT_CLASS_16_LO); - new16 = (RT_NODE_16 *) newnode.local; + newalloc = RT_ALLOC_NODE(tree, RT_CLASS_16_LO); + newnode = RT_PTR_GET_LOCAL(tree, newalloc); + RT_INIT_NODE(newnode, RT_NODE_KIND_16, RT_CLASS_16_LO); + new16 = (RT_NODE_16 *) newnode; /* copy over existing entries */ - RT_COPY_COMMON(newnode, node); + RT_COPY_COMMON((RT_NODE *) new16, (RT_NODE *) n4); Assert(n4->base.count == RT_FANOUT_4); insertpos = RT_NODE_4_GET_INSERTPOS(n4, chunk, RT_FANOUT_4); RT_COPY_ARRAYS_FOR_INSERT(new16->chunks, new16->children, @@ -1504,16 +1497,15 @@ RT_GROW_NODE_4(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_CHILD_PTR no RT_VERIFY_NODE((RT_NODE *) new16); /* free old node and update reference in parent */ - *parent_slot = newnode.alloc; - RT_FREE_NODE(tree, node); + RT_FREE_NODE(tree, *parent_slot); + *parent_slot = newalloc; return &new16->children[insertpos]; } static inline RT_PTR_ALLOC * -RT_ADD_CHILD_4(RT_RADIX_TREE * tree, RT_CHILD_PTR node, uint8 chunk) +RT_ADD_CHILD_4(RT_RADIX_TREE * tree, RT_NODE_4 * n4, uint8 chunk) { - RT_NODE_4 *n4 = (RT_NODE_4 *) (node.local); int count = n4->base.count; int insertpos = RT_NODE_4_GET_INSERTPOS(n4, chunk, count); @@ -1539,36 +1531,40 @@ RT_ADD_CHILD_4(RT_RADIX_TREE * tree, RT_CHILD_PTR node, uint8 chunk) * child pointer with the pointer to the new larger node. */ static inline RT_PTR_ALLOC * -RT_NODE_INSERT(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_CHILD_PTR node, +RT_NODE_INSERT(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_NODE * n, uint8 chunk) { - RT_NODE *n = node.local; - switch (n->kind) { case RT_NODE_KIND_4: { + RT_NODE_4 *n4 = (RT_NODE_4 *) n; + if (unlikely(RT_NODE_MUST_GROW(n))) - return RT_GROW_NODE_4(tree, parent_slot, node, chunk); + return RT_GROW_NODE_4(tree, parent_slot, n4, chunk); - return RT_ADD_CHILD_4(tree, node, chunk); + return RT_ADD_CHILD_4(tree, n4, chunk); } case RT_NODE_KIND_16: { + RT_NODE_16 *n16 = (RT_NODE_16 *) n; + if (unlikely(RT_NODE_MUST_GROW(n))) - return RT_GROW_NODE_16(tree, parent_slot, node, chunk); + return RT_GROW_NODE_16(tree, parent_slot, n16, chunk); - return RT_ADD_CHILD_16(tree, node, chunk); + return RT_ADD_CHILD_16(tree, n16, chunk); } case RT_NODE_KIND_48: { + RT_NODE_48 *n48 = (RT_NODE_48 *) n; + if (unlikely(RT_NODE_MUST_GROW(n))) - return RT_GROW_NODE_48(tree, parent_slot, node, chunk); + return RT_GROW_NODE_48(tree, parent_slot, n48, chunk); - return RT_ADD_CHILD_48(tree, node, chunk); + return RT_ADD_CHILD_48(tree, (RT_NODE_48 *) n, chunk); } case RT_NODE_KIND_256: - return RT_ADD_CHILD_256(tree, node, chunk); + return RT_ADD_CHILD_256(tree, (RT_NODE_256 *) n, chunk); default: pg_unreachable(); } @@ -1589,17 +1585,20 @@ RT_EXTEND_UP(RT_RADIX_TREE * tree, uint64 key) /* Grow tree upwards until start shift can accommodate the key */ while (shift < target_shift) { - RT_CHILD_PTR node; + RT_PTR_ALLOC nodealloc; + RT_NODE *node; RT_NODE_4 *n4; - node = RT_ALLOC_NODE(tree, RT_NODE_KIND_4, RT_CLASS_4); - n4 = (RT_NODE_4 *) node.local; + nodealloc = RT_ALLOC_NODE(tree, RT_CLASS_4); + node = RT_PTR_GET_LOCAL(tree, nodealloc); + RT_INIT_NODE(node, RT_NODE_KIND_4, RT_CLASS_4); + n4 = (RT_NODE_4 *) node; n4->base.count = 1; n4->chunks[0] = 0; n4->children[0] = tree->ctl->root; /* Update the root */ - tree->ctl->root = node.alloc; + tree->ctl->root = nodealloc; shift += RT_SPAN; } @@ -1616,29 +1615,34 @@ RT_EXTEND_UP(RT_RADIX_TREE * tree, uint64 key) static pg_noinline RT_PTR_ALLOC * RT_EXTEND_DOWN(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, uint64 key, int shift) { - RT_CHILD_PTR node, - child; + RT_PTR_ALLOC childalloc; + RT_NODE *child; + RT_NODE *node; RT_NODE_4 *n4; /* * The child pointer of the first node in the chain goes in the * caller-provided slot. */ - child = RT_ALLOC_NODE(tree, RT_NODE_KIND_4, RT_CLASS_4); - *parent_slot = child.alloc; + childalloc = RT_ALLOC_NODE(tree, RT_CLASS_4); + *parent_slot = childalloc; + child = RT_PTR_GET_LOCAL(tree, childalloc); + RT_INIT_NODE(child, RT_NODE_KIND_4, RT_CLASS_4); node = child; shift -= RT_SPAN; while (shift > 0) { - child = RT_ALLOC_NODE(tree, RT_NODE_KIND_4, RT_CLASS_4); + childalloc = RT_ALLOC_NODE(tree, RT_CLASS_4); + child = RT_PTR_GET_LOCAL(tree, childalloc); + RT_INIT_NODE(child, RT_NODE_KIND_4, RT_CLASS_4); /* We open-code the insertion ourselves, for speed. */ - n4 = (RT_NODE_4 *) node.local; + n4 = (RT_NODE_4 *) node; n4->base.count = 1; n4->chunks[0] = RT_GET_KEY_CHUNK(key, shift); - n4->children[0] = child.alloc; + n4->children[0] = childalloc; node = child; shift -= RT_SPAN; @@ -1646,7 +1650,7 @@ RT_EXTEND_DOWN(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, uint64 key, int Assert(shift == 0); /* Reserve slot for the value. */ - n4 = (RT_NODE_4 *) node.local; + n4 = (RT_NODE_4 *) node; n4->chunks[0] = RT_GET_KEY_CHUNK(key, 0); n4->base.count = 1; @@ -1664,12 +1668,11 @@ static RT_PTR_ALLOC * RT_GET_SLOT_RECURSIVE(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, uint64 key, int shift, bool *found) { RT_PTR_ALLOC *slot; - RT_CHILD_PTR node; + RT_NODE *node; uint8 chunk = RT_GET_KEY_CHUNK(key, shift); - node.alloc = *parent_slot; - RT_PTR_SET_LOCAL(tree, &node); - slot = RT_NODE_SEARCH(node.local, chunk); + node = RT_PTR_GET_LOCAL(tree, *parent_slot); + slot = RT_NODE_SEARCH(node, chunk); if (slot == NULL) { @@ -1712,7 +1715,7 @@ RT_GET_SLOT(RT_RADIX_TREE * tree, uint64 key, bool * found) { if (tree->ctl->num_keys == 0) { - RT_CHILD_PTR node; + RT_PTR_ALLOC nodealloc; RT_NODE_4 *n4; int start_shift = RT_KEY_GET_SHIFT(key); @@ -1722,9 +1725,8 @@ RT_GET_SLOT(RT_RADIX_TREE * tree, uint64 key, bool * found) * code inserting into the root node and extend downward from * there. */ - node.alloc = tree->ctl->root; - RT_PTR_SET_LOCAL(tree, &node); - n4 = (RT_NODE_4 *) node.local; + nodealloc = tree->ctl->root; + n4 = (RT_NODE_4 *) RT_PTR_GET_LOCAL(tree, nodealloc); n4->base.count = 1; n4->chunks[0] = RT_GET_KEY_CHUNK(key, start_shift); @@ -1784,33 +1786,36 @@ RT_SET_SLOT(RT_RADIX_TREE * tree, void * slot_v, RT_VALUE_TYPE * value_p, bool f } else { - RT_CHILD_PTR leaf; + RT_PTR_ALLOC leafalloc; + RT_VALUE_TYPE *leaf; if (found && !RT_CHILDPTR_IS_VALUE(*slot)) { Assert(RT_PTR_ALLOC_IS_VALID(*slot)); - leaf.alloc = *slot; - RT_PTR_SET_LOCAL(tree, &leaf); + leafalloc = *slot; + leaf = (RT_VALUE_TYPE *) RT_PTR_GET_LOCAL(tree, leafalloc); - if (RT_GET_VALUE_SIZE((RT_VALUE_TYPE *) leaf.local) != value_sz) + if (RT_GET_VALUE_SIZE(leaf) != value_sz) { /* * different sizes, so first free the existing leaf before * allocating a new one */ RT_FREE_LEAF(tree, *slot); - leaf = RT_ALLOC_LEAF(tree, value_sz); - *slot = leaf.alloc; + leafalloc = RT_ALLOC_LEAF(tree, value_sz); + *slot = leafalloc; + leaf = (RT_VALUE_TYPE *) RT_PTR_GET_LOCAL(tree, leafalloc); } } else { /* allocate new leaf and store it in the child array */ - leaf = RT_ALLOC_LEAF(tree, value_sz); - *slot = leaf.alloc; + leafalloc = RT_ALLOC_LEAF(tree, value_sz); + *slot = leafalloc; + leaf = (RT_VALUE_TYPE *) RT_PTR_GET_LOCAL(tree, leafalloc); } - memcpy(leaf.local, value_p, value_sz); + memcpy(leaf, value_p, value_sz); } return found; @@ -1849,7 +1854,8 @@ RT_CREATE(MemoryContext ctx) #endif { RT_RADIX_TREE *tree; - RT_CHILD_PTR rootnode; + RT_PTR_ALLOC rootalloc; + RT_NODE *rootnode; #ifdef RT_SHMEM dsa_pointer dp; #endif @@ -1882,8 +1888,10 @@ RT_CREATE(MemoryContext ctx) #endif /* RT_SHMEM */ /* add root node now so that RT_SET can assume it exists */ - rootnode = RT_ALLOC_NODE(tree, RT_NODE_KIND_4, RT_CLASS_4); - tree->ctl->root = rootnode.alloc; + rootalloc = RT_ALLOC_NODE(tree, RT_CLASS_4); + rootnode = RT_PTR_GET_LOCAL(tree, rootalloc); + RT_INIT_NODE(rootnode, RT_NODE_KIND_4, RT_CLASS_4); + tree->ctl->root = rootalloc; tree->ctl->start_shift = 0; tree->ctl->max_val = RT_SHIFT_GET_MAX_VAL(0); @@ -1950,18 +1958,15 @@ RT_UNLOCK(RT_RADIX_TREE * tree) static void RT_FREE_RECURSE(RT_RADIX_TREE * tree, RT_PTR_ALLOC ptr, int shift) { - RT_CHILD_PTR node; + RT_NODE *node = RT_PTR_GET_LOCAL(tree, ptr); check_stack_depth(); - node.alloc = ptr; - RT_PTR_SET_LOCAL(tree, &node); - - switch (node.local->kind) + switch (node->kind) { case RT_NODE_KIND_4: { - RT_NODE_4 *n4 = (RT_NODE_4 *) node.local; + RT_NODE_4 *n4 = (RT_NODE_4 *) node; for (int i = 0; i < n4->base.count; i++) { @@ -1977,7 +1982,7 @@ RT_FREE_RECURSE(RT_RADIX_TREE * tree, RT_PTR_ALLOC ptr, int shift) } case RT_NODE_KIND_16: { - RT_NODE_16 *n16 = (RT_NODE_16 *) node.local; + RT_NODE_16 *n16 = (RT_NODE_16 *) node; for (int i = 0; i < n16->base.count; i++) { @@ -1993,7 +1998,7 @@ RT_FREE_RECURSE(RT_RADIX_TREE * tree, RT_PTR_ALLOC ptr, int shift) } case RT_NODE_KIND_48: { - RT_NODE_48 *n48 = (RT_NODE_48 *) node.local; + RT_NODE_48 *n48 = (RT_NODE_48 *) node; for (int i = 0; i < RT_NODE_MAX_SLOTS; i++) { @@ -2014,7 +2019,7 @@ RT_FREE_RECURSE(RT_RADIX_TREE * tree, RT_PTR_ALLOC ptr, int shift) } case RT_NODE_KIND_256: { - RT_NODE_256 *n256 = (RT_NODE_256 *) node.local; + RT_NODE_256 *n256 = (RT_NODE_256 *) node; for (int i = 0; i < RT_NODE_MAX_SLOTS; i++) { @@ -2084,20 +2089,20 @@ RT_SCOPE RT_ITER * RT_BEGIN_ITERATE(RT_RADIX_TREE * tree) { RT_ITER *iter; - RT_CHILD_PTR root; + RT_PTR_ALLOC root; iter = (RT_ITER *) palloc0(sizeof(RT_ITER)); iter->tree = tree; Assert(RT_PTR_ALLOC_IS_VALID(tree->ctl->root)); - root.alloc = iter->tree->ctl->root; - RT_PTR_SET_LOCAL(tree, &root); + root = tree->ctl->root; iter->top_level = iter->tree->ctl->start_shift / RT_SPAN; /* Set the root to start */ iter->cur_level = iter->top_level; - iter->node_iters[iter->cur_level].node = root; + iter->node_iters[iter->cur_level].nodealloc = root; + iter->node_iters[iter->cur_level].node = RT_PTR_GET_LOCAL(tree, root);; iter->node_iters[iter->cur_level].idx = 0; return iter; @@ -2112,7 +2117,7 @@ RT_NODE_ITERATE_NEXT(RT_ITER * iter, int level) { uint8 key_chunk = 0; RT_NODE_ITER *node_iter; - RT_CHILD_PTR node; + RT_NODE *node; RT_PTR_ALLOC *slot = NULL; #ifdef RT_SHMEM @@ -2122,13 +2127,13 @@ RT_NODE_ITERATE_NEXT(RT_ITER * iter, int level) node_iter = &(iter->node_iters[level]); node = node_iter->node; - Assert(node.local != NULL); + Assert(node != NULL); - switch ((node.local)->kind) + switch (node->kind) { case RT_NODE_KIND_4: { - RT_NODE_4 *n4 = (RT_NODE_4 *) (node.local); + RT_NODE_4 *n4 = (RT_NODE_4 *) node; if (node_iter->idx >= n4->base.count) return NULL; @@ -2140,7 +2145,7 @@ RT_NODE_ITERATE_NEXT(RT_ITER * iter, int level) } case RT_NODE_KIND_16: { - RT_NODE_16 *n16 = (RT_NODE_16 *) (node.local); + RT_NODE_16 *n16 = (RT_NODE_16 *) node; if (node_iter->idx >= n16->base.count) return NULL; @@ -2152,7 +2157,7 @@ RT_NODE_ITERATE_NEXT(RT_ITER * iter, int level) } case RT_NODE_KIND_48: { - RT_NODE_48 *n48 = (RT_NODE_48 *) (node.local); + RT_NODE_48 *n48 = (RT_NODE_48 *) node; int chunk; for (chunk = node_iter->idx; chunk < RT_NODE_MAX_SLOTS; chunk++) @@ -2172,7 +2177,7 @@ RT_NODE_ITERATE_NEXT(RT_ITER * iter, int level) } case RT_NODE_KIND_256: { - RT_NODE_256 *n256 = (RT_NODE_256 *) (node.local); + RT_NODE_256 *n256 = (RT_NODE_256 *) node; int chunk; for (chunk = node_iter->idx; chunk < RT_NODE_MAX_SLOTS; chunk++) @@ -2210,33 +2215,25 @@ RT_ITERATE_NEXT(RT_ITER * iter, uint64 *key_p) while (iter->cur_level <= iter->top_level) { - RT_CHILD_PTR node; - slot = RT_NODE_ITERATE_NEXT(iter, iter->cur_level); if (iter->cur_level == 0 && slot != NULL) { /* Found a value at the leaf node */ *key_p = iter->key; - node.alloc = *slot; if (RT_CHILDPTR_IS_VALUE(*slot)) return (RT_VALUE_TYPE *) slot; else - { - RT_PTR_SET_LOCAL(iter->tree, &node); - return (RT_VALUE_TYPE *) node.local; - } + return (RT_VALUE_TYPE *) RT_PTR_GET_LOCAL(iter->tree, *slot); } if (slot != NULL) { /* Found the child slot, move down the tree */ - node.alloc = *slot; - RT_PTR_SET_LOCAL(iter->tree, &node); - iter->cur_level--; - iter->node_iters[iter->cur_level].node = node; + iter->node_iters[iter->cur_level].nodealloc = *slot; + iter->node_iters[iter->cur_level].node = RT_PTR_GET_LOCAL(iter->tree, *slot); iter->node_iters[iter->cur_level].idx = 0; } else @@ -2322,19 +2319,21 @@ RT_COPY_ARRAYS_AND_DELETE(uint8 *dst_chunks, RT_PTR_ALLOC * dst_children, * in the caller. */ static void pg_noinline -RT_SHRINK_NODE_256(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_CHILD_PTR node, uint8 chunk) +RT_SHRINK_NODE_256(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_NODE_256 * n256, uint8 chunk) { - RT_NODE_256 *n256 = (RT_NODE_256 *) node.local; - RT_CHILD_PTR newnode; + RT_PTR_ALLOC newalloc; + RT_NODE *newnode; RT_NODE_48 *new48; int slot_idx = 0; /* initialize new node */ - newnode = RT_ALLOC_NODE(tree, RT_NODE_KIND_48, RT_CLASS_48); - new48 = (RT_NODE_48 *) newnode.local; + newalloc = RT_ALLOC_NODE(tree, RT_CLASS_48); + newnode = RT_PTR_GET_LOCAL(tree, newalloc); + RT_INIT_NODE(newnode, RT_NODE_KIND_48, RT_CLASS_48); + new48 = (RT_NODE_48 *) newnode; /* copy over the entries */ - RT_COPY_COMMON(newnode, node); + RT_COPY_COMMON((RT_NODE *) new48, (RT_NODE *) n256); for (int i = 0; i < RT_NODE_MAX_SLOTS; i++) { if (RT_NODE_256_IS_CHUNK_USED(n256, i)) @@ -2354,15 +2353,14 @@ RT_SHRINK_NODE_256(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_CHILD_PT new48->isset[0] = (bitmapword) (((uint64) 1 << n256->base.count) - 1); /* free old node and update reference in parent */ - *parent_slot = newnode.alloc; - RT_FREE_NODE(tree, node); + RT_FREE_NODE(tree, *parent_slot); + *parent_slot = newalloc; } static inline void -RT_REMOVE_CHILD_256(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_CHILD_PTR node, uint8 chunk) +RT_REMOVE_CHILD_256(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_NODE_256 * n256, uint8 chunk) { int shrink_threshold; - RT_NODE_256 *n256 = (RT_NODE_256 *) node.local; int idx = RT_BM_IDX(chunk); int bitnum = RT_BM_BIT(chunk); @@ -2382,7 +2380,7 @@ RT_REMOVE_CHILD_256(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_CHILD_P shrink_threshold = Min(RT_FANOUT_48 / 4 * 3, shrink_threshold); if (n256->base.count <= shrink_threshold) - RT_SHRINK_NODE_256(tree, parent_slot, node, chunk); + RT_SHRINK_NODE_256(tree, parent_slot, n256, chunk); } /* @@ -2390,10 +2388,10 @@ RT_REMOVE_CHILD_256(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_CHILD_P * in the caller. */ static void pg_noinline -RT_SHRINK_NODE_48(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_CHILD_PTR node, uint8 chunk) +RT_SHRINK_NODE_48(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_NODE_48 * n48, uint8 chunk) { - RT_NODE_48 *n48 = (RT_NODE_48 *) (node.local); - RT_CHILD_PTR newnode; + RT_PTR_ALLOC newalloc; + RT_NODE *newnode; RT_NODE_16 *new16; int destidx = 0; @@ -2401,11 +2399,13 @@ RT_SHRINK_NODE_48(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_CHILD_PTR * Initialize new node. For now we skip the larger node16 size class for * simplicity. */ - newnode = RT_ALLOC_NODE(tree, RT_NODE_KIND_16, RT_CLASS_16_LO); - new16 = (RT_NODE_16 *) newnode.local; + newalloc = RT_ALLOC_NODE(tree, RT_CLASS_16_LO); + newnode = RT_PTR_GET_LOCAL(tree, newalloc); + RT_INIT_NODE(newnode, RT_NODE_KIND_16, RT_CLASS_16_LO); + new16 = (RT_NODE_16 *) newnode; /* copy over all existing entries */ - RT_COPY_COMMON(newnode, node); + RT_COPY_COMMON((RT_NODE *) new16, (RT_NODE *) n48); for (int chunk = 0; chunk < RT_NODE_MAX_SLOTS; chunk++) { if (n48->slot_idxs[chunk] != RT_INVALID_SLOT_IDX) @@ -2421,14 +2421,13 @@ RT_SHRINK_NODE_48(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_CHILD_PTR RT_VERIFY_NODE((RT_NODE *) new16); /* free old node and update reference in parent */ - *parent_slot = newnode.alloc; - RT_FREE_NODE(tree, node); + RT_FREE_NODE(tree, *parent_slot); + *parent_slot = newalloc; } static inline void -RT_REMOVE_CHILD_48(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_CHILD_PTR node, uint8 chunk) +RT_REMOVE_CHILD_48(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_NODE_48 * n48, uint8 chunk) { - RT_NODE_48 *n48 = (RT_NODE_48 *) node.local; int deletepos = n48->slot_idxs[chunk]; /* For now we skip the larger node16 size class for simplicity */ @@ -2450,7 +2449,7 @@ RT_REMOVE_CHILD_48(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_CHILD_PT * node48 anyway. */ if (n48->base.count <= shrink_threshold) - RT_SHRINK_NODE_48(tree, parent_slot, node, chunk); + RT_SHRINK_NODE_48(tree, parent_slot, n48, chunk); } /* @@ -2459,18 +2458,20 @@ RT_REMOVE_CHILD_48(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_CHILD_PT * node. */ static void pg_noinline -RT_SHRINK_NODE_16(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_CHILD_PTR node, uint8 deletepos) +RT_SHRINK_NODE_16(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_NODE_16 * n16, uint8 deletepos) { - RT_NODE_16 *n16 = (RT_NODE_16 *) (node.local); - RT_CHILD_PTR newnode; + RT_PTR_ALLOC newalloc; + RT_NODE *newnode; RT_NODE_4 *new4; /* initialize new node */ - newnode = RT_ALLOC_NODE(tree, RT_NODE_KIND_4, RT_CLASS_4); - new4 = (RT_NODE_4 *) newnode.local; + newalloc = RT_ALLOC_NODE(tree, RT_CLASS_4); + newnode = RT_PTR_GET_LOCAL(tree, newalloc); + RT_INIT_NODE(newnode, RT_NODE_KIND_4, RT_CLASS_4); + new4 = (RT_NODE_4 *) newnode; /* copy over existing entries, except for the one at "deletepos" */ - RT_COPY_COMMON(newnode, node); + RT_COPY_COMMON((RT_NODE *) new4, (RT_NODE *) n16); RT_COPY_ARRAYS_AND_DELETE(new4->chunks, new4->children, n16->chunks, n16->children, n16->base.count, deletepos); @@ -2479,14 +2480,13 @@ RT_SHRINK_NODE_16(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_CHILD_PTR RT_VERIFY_NODE((RT_NODE *) new4); /* free old node and update reference in parent */ - *parent_slot = newnode.alloc; - RT_FREE_NODE(tree, node); + RT_FREE_NODE(tree, *parent_slot); + *parent_slot = newalloc; } static inline void -RT_REMOVE_CHILD_16(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_CHILD_PTR node, uint8 chunk, RT_PTR_ALLOC * slot) +RT_REMOVE_CHILD_16(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_NODE_16 * n16, uint8 chunk, RT_PTR_ALLOC * slot) { - RT_NODE_16 *n16 = (RT_NODE_16 *) node.local; int deletepos = slot - n16->children; /* @@ -2496,7 +2496,7 @@ RT_REMOVE_CHILD_16(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_CHILD_PT */ if (n16->base.count <= 4) { - RT_SHRINK_NODE_16(tree, parent_slot, node, deletepos); + RT_SHRINK_NODE_16(tree, parent_slot, n16, deletepos); return; } @@ -2509,10 +2509,8 @@ RT_REMOVE_CHILD_16(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_CHILD_PT } static inline void -RT_REMOVE_CHILD_4(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_CHILD_PTR node, uint8 chunk, RT_PTR_ALLOC * slot) +RT_REMOVE_CHILD_4(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_NODE_4 * n4, uint8 chunk, RT_PTR_ALLOC * slot) { - RT_NODE_4 *n4 = (RT_NODE_4 *) node.local; - if (n4->base.count == 1) { Assert(n4->chunks[0] == chunk); @@ -2535,7 +2533,7 @@ RT_REMOVE_CHILD_4(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_CHILD_PTR * RT_DELETE_RECURSIVE has already freed the value and lower-level * children. */ - RT_FREE_NODE(tree, node); + RT_FREE_NODE(tree, *parent_slot); /* * Also null out the parent's slot -- this tells the next higher @@ -2562,28 +2560,36 @@ RT_REMOVE_CHILD_4(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_CHILD_PTR * Delete the child pointer corresponding to "key" in the given node. */ static inline void -RT_NODE_DELETE(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_CHILD_PTR node, uint8 chunk, RT_PTR_ALLOC * slot) +RT_NODE_DELETE(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, RT_NODE * node, uint8 chunk, RT_PTR_ALLOC * slot) { - switch ((node.local)->kind) + switch (node->kind) { case RT_NODE_KIND_4: { - RT_REMOVE_CHILD_4(tree, parent_slot, node, chunk, slot); + RT_NODE_4 *n4 = (RT_NODE_4 *) node; + + RT_REMOVE_CHILD_4(tree, parent_slot, n4, chunk, slot); return; } case RT_NODE_KIND_16: { - RT_REMOVE_CHILD_16(tree, parent_slot, node, chunk, slot); + RT_NODE_16 *n16 = (RT_NODE_16 *) node; + + RT_REMOVE_CHILD_16(tree, parent_slot, n16, chunk, slot); return; } case RT_NODE_KIND_48: { - RT_REMOVE_CHILD_48(tree, parent_slot, node, chunk); + RT_NODE_48 *n48 = (RT_NODE_48 *) node; + + RT_REMOVE_CHILD_48(tree, parent_slot, n48, chunk); return; } case RT_NODE_KIND_256: { - RT_REMOVE_CHILD_256(tree, parent_slot, node, chunk); + RT_NODE_256 *n256 = (RT_NODE_256 *) node; + + RT_REMOVE_CHILD_256(tree, parent_slot, n256, chunk); return; } default: @@ -2596,12 +2602,10 @@ static bool RT_DELETE_RECURSIVE(RT_RADIX_TREE * tree, RT_PTR_ALLOC * parent_slot, uint64 key, int shift) { RT_PTR_ALLOC *slot; - RT_CHILD_PTR node; + RT_NODE *node = RT_PTR_GET_LOCAL(tree, *parent_slot); uint8 chunk = RT_GET_KEY_CHUNK(key, shift); - node.alloc = *parent_slot; - RT_PTR_SET_LOCAL(tree, &node); - slot = RT_NODE_SEARCH(node.local, chunk); + slot = RT_NODE_SEARCH(node, chunk); if (slot == NULL) return false; @@ -2954,7 +2958,6 @@ RT_DUMP_NODE(RT_NODE * node) /* type declarations */ #undef RT_RADIX_TREE #undef RT_RADIX_TREE_CONTROL -#undef RT_CHILD_PTR #undef RT_PTR_ALLOC #undef RT_INVALID_PTR_ALLOC #undef RT_HANDLE @@ -3014,6 +3017,7 @@ RT_DUMP_NODE(RT_NODE * node) #undef RT_GET_SLOT_RECURSIVE #undef RT_DELETE_RECURSIVE #undef RT_ALLOC_NODE +#undef RT_INIT_NODE #undef RT_ALLOC_LEAF #undef RT_FREE_NODE #undef RT_FREE_LEAF @@ -3021,7 +3025,7 @@ RT_DUMP_NODE(RT_NODE * node) #undef RT_EXTEND_UP #undef RT_EXTEND_DOWN #undef RT_COPY_COMMON -#undef RT_PTR_SET_LOCAL +#undef RT_PTR_GET_LOCAL #undef RT_PTR_ALLOC_IS_VALID #undef RT_NODE_16_SEARCH_EQ #undef RT_NODE_4_GET_INSERTPOS -- 2.50.0