Antonin Houska <a...@cybertec.at> wrote: > I'm still testing it. especially the concurrent access. There are probably > bugs in the code, but it can help understand what I mean.
I've traced the most important cases of concurrent insertion into a bucket split of which is in progress. A few related bugs fixed. Some tool to check the index structure is needed yet, before performance testing makes sense. That might include enhancement of contrib/pageinspect. -- Antonin Houska Cybertec Schönig & Schönig GmbH Gröhrmühlgasse 26 A-2700 Wiener Neustadt Web: http://www.postgresql-support.de, http://www.cybertec.at
diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c new file mode 100644 index 24b06a5..149bbcf *** a/src/backend/access/hash/hash.c --- b/src/backend/access/hash/hash.c *************** loop_top: *** 572,577 **** --- 572,599 ---- opaque = (HashPageOpaque) PageGetSpecialPointer(page); Assert(opaque->hasho_bucket == cur_bucket); + /* + * If the bucket participates in a split, give up. + * + * (Unlike the metapage copy, the flags at bucket level should + * always be up-to-date.) + * + * TODO + * + * 1. Analyze if both buckets participating in the split impose + * too severe restrictions, and if it makes sense to introduce + * separate flags for "old" and "new" bucket. Also, would such a + * restricted VACUUM still make sense? + * + * 2. Consider how statistics should reflect the fact that some + * buckets are skipped because of split. + */ + if (opaque->hasho_flag & LH_BUCKET_SPLIT) + { + _hash_relbuf(rel, buf); + break; + } + /* Scan each tuple in page */ maxoffno = PageGetMaxOffsetNumber(page); for (offno = FirstOffsetNumber; diff --git a/src/backend/access/hash/hashinsert.c b/src/backend/access/hash/hashinsert.c new file mode 100644 index 63aaec9..de445c9 *** a/src/backend/access/hash/hashinsert.c --- b/src/backend/access/hash/hashinsert.c *************** _hash_doinsert(Relation rel, IndexTuple *** 37,42 **** --- 37,43 ---- Page page; HashPageOpaque pageopaque; Size itemsz; + uint16 buckets_total; bool do_expand; uint32 hashkey; Bucket bucket; *************** _hash_doinsert(Relation rel, IndexTuple *** 123,129 **** */ BlockNumber nextblkno = pageopaque->hasho_nextblkno; ! if (BlockNumberIsValid(nextblkno)) { /* * ovfl page exists; go get it. if it doesn't have room, we'll --- 124,131 ---- */ BlockNumber nextblkno = pageopaque->hasho_nextblkno; ! if (BlockNumberIsValid(nextblkno) && ! !(pageopaque->hasho_flag & LH_BUCKET_SPLIT_LAST)) { /* * ovfl page exists; go get it. if it doesn't have room, we'll *************** _hash_doinsert(Relation rel, IndexTuple *** 136,142 **** else { /* ! * we're at the end of the bucket chain and we haven't found a * page with enough room. allocate a new overflow page. */ --- 138,145 ---- else { /* ! * we're at the end of the bucket chain, or (during a split) right ! * before redirection to the "old bucket", and we haven't found a * page with enough room. allocate a new overflow page. */ *************** _hash_doinsert(Relation rel, IndexTuple *** 151,157 **** Assert(PageGetFreeSpace(page) >= itemsz); } pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); ! Assert(pageopaque->hasho_flag == LH_OVERFLOW_PAGE); Assert(pageopaque->hasho_bucket == bucket); } --- 154,160 ---- Assert(PageGetFreeSpace(page) >= itemsz); } pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); ! Assert(pageopaque->hasho_flag & LH_OVERFLOW_PAGE); Assert(pageopaque->hasho_bucket == bucket); } *************** _hash_doinsert(Relation rel, IndexTuple *** 173,180 **** metap->hashm_ntuples += 1; /* Make sure this stays in sync with _hash_expandtable() */ ! do_expand = metap->hashm_ntuples > ! (double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1); /* Write out the metapage and drop lock, but keep pin */ _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK); --- 176,184 ---- metap->hashm_ntuples += 1; /* Make sure this stays in sync with _hash_expandtable() */ ! buckets_total = metap->hashm_maxbucket + 1 + metap->hashm_split_count; ! do_expand = metap->hashm_split_count < HASH_MAX_SPLITS && ! metap->hashm_ntuples > (double) metap->hashm_ffactor * buckets_total; /* Write out the metapage and drop lock, but keep pin */ _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK); diff --git a/src/backend/access/hash/hashovfl.c b/src/backend/access/hash/hashovfl.c new file mode 100644 index b775164..4345f29 *** a/src/backend/access/hash/hashovfl.c --- b/src/backend/access/hash/hashovfl.c *************** *** 21,27 **** #include "utils/rel.h" - static Buffer _hash_getovflpage(Relation rel, Buffer metabuf); static uint32 _hash_firstfreebit(uint32 map); --- 21,26 ---- *************** _hash_addovflpage(Relation rel, Buffer m *** 127,133 **** pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); nextblkno = pageopaque->hasho_nextblkno; ! if (!BlockNumberIsValid(nextblkno)) break; /* we assume we do not need to write the unmodified page */ --- 126,133 ---- pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); nextblkno = pageopaque->hasho_nextblkno; ! if (!BlockNumberIsValid(nextblkno) || ! (pageopaque->hasho_flag & LH_BUCKET_SPLIT_LAST)) break; /* we assume we do not need to write the unmodified page */ *************** _hash_addovflpage(Relation rel, Buffer m *** 140,150 **** ovflpage = BufferGetPage(ovflbuf); ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); ovflopaque->hasho_prevblkno = BufferGetBlockNumber(buf); - ovflopaque->hasho_nextblkno = InvalidBlockNumber; ovflopaque->hasho_bucket = pageopaque->hasho_bucket; ovflopaque->hasho_flag = LH_OVERFLOW_PAGE; ovflopaque->hasho_page_id = HASHO_PAGE_ID; MarkBufferDirty(ovflbuf); /* logically chain overflow page to previous page */ --- 140,163 ---- ovflpage = BufferGetPage(ovflbuf); ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); ovflopaque->hasho_prevblkno = BufferGetBlockNumber(buf); ovflopaque->hasho_bucket = pageopaque->hasho_bucket; ovflopaque->hasho_flag = LH_OVERFLOW_PAGE; ovflopaque->hasho_page_id = HASHO_PAGE_ID; + if (pageopaque->hasho_flag & LH_BUCKET_SPLIT_LAST) + { + /* + * The bucket is being created, so it may point to part of the bucket + * being split (but the link may already be terminated - we don't + * care, just copy it.) + */ + ovflopaque->hasho_nextblkno = pageopaque->hasho_nextblkno; + ovflopaque->hasho_flag |= LH_BUCKET_SPLIT_LAST; + pageopaque->hasho_flag &= ~LH_BUCKET_SPLIT_LAST; + } + else + ovflopaque->hasho_nextblkno = InvalidBlockNumber; + MarkBufferDirty(ovflbuf); /* logically chain overflow page to previous page */ *************** _hash_addovflpage(Relation rel, Buffer m *** 164,170 **** * The caller must hold a pin, but no lock, on the metapage buffer. * That buffer is left in the same state at exit. */ ! static Buffer _hash_getovflpage(Relation rel, Buffer metabuf) { HashMetaPage metap; --- 177,183 ---- * The caller must hold a pin, but no lock, on the metapage buffer. * That buffer is left in the same state at exit. */ ! Buffer _hash_getovflpage(Relation rel, Buffer metabuf) { HashMetaPage metap; *************** found: *** 334,339 **** --- 347,385 ---- } /* + * Return buffer containing the last page of a bucket being created (during + * split). The search starts at page contained in 'start' buffer. This buffer + * is expected to be pinned and write-locked (caller can't unlock it). The + * result is also pinned and write-locked. + */ + Buffer + _hash_get_new_bucket_end(Relation rel, Buffer start) + { + Buffer buf; + Page page; + HashPageOpaque pageopaque; + BlockNumber blkno = InvalidBlockNumber; + + buf = start; + for (;;) + { + if (BlockNumberIsValid(blkno)) + buf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_OVERFLOW_PAGE); + page = BufferGetPage(buf); + pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); + if (pageopaque->hasho_flag & LH_BUCKET_SPLIT_LAST) + return buf; + + blkno = pageopaque->hasho_nextblkno; + /* If reached end of the old bucket, something is pretty bad. */ + Assert(BlockNumberIsValid(blkno)); + + if (buf != start) + _hash_relbuf(rel, buf); + } + } + + /* * _hash_firstfreebit() * * Return the number of the first bit that is not set in the word 'map'. diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c new file mode 100644 index 46c6c96..c169dcd *** a/src/backend/access/hash/hashpage.c --- b/src/backend/access/hash/hashpage.c *************** _hash_metapinit(Relation rel, double num *** 424,432 **** --- 424,434 ---- */ metap->hashm_maxbucket = metap->hashm_lowmask = num_buckets - 1; metap->hashm_highmask = (num_buckets << 1) - 1; + metap->hashm_split_count = 0; MemSet(metap->hashm_spares, 0, sizeof(metap->hashm_spares)); MemSet(metap->hashm_mapp, 0, sizeof(metap->hashm_mapp)); + MemSet(metap->hashm_splits, 0, sizeof(metap->hashm_splits)); /* Set up mapping for one spare page after the initial splitpoints */ metap->hashm_spares[log2_num_buckets] = 1; *************** void *** 498,511 **** --- 500,521 ---- _hash_expandtable(Relation rel, Buffer metabuf) { HashMetaPage metap; + uint16 buckets_total; Bucket old_bucket; Bucket new_bucket; + HashSplit new_split; uint32 spare_ndx; BlockNumber start_oblkno; BlockNumber start_nblkno; uint32 maxbucket; uint32 highmask; uint32 lowmask; + Buffer obuf; + Page opage, npage; + HashPageOpaque oopaque, nopaque; + uint16 i; + size_t tail; + Buffer nbuf; /* * Write-lock the meta page. It used to be necessary to acquire a *************** _hash_expandtable(Relation rel, Buffer m *** 517,529 **** metap = HashPageGetMeta(BufferGetPage(metabuf)); /* ! * Check to see if split is still needed; someone else might have already ! * done one while we waited for the lock. * * Make sure this stays in sync with _hash_doinsert() */ ! if (metap->hashm_ntuples <= ! (double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1)) goto fail; /* --- 527,542 ---- metap = HashPageGetMeta(BufferGetPage(metabuf)); /* ! * Check to see if we can handle one more split and if it's still needed; ! * someone else might have already done one or more splits while we waited ! * for the lock. (At this point we assume that the splits in progress will ! * all succeed.) * * Make sure this stays in sync with _hash_doinsert() */ ! buckets_total = metap->hashm_maxbucket + 1 + metap->hashm_split_count; ! if (metap->hashm_split_count >= HASH_MAX_SPLITS || ! metap->hashm_ntuples <= (double) metap->hashm_ffactor * buckets_total) goto fail; /* *************** _hash_expandtable(Relation rel, Buffer m *** 545,569 **** goto fail; /* ! * Determine which bucket is to be split, and attempt to lock the old ! * bucket. If we can't get the lock, give up. ! * ! * The lock protects us against other backends, but not against our own ! * backend. Must check for active scans separately. */ new_bucket = metap->hashm_maxbucket + 1; old_bucket = (new_bucket & metap->hashm_lowmask); ! start_oblkno = BUCKET_TO_BLKNO(metap, old_bucket); if (_hash_has_active_scan(rel, old_bucket)) goto fail; if (!_hash_try_getlock(rel, start_oblkno, HASH_EXCLUSIVE)) goto fail; /* * Likewise lock the new bucket (should never fail). * * Note: it is safe to compute the new bucket's blkno here, even though we --- 558,619 ---- goto fail; /* ! * Determine which bucket is to be split. */ new_bucket = metap->hashm_maxbucket + 1; old_bucket = (new_bucket & metap->hashm_lowmask); ! /* ! * If the masks changed too recently, another new_bucket might map to the ! * same old_bucket. ! */ ! for (i = 0; i < metap->hashm_split_count; i++) ! { ! if (metap->hashm_splits[i].old_bucket == old_bucket) ! goto fail; ! } + /* + * Lock the old bucket. If we can't get the lock, give up (else we could + * cause deadlock with backends looking for the bucket). + * + * The lock protects us against other backends, but not against our own + * backend. Must check for active scans separately. + */ if (_hash_has_active_scan(rel, old_bucket)) goto fail; + start_oblkno = BUCKET_TO_BLKNO(metap, old_bucket); if (!_hash_try_getlock(rel, start_oblkno, HASH_EXCLUSIVE)) goto fail; /* + * Make sure that backends in need of old_bucket during the split are + * aware of the restriction that split-in-progress imposes. + */ + obuf = _hash_getbuf(rel, start_oblkno, HASH_WRITE, LH_BUCKET_PAGE); + opage = BufferGetPage(obuf); + oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); + /* + * If hashm_splits indicates no split for this bucket, the bucket itself + * shouldn't be marked neither: when the split ends, these flags are unset + * first, then the item is removed from hashm_splits. + */ + Assert(!(oopaque->hasho_flag & LH_BUCKET_SPLIT)); + oopaque->hasho_flag |= LH_BUCKET_SPLIT; + + /* + * Do not drop pin, we'll definitely need the buffer soon again. + */ + _hash_chgbufaccess(rel, obuf, HASH_WRITE, HASH_NOLOCK); + /* + * It's o.k. to let other backends use the bucket now. LH_BUCKET_SPLITS + * ensures that they are aware of the relevant restrictions. + */ + _hash_droplock(rel, start_oblkno, HASH_EXCLUSIVE); + + /* * Likewise lock the new bucket (should never fail). * * Note: it is safe to compute the new bucket's blkno here, even though we *************** _hash_expandtable(Relation rel, Buffer m *** 596,610 **** */ if (!_hash_alloc_buckets(rel, start_nblkno, new_bucket)) { ! /* can't split due to BlockNumber overflow */ ! _hash_droplock(rel, start_oblkno, HASH_EXCLUSIVE); _hash_droplock(rel, start_nblkno, HASH_EXCLUSIVE); goto fail; } } /* ! * Okay to proceed with split. Update the metapage bucket mapping info. * * Since we are scribbling on the metapage data right in the shared * buffer, any failure in this next little bit leaves us with a big --- 646,664 ---- */ if (!_hash_alloc_buckets(rel, start_nblkno, new_bucket)) { ! /* ! * can't split due to BlockNumber overflow ! * ! * Meta page is not updated yet, no other backend should be ! * aware of the new bucket. ! */ _hash_droplock(rel, start_nblkno, HASH_EXCLUSIVE); goto fail; } } /* ! * Okay to initiate the split. Update the metapage bucket mapping info. * * Since we are scribbling on the metapage data right in the shared * buffer, any failure in this next little bit leaves us with a big *************** _hash_expandtable(Relation rel, Buffer m *** 613,619 **** * establish a critical section. */ START_CRIT_SECTION(); - metap->hashm_maxbucket = new_bucket; if (new_bucket > metap->hashm_highmask) --- 667,672 ---- *************** _hash_expandtable(Relation rel, Buffer m *** 635,640 **** --- 688,700 ---- metap->hashm_ovflpoint = spare_ndx; } + /* + * Publish information about the split in progress. + */ + new_split = &metap->hashm_splits[i]; + new_split->new_bucket = new_bucket; + new_split->old_bucket = old_bucket; + metap->hashm_split_count += 1; /* Done mucking with metapage */ END_CRIT_SECTION(); *************** _hash_expandtable(Relation rel, Buffer m *** 649,665 **** highmask = metap->hashm_highmask; lowmask = metap->hashm_lowmask; ! /* Write out the metapage and drop lock, but keep pin */ _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK); /* Relocate records to the new bucket */ _hash_splitbucket(rel, metabuf, old_bucket, new_bucket, ! start_oblkno, start_nblkno, ! maxbucket, highmask, lowmask); ! /* Release bucket locks, allowing others to access them */ ! _hash_droplock(rel, start_oblkno, HASH_EXCLUSIVE); ! _hash_droplock(rel, start_nblkno, HASH_EXCLUSIVE); return; --- 709,832 ---- highmask = metap->hashm_highmask; lowmask = metap->hashm_lowmask; ! /* ! * Initialize the new bucket's primary page while holding the meta page ! * lock (_hash_getnewbuf should not be called concurrently). ! * ! * XXX This is rather a misuse of the metabuf content lock. Can we ! * synchronize this call in any better way? ! */ ! nbuf = _hash_getnewbuf(rel, start_nblkno, MAIN_FORKNUM); ! ! /* ! * No need to hold the meta page lock while initializing page(s) of the ! * new bucket. ! * ! * Note: As soon as the meta page lock is released, any concurrent backend ! * can see the new bucket, but can't lock it. We hold the exclusive lock ! * on the new bucket until its ready to accept both reads and writes. ! * ! * TODO (WAL) ! * ! * Before WAL-replaying the above meta page changes, exclusive lock of the ! * new bucket must be acquired (probably before even locking the metapage, ! * to avoid deadlock with backends looking for the bucket; note that ! * conditional lock acquisition does not help here, we really need the ! * lock). The problem is, that if crash happens exactly after the meta ! * page changes have been WAL-logged, recovery must not let other backends ! * know about (and possibly lock) the new bucket until the replaying ! * backend (startup process) has it locked exclusively (to complete its ! * initial setup). ! */ _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK); + /* + * Neither initialization nor writing of the new page requires the metabuf + * lock. + * + * The last overflow page in the chain must point to the first page of the + * old bucket. During the split this ensures that scans of the new bucket + * will always succeed, although the new bucket has not received all the + * matching tuples from the old bucket yet. + */ + npage = BufferGetPage(nbuf); + nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); + nopaque->hasho_prevblkno = InvalidBlockNumber; + nopaque->hasho_nextblkno = start_oblkno; + nopaque->hasho_bucket = new_bucket; + + /* + * LH_BUCKET_SPLIT_LAST - initially the primary page is also the last page + * of the bucket. This flag will be cleared as soon as the first overflow + * page is added. + */ + nopaque->hasho_flag = LH_BUCKET_PAGE | LH_BUCKET_SPLIT | + LH_BUCKET_SPLIT_LAST; + nopaque->hasho_page_id = HASHO_PAGE_ID; + _hash_chgbufaccess(rel, nbuf, HASH_WRITE, HASH_NOLOCK); + + /* Release lock of the new bucket, allowing others to use it. */ + _hash_droplock(rel, start_nblkno, HASH_EXCLUSIVE); + /* Relocate records to the new bucket */ _hash_splitbucket(rel, metabuf, old_bucket, new_bucket, ! start_oblkno, start_nblkno, maxbucket, ! highmask, lowmask); ! ! /* ! * Mark the new buckets as no longer participating in the split. ! */ ! _hash_chgbufaccess(rel, nbuf, HASH_NOLOCK, HASH_WRITE); ! nopaque->hasho_flag &= ~LH_BUCKET_SPLIT; ! _hash_wrtbuf(rel, nbuf); ! ! _hash_chgbufaccess(rel, obuf, HASH_NOLOCK, HASH_WRITE); ! oopaque->hasho_flag &= ~LH_BUCKET_SPLIT; ! _hash_wrtbuf(rel, obuf); ! ! /* ! * Remove the "split record" from the metapage. ! * ! * (Use critical section for the same reasons as explained above.) ! */ ! START_CRIT_SECTION(); ! _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE); ! _hash_checkpage(rel, metabuf, LH_META_PAGE); ! metap = HashPageGetMeta(BufferGetPage(metabuf)); ! Assert(metap->hashm_split_count > 0); ! for (i = 0; i < metap->hashm_split_count; i++) ! { ! if (metap->hashm_splits[i].old_bucket == old_bucket) ! break; ! } ! if (i == metap->hashm_split_count) ! /* ! * This should not happen unless the meta page is corrupt. ! */ ! elog(ERROR, "could not find info on split of bucket %u", old_bucket); ! ! /* Number of items following the one we're gonna delete. */ ! tail = metap->hashm_split_count - (i + 1); ! if (tail > 0) ! { ! HashSplit current, next; ! ! /* ! * The item being removed is not the last one. Shift the following ! * items one position lower. ! */ ! current = metap->hashm_splits + i; ! next = current + 1; ! memmove(current, next, tail * sizeof(HashSplitData)); ! } ! metap->hashm_split_count -= 1; ! END_CRIT_SECTION(); ! ! /* ! * Write the meta page. ! */ ! _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK); return; *************** _hash_alloc_buckets(Relation rel, BlockN *** 718,723 **** --- 885,949 ---- return true; } + /* + * Set redirection from the end of "new" bucket (i.e. just being created) to + * the appropriate page of "old" (i.e. bucket being split). Search for the + * last page of "new" bucket starts at page contained in 'buf'. + * + * 'buf' must be pinned and write-locked, 'target' points to the page of the + * "old" bucket. 'to_unlock' is a list of pages that caller will need to + * unlock sometime. + */ + static void + _hash_set_new_bucket_end_link(Relation rel, Buffer buf, BlockNumber target, + List **to_unlock) + { + Page page; + HashPageOpaque opaque; + + page = BufferGetPage(buf); + opaque = (HashPageOpaque) PageGetSpecialPointer(page); + if (!(opaque->hasho_flag & LH_BUCKET_SPLIT_LAST)) + { + buf = _hash_get_new_bucket_end(rel, buf); + page = BufferGetPage(buf); + opaque = (HashPageOpaque) PageGetSpecialPointer(page); + /* It will need to be unlocked too. */ + *to_unlock = lappend_int(*to_unlock, buf); + } + + Assert(opaque->hasho_flag & LH_BUCKET_SPLIT_LAST); + opaque->hasho_nextblkno = target; + /* The special case of terminating the new bucket. */ + if (!BlockNumberIsValid(opaque->hasho_nextblkno)) + opaque->hasho_flag &= ~LH_BUCKET_SPLIT_LAST; + } + + + /* + * Unlock buffers in the order we locked them. 'keep_pin', if valid, is a + * buffer that should only by written, but its pin should not be dropped. + */ + static void + _hash_write_buffers(Relation rel, List **bufs, Buffer keep_pin) + { + ListCell *nc; + + foreach(nc, *bufs) + { + Buffer buf; + + buf = (Buffer) lfirst_int(nc); + if (!BufferIsInvalid(keep_pin) && buf == keep_pin) + _hash_chgbufaccess(rel, buf, HASH_WRITE, HASH_NOLOCK); + else + _hash_wrtbuf(rel, buf); + + } + list_free(*bufs); + *bufs = NIL; + } + /* * _hash_splitbucket -- split 'obucket' into 'obucket' and 'nbucket' *************** _hash_alloc_buckets(Relation rel, BlockN *** 727,734 **** * belong in the new bucket, and compress out any free space in the old * bucket. * ! * The caller must hold exclusive locks on both buckets to ensure that ! * no one else is trying to access them (see README). * * The caller must hold a pin, but no lock, on the metapage buffer. * The buffer is returned in the same state. (The metapage is only --- 953,961 ---- * belong in the new bucket, and compress out any free space in the old * bucket. * ! * Neither bucket needs to be locked. Any concurrent backend is expected to ! * check flags of primary pages of the buckets and follow the related ! * restrictions. * * The caller must hold a pin, but no lock, on the metapage buffer. * The buffer is returned in the same state. (The metapage is only *************** _hash_splitbucket(Relation rel, *** 748,796 **** BlockNumber oblkno; BlockNumber nblkno; Buffer obuf; - Buffer nbuf; Page opage; - Page npage; HashPageOpaque oopaque; ! HashPageOpaque nopaque; - /* - * It should be okay to simultaneously write-lock pages from each bucket, - * since no one else can be trying to acquire buffer lock on pages of - * either bucket. - */ oblkno = start_oblkno; - obuf = _hash_getbuf(rel, oblkno, HASH_WRITE, LH_BUCKET_PAGE); - opage = BufferGetPage(obuf); - oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); - nblkno = start_nblkno; ! nbuf = _hash_getnewbuf(rel, nblkno, MAIN_FORKNUM); ! npage = BufferGetPage(nbuf); ! ! /* initialize the new bucket's primary page */ ! nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); ! nopaque->hasho_prevblkno = InvalidBlockNumber; ! nopaque->hasho_nextblkno = InvalidBlockNumber; ! nopaque->hasho_bucket = nbucket; ! nopaque->hasho_flag = LH_BUCKET_PAGE; ! nopaque->hasho_page_id = HASHO_PAGE_ID; /* ! * Partition the tuples in the old bucket between the old bucket and the ! * new bucket, advancing along the old bucket's overflow bucket chain and ! * adding overflow pages to the new bucket as needed. Outer loop iterates ! * once per page in old bucket. */ for (;;) { OffsetNumber ooffnum; OffsetNumber omaxoffnum; OffsetNumber deletable[MaxOffsetNumber]; int ndeletable = 0; /* Scan each tuple in old page */ omaxoffnum = PageGetMaxOffsetNumber(opage); for (ooffnum = FirstOffsetNumber; ooffnum <= omaxoffnum; ooffnum = OffsetNumberNext(ooffnum)) --- 975,1032 ---- BlockNumber oblkno; BlockNumber nblkno; Buffer obuf; Page opage; HashPageOpaque oopaque; ! Page npage = NULL; ! HashPageOpaque nopaque = NULL; ! char *tuples_new; ! Buffer nbuf = InvalidBuffer; ! int nbuf_inserts = 0; oblkno = start_oblkno; nblkno = start_nblkno; ! /* BLCKSZ is safe size for any number of matching tuples. */ ! tuples_new = (char *) palloc(BLCKSZ); /* ! * Copy matching tuples from the old to the new bucket, advancing along ! * the old bucket's overflow bucket chain and adding overflow pages to the ! * new bucket as needed. Backends scanning the old bucket during the split ! * are aware that deletion of the tuples copied is not immediate, so they ! * do re-check each key before returning it. ! * ! * Outer loop iterates once per page in old bucket. */ for (;;) { OffsetNumber ooffnum; OffsetNumber omaxoffnum; + BlockNumber oblkno_next; OffsetNumber deletable[MaxOffsetNumber]; int ndeletable = 0; + char *tuple_copy; + IndexTuple itup; + Size itemsz; + int item_count, i; + Buffer keep_pin = InvalidBuffer; + List *new_write = NIL; + + if (!BlockNumberIsValid(oblkno)) + break; + + /* + * 1. Copy the interesting tuples from the source page aside and + * release its content lock. + */ + obuf = _hash_getbuf(rel, oblkno, HASH_READ, + LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); + opage = BufferGetPage(obuf); + oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); /* Scan each tuple in old page */ omaxoffnum = PageGetMaxOffsetNumber(opage); + tuple_copy = tuples_new; + item_count = 0; for (ooffnum = FirstOffsetNumber; ooffnum <= omaxoffnum; ooffnum = OffsetNumberNext(ooffnum)) *************** _hash_splitbucket(Relation rel, *** 810,862 **** if (bucket == nbucket) { - /* - * insert the tuple into the new bucket. if it doesn't fit on - * the current page in the new bucket, we must allocate a new - * overflow page and place the tuple on that page instead. - */ itemsz = IndexTupleDSize(*itup); itemsz = MAXALIGN(itemsz); ! if (PageGetFreeSpace(npage) < itemsz) { ! /* write out nbuf and drop lock, but keep pin */ ! _hash_chgbufaccess(rel, nbuf, HASH_WRITE, HASH_NOLOCK); ! /* chain to a new overflow page */ ! nbuf = _hash_addovflpage(rel, metabuf, nbuf); ! npage = BufferGetPage(nbuf); ! /* we don't need nblkno or nopaque within the loop */ } /* ! * Insert tuple on new page, using _hash_pgaddtup to ensure ! * correct ordering by hashkey. This is a tad inefficient ! * since we may have to shuffle itempointers repeatedly. ! * Possible future improvement: accumulate all the items for ! * the new page and qsort them before insertion. */ ! (void) _hash_pgaddtup(rel, nbuf, itemsz, itup); /* * Mark tuple for deletion from old page. */ deletable[ndeletable++] = ooffnum; } else - { - /* - * the tuple stays on this page, so nothing to do. - */ Assert(bucket == obucket); - } } - oblkno = oopaque->hasho_nextblkno; - - /* - * Done scanning this old page. If we moved any tuples, delete them - * from the old page. - */ if (ndeletable > 0) { PageIndexMultiDelete(opage, deletable, ndeletable); --- 1046,1305 ---- if (bucket == nbucket) { itemsz = IndexTupleDSize(*itup); itemsz = MAXALIGN(itemsz); ! memcpy(tuple_copy, itup, itemsz); ! tuple_copy += itemsz; ! item_count++; ! } ! else ! { ! /* ! * the tuple stays on this page, so nothing to do. ! */ ! Assert(bucket == obucket); ! } ! } ! ! oblkno_next = oopaque->hasho_nextblkno; ! _hash_chgbufaccess(rel, obuf, HASH_READ, HASH_NOLOCK); ! ! /* ! * 2. Insert them into the new bucket. ! */ ! if (BufferIsInvalid(nbuf)) ! { ! nbuf = _hash_getbuf(rel, nblkno, HASH_WRITE, LH_BUCKET_PAGE); ! npage = BufferGetPage(nbuf); ! nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); ! } ! else ! _hash_chgbufaccess(rel, nbuf, HASH_NOLOCK, HASH_WRITE); ! ! ! if (item_count == 0) ! { ! /* ! * No matching tuples on this page. ! * ! * Write even though no items could be added to nbuf: the link to ! * the old bucket will be updated. ! */ ! new_write = lappend_int(new_write, nbuf); ! /* ! * If this is the last outer loop, oblkno_next is ! * InvalidBlockNumber, so the link will be terminated. Otherwise ! * it will point to the next page of the old bucket. ! */ ! _hash_set_new_bucket_end_link(rel, nbuf, oblkno_next, &new_write); ! ! /* ! * Write, this time only nbuf, and - if distinct - the buffer ! * containing the link to the old bucket. Keep pin of nbuf iff at ! * least one more outer loop is expected. ! */ ! keep_pin = BlockNumberIsValid(oblkno_next) ? nbuf : InvalidBuffer; ! _hash_write_buffers(rel, &new_write, keep_pin); ! ! _hash_dropbuf(rel, obuf); ! oblkno = oblkno_next; ! continue; ! } ! ! tuple_copy = tuples_new; ! for (i = 0; i < item_count;) ! { ! itup = (IndexTuple) tuple_copy; ! itemsz = IndexTupleDSize(*itup); ! itemsz = MAXALIGN(itemsz); ! ! /* ! * insert the tuple into the new bucket. if it doesn't fit on the ! * current page in the new bucket, try to proceed to the next one ! * - it's possible that it overflow page has already been added ! * for concurrent insertions. ! */ ! if (PageGetFreeSpace(npage) < itemsz) ! { ! BlockNumber nblkno_next; ! ! nblkno_next = nopaque->hasho_nextblkno; ! ! /* ! * The split must have been initialized by now. Thus, even if ! * we are at the end of the new bucket, a valid link should be ! * there (to the old bucket). ! */ ! Assert(BlockNumberIsValid(nblkno_next)); ! ! if (nbuf_inserts > 0) { ! /* ! * The page should be written, but not unlocked! As the ! * new bucket eventually links to the current page of the ! * old bucket, the tuples already copied from the current ! * old page could be found twice - once on this page and ! * once on the old one. (On the other hand, the link must ! * not be redirected to oblkno_next yet, that would make ! * readers omit some not-yet-copied tuples.) ! * ! * TODO The hash index seems to have no API to only sync ! * buffer, w/o unlocking. Let's do nothing so far (the ! * buffers will be written when processing the current old ! * page is done) and resolve it when implementing WAL ! */ ! ! /* ! * Remember to write and unlock when the current old ! * bucket page is processed. ! */ ! new_write = lappend_int(new_write, nbuf); } /* ! * The current (already full) page links either to a page of ! * the new bucket (probably created for concurrent insertions) ! * or to one of the old bucket. */ ! if (!(nopaque->hasho_flag & LH_BUCKET_SPLIT_LAST)) ! { ! if (nbuf_inserts == 0) ! { ! /* No reason to prevent others from using the page. */ ! _hash_relbuf(rel, nbuf); ! } ! ! /* Follow the link. */ ! nbuf = _hash_getbuf(rel, nblkno_next, HASH_WRITE, ! LH_OVERFLOW_PAGE); ! } ! else ! { ! /* ! * A link to the old bucket - add a new page before it. ! */ ! Buffer ovflbuf; ! Page ovflpage; ! HashPageOpaque ovflopaque; ! ! /* ! * _hash_addovflpage can't be used here because it expects ! * the buffer unlocked. We can't afford unlocking as ! * explained above. ! */ ! ovflbuf = _hash_getovflpage(rel, metabuf); ! ovflpage = BufferGetPage(ovflbuf); ! ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); ! ovflopaque->hasho_bucket = nopaque->hasho_bucket; ! ovflopaque->hasho_flag = LH_OVERFLOW_PAGE; ! ovflopaque->hasho_page_id = HASHO_PAGE_ID; ! /* ovflpage takes the link over. */ ! ovflopaque->hasho_nextblkno = nopaque->hasho_nextblkno; ! nopaque->hasho_flag &= ~LH_BUCKET_SPLIT_LAST; ! ovflopaque->hasho_flag |= LH_BUCKET_SPLIT_LAST; ! ovflopaque->hasho_prevblkno = nblkno; ! /* The current new page is no longer the last. */ ! nopaque->hasho_nextblkno = BufferGetBlockNumber(ovflbuf); ! ! if (nbuf_inserts == 0) ! { ! /* ! * No reason to prevent others from using the ! * page. (Could not do it earlier as we needed some ! * info from nopaque. to setup the links. ! */ ! _hash_relbuf(rel, nbuf); ! } ! nbuf = ovflbuf; ! } ! /* Keep nblkno always in-sync with nbuf. */ ! nblkno = BufferGetBlockNumber(nbuf); ! npage = BufferGetPage(nbuf); ! nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); ! nbuf_inserts = 0; ! continue; ! } ! ! /* ! * Insert tuple on new page, using _hash_pgaddtup to ensure ! * correct ordering by hashkey. This is a tad inefficient ! * since we may have to shuffle itempointers repeatedly. ! * Possible future improvement: accumulate all the items for ! * the new page and qsort them before insertion. ! */ ! (void) _hash_pgaddtup(rel, nbuf, itemsz, itup); ! ! nbuf_inserts++; ! tuple_copy += itemsz; ! i++; ! } ! ! Assert(nbuf_inserts > 0); ! new_write = lappend_int(new_write, nbuf); ! /* ! * If the next outer loop will add some more items to 'nbuf', they ! * should trigger writing again. ! */ ! nbuf_inserts = 0; ! ! /* ! * 3. Before the affected pages of the new bucket get unlocked, ! * redirect scans of the new bucket to the first not-yet-processed ! * page of the old bucket. ! * ! * If we're at the last page of the old bucket, this effectively ! * terminates the new bucket. ! */ ! _hash_set_new_bucket_end_link(rel, nbuf, oblkno_next, &new_write); ! ! /* ! * Write all buffers modified for the current old bucket page. ! * ! * Modification means tuple insertion or update of the link to the old ! * bucket. ! * ! * nbuf (the last buffer we inserted data into) might be needed for ! * the next outer loop. Keep its pin unless this is the last outer ! * loop. (However the page _hash_set_new_bucket_end_link might have ! * added can be dropped for now.) ! */ ! keep_pin = BlockNumberIsValid(oblkno_next) ? nbuf : InvalidBuffer; ! _hash_write_buffers(rel, &new_write, keep_pin); ! ! /* ! * 4. Now that the copying has succeeded, write-lock page of the old ! * bucket and delete the tuples just copied. ! */ ! _hash_chgbufaccess(rel, obuf, HASH_NOLOCK, HASH_WRITE); ! omaxoffnum = PageGetMaxOffsetNumber(opage); ! for (ooffnum = FirstOffsetNumber; ! ooffnum <= omaxoffnum; ! ooffnum = OffsetNumberNext(ooffnum)) ! { ! IndexTuple itup; ! Bucket bucket; ! ! /* ! * Fetch the item's hash key (conveniently stored in the item) and ! * determine which bucket it now belongs in. ! */ ! itup = (IndexTuple) PageGetItem(opage, ! PageGetItemId(opage, ooffnum)); ! bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup), ! maxbucket, highmask, lowmask); + if (bucket == nbucket) + { /* * Mark tuple for deletion from old page. */ deletable[ndeletable++] = ooffnum; } else Assert(bucket == obucket); } if (ndeletable > 0) { PageIndexMultiDelete(opage, deletable, ndeletable); *************** _hash_splitbucket(Relation rel, *** 865,887 **** else _hash_relbuf(rel, obuf); ! /* Exit loop if no more overflow pages in old bucket */ ! if (!BlockNumberIsValid(oblkno)) ! break; ! ! /* Else, advance to next old page */ ! obuf = _hash_getbuf(rel, oblkno, HASH_WRITE, LH_OVERFLOW_PAGE); ! opage = BufferGetPage(obuf); ! oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); } /* ! * We're at the end of the old bucket chain, so we're done partitioning ! * the tuples. Before quitting, call _hash_squeezebucket to ensure the ! * tuples remaining in the old bucket (including the overflow pages) are ! * packed as tightly as possible. The new bucket is already tight. */ - _hash_wrtbuf(rel, nbuf); - - _hash_squeezebucket(rel, obucket, start_oblkno, NULL); } --- 1308,1343 ---- else _hash_relbuf(rel, obuf); ! /* Go for the next unprocessed page. */ ! oblkno = oblkno_next; } + pfree(tuples_new); + /* ! * TODO Squeeze the old bucket ! * ! * Whether it should happen here or elsewhere, consider how to interact ! * with concurrent calls of _hash_addovflpage. That function claims: "at ! * least share lock on the bucket, to ensure that no one else tries to ! * compact the bucket meanwhile. This guarantees that 'buf' won't stop ! * being part of the bucket while it's unlocked." ! * ! * The problem is that we don't hold exclusive lock on either bucket ! * during the split anymore (except for short time during the ! * preparation), so the shared lock held by caller of _hash_addovflpage ! * is useless. We can either: ! * ! * 1. Squeeze the bucket here, but consider that LH_BUCKET_SPLIT is still ! * set. That is, be "careful enough" not to disrupt any concurrent ! * execution of _hash_addovflpage (during insertion). (And remove the ! * mentioned comment about "at least share lock"). Does such a "tentative" ! * squeezing still make sense? ! * ! * 2. Move the "squeeze" phase elsewhere (to VACUUM ?) and be careful ! * about LH_BUCKET_SPLIT flag before doing anything. ! * ! * Note: whichever approach we take, VACUUM must also respect ! * LH_BUCKET_SPLIT. */ } diff --git a/src/backend/access/hash/hashutil.c b/src/backend/access/hash/hashutil.c new file mode 100644 index 3d66216..2321890 *** a/src/backend/access/hash/hashutil.c --- b/src/backend/access/hash/hashutil.c *************** _hash_checkpage(Relation rel, Buffer buf *** 185,192 **** if (flags) { HashPageOpaque opaque = (HashPageOpaque) PageGetSpecialPointer(page); ! if ((opaque->hasho_flag & flags) == 0) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("index \"%s\" contains corrupted page at block %u", --- 185,196 ---- if (flags) { HashPageOpaque opaque = (HashPageOpaque) PageGetSpecialPointer(page); + uint16 oflags = opaque->hasho_flag; ! if (((oflags & flags) == 0) || ! ((oflags & LH_BUCKET_SPLIT) && !(oflags & LH_BUCKET_PAGE)) || ! ((oflags & LH_BUCKET_SPLIT_LAST) && ! !(oflags & (LH_BUCKET_PAGE | LH_OVERFLOW_PAGE)))) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("index \"%s\" contains corrupted page at block %u", diff --git a/src/include/access/hash.h b/src/include/access/hash.h new file mode 100644 index fc3c7f4..956e5a2 *** a/src/include/access/hash.h --- b/src/include/access/hash.h *************** typedef uint32 Bucket; *** 46,57 **** * available to other buckets by calling _hash_freeovflpage(). If all * the tuples are deleted from a bucket page, no additional action is * necessary. */ #define LH_UNUSED_PAGE (0) #define LH_OVERFLOW_PAGE (1 << 0) #define LH_BUCKET_PAGE (1 << 1) ! #define LH_BITMAP_PAGE (1 << 2) ! #define LH_META_PAGE (1 << 3) typedef struct HashPageOpaqueData { --- 46,77 ---- * available to other buckets by calling _hash_freeovflpage(). If all * the tuples are deleted from a bucket page, no additional action is * necessary. + * + * LH_BUCKET_SPLIT can only be set if LH_BUCKET_PAGE is already set. It + * indicates that bucket participates in a split - whether the bucket being + * split or a new one, receiving tuples from an existing bucket. This flag + * compensates for the fact that no lock is held at bucket level for major + * part of the split. Whoever wants to move tuples across pages (VACUUM, + * _hash_squeezebucket) should check for this flags *in addition* to having + * acquired exclusive lock on the bucket. (The exclusive lock itself is + * sufficient protection for running scans.) + * + * XXX Do we need 2 separate flags, one for the "old" and one for the "new" + * bucket? + * + * LH_BUCKET_SPLIT_LAST can be set either on primary page of a bucket being + * created during a split (such page already has LH_BUCKET_SPLIT set), or - if + * the bucket already contains overflow pages - on the last overflow page of + * such bucket. It tells that hasho_nextblkno of the page points to the first + * not-yet-processed page of the bucket being split. */ #define LH_UNUSED_PAGE (0) #define LH_OVERFLOW_PAGE (1 << 0) #define LH_BUCKET_PAGE (1 << 1) ! #define LH_BUCKET_SPLIT (1 << 2) ! #define LH_BUCKET_SPLIT_LAST (1 << 3) ! #define LH_BITMAP_PAGE (1 << 4) ! #define LH_META_PAGE (1 << 5) typedef struct HashPageOpaqueData { *************** typedef struct HashScanOpaqueData *** 88,94 **** bool hashso_bucket_valid; /* ! * If we have a share lock on the bucket, we record it here. When * hashso_bucket_blkno is zero, we have no such lock. */ BlockNumber hashso_bucket_blkno; --- 108,114 ---- bool hashso_bucket_valid; /* ! * If we have a share lock on the bucket, we record it here. When * hashso_bucket_blkno is zero, we have no such lock. */ BlockNumber hashso_bucket_blkno; *************** typedef struct HashScanOpaqueData *** 111,116 **** --- 131,152 ---- typedef HashScanOpaqueData *HashScanOpaque; /* + * Details of a split in progress. + * + * old_bucket is the bucket being split, by moving some tuples into + * new_bucket. + * + * TODO Consider renaming to BucketSplitData. + */ + typedef struct HashSplitData + { + Bucket old_bucket; + Bucket new_bucket; + } HashSplitData; + + typedef HashSplitData *HashSplit; + + /* * Definitions for metapage. */ *************** typedef HashScanOpaqueData *HashScanOpaq *** 138,145 **** --- 174,185 ---- * There is no particular upper limit on the size of mapp[], other than * needing to fit into the metapage. (With 8K block size, 128 bitmaps * limit us to 64 Gb of overflow space...) + * + * splits contains list of bucket splits in progress. Concurrent scans and + * insertions need to be aware of them. */ #define HASH_MAX_SPLITPOINTS 32 + #define HASH_MAX_SPLITS 8 /* TODO How many can we afford? */ #define HASH_MAX_BITMAPS 128 typedef struct HashMetaPageData *************** typedef struct HashMetaPageData *** 153,158 **** --- 193,199 ---- * of 2 */ uint16 hashm_bmshift; /* log2(bitmap array size in BITS) */ uint32 hashm_maxbucket; /* ID of maximum bucket in use */ + uint16 hashm_split_count; /* number of splits in progress */ uint32 hashm_highmask; /* mask to modulo into entire table */ uint32 hashm_lowmask; /* mask to modulo into lower half of table */ uint32 hashm_ovflpoint;/* splitpoint from which ovflpgs being *************** typedef struct HashMetaPageData *** 163,168 **** --- 204,210 ---- uint32 hashm_spares[HASH_MAX_SPLITPOINTS]; /* spare pages before * each splitpoint */ BlockNumber hashm_mapp[HASH_MAX_BITMAPS]; /* blknos of ovfl bitmaps */ + HashSplitData hashm_splits[HASH_MAX_SPLITS]; /* splits in progress */ } HashMetaPageData; typedef HashMetaPageData *HashMetaPage; *************** extern OffsetNumber _hash_pgaddtup(Relat *** 291,296 **** --- 333,340 ---- /* hashovfl.c */ extern Buffer _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf); + extern Buffer _hash_getovflpage(Relation rel, Buffer metabuf); + extern Buffer _hash_get_new_bucket_end(Relation rel, Buffer start); extern BlockNumber _hash_freeovflpage(Relation rel, Buffer ovflbuf, BufferAccessStrategy bstrategy); extern void _hash_initbitmap(Relation rel, HashMetaPage metap,
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers