On 10/13/18, Amit Kapila <amit.kapil...@gmail.com> wrote: > On Sat, Oct 6, 2018 at 12:17 AM John Naylor <jcnay...@gmail.com> wrote: >> -For normal mode, I taught fsm_set_and_search() to switch to a >> non-extending buffer call, but the biggest missing piece is WAL >> replay. >> > > fsm_set_and_search() > { > .. > + /* > + * For heaps we prevent extension of the FSM unless the number of pages > + * exceeds > HEAP_FSM_EXTENSION_THRESHOLD. For tables that don't already > + * have a FSM, this will save an inode and a few kB > of space. > + * For sane threshold values, the FSM address will be zero, so we > + * don't bother dealing with > anything else. > + */ > + if (rel->rd_rel->relkind == RELKIND_RELATION > + && addr.logpageno == 0) > > I am not sure if this is a solid way to avoid creating FSM. What if > fsm_set_and_search gets called for the level other than 0?
Thanks for taking a look. As for levels other than 0, I think that only happens when fsm_set_and_search() is called by fsm_search(), which will not cause extension. > Also, > when the relation has blocks more than HEAP_FSM_EXTENSION_THRESHOLD, > then first time when vacuum will try to record the free space in the > page, won't it skip recording free space for first > HEAP_FSM_EXTENSION_THRESHOLD pages? Hmm, that's a good point. > I think you have found a good way to avoid creating FSM, but can't we > use some simpler technique like if the FSM fork for a relation doesn't > exist, then check the heapblk number for which we try to update the > FSM and if it is lesser than HEAP_FSM_EXTENSION_THRESHOLD, then avoid > creating the FSM. I think I see what you mean, but to avoid the vacuum problem you just mentioned, we'd need to check the relation size, too. I've attached an unpolished revision to do this. It seems to work, but I haven't tested the vacuum issue yet. I'll do that and some COPY performance testing in the next day or so. There's a bit more repetition than I would like, so I'm not sure it's simpler - perhaps RecordPageWithFreeSpace() could be turned into a wrapper around RecordAndGetPageWithFreeSpace(). Also new in this version, some non-functional improvements to hio.c: -debugging calls that are #ifdef'd out. -move some code out into a function instead of adding another goto. >> I couldn't find a non-extending equivalent of >> XLogReadBufferExtended(), so I might have to create one. >> > > I think it would be better if we can find a common way to avoid > creating FSM both during DO and REDO time. It might be possible if > somethin like what I have said above is feasible. That would be ideal. -John Naylor
From e40e94071f998d0b8a04a1116de5d22aa8cd355c Mon Sep 17 00:00:00 2001 From: John Naylor <jcnay...@gmail.com> Date: Sun, 14 Oct 2018 02:25:34 +0700 Subject: [PATCH v3] Avoid creation of the free space map for small tables. The FSM isn't created if the heap has fewer than 10 blocks. If the last known good block has insufficient space, try every block before extending the heap. If a heap with a FSM is truncated back to below the threshold, the FSM stays around and can be used as usual. --- contrib/pageinspect/expected/page.out | 77 ++++++------ contrib/pageinspect/sql/page.sql | 33 ++++-- src/backend/access/heap/hio.c | 137 +++++++++++++++++----- src/backend/storage/freespace/freespace.c | 95 +++++++++++---- src/include/storage/freespace.h | 4 + 5 files changed, 244 insertions(+), 102 deletions(-) diff --git a/contrib/pageinspect/expected/page.out b/contrib/pageinspect/expected/page.out index 3fcd9fbe6d..83e5910453 100644 --- a/contrib/pageinspect/expected/page.out +++ b/contrib/pageinspect/expected/page.out @@ -1,48 +1,69 @@ CREATE EXTENSION pageinspect; -CREATE TABLE test1 (a int, b int); -INSERT INTO test1 VALUES (16777217, 131584); -VACUUM test1; -- set up FSM +CREATE TABLE test_rel_forks (a int); +-- Make sure there are enough blocks in the heap for the FSM to be created. +INSERT INTO test_rel_forks SELECT g from generate_series(1,10000) g; +-- set up FSM and VM +VACUUM test_rel_forks; -- The page contents can vary, so just test that it can be read -- successfully, but don't keep the output. -SELECT octet_length(get_raw_page('test1', 'main', 0)) AS main_0; +SELECT octet_length(get_raw_page('test_rel_forks', 'main', 0)) AS main_0; main_0 -------- 8192 (1 row) -SELECT octet_length(get_raw_page('test1', 'main', 1)) AS main_1; -ERROR: block number 1 is out of range for relation "test1" -SELECT octet_length(get_raw_page('test1', 'fsm', 0)) AS fsm_0; +SELECT octet_length(get_raw_page('test_rel_forks', 'main', 100)) AS main_100; +ERROR: block number 100 is out of range for relation "test_rel_forks" +SELECT octet_length(get_raw_page('test_rel_forks', 'fsm', 0)) AS fsm_0; fsm_0 ------- 8192 (1 row) -SELECT octet_length(get_raw_page('test1', 'fsm', 1)) AS fsm_1; - fsm_1 -------- - 8192 -(1 row) - -SELECT octet_length(get_raw_page('test1', 'vm', 0)) AS vm_0; +SELECT octet_length(get_raw_page('test_rel_forks', 'fsm', 10)) AS fsm_10; +ERROR: block number 10 is out of range for relation "test_rel_forks" +SELECT octet_length(get_raw_page('test_rel_forks', 'vm', 0)) AS vm_0; vm_0 ------ 8192 (1 row) -SELECT octet_length(get_raw_page('test1', 'vm', 1)) AS vm_1; -ERROR: block number 1 is out of range for relation "test1" +SELECT octet_length(get_raw_page('test_rel_forks', 'vm', 1)) AS vm_1; +ERROR: block number 1 is out of range for relation "test_rel_forks" SELECT octet_length(get_raw_page('xxx', 'main', 0)); ERROR: relation "xxx" does not exist -SELECT octet_length(get_raw_page('test1', 'xxx', 0)); +SELECT octet_length(get_raw_page('test_rel_forks', 'xxx', 0)); ERROR: invalid fork name HINT: Valid fork names are "main", "fsm", "vm", and "init". -SELECT get_raw_page('test1', 0) = get_raw_page('test1', 'main', 0); +SELECT * FROM fsm_page_contents(get_raw_page('test_rel_forks', 'fsm', 0)); + fsm_page_contents +------------------- + 0: 192 + + 1: 192 + + 3: 192 + + 7: 192 + + 15: 192 + + 31: 192 + + 63: 192 + + 127: 192 + + 255: 192 + + 511: 192 + + 1023: 192 + + 2047: 192 + + 4095: 192 + + fp_next_slot: 0 + + +(1 row) + +SELECT get_raw_page('test_rel_forks', 0) = get_raw_page('test_rel_forks', 'main', 0); ?column? ---------- t (1 row) +DROP TABLE test_rel_forks; +CREATE TABLE test1 (a int, b int); +INSERT INTO test1 VALUES (16777217, 131584); SELECT pagesize, version FROM page_header(get_raw_page('test1', 0)); pagesize | version ----------+--------- @@ -62,26 +83,6 @@ SELECT tuple_data_split('test1'::regclass, t_data, t_infomask, t_infomask2, t_bi {"\\x01000001","\\x00020200"} (1 row) -SELECT * FROM fsm_page_contents(get_raw_page('test1', 'fsm', 0)); - fsm_page_contents -------------------- - 0: 254 + - 1: 254 + - 3: 254 + - 7: 254 + - 15: 254 + - 31: 254 + - 63: 254 + - 127: 254 + - 255: 254 + - 511: 254 + - 1023: 254 + - 2047: 254 + - 4095: 254 + - fp_next_slot: 0 + - -(1 row) - DROP TABLE test1; -- check that using any of these functions with a partitioned table or index -- would fail diff --git a/contrib/pageinspect/sql/page.sql b/contrib/pageinspect/sql/page.sql index 8ac9991837..ee811759d5 100644 --- a/contrib/pageinspect/sql/page.sql +++ b/contrib/pageinspect/sql/page.sql @@ -1,26 +1,35 @@ CREATE EXTENSION pageinspect; -CREATE TABLE test1 (a int, b int); -INSERT INTO test1 VALUES (16777217, 131584); +CREATE TABLE test_rel_forks (a int); +-- Make sure there are enough blocks in the heap for the FSM to be created. +INSERT INTO test_rel_forks SELECT g from generate_series(1,10000) g; -VACUUM test1; -- set up FSM +-- set up FSM and VM +VACUUM test_rel_forks; -- The page contents can vary, so just test that it can be read -- successfully, but don't keep the output. -SELECT octet_length(get_raw_page('test1', 'main', 0)) AS main_0; -SELECT octet_length(get_raw_page('test1', 'main', 1)) AS main_1; +SELECT octet_length(get_raw_page('test_rel_forks', 'main', 0)) AS main_0; +SELECT octet_length(get_raw_page('test_rel_forks', 'main', 100)) AS main_100; -SELECT octet_length(get_raw_page('test1', 'fsm', 0)) AS fsm_0; -SELECT octet_length(get_raw_page('test1', 'fsm', 1)) AS fsm_1; +SELECT octet_length(get_raw_page('test_rel_forks', 'fsm', 0)) AS fsm_0; +SELECT octet_length(get_raw_page('test_rel_forks', 'fsm', 10)) AS fsm_10; -SELECT octet_length(get_raw_page('test1', 'vm', 0)) AS vm_0; -SELECT octet_length(get_raw_page('test1', 'vm', 1)) AS vm_1; +SELECT octet_length(get_raw_page('test_rel_forks', 'vm', 0)) AS vm_0; +SELECT octet_length(get_raw_page('test_rel_forks', 'vm', 1)) AS vm_1; SELECT octet_length(get_raw_page('xxx', 'main', 0)); -SELECT octet_length(get_raw_page('test1', 'xxx', 0)); +SELECT octet_length(get_raw_page('test_rel_forks', 'xxx', 0)); + +SELECT * FROM fsm_page_contents(get_raw_page('test_rel_forks', 'fsm', 0)); + +SELECT get_raw_page('test_rel_forks', 0) = get_raw_page('test_rel_forks', 'main', 0); -SELECT get_raw_page('test1', 0) = get_raw_page('test1', 'main', 0); +DROP TABLE test_rel_forks; + +CREATE TABLE test1 (a int, b int); +INSERT INTO test1 VALUES (16777217, 131584); SELECT pagesize, version FROM page_header(get_raw_page('test1', 0)); @@ -29,8 +38,6 @@ SELECT page_checksum(get_raw_page('test1', 0), 0) IS NOT NULL AS silly_checksum_ SELECT tuple_data_split('test1'::regclass, t_data, t_infomask, t_infomask2, t_bits) FROM heap_page_items(get_raw_page('test1', 0)); -SELECT * FROM fsm_page_contents(get_raw_page('test1', 'fsm', 0)); - DROP TABLE test1; -- check that using any of these functions with a partitioned table or index diff --git a/src/backend/access/heap/hio.c b/src/backend/access/heap/hio.c index b8b5871559..dd221b8f5d 100644 --- a/src/backend/access/heap/hio.c +++ b/src/backend/access/heap/hio.c @@ -24,6 +24,12 @@ #include "storage/lmgr.h" #include "storage/smgr.h" +/*#define TRACE_TARGETBLOCK */ + +static BlockNumber get_page_no_fsm(Relation relation, + BlockNumber prevBlockAttempted, + bool *try_every_page); + /* * RelationPutHeapTuple - place tuple at specified page @@ -315,13 +321,15 @@ RelationGetBufferForTuple(Relation relation, Size len, BulkInsertState bistate, Buffer *vmbuffer, Buffer *vmbuffer_other) { - bool use_fsm = !(options & HEAP_INSERT_SKIP_FSM); + bool always_extend = (options & HEAP_INSERT_SKIP_FSM), + try_every_page = false; Buffer buffer = InvalidBuffer; Page page; Size pageFreeSpace = 0, saveFreeSpace = 0; BlockNumber targetBlock, - otherBlock; + otherBlock, + prevBlockAttempted; bool needLock; len = MAXALIGN(len); /* be conservative */ @@ -355,47 +363,41 @@ RelationGetBufferForTuple(Relation relation, Size len, * loop around and retry multiple times. (To insure this isn't an infinite * loop, we must update the FSM with the correct amount of free space on * each page that proves not to be suitable.) If the FSM has no record of - * a page with enough free space, we give up and extend the relation. + * a page with enough free space, we try every page if the heap is small, + * or give up and extend the relation. * - * When use_fsm is false, we either put the tuple onto the existing target - * page or extend the relation. + * When always_extend is true, we either put the tuple onto the existing + * target page or extend the relation. */ if (len + saveFreeSpace > MaxHeapTupleSize) { /* can't fit, don't bother asking FSM */ targetBlock = InvalidBlockNumber; - use_fsm = false; + always_extend = true; } else if (bistate && bistate->current_buf != InvalidBuffer) targetBlock = BufferGetBlockNumber(bistate->current_buf); else targetBlock = RelationGetTargetBlock(relation); - if (targetBlock == InvalidBlockNumber && use_fsm) + if (targetBlock == InvalidBlockNumber && !always_extend) { /* * We have no cached target page, so ask the FSM for an initial * target. */ targetBlock = GetPageWithFreeSpace(relation, len + saveFreeSpace); - - /* - * If the FSM knows nothing of the rel, try the last page before we - * give up and extend. This avoids one-tuple-per-page syndrome during - * bootstrapping or in a recently-started system. - */ if (targetBlock == InvalidBlockNumber) - { - BlockNumber nblocks = RelationGetNumberOfBlocks(relation); - - if (nblocks > 0) - targetBlock = nblocks - 1; - } + targetBlock = get_page_no_fsm(relation, InvalidBlockNumber, + &try_every_page); } loop: while (targetBlock != InvalidBlockNumber) { +#ifdef TRACE_TARGETBLOCK + elog(DEBUG1, "Attempting block %d", targetBlock); +#endif /* * Read and exclusive-lock the target block, as well as the other * block if one was given, taking suitable care with lock ordering and @@ -482,6 +484,9 @@ loop: pageFreeSpace = PageGetHeapFreeSpace(page); if (len + saveFreeSpace <= pageFreeSpace) { +#ifdef TRACE_TARGETBLOCK + elog(DEBUG1, "Returning buffer for block %d", targetBlock); +#endif /* use this page as future insert target, too */ RelationSetTargetBlock(relation, targetBlock); return buffer; @@ -502,18 +507,36 @@ loop: ReleaseBuffer(buffer); } - /* Without FSM, always fall out of the loop and extend */ - if (!use_fsm) + if (always_extend) break; - /* - * Update FSM as to condition of this page, and ask for another page - * to try. - */ - targetBlock = RecordAndGetPageWithFreeSpace(relation, - targetBlock, - pageFreeSpace, - len + saveFreeSpace); + if (try_every_page) + { + /* We've tried every page; extend. */ + if (targetBlock == 0) + break; + + /* Try the next lower block number. */ + targetBlock--; +#ifdef TRACE_TARGETBLOCK + elog(DEBUG1, "Trying next lower block number"); +#endif + } + else + { + /* + * Update FSM as to condition of this page, and ask for another + * page to try. + */ + prevBlockAttempted = targetBlock; + targetBlock = RecordAndGetPageWithFreeSpace(relation, + targetBlock, + pageFreeSpace, + len + saveFreeSpace); + if (targetBlock == InvalidBlockNumber) + targetBlock = get_page_no_fsm(relation, prevBlockAttempted, + &try_every_page); + } } /* @@ -534,7 +557,7 @@ loop: */ if (needLock) { - if (!use_fsm) + if (always_extend || try_every_page) LockRelationForExtension(relation, ExclusiveLock); else if (!ConditionalLockRelationForExtension(relation, ExclusiveLock)) { @@ -554,6 +577,10 @@ loop: if (targetBlock != InvalidBlockNumber) { UnlockRelationForExtension(relation, ExclusiveLock); + + /* This shouldn't be true, but let's make sure it isn't. */ + try_every_page = false; + goto loop; } @@ -627,3 +654,53 @@ loop: return buffer; } + +/* + * If the FSM has no information, first try the last page in the relation + * if we haven't already. This avoids one-tuple-per-page syndrome during + * bootstrapping or in a recently-started system. + * + * If the heap is small enough, it likely has no FSM (or a truncated one), + * but even if it does, just try every page. + * + * If InvalidBlockNumber is returned, extend the relation. + */ +static BlockNumber +get_page_no_fsm(Relation relation, + BlockNumber prevBlockAttempted, + bool *try_every_page) +{ + BlockNumber nblocks = RelationGetNumberOfBlocks(relation), + targetBlock = InvalidBlockNumber; + + if (nblocks > 0) + { + targetBlock = nblocks - 1; + + if (nblocks <= HEAP_FSM_EXTENSION_THRESHOLD) + { + *try_every_page = true; +#ifdef TRACE_TARGETBLOCK + elog(DEBUG1, "Setting try_every_page"); +#endif + /* If we already tried the last page, skip it or extend. */ + if (targetBlock == prevBlockAttempted) + { + if (nblocks > 1) + { + Assert(targetBlock != InvalidBlockNumber); + targetBlock--; + } + else + targetBlock = InvalidBlockNumber; + } + } + else + { + /* If we already tried the last page, extend. */ + if (targetBlock == prevBlockAttempted) + targetBlock = InvalidBlockNumber; + } + } + return targetBlock; +} diff --git a/src/backend/storage/freespace/freespace.c b/src/backend/storage/freespace/freespace.c index 7c4ad1c449..b62456c59f 100644 --- a/src/backend/storage/freespace/freespace.c +++ b/src/backend/storage/freespace/freespace.c @@ -111,6 +111,7 @@ static BlockNumber fsm_search(Relation rel, uint8 min_cat); static uint8 fsm_vacuum_page(Relation rel, FSMAddress addr, BlockNumber start, BlockNumber end, bool *eof); +static void check_fsm_size(Relation rel, BlockNumber blkno, bool *fsm_exists); /******** Public API ********/ @@ -125,8 +126,7 @@ static uint8 fsm_vacuum_page(Relation rel, FSMAddress addr, * will turn out to have too little space available by the time the caller * gets a lock on it. In that case, the caller should report the actual * amount of free space available on that page and then try again (see - * RecordAndGetPageWithFreeSpace). If InvalidBlockNumber is returned, - * extend the relation. + * RecordAndGetPageWithFreeSpace). */ BlockNumber GetPageWithFreeSpace(Relation rel, Size spaceNeeded) @@ -153,12 +153,28 @@ RecordAndGetPageWithFreeSpace(Relation rel, BlockNumber oldPage, int search_cat = fsm_space_needed_to_cat(spaceNeeded); FSMAddress addr; uint16 slot; - int search_slot; + int search_slot = -1; + bool fsm_exists; /* Get the location of the FSM byte representing the heap block */ addr = fsm_get_location(oldPage, &slot); - search_slot = fsm_set_and_search(rel, addr, slot, old_cat, search_cat); + RelationOpenSmgr(rel); + check_fsm_size(rel, oldPage, &fsm_exists); + + if (!fsm_exists && rel->rd_rel->relkind == RELKIND_RELATION + && oldPage <= HEAP_FSM_EXTENSION_THRESHOLD) + { + /* See explanation in RecordPageWithFreeSpace */ + /* TODO: use a cached value and/or cache this value? */ + BlockNumber heap_nblocks = RelationGetNumberOfBlocks(rel); + if (heap_nblocks > HEAP_FSM_EXTENSION_THRESHOLD) + search_slot = fsm_set_and_search(rel, addr, slot, old_cat, search_cat); + else + return InvalidBlockNumber; + } + else + search_slot = fsm_set_and_search(rel, addr, slot, old_cat, search_cat); /* * If fsm_set_and_search found a suitable new block, return that. @@ -183,16 +199,36 @@ RecordPageWithFreeSpace(Relation rel, BlockNumber heapBlk, Size spaceAvail) int new_cat = fsm_space_avail_to_cat(spaceAvail); FSMAddress addr; uint16 slot; + bool fsm_exists; /* Get the location of the FSM byte representing the heap block */ addr = fsm_get_location(heapBlk, &slot); - fsm_set_and_search(rel, addr, slot, new_cat, 0); + RelationOpenSmgr(rel); + check_fsm_size(rel, heapBlk, &fsm_exists); + + if (!fsm_exists && rel->rd_rel->relkind == RELKIND_RELATION + && heapBlk <= HEAP_FSM_EXTENSION_THRESHOLD) + { + /* + * For heaps we prevent extension of the FSM unless the number + * of pages exceeds HEAP_FSM_EXTENSION_THRESHOLD. For tables + * that don't already have a FSM, this will save an inode + * and a few kB of space. + */ + /* TODO: use a cached value and/or cache this value? */ + BlockNumber heap_nblocks = RelationGetNumberOfBlocks(rel); + if (heap_nblocks > HEAP_FSM_EXTENSION_THRESHOLD) + fsm_set_and_search(rel, addr, slot, new_cat, 0); + } + else + fsm_set_and_search(rel, addr, slot, new_cat, 0); } /* * XLogRecordPageWithFreeSpace - like RecordPageWithFreeSpace, for use in * WAL replay + * TODO: Avoid creating a FSM here, also. */ void XLogRecordPageWithFreeSpace(RelFileNode rnode, BlockNumber heapBlk, @@ -546,24 +582,10 @@ fsm_readbuf(Relation rel, FSMAddress addr, bool extend) { BlockNumber blkno = fsm_logical_to_physical(addr); Buffer buf; + bool dummy; RelationOpenSmgr(rel); - - /* - * If we haven't cached the size of the FSM yet, check it first. Also - * recheck if the requested block seems to be past end, since our cached - * value might be stale. (We send smgr inval messages on truncation, but - * not on extension.) - */ - if (rel->rd_smgr->smgr_fsm_nblocks == InvalidBlockNumber || - blkno >= rel->rd_smgr->smgr_fsm_nblocks) - { - if (smgrexists(rel->rd_smgr, FSM_FORKNUM)) - rel->rd_smgr->smgr_fsm_nblocks = smgrnblocks(rel->rd_smgr, - FSM_FORKNUM); - else - rel->rd_smgr->smgr_fsm_nblocks = 0; - } + check_fsm_size(rel, blkno, &dummy); /* Handle requests beyond EOF */ if (blkno >= rel->rd_smgr->smgr_fsm_nblocks) @@ -676,6 +698,9 @@ fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot, int newslot = -1; buf = fsm_readbuf(rel, addr, true); + if (buf == InvalidBuffer) + return newslot; + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); page = BufferGetPage(buf); @@ -904,3 +929,31 @@ fsm_vacuum_page(Relation rel, FSMAddress addr, return max_avail; } + +/* + * If we haven't cached the size of the FSM yet, check it first. Also + * recheck if the requested block seems to be past end, since our cached + * value might be stale. (We send smgr inval messages on truncation, but + * not on extension.) + */ +static void +check_fsm_size(Relation rel, BlockNumber blkno, bool *fsm_exists) +{ + if (rel->rd_smgr->smgr_fsm_nblocks == InvalidBlockNumber || + blkno >= rel->rd_smgr->smgr_fsm_nblocks) + { + if (smgrexists(rel->rd_smgr, FSM_FORKNUM)) + { + *fsm_exists = true; + rel->rd_smgr->smgr_fsm_nblocks = smgrnblocks(rel->rd_smgr, + FSM_FORKNUM); + } + else + { + *fsm_exists = false; + rel->rd_smgr->smgr_fsm_nblocks = 0; + } + } + else + *fsm_exists = true; +} \ No newline at end of file diff --git a/src/include/storage/freespace.h b/src/include/storage/freespace.h index 726eb30fb8..68e4d27818 100644 --- a/src/include/storage/freespace.h +++ b/src/include/storage/freespace.h @@ -18,6 +18,10 @@ #include "storage/relfilenode.h" #include "utils/relcache.h" +/* Only extend a heap's FSM if the heap has greater than this many blocks */ +/* TODO: Performance-test different values. */ +#define HEAP_FSM_EXTENSION_THRESHOLD 10 + /* prototypes for public functions in freespace.c */ extern Size GetRecordedFreeSpace(Relation rel, BlockNumber heapBlk); extern BlockNumber GetPageWithFreeSpace(Relation rel, Size spaceNeeded); -- 2.17.1