On Fri, Nov 27, 2015 at 3:42 PM, Michael Paquier wrote: > I am still investigating for a correct fix, looking at reinit.c the > code in charge of copying the init fork as the main fork for a > relation at the end of recovery looks to be doing its job correctly...
Attached is a patch that fixes the issue for me in master and 9.5. Actually in the last patch I forgot a call to smgrwrite to ensure that the INIT_FORKNUM is correctly synced to disk when those pages are replayed at recovery, letting the reset routines for unlogged relations do their job correctly. I have noticed as well that we need to do the same for gin and brin relations. In this case I think that we could limit the flush to unlogged relations, my patch does it unconditionally though to generalize the logic. Thoughts? -- Michael
diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index 99337b0..d7964ac 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -682,7 +682,12 @@ brinbuildempty(PG_FUNCTION_ARGS) brin_metapage_init(BufferGetPage(metabuf), BrinGetPagesPerRange(index), BRIN_CURRENT_VERSION); MarkBufferDirty(metabuf); - log_newpage_buffer(metabuf, false); + /* + * When this full page image is replayed, there is no guarantee that + * this page will be present to disk when replayed particularly for + * unlogged relations, hence enforce it to be flushed to disk. + */ + log_newpage_buffer(metabuf, false, true); END_CRIT_SECTION(); UnlockReleaseBuffer(metabuf); diff --git a/src/backend/access/brin/brin_pageops.c b/src/backend/access/brin/brin_pageops.c index f876f62..572fe20 100644 --- a/src/backend/access/brin/brin_pageops.c +++ b/src/backend/access/brin/brin_pageops.c @@ -865,7 +865,7 @@ brin_initialize_empty_new_buffer(Relation idxrel, Buffer buffer) page = BufferGetPage(buffer); brin_page_init(page, BRIN_PAGETYPE_REGULAR); MarkBufferDirty(buffer); - log_newpage_buffer(buffer, true); + log_newpage_buffer(buffer, true, false); END_CRIT_SECTION(); /* diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c index 49e9185..755c983 100644 --- a/src/backend/access/gin/gininsert.c +++ b/src/backend/access/gin/gininsert.c @@ -450,14 +450,21 @@ ginbuildempty(PG_FUNCTION_ARGS) ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL); LockBuffer(RootBuffer, BUFFER_LOCK_EXCLUSIVE); - /* Initialize and xlog metabuffer and root buffer. */ + /* + * Initialize and xlog metabuffer and root buffer. When those full + * pages are replayed, it is not guaranteed that those relation + * init forks will be flushed to disk after replaying them, hence + * enforce those pages to be flushed to disk at replay, only the + * last record will enforce a flush for performance reasons and + * because it is actually unnecessary to do it multiple times. + */ START_CRIT_SECTION(); GinInitMetabuffer(MetaBuffer); MarkBufferDirty(MetaBuffer); - log_newpage_buffer(MetaBuffer, false); + log_newpage_buffer(MetaBuffer, false, false); GinInitBuffer(RootBuffer, GIN_LEAF); MarkBufferDirty(RootBuffer); - log_newpage_buffer(RootBuffer, false); + log_newpage_buffer(RootBuffer, false, true); END_CRIT_SECTION(); /* Unlock and release the buffers. */ diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c index 53bccf6..fd53268 100644 --- a/src/backend/access/gist/gist.c +++ b/src/backend/access/gist/gist.c @@ -84,7 +84,7 @@ gistbuildempty(PG_FUNCTION_ARGS) START_CRIT_SECTION(); GISTInitBuffer(buffer, F_LEAF); MarkBufferDirty(buffer); - log_newpage_buffer(buffer, true); + log_newpage_buffer(buffer, true, true); END_CRIT_SECTION(); /* Unlock and release the buffer */ diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c index 6a6fc3b..e9a9a8f 100644 --- a/src/backend/access/heap/rewriteheap.c +++ b/src/backend/access/heap/rewriteheap.c @@ -336,7 +336,8 @@ end_heap_rewrite(RewriteState state) MAIN_FORKNUM, state->rs_blockno, state->rs_buffer, - true); + true, + false); RelationOpenSmgr(state->rs_new_rel); PageSetChecksumInplace(state->rs_buffer, state->rs_blockno); @@ -685,7 +686,8 @@ raw_heap_insert(RewriteState state, HeapTuple tup) MAIN_FORKNUM, state->rs_blockno, page, - true); + true, + false); /* * Now write the page. We say isTemp = true even if it's not a diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index cf4a6dc..d211a98 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -206,7 +206,7 @@ btbuildempty(PG_FUNCTION_ARGS) (char *) metapage, true); if (XLogIsNeeded()) log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM, - BTREE_METAPAGE, metapage, false); + BTREE_METAPAGE, metapage, false, true); /* * An immediate sync is required even if we xlog'd the page, because the diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index f95f67a..faf611c 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -277,7 +277,8 @@ _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno) if (wstate->btws_use_wal) { /* We use the heap NEWPAGE record type for this */ - log_newpage(&wstate->index->rd_node, MAIN_FORKNUM, blkno, page, true); + log_newpage(&wstate->index->rd_node, MAIN_FORKNUM, blkno, page, + true, false); } /* diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c index 83cc9e8..5f6fed5 100644 --- a/src/backend/access/rmgrdesc/xlogdesc.c +++ b/src/backend/access/rmgrdesc/xlogdesc.c @@ -77,7 +77,9 @@ xlog_desc(StringInfo buf, XLogReaderState *record) appendStringInfoString(buf, xlrec->rp_name); } - else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT) + else if (info == XLOG_FPI || + info == XLOG_FPI_FOR_HINT || + info == XLOG_FPI_FOR_SYNC) { /* no further information to print */ } @@ -181,6 +183,9 @@ xlog_identify(uint8 info) case XLOG_FPI_FOR_HINT: id = "FPI_FOR_HINT"; break; + case XLOG_FPI_FOR_SYNC: + id = "FPI_FOR_SYNC"; + break; } return id; diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c index bceee8d..15d148c 100644 --- a/src/backend/access/spgist/spginsert.c +++ b/src/backend/access/spgist/spginsert.c @@ -173,7 +173,7 @@ spgbuildempty(PG_FUNCTION_ARGS) (char *) page, true); if (XLogIsNeeded()) log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM, - SPGIST_METAPAGE_BLKNO, page, false); + SPGIST_METAPAGE_BLKNO, page, false, false); /* Likewise for the root page. */ SpGistInitPage(page, SPGIST_LEAF); @@ -183,7 +183,7 @@ spgbuildempty(PG_FUNCTION_ARGS) (char *) page, true); if (XLogIsNeeded()) log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM, - SPGIST_ROOT_BLKNO, page, true); + SPGIST_ROOT_BLKNO, page, true, false); /* Likewise for the null-tuples root page. */ SpGistInitPage(page, SPGIST_LEAF | SPGIST_NULLS); @@ -193,12 +193,15 @@ spgbuildempty(PG_FUNCTION_ARGS) (char *) page, true); if (XLogIsNeeded()) log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM, - SPGIST_NULL_BLKNO, page, true); + SPGIST_NULL_BLKNO, page, true, true); /* * An immediate sync is required even if we xlog'd the pages, because the * writes did not go through shared buffers and therefore a concurrent * checkpoint may have moved the redo pointer past our xlog record. + * When the pages are WAL-logged, record requesting the relation to be + * sync'ed is requested only once for the last record emitted in this + * sequence. */ smgrimmedsync(index->rd_smgr, INIT_FORKNUM); diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index f17f834..1cafe48 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -9184,7 +9184,9 @@ xlog_redo(XLogReaderState *record) XLogRecPtr lsn = record->EndRecPtr; /* in XLOG rmgr, backup blocks are only used by XLOG_FPI records */ - Assert(info == XLOG_FPI || info == XLOG_FPI_FOR_HINT || + Assert(info == XLOG_FPI || + info == XLOG_FPI_FOR_HINT || + info == XLOG_FPI_FOR_SYNC || !XLogRecHasAnyBlockRefs(record)); if (info == XLOG_NEXTOID) @@ -9374,7 +9376,9 @@ xlog_redo(XLogReaderState *record) { /* nothing to do here */ } - else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT) + else if (info == XLOG_FPI || + info == XLOG_FPI_FOR_HINT || + info == XLOG_FPI_FOR_SYNC) { Buffer buffer; @@ -9392,9 +9396,28 @@ xlog_redo(XLogReaderState *record) * when checksums are enabled. There is no difference in handling * XLOG_FPI and XLOG_FPI_FOR_HINT records, they use a different info * code just to distinguish them for statistics purposes. + * + * XLOG_FPI_FOR_SYNC records are generated when a relation needs to + * be sync'ed just after replaying a full page. This is important + * to match the master behavior in the case where a page is written + * directly to disk without going through shared buffers, particularly + * when writing init forks for index relations. */ if (XLogReadBufferForRedo(record, 0, &buffer) != BLK_RESTORED) elog(ERROR, "unexpected XLogReadBufferForRedo result when restoring backup block"); + + if (info == XLOG_FPI_FOR_SYNC) + { + RelFileNode rnode; + ForkNumber forknum; + BlockNumber blkno; + SMgrRelation srel; + + (void) XLogRecGetBlockTag(record, 0, &rnode, &forknum, &blkno); + srel = smgropen(rnode, InvalidBackendId); + smgrwrite(srel, forknum, blkno, BufferGetPage(buffer), false); + } + UnlockReleaseBuffer(buffer); } else if (info == XLOG_BACKUP_END) diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c index 925255f..ef0cc17 100644 --- a/src/backend/access/transam/xloginsert.c +++ b/src/backend/access/transam/xloginsert.c @@ -932,10 +932,13 @@ XLogSaveBufferForHint(Buffer buffer, bool buffer_std) * If the page follows the standard page layout, with a PageHeader and unused * space between pd_lower and pd_upper, set 'page_std' to TRUE. That allows * the unused space to be left out from the WAL record, making it smaller. + * + * If 'is_sync' is set to TRUE, relation will be requested to sync immediately + * its data at replay after replaying this full page image. */ XLogRecPtr log_newpage(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno, - Page page, bool page_std) + Page page, bool page_std, bool is_sync) { int flags; XLogRecPtr recptr; @@ -946,7 +949,7 @@ log_newpage(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno, XLogBeginInsert(); XLogRegisterBlock(0, rnode, forkNum, blkno, page, flags); - recptr = XLogInsert(RM_XLOG_ID, XLOG_FPI); + recptr = XLogInsert(RM_XLOG_ID, is_sync ? XLOG_FPI_FOR_SYNC : XLOG_FPI); /* * The page may be uninitialized. If so, we can't set the LSN because that @@ -969,9 +972,12 @@ log_newpage(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno, * If the page follows the standard page layout, with a PageHeader and unused * space between pd_lower and pd_upper, set 'page_std' to TRUE. That allows * the unused space to be left out from the WAL record, making it smaller. + * + * If 'is_sync' is set to TRUE, relation will be requested to sync immediately + * its data at replay after replaying this full page image. */ XLogRecPtr -log_newpage_buffer(Buffer buffer, bool page_std) +log_newpage_buffer(Buffer buffer, bool page_std, bool is_sync) { Page page = BufferGetPage(buffer); RelFileNode rnode; @@ -983,7 +989,7 @@ log_newpage_buffer(Buffer buffer, bool page_std) BufferGetTag(buffer, &rnode, &forkNum, &blkno); - return log_newpage(&rnode, forkNum, blkno, page, page_std); + return log_newpage(&rnode, forkNum, blkno, page, page_std, is_sync); } /* diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 0ddde72..eb22a51 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -9920,7 +9920,8 @@ copy_relation_data(SMgrRelation src, SMgrRelation dst, * space. */ if (use_wal) - log_newpage(&dst->smgr_rnode.node, forkNum, blkno, page, false); + log_newpage(&dst->smgr_rnode.node, forkNum, blkno, page, + false, false); PageSetChecksumInplace(page, blkno); diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c index 2429889..b0e3901 100644 --- a/src/backend/commands/vacuumlazy.c +++ b/src/backend/commands/vacuumlazy.c @@ -736,7 +736,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, */ if (RelationNeedsWAL(onerel) && PageGetLSN(page) == InvalidXLogRecPtr) - log_newpage_buffer(buf, true); + log_newpage_buffer(buf, true, false); PageSetAllVisible(page); visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr, diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c index 9f60687..fff0c7a 100644 --- a/src/backend/replication/logical/decode.c +++ b/src/backend/replication/logical/decode.c @@ -174,6 +174,7 @@ DecodeXLogOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) case XLOG_FPW_CHANGE: case XLOG_FPI_FOR_HINT: case XLOG_FPI: + case XLOG_FPI_FOR_SYNC: break; default: elog(ERROR, "unexpected RM_XLOG_ID record type: %u", info); diff --git a/src/include/access/xloginsert.h b/src/include/access/xloginsert.h index 31b45ba..bc91802 100644 --- a/src/include/access/xloginsert.h +++ b/src/include/access/xloginsert.h @@ -53,8 +53,10 @@ extern void XLogResetInsertion(void); extern bool XLogCheckBufferNeedsBackup(Buffer buffer); extern XLogRecPtr log_newpage(RelFileNode *rnode, ForkNumber forkNum, - BlockNumber blk, char *page, bool page_std); -extern XLogRecPtr log_newpage_buffer(Buffer buffer, bool page_std); + BlockNumber blk, char *page, bool page_std, + bool is_sync); +extern XLogRecPtr log_newpage_buffer(Buffer buffer, bool page_std, + bool is_sync); extern XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std); extern void InitXLogInsert(void); diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h index ad1eb4b..91445f1 100644 --- a/src/include/catalog/pg_control.h +++ b/src/include/catalog/pg_control.h @@ -73,6 +73,7 @@ typedef struct CheckPoint #define XLOG_END_OF_RECOVERY 0x90 #define XLOG_FPI_FOR_HINT 0xA0 #define XLOG_FPI 0xB0 +#define XLOG_FPI_FOR_SYNC 0xC0 /*
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers