I just noticed that ALTER TABLE foo SET TABLESPACE new_tablespace;
doesn't optimise writing WAL, so when streaming enabled it will copy the whole table to WAL and across the wire. My understanding was that ALTER TABLE had been optimised, though not as much as could be in this case. Following patch writes a new WAL record that just says "copy foo to newts" and during replay we flush buffers and then re-execute the copy (but only when InArchiveRecovery). So the copy happens locally on the standby, not copying from primary to standby. We do this just with a little refactoring and a simple new WAL message. So about 64 bytes rather than potentially gigabytes of data, which seems important when using SR. Simple patch, no new concepts, so I figure to apply this now. (Yes, I need to bump WAL format id as well). Objections? -- Simon Riggs www.2ndQuadrant.com
*** a/src/backend/access/heap/heapam.c --- b/src/backend/access/heap/heapam.c *************** *** 4763,4768 **** heap_xlog_inplace(XLogRecPtr lsn, XLogRecord *record) --- 4763,4802 ---- UnlockReleaseBuffer(buffer); } + static void + heap_xlog_copyrelfile(XLogRecPtr lsn, XLogRecord *record) + { + xl_heap_copyrelfile *xlrec = (xl_heap_copyrelfile *) XLogRecGetData(record); + + /* + * No conflict processing required for XLOG_HEAP2_COPYFILE records + */ + + /* + * If this is just crash recovery then we already synced the + * changes to disk, so we don't need to replay the copy. That + * is equivalent to skipping writing WAL in other cases. + */ + if (InArchiveRecovery) + { + SMgrRelation src = smgropen(xlrec->node); + SMgrRelation dst = smgropen(xlrec->dstnode); + + Relation srcrel = CreateFakeRelcacheEntry(src->smgr_rnode); + + /* + * Flush all buffers for the source relfilenode before + * we do the copy outside of shared buffers. + */ + FlushRelationBuffers(srcrel); + + /* + * Replay the copy and sync + */ + heap_copy_relfilenode(src, dst, xlrec->forkNum, false); + } + } + void heap_redo(XLogRecPtr lsn, XLogRecord *record) { *************** *** 4824,4829 **** heap2_redo(XLogRecPtr lsn, XLogRecord *record) --- 4858,4866 ---- case XLOG_HEAP2_CLEANUP_INFO: heap_xlog_cleanup_info(lsn, record); break; + case XLOG_HEAP2_COPYFILE: + heap_xlog_copyrelfile(lsn, record); + break; default: elog(PANIC, "heap2_redo: unknown op code %u", info); } *************** *** 4952,4961 **** heap2_desc(StringInfo buf, uint8 xl_info, char *rec) --- 4989,5090 ---- appendStringInfo(buf, "cleanup info: remxid %u", xlrec->latestRemovedXid); } + else if (info == XLOG_HEAP2_COPYFILE) + { + xl_heap_copyrelfile *xlrec = (xl_heap_copyrelfile *) rec; + + appendStringInfo(buf, "copy relfilenode: source rel %u/%u/%u;" + " destination rel %u/%u/%u; fork %u", + xlrec->node.spcNode, xlrec->node.dbNode, + xlrec->node.relNode, + xlrec->dstnode.spcNode, xlrec->dstnode.dbNode, + xlrec->dstnode.relNode, + xlrec->forkNum); + } else appendStringInfo(buf, "UNKNOWN"); } + static void + log_copyrelfiles(SMgrRelation src, SMgrRelation dst, + ForkNumber forkNum) + { + xl_heap_copyrelfile xlrec; + XLogRecPtr recptr; + XLogRecData rdata[1]; + + START_CRIT_SECTION(); + + xlrec.node = src->smgr_rnode; + xlrec.dstnode = dst->smgr_rnode; + xlrec.forkNum = forkNum; + + rdata[0].data = (char *) &xlrec; + rdata[0].len = SizeOfHeapCopyRelFile; + rdata[0].buffer = InvalidBuffer; + rdata[0].next = NULL; + + recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_COPYFILE, rdata); + + END_CRIT_SECTION(); + } + + /* + * heap_copy_relfilenode - copy source to destination relfilenode + * + * Copy data, block by block and then sync the file. All done outside + * shared buffers, so can only be executed with AccessExclusiveLock. + * + * Both smgrs must already exist + */ + void + heap_copy_relfilenode(SMgrRelation src, SMgrRelation dst, + ForkNumber forkNum, bool istemp) + { + BlockNumber nblocks; + BlockNumber blkno; + char buf[BLCKSZ]; + + /* + * We log a single instruction to truncate the destination and copy + * from source, to allow us to avoid writing WAL for every block. + * The WAL message is a no-op unless we are InArchiveRecovery. + */ + if (!InRecovery && !istemp) + log_copyrelfiles(src, dst, forkNum); + + nblocks = smgrnblocks(src, forkNum); + + for (blkno = 0; blkno < nblocks; blkno++) + { + smgrread(src, forkNum, blkno, buf); + + /* + * Now write the page. We say isTemp = true even if it's not a temp + * rel, because there's no need for smgr to schedule an fsync for this + * write; we'll do it ourselves below. + */ + smgrextend(dst, forkNum, blkno, buf, true); + } + + /* + * If the rel isn't temp, we must fsync it down to disk before it's safe + * to commit the transaction. (For a temp rel we don't care since the rel + * will be uninteresting after a crash anyway.) + * + * It's obvious that we must do this when not WAL-logging the copy. It's + * less obvious that we have to do it even if we did WAL-log the copied + * pages. The reason is that since we're copying outside shared buffers, a + * CHECKPOINT occurring during the copy has no way to flush the previously + * written data to disk (indeed it won't know the new rel even exists). A + * crash later on would replay WAL from the checkpoint, therefore it + * wouldn't replay our earlier WAL entries. If we do not fsync those pages + * here, they might still not be on disk when the crash occurs. + */ + if (!istemp) + smgrimmedsync(dst, forkNum); + } + /* * heap_sync - sync a heap, for use when no WAL has been written * *** a/src/backend/access/transam/xlog.c --- b/src/backend/access/transam/xlog.c *************** *** 163,170 **** static bool LocalRecoveryInProgress = true; */ static int LocalXLogInsertAllowed = -1; ! /* Are we recovering using offline XLOG archives? */ ! static bool InArchiveRecovery = false; /* Was the last xlog file restored from archive, or local? */ static bool restoredFromArchive = false; --- 163,170 ---- */ static int LocalXLogInsertAllowed = -1; ! /* Are we doing more than a crash recovery? */ ! bool InArchiveRecovery = false; /* Was the last xlog file restored from archive, or local? */ static bool restoredFromArchive = false; *** a/src/backend/commands/tablecmds.c --- b/src/backend/commands/tablecmds.c *************** *** 329,336 **** static void ATExecEnableDisableRule(Relation rel, char *rulename, char fires_when); static void ATExecAddInherit(Relation rel, RangeVar *parent); static void ATExecDropInherit(Relation rel, RangeVar *parent); - static void copy_relation_data(SMgrRelation rel, SMgrRelation dst, - ForkNumber forkNum, bool istemp); static const char *storage_name(char c); --- 329,334 ---- *************** *** 6995,7001 **** ATExecSetTableSpace(Oid tableOid, Oid newTableSpace) RelationCreateStorage(newrnode, rel->rd_istemp); /* copy main fork */ ! copy_relation_data(rel->rd_smgr, dstrel, MAIN_FORKNUM, rel->rd_istemp); /* copy those extra forks that exist */ for (forkNum = MAIN_FORKNUM + 1; forkNum <= MAX_FORKNUM; forkNum++) --- 6993,6999 ---- RelationCreateStorage(newrnode, rel->rd_istemp); /* copy main fork */ ! heap_copy_relfilenode(rel->rd_smgr, dstrel, MAIN_FORKNUM, rel->rd_istemp); /* copy those extra forks that exist */ for (forkNum = MAIN_FORKNUM + 1; forkNum <= MAX_FORKNUM; forkNum++) *************** *** 7003,7009 **** ATExecSetTableSpace(Oid tableOid, Oid newTableSpace) if (smgrexists(rel->rd_smgr, forkNum)) { smgrcreate(dstrel, forkNum, false); ! copy_relation_data(rel->rd_smgr, dstrel, forkNum, rel->rd_istemp); } } --- 7001,7007 ---- if (smgrexists(rel->rd_smgr, forkNum)) { smgrcreate(dstrel, forkNum, false); ! heap_copy_relfilenode(rel->rd_smgr, dstrel, forkNum, rel->rd_istemp); } } *************** *** 7021,7040 **** ATExecSetTableSpace(Oid tableOid, Oid newTableSpace) heap_close(pg_class, RowExclusiveLock); - /* - * Write an XLOG UNLOGGED record if WAL-logging was skipped because WAL - * archiving is not enabled. - */ - if (!XLogIsNeeded() && !rel->rd_istemp) - { - char reason[NAMEDATALEN + 40]; - - snprintf(reason, sizeof(reason), "ALTER TABLE SET TABLESPACE on \"%s\"", - RelationGetRelationName(rel)); - - XLogReportUnloggedStatement(reason); - } - relation_close(rel, NoLock); /* Make sure the reltablespace change is visible */ --- 7019,7024 ---- *************** *** 7048,7112 **** ATExecSetTableSpace(Oid tableOid, Oid newTableSpace) } /* - * Copy data, block by block - */ - static void - copy_relation_data(SMgrRelation src, SMgrRelation dst, - ForkNumber forkNum, bool istemp) - { - bool use_wal; - BlockNumber nblocks; - BlockNumber blkno; - char buf[BLCKSZ]; - Page page = (Page) buf; - - /* - * We need to log the copied data in WAL iff WAL archiving/streaming is - * enabled AND it's not a temp rel. - * - * Note: If you change the conditions here, update the conditions in - * ATExecSetTableSpace() for when an XLOG UNLOGGED record is written to - * match. - */ - use_wal = XLogIsNeeded() && !istemp; - - nblocks = smgrnblocks(src, forkNum); - - for (blkno = 0; blkno < nblocks; blkno++) - { - smgrread(src, forkNum, blkno, buf); - - /* XLOG stuff */ - if (use_wal) - log_newpage(&dst->smgr_rnode, forkNum, blkno, page); - - /* - * Now write the page. We say isTemp = true even if it's not a temp - * rel, because there's no need for smgr to schedule an fsync for this - * write; we'll do it ourselves below. - */ - smgrextend(dst, forkNum, blkno, buf, true); - } - - /* - * If the rel isn't temp, we must fsync it down to disk before it's safe - * to commit the transaction. (For a temp rel we don't care since the rel - * will be uninteresting after a crash anyway.) - * - * It's obvious that we must do this when not WAL-logging the copy. It's - * less obvious that we have to do it even if we did WAL-log the copied - * pages. The reason is that since we're copying outside shared buffers, a - * CHECKPOINT occurring during the copy has no way to flush the previously - * written data to disk (indeed it won't know the new rel even exists). A - * crash later on would replay WAL from the checkpoint, therefore it - * wouldn't replay our earlier WAL entries. If we do not fsync those pages - * here, they might still not be on disk when the crash occurs. - */ - if (!istemp) - smgrimmedsync(dst, forkNum); - } - - /* * ALTER TABLE ENABLE/DISABLE TRIGGER * * We just pass this off to trigger.c. --- 7032,7037 ---- *** a/src/include/access/heapam.h --- b/src/include/access/heapam.h *************** *** 119,124 **** extern void simple_heap_update(Relation relation, ItemPointer otid, --- 119,126 ---- extern void heap_markpos(HeapScanDesc scan); extern void heap_restrpos(HeapScanDesc scan); + extern void heap_copy_relfilenode(SMgrRelation src, SMgrRelation dst, + ForkNumber forkNum, bool istemp); extern void heap_sync(Relation relation); extern void heap_redo(XLogRecPtr lsn, XLogRecord *rptr); *** a/src/include/access/htup.h --- b/src/include/access/htup.h *************** *** 18,23 **** --- 18,24 ---- #include "access/tupmacs.h" #include "storage/itemptr.h" #include "storage/relfilenode.h" + #include "storage/smgr.h" /* * MaxTupleAttributeNumber limits the number of (user) columns in a tuple. *************** *** 583,588 **** typedef HeapTupleData *HeapTuple; --- 584,590 ---- #define XLOG_HEAP2_CLEAN 0x10 /* 0x20 is free, was XLOG_HEAP2_CLEAN_MOVE */ #define XLOG_HEAP2_CLEANUP_INFO 0x30 + #define XLOG_HEAP2_COPYFILE 0x40 /* * All what we need to find changed tuple *************** *** 727,732 **** typedef struct xl_heap_freeze --- 729,744 ---- #define SizeOfHeapFreeze (offsetof(xl_heap_freeze, cutoff_xid) + sizeof(TransactionId)) + /* This is for logging the command to copy a whole relfilenode fork */ + typedef struct xl_heap_copyrelfile + { + RelFileNode node; /* node of source */ + RelFileNode dstnode; /* node of destination */ + ForkNumber forkNum; + } xl_heap_copyrelfile; + + #define SizeOfHeapCopyRelFile (sizeof(xl_heap_copyrelfile)) + extern void HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple, TransactionId *latestRemovedXid); *** a/src/include/access/xlog.h --- b/src/include/access/xlog.h *************** *** 142,147 **** extern PGDLLIMPORT TimeLineID ThisTimeLineID; /* current TLI */ --- 142,149 ---- * see XLogCtl notes in xlog.c */ extern bool InRecovery; + /* Is this more than just a crash recovery? */ + extern bool InArchiveRecovery; /* * Like InRecovery, standbyState is only valid in the startup process.
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers