Alvaro Herrera wrote: > Alvaro Herrera wrote: > > > Hmm, oh I see another problem here -- the bit is not restored when > > replayed heap_update's WAL record. I'm now wondering what other bits > > are set without much care about correctly restoring them during replay. > > I'm now wondering whether it'd be easier to just ignore pd_flags in > calculating the checksum.
Okay, so this is what I've done. pd_flags is skipped. Also the WAL routine logs both HeapTupleHeader infomasks and ItemId->lp_flags. On the latter point I'm not 100% sure of the cases where lp_flags must be logged; right now I'm only logging if the item is marked as "having storage" (the logic being that if an item does not have storage, then making it have requires a WAL entry, and vice versa). (This version has some debugging log entries which are obviously only WIP material.) -- Alvaro Herrera http://www.CommandPrompt.com/ PostgreSQL Replication, Consulting, Custom Development, 24x7 support
Index: src/backend/access/gist/gistget.c =================================================================== RCS file: /home/alvherre/Code/cvs/pgsql/src/backend/access/gist/gistget.c,v retrieving revision 1.79 diff -c -p -r1.79 gistget.c *** src/backend/access/gist/gistget.c 22 Oct 2008 12:53:56 -0000 1.79 --- src/backend/access/gist/gistget.c 5 Nov 2008 21:17:40 -0000 *************** killtuple(Relation r, GISTScanOpaque so, *** 43,48 **** --- 43,49 ---- /* page unchanged, so all is simple */ offset = ItemPointerGetOffsetNumber(iptr); ItemIdMarkDead(PageGetItemId(p, offset)); + PageSetUnloggedChange(p); SetBufferCommitInfoNeedsSave(so->curbuf); } else *************** killtuple(Relation r, GISTScanOpaque so, *** 57,62 **** --- 58,64 ---- { /* found */ ItemIdMarkDead(PageGetItemId(p, offset)); + PageSetUnloggedChange(p); SetBufferCommitInfoNeedsSave(so->curbuf); break; } Index: src/backend/access/hash/hash.c =================================================================== RCS file: /home/alvherre/Code/cvs/pgsql/src/backend/access/hash/hash.c,v retrieving revision 1.106 diff -c -p -r1.106 hash.c *** src/backend/access/hash/hash.c 17 Oct 2008 23:50:57 -0000 1.106 --- src/backend/access/hash/hash.c 5 Nov 2008 21:17:47 -0000 *************** hashgettuple(PG_FUNCTION_ARGS) *** 239,244 **** --- 239,245 ---- offnum = ItemPointerGetOffsetNumber(&(so->hashso_curpos)); page = BufferGetPage(so->hashso_curbuf); ItemIdMarkDead(PageGetItemId(page, offnum)); + PageSetUnloggedChange(page); /* * Since this can be redone later if needed, it's treated the same Index: src/backend/access/heap/heapam.c =================================================================== RCS file: /home/alvherre/Code/cvs/pgsql/src/backend/access/heap/heapam.c,v retrieving revision 1.268 diff -c -p -r1.268 heapam.c *** src/backend/access/heap/heapam.c 31 Oct 2008 19:40:26 -0000 1.268 --- src/backend/access/heap/heapam.c 7 Nov 2008 19:01:57 -0000 *************** log_newpage(RelFileNode *rnode, ForkNumb *** 4008,4013 **** --- 4008,4102 ---- } /* + * Perform XLogInsert for hint bits changes in a page. This handles hint + * bits set in HeapTupleHeaderData (t_infomask and t_infomask2) as well as + * those set in ItemIdData->lp_flags. + * + * This is intended to be called right before writing a page from shared + * buffers to disk. + * + * The approach used here, instead of WAL-logging every change, is to produce + * a complete record of the current state of hint bits in a page just before + * flushing it. There are two downsides to this approach: first, it stores + * all hint bits in the page, not only those that changed; and second, that + * the flusher of the page needs to flush a lot more of the WAL (namely up + * to this new record's LSN) than the original LSN marked on the page. + */ + XLogRecPtr + log_hintbits(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno, + Page page) + { + xl_heap_hintbits xlrec; + OffsetNumber i; + XLogRecPtr recptr; + XLogRecData rdata[2]; + char *bits; + int pos = 0; + StringInfoData buf; + + /* + * 1 byte for line pointer bits, 2 bytes for infomask, + * 2 bytes for infomask2 + */ + bits = palloc(MaxHeapTuplesPerPage * 5); + + initStringInfo(&buf); + appendStringInfo(&buf, "page %u: ", blkno); + + for (i = FirstOffsetNumber; i <= PageGetMaxOffsetNumber(page); + i = OffsetNumberNext(i)) + { + HeapTupleHeader htup; + ItemId lp = PageGetItemId(page, i); + + if (!ItemIdHasStorage(lp)) + continue; + + appendStringInfo(&buf, "offset %d: flags %02x ", i, lp->lp_flags); + + bits[pos++] = lp->lp_flags; + htup = (HeapTupleHeader) PageGetItem(page, lp); + + *((uint16 *) (bits + pos)) = htup->t_infomask & HEAP_XACT_MASK; + appendStringInfo(&buf, "infomask %04x/%04x ", htup->t_infomask, + htup->t_infomask & HEAP_XACT_MASK); + pos += 2; + *((uint16 *) (bits + pos)) = htup->t_infomask2 & HEAP2_XACT_MASK; + appendStringInfo(&buf, "infomask2 %04x/%04x\n", htup->t_infomask2, + htup->t_infomask2 & HEAP2_XACT_MASK); + pos += 2; + } + + elog(LOG, "%s", buf.data); + pfree(buf.data); + + /* NO ELOG(ERROR) from here till hint bits are logged */ + START_CRIT_SECTION(); + + xlrec.node = *rnode; + xlrec.block = blkno; + + rdata[0].data = (char *) &xlrec; + rdata[0].len = SizeOfHeapHintbits; + rdata[0].buffer = InvalidBuffer; + rdata[0].next = &(rdata[1]); + + rdata[1].data = (char *) bits; + rdata[1].len = pos; + rdata[1].buffer = InvalidBuffer; + rdata[1].next = NULL; + + recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_HINTBITS, rdata); + + PageSetLSN(page, recptr); + PageSetTLI(page, ThisTimeLineID); + + END_CRIT_SECTION(); + + return recptr; + } + + /* * Handles CLEAN and CLEAN_MOVE record types */ static void *************** heap_xlog_freeze(XLogRecPtr lsn, XLogRec *** 4125,4130 **** --- 4214,4302 ---- } static void + heap_xlog_hintbits(XLogRecPtr lsn, XLogRecord *record) + { + xl_heap_hintbits *xlrec = (xl_heap_hintbits *) XLogRecGetData(record); + Buffer buffer; + Page page; + + buffer = XLogReadBuffer(xlrec->node, xlrec->block, false); + if (!BufferIsValid(buffer)) + return; + page = (Page) BufferGetPage(buffer); + + if (XLByteLE(lsn, PageGetLSN(page))) + { + UnlockReleaseBuffer(buffer); + return; + } + + if (record->xl_len > SizeOfHeapHintbits) + { + char *bits; + char *bits_end; + OffsetNumber offset = FirstOffsetNumber; + StringInfoData buf; + + + bits = (char *) xlrec + SizeOfHeapHintbits; + bits_end = (char *) xlrec + record->xl_len; + + initStringInfo(&buf); + appendStringInfo(&buf, "page %u: ", xlrec->block); + + while (bits < bits_end) + { + + for (;;) + { + HeapTupleHeader htup; + ItemId lp = PageGetItemId(page, offset); + + if (!ItemIdHasStorage(lp)) + { + offset++; + continue; + } + + /* set the page flags */ + lp->lp_flags = *bits; + bits++; + appendStringInfo(&buf, "offset %d: flags %02x ", offset, + lp->lp_flags); + + htup = (HeapTupleHeader) PageGetItem(page, lp); + + /* set the right bits in infomask */ + htup->t_infomask = *(uint16 *) bits | + (htup->t_infomask & ~HEAP_XACT_MASK); + appendStringInfo(&buf, "infomask %04x/%04x ", htup->t_infomask, + *(uint16 *) bits); + bits += 2; + + /* set the right bits in infomask2 */ + htup->t_infomask2 = *(uint16 *) bits | + (htup->t_infomask2 & ~HEAP2_XACT_MASK); + appendStringInfo(&buf, "infomask2 %04x/%04x\n", htup->t_infomask2, + *(uint16 *) bits); + bits += 2; + + offset++; + + break; + } + } + elog(LOG, "%s", buf.data); + pfree(buf.data); + } + + PageSetLSN(page, lsn); + PageSetTLI(page, ThisTimeLineID); + MarkBufferDirty(buffer); + UnlockReleaseBuffer(buffer); + } + + static void heap_xlog_newpage(XLogRecPtr lsn, XLogRecord *record) { xl_heap_newpage *xlrec = (xl_heap_newpage *) XLogRecGetData(record); *************** heap_xlog_update(XLogRecPtr lsn, XLogRec *** 4389,4394 **** --- 4561,4568 ---- */ if (samepage) goto newsame; + if (!hot_update && !move) + PageSetFull(page); PageSetLSN(page, lsn); PageSetTLI(page, ThisTimeLineID); MarkBufferDirty(buffer); *************** heap2_redo(XLogRecPtr lsn, XLogRecord *r *** 4664,4669 **** --- 4838,4846 ---- case XLOG_HEAP2_CLEAN_MOVE: heap_xlog_clean(lsn, record, true); break; + case XLOG_HEAP2_HINTBITS: + heap_xlog_hintbits(lsn, record); + break; default: elog(PANIC, "heap2_redo: unknown op code %u", info); } *************** heap2_desc(StringInfo buf, uint8 xl_info *** 4805,4810 **** --- 4982,4995 ---- xlrec->node.spcNode, xlrec->node.dbNode, xlrec->node.relNode, xlrec->block); } + else if (info == XLOG_HEAP2_HINTBITS) + { + xl_heap_hintbits *xlrec = (xl_heap_hintbits *) rec; + + appendStringInfo(buf, "hintbits: rel %u/%u/%u; blk %u", + xlrec->node.spcNode, xlrec->node.dbNode, + xlrec->node.relNode, xlrec->block); + } else appendStringInfo(buf, "UNKNOWN"); } Index: src/backend/access/nbtree/nbtinsert.c =================================================================== RCS file: /home/alvherre/Code/cvs/pgsql/src/backend/access/nbtree/nbtinsert.c,v retrieving revision 1.168 diff -c -p -r1.168 nbtinsert.c *** src/backend/access/nbtree/nbtinsert.c 3 Nov 2008 20:47:48 -0000 1.168 --- src/backend/access/nbtree/nbtinsert.c 5 Nov 2008 21:18:28 -0000 *************** _bt_check_unique(Relation rel, IndexTupl *** 308,313 **** --- 308,314 ---- * killed. */ ItemIdMarkDead(curitemid); + PageSetUnloggedChange(page); opaque->btpo_flags |= BTP_HAS_GARBAGE; /* be sure to mark the proper buffer dirty... */ if (nbuf != InvalidBuffer) Index: src/backend/access/nbtree/nbtutils.c =================================================================== RCS file: /home/alvherre/Code/cvs/pgsql/src/backend/access/nbtree/nbtutils.c,v retrieving revision 1.91 diff -c -p -r1.91 nbtutils.c *** src/backend/access/nbtree/nbtutils.c 19 Jun 2008 00:46:03 -0000 1.91 --- src/backend/access/nbtree/nbtutils.c 5 Nov 2008 21:20:11 -0000 *************** _bt_killitems(IndexScanDesc scan, bool h *** 1153,1158 **** --- 1153,1159 ---- { /* found the item */ ItemIdMarkDead(iid); + PageSetUnloggedChange(page); killedsomething = true; break; /* out of inner search loop */ } Index: src/backend/storage/buffer/bufmgr.c =================================================================== RCS file: /home/alvherre/Code/cvs/pgsql/src/backend/storage/buffer/bufmgr.c,v retrieving revision 1.240 diff -c -p -r1.240 bufmgr.c *** src/backend/storage/buffer/bufmgr.c 31 Oct 2008 15:05:00 -0000 1.240 --- src/backend/storage/buffer/bufmgr.c 7 Nov 2008 19:03:51 -0000 *************** *** 33,38 **** --- 33,39 ---- #include <sys/file.h> #include <unistd.h> + #include "access/heapam.h" #include "miscadmin.h" #include "pg_trace.h" #include "pgstat.h" *************** *** 42,47 **** --- 43,49 ---- #include "storage/ipc.h" #include "storage/proc.h" #include "storage/smgr.h" + #include "utils/memutils.h" #include "utils/rel.h" #include "utils/resowner.h" *************** BgBufferSync(void) *** 1464,1470 **** * BUF_REUSABLE: buffer is available for replacement, ie, it has * pin count 0 and usage count 0. * ! * (BUF_WRITTEN could be set in error if FlushBuffers finds the buffer clean * after locking it, but we don't care all that much.) * * Note: caller must have done ResourceOwnerEnlargeBuffers. --- 1466,1472 ---- * BUF_REUSABLE: buffer is available for replacement, ie, it has * pin count 0 and usage count 0. * ! * (BUF_WRITTEN could be set in error if FlushBuffer finds the buffer clean * after locking it, but we don't care all that much.) * * Note: caller must have done ResourceOwnerEnlargeBuffers. *************** FlushBuffer(volatile BufferDesc *buf, SM *** 1774,1779 **** --- 1776,1789 ---- { XLogRecPtr recptr; ErrorContextCallback errcontext; + static char *dblbuf = NULL; + bool done = false; + + if (enable_block_checksums && dblbuf == NULL) + { + dblbuf = MemoryContextAlloc(TopMemoryContext, BLCKSZ + ALIGNOF_BUFFER); + dblbuf = (char *) BUFFERALIGN(dblbuf); + } /* * Acquire the buffer's io_in_progress lock. If StartBufferIO returns *************** FlushBuffer(volatile BufferDesc *buf, SM *** 1798,1803 **** --- 1808,1837 ---- reln->smgr_rnode.relNode); /* + * We make a copy of the buffer to write. + */ + if (enable_block_checksums) + memcpy(dblbuf, BufHdrGetBlock(buf), BLCKSZ); + + /* + * If the page has been modified by a hint bit setter, ensure we WAL-log + * their changes before actually writing the page; otherwise the CRC we're + * about to store could be invalid if the page is torn. Note: we check + * the flag on the shared-memory copy of the buffer, not the private copy + * we just made, to forestall the possibility that hints bits could have + * been set in the later parts of the page after we copied the flag in + * unset state. + */ + if (enable_block_checksums && PageHasUnloggedChange(BufHdrGetBlock(buf)) && + !InRecovery) + { + /* XXX cast away the "volatile" qualifier */ + log_hintbits(&((BufferDesc *) buf)->tag.rnode, buf->tag.forkNum, + buf->tag.blockNum, BufHdrGetBlock(buf)); + done = true; + } + + /* * Force XLOG flush up to buffer's LSN. This implements the basic WAL * rule that log updates must hit disk before any of the data-file changes * they describe do. *************** FlushBuffer(volatile BufferDesc *buf, SM *** 1819,1825 **** smgrwrite(reln, buf->tag.forkNum, buf->tag.blockNum, ! (char *) BufHdrGetBlock(buf), false); BufferFlushCount++; --- 1853,1859 ---- smgrwrite(reln, buf->tag.forkNum, buf->tag.blockNum, ! enable_block_checksums ? dblbuf : BufHdrGetBlock(buf), false); BufferFlushCount++; Index: src/backend/storage/page/bufpage.c =================================================================== RCS file: /home/alvherre/Code/cvs/pgsql/src/backend/storage/page/bufpage.c,v retrieving revision 1.81 diff -c -p -r1.81 bufpage.c *** src/backend/storage/page/bufpage.c 3 Nov 2008 20:47:48 -0000 1.81 --- src/backend/storage/page/bufpage.c 3 Nov 2008 22:37:02 -0000 *************** PageInit(Page page, Size pageSize, Size *** 41,46 **** --- 41,47 ---- MemSet(p, 0, pageSize); /* p->pd_flags = 0; done by above MemSet */ + p->pd_checksum = PAGE_INVALID_CHECKSUM; p->pd_lower = SizeOfPageHeaderData; p->pd_upper = pageSize - specialSize; p->pd_special = pageSize - specialSize; *************** PageHeaderIsValid(PageHeader page) *** 84,92 **** page->pd_special == MAXALIGN(page->pd_special)) return true; ! /* Check all-zeroes case */ pagebytes = (char *) page; ! for (i = 0; i < BLCKSZ; i++) { if (pagebytes[i] != 0) return false; --- 85,93 ---- page->pd_special == MAXALIGN(page->pd_special)) return true; ! /* Check all-zeroes case (skipping the checksum) */ pagebytes = (char *) page; ! for (i = sizeof(PAGE_CHECKSUM_TYPE); i < BLCKSZ; i++) { if (pagebytes[i] != 0) return false; Index: src/backend/storage/smgr/smgr.c =================================================================== RCS file: /home/alvherre/Code/cvs/pgsql/src/backend/storage/smgr/smgr.c,v retrieving revision 1.112 diff -c -p -r1.112 smgr.c *** src/backend/storage/smgr/smgr.c 30 Sep 2008 10:52:13 -0000 1.112 --- src/backend/storage/smgr/smgr.c 3 Nov 2008 22:37:02 -0000 *************** *** 27,32 **** --- 27,35 ---- #include "utils/memutils.h" + /* Perform block checksumming for corruption detection */ + bool enable_block_checksums = false; + /* * This struct of function pointers defines the API between smgr.c and * any individual storage manager module. Note that smgr subfunctions are *************** void *** 503,508 **** --- 506,515 ---- smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool isTemp) { + /* Perform block checksumming for corruption detection */ + if (enable_block_checksums) + WritePageChecksum(buffer); + (*(smgrsw[reln->smgr_which].smgr_extend)) (reln, forknum, blocknum, buffer, isTemp); } *************** smgrread(SMgrRelation reln, ForkNumber f *** 520,525 **** --- 527,551 ---- char *buffer) { (*(smgrsw[reln->smgr_which].smgr_read)) (reln, forknum, blocknum, buffer); + + /* Perform block checksumming for corruption detection */ + if (enable_block_checksums && PageGetChecksum(buffer) != PAGE_INVALID_CHECKSUM) + { + PAGE_CHECKSUM_TYPE chksum; + + CalcPageChecksum(buffer, chksum); + + if (chksum != PageGetChecksum(buffer)) + { + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("invalid checksum on read of block %u of relation %u/%u/%u", + blocknum, + reln->smgr_rnode.spcNode, + reln->smgr_rnode.dbNode, + reln->smgr_rnode.relNode))); + } + } } /* *************** void *** 541,546 **** --- 567,578 ---- smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool isTemp) { + /* + * Perform block checksumming before writing. + */ + if (enable_block_checksums) + WritePageChecksum(buffer); + (*(smgrsw[reln->smgr_which].smgr_write)) (reln, forknum, blocknum, buffer, isTemp); } Index: src/backend/utils/misc/guc.c =================================================================== RCS file: /home/alvherre/Code/cvs/pgsql/src/backend/utils/misc/guc.c,v retrieving revision 1.475 diff -c -p -r1.475 guc.c *** src/backend/utils/misc/guc.c 6 Oct 2008 13:05:36 -0000 1.475 --- src/backend/utils/misc/guc.c 3 Nov 2008 22:37:02 -0000 *************** *** 57,62 **** --- 57,63 ---- #include "regex/regex.h" #include "storage/bufmgr.h" #include "storage/fd.h" + #include "storage/smgr.h" #include "tcop/tcopprot.h" #include "tsearch/ts_cache.h" #include "utils/builtins.h" *************** static struct config_bool ConfigureNames *** 762,767 **** --- 763,778 ---- false, NULL, NULL }, { + {"perform_checksum", PGC_SIGHUP, UNGROUPED, + gettext_noop("Forces checksumming of blocks to/from disk."), + gettext_noop("The server will perform a checksum on the block " + "when read from or written to disk in order to detect storage-related " + "corruption.") + }, + &enable_block_checksums, + false, NULL, NULL + }, + { {"log_duration", PGC_SUSET, LOGGING_WHAT, gettext_noop("Logs the duration of each completed SQL statement."), NULL Index: src/backend/utils/misc/postgresql.conf.sample =================================================================== RCS file: /home/alvherre/Code/cvs/pgsql/src/backend/utils/misc/postgresql.conf.sample,v retrieving revision 1.246 diff -c -p -r1.246 postgresql.conf.sample *** src/backend/utils/misc/postgresql.conf.sample 30 Sep 2008 10:52:13 -0000 1.246 --- src/backend/utils/misc/postgresql.conf.sample 3 Nov 2008 22:37:02 -0000 *************** *** 480,485 **** --- 480,490 ---- #transform_null_equals = off + #------------------------------------------------------------------------------ + # CORRUPTION DETECTION + #------------------------------------------------------------------------------ + + #perform_checksum = off # Perform block checksumming to/from disk #------------------------------------------------------------------------------ # CUSTOMIZED OPTIONS Index: src/backend/utils/time/tqual.c =================================================================== RCS file: /home/alvherre/Code/cvs/pgsql/src/backend/utils/time/tqual.c,v retrieving revision 1.110 diff -c -p -r1.110 tqual.c *** src/backend/utils/time/tqual.c 26 Mar 2008 16:20:47 -0000 1.110 --- src/backend/utils/time/tqual.c 3 Nov 2008 22:37:02 -0000 *************** *** 44,49 **** --- 44,50 ---- #include "access/xact.h" #include "storage/bufmgr.h" #include "storage/procarray.h" + #include "storage/smgr.h" #include "utils/tqual.h" *************** SetHintBits(HeapTupleHeader tuple, Buffe *** 96,101 **** --- 97,104 ---- } tuple->t_infomask |= infomask; + if (enable_block_checksums) + PageSetUnloggedChange(BufferGetPage(buffer)); SetBufferCommitInfoNeedsSave(buffer); } Index: src/include/pg_config_manual.h =================================================================== RCS file: /home/alvherre/Code/cvs/pgsql/src/include/pg_config_manual.h,v retrieving revision 1.35 diff -c -p -r1.35 pg_config_manual.h *** src/include/pg_config_manual.h 12 Jul 2008 02:28:43 -0000 1.35 --- src/include/pg_config_manual.h 3 Nov 2008 22:37:02 -0000 *************** *** 195,201 **** * Enable debugging print statements for WAL-related operations; see * also the wal_debug GUC var. */ ! /* #define WAL_DEBUG */ /* * Enable tracing of resource consumption during sort operations; --- 195,201 ---- * Enable debugging print statements for WAL-related operations; see * also the wal_debug GUC var. */ ! #define WAL_DEBUG 1 /* * Enable tracing of resource consumption during sort operations; Index: src/include/access/heapam.h =================================================================== RCS file: /home/alvherre/Code/cvs/pgsql/src/include/access/heapam.h,v retrieving revision 1.139 diff -c -p -r1.139 heapam.h *** src/include/access/heapam.h 8 Oct 2008 01:14:44 -0000 1.139 --- src/include/access/heapam.h 3 Nov 2008 22:37:02 -0000 *************** extern XLogRecPtr log_heap_freeze(Relati *** 131,136 **** --- 131,138 ---- OffsetNumber *offsets, int offcnt); extern XLogRecPtr log_newpage(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blk, Page page); + extern XLogRecPtr log_hintbits(RelFileNode *rnode, ForkNumber forkNum, + BlockNumber blk, Page page); /* in heap/pruneheap.c */ extern void heap_page_prune_opt(Relation relation, Buffer buffer, Index: src/include/access/htup.h =================================================================== RCS file: /home/alvherre/Code/cvs/pgsql/src/include/access/htup.h,v retrieving revision 1.103 diff -c -p -r1.103 htup.h *** src/include/access/htup.h 2 Nov 2008 01:45:28 -0000 1.103 --- src/include/access/htup.h 3 Nov 2008 22:37:02 -0000 *************** typedef HeapTupleData *HeapTuple; *** 580,585 **** --- 580,586 ---- #define XLOG_HEAP2_FREEZE 0x00 #define XLOG_HEAP2_CLEAN 0x10 #define XLOG_HEAP2_CLEAN_MOVE 0x20 + #define XLOG_HEAP2_HINTBITS 0x30 /* * All what we need to find changed tuple *************** typedef struct xl_heap_freeze *** 714,719 **** --- 715,730 ---- #define SizeOfHeapFreeze (offsetof(xl_heap_freeze, cutoff_xid) + sizeof(TransactionId)) + /* This is what we need to know about hint bits */ + typedef struct xl_heap_hintbits + { + RelFileNode node; + BlockNumber block; + /* HINT BIT ARRAY FOLLOWS AT THE END */ + } xl_heap_hintbits; + + #define SizeOfHeapHintbits (offsetof(xl_heap_hintbits, block) + sizeof(BlockNumber)) + /* HeapTupleHeader functions implemented in utils/time/combocid.c */ extern CommandId HeapTupleHeaderGetCmin(HeapTupleHeader tup); extern CommandId HeapTupleHeaderGetCmax(HeapTupleHeader tup); Index: src/include/storage/bufpage.h =================================================================== RCS file: /home/alvherre/Code/cvs/pgsql/src/include/storage/bufpage.h,v retrieving revision 1.84 diff -c -p -r1.84 bufpage.h *** src/include/storage/bufpage.h 3 Nov 2008 20:47:49 -0000 1.84 --- src/include/storage/bufpage.h 3 Nov 2008 22:37:02 -0000 *************** *** 17,22 **** --- 17,23 ---- #include "access/xlogdefs.h" #include "storage/item.h" #include "storage/off.h" + #include "utils/pg_crc.h" /* * A postgres disk page is an abstraction layered on top of a postgres *************** typedef uint16 LocationIndex; *** 87,92 **** --- 88,94 ---- * * space management information generic to any page * + * pd_checksum - the checksum of the page * pd_lsn - identifies xlog record for last change to this page. * pd_tli - ditto. * pd_flags - flag bits. *************** typedef uint16 LocationIndex; *** 118,136 **** * the constraint on pagesize mod 256 is not an important restriction. * On the high end, we can only support pages up to 32KB because lp_off/lp_len * are 15 bits. */ typedef struct PageHeaderData { ! /* XXX LSN is member of *any* block, not only page-organized ones */ XLogRecPtr pd_lsn; /* LSN: next byte after last byte of xlog * record for last change to this page */ - uint16 pd_tli; /* least significant bits of the TimeLineID - * containing the LSN */ - uint16 pd_flags; /* flag bits, see below */ LocationIndex pd_lower; /* offset to start of free space */ LocationIndex pd_upper; /* offset to end of free space */ LocationIndex pd_special; /* offset to start of special space */ uint16 pd_pagesize_version; TransactionId pd_prune_xid; /* oldest prunable XID, or zero if none */ ItemIdData pd_linp[1]; /* beginning of line pointer array */ } PageHeaderData; --- 120,143 ---- * the constraint on pagesize mod 256 is not an important restriction. * On the high end, we can only support pages up to 32KB because lp_off/lp_len * are 15 bits. + * + * Note that pd_tli appears in a rather awkward position in the struct; + * this is because we moved it to accomodate pd_checksum without changing + * pg_pagesize_version's offset. */ typedef struct PageHeaderData { ! /* XXX CRC & LSN are members of *any* block, not only page-organized ones */ ! pg_crc32 pd_checksum; /* The block-level checksum */ XLogRecPtr pd_lsn; /* LSN: next byte after last byte of xlog * record for last change to this page */ LocationIndex pd_lower; /* offset to start of free space */ LocationIndex pd_upper; /* offset to end of free space */ LocationIndex pd_special; /* offset to start of special space */ uint16 pd_pagesize_version; + uint16 pd_tli; /* least significant bits of the TimeLineID + * containing the LSN */ + uint16 pd_flags; /* flag bits, see below */ TransactionId pd_prune_xid; /* oldest prunable XID, or zero if none */ ItemIdData pd_linp[1]; /* beginning of line pointer array */ } PageHeaderData; *************** typedef PageHeaderData *PageHeader; *** 148,159 **** * PD_PAGE_FULL is set if an UPDATE doesn't find enough free space in the * page for its new tuple version; this suggests that a prune is needed. * Again, this is just a hint. */ #define PD_HAS_FREE_LINES 0x0001 /* are there any unused line pointers? */ #define PD_PAGE_FULL 0x0002 /* not enough free space for new * tuple? */ ! #define PD_VALID_FLAG_BITS 0x0003 /* OR of all valid pd_flags bits */ /* * Page layout version number 0 is for pre-7.3 Postgres releases. --- 155,172 ---- * PD_PAGE_FULL is set if an UPDATE doesn't find enough free space in the * page for its new tuple version; this suggests that a prune is needed. * Again, this is just a hint. + * + * PG_UNLOGGED_CHANGE indicates whether a process has set hint bits on the + * page. This is used to determine whether a WAL message needs to be emitted + * before writing the page to disk when page checksums are enabled. */ #define PD_HAS_FREE_LINES 0x0001 /* are there any unused line pointers? */ #define PD_PAGE_FULL 0x0002 /* not enough free space for new * tuple? */ + #define PD_UNLOGGED_CHANGE 0x0004 /* does the page have unlogged hint + bits? */ ! #define PD_VALID_FLAG_BITS 0x0007 /* OR of all valid pd_flags bits */ /* * Page layout version number 0 is for pre-7.3 Postgres releases. *************** typedef PageHeaderData *PageHeader; *** 163,170 **** * Release 8.3 uses 4; it changed the HeapTupleHeader layout again, and * added the pd_flags field (by stealing some bits from pd_tli), * as well as adding the pd_prune_xid field (which enlarges the header). */ ! #define PG_PAGE_LAYOUT_VERSION 4 /* ---------------------------------------------------------------- --- 176,186 ---- * Release 8.3 uses 4; it changed the HeapTupleHeader layout again, and * added the pd_flags field (by stealing some bits from pd_tli), * as well as adding the pd_prune_xid field (which enlarges the header). + * Release 8.4 uses 5; it added a checksum to the page header, and moved + * pd_tli and pd_flags so that the page version would keep the same + * offset. */ ! #define PG_PAGE_LAYOUT_VERSION 5 /* ---------------------------------------------------------------- *************** do { \ *** 352,357 **** --- 368,410 ---- #define PageClearPrunable(page) \ (((PageHeader) (page))->pd_prune_xid = InvalidTransactionId) + /* ---------------------------------------------------------------- + * CRC support + * ---------------------------------------------------------------- + */ + #define PAGE_CHECKSUM_TYPE pg_crc32 + #define SIZEOF_PAGE_CHECKSUM sizeof(PAGE_CHECKSUM_TYPE) + #define PAGE_INVALID_CHECKSUM 0xb79a6e9c + + #define CalcPageChecksum(buffer, sum) \ + do { \ + INIT_CRC32(sum); \ + COMP_CRC32(sum, &buffer[sizeof(pg_crc32)], \ + offsetof(PageHeaderData, pd_flags) - sizeof(pg_crc32)); \ + COMP_CRC32(sum, &buffer[offsetof(PageHeaderData, pd_flags) + sizeof(uint16)], \ + BLCKSZ - (offsetof(PageHeaderData, pd_flags) + sizeof(uint16))); \ + FIN_CRC32(sum); \ + } while (0) + + /* beware multiple evaluation of argument */ + #define WritePageChecksum(buffer) \ + do { \ + PAGE_CHECKSUM_TYPE chksum; \ + CalcPageChecksum(buffer, chksum); \ + PageSetChecksum(buffer, chksum); \ + } while (0) + + #define PageGetChecksum(page) \ + (((PageHeader) (page))->pd_checksum) + #define PageSetChecksum(page, checksum) \ + (((PageHeader) (page))->pd_checksum = (checksum)) + + #define PageHasUnloggedChange(page) \ + (((PageHeader) (page))->pd_flags & PD_UNLOGGED_CHANGE) + #define PageSetUnloggedChange(page) \ + (((PageHeader) (page))->pd_flags |= PD_UNLOGGED_CHANGE) + #define PageClearUnloggedChange(page) \ + (((PageHeader) (page))->pd_flags &= ~PD_UNLOGGED_CHANGE) /* ---------------------------------------------------------------- * extern declarations Index: src/include/storage/smgr.h =================================================================== RCS file: /home/alvherre/Code/cvs/pgsql/src/include/storage/smgr.h,v retrieving revision 1.63 diff -c -p -r1.63 smgr.h *** src/include/storage/smgr.h 11 Aug 2008 11:05:11 -0000 1.63 --- src/include/storage/smgr.h 3 Nov 2008 22:37:02 -0000 *************** *** 20,25 **** --- 20,28 ---- #include "storage/relfilenode.h" + /* Perform block checksumming for corruption detection */ + bool enable_block_checksums; + /* * smgr.c maintains a table of SMgrRelation objects, which are essentially * cached file handles. An SMgrRelation is created (if not already present)
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers