Alvaro Herrera wrote:
> Alvaro Herrera wrote:
> > Hmm, oh I see another problem here -- the bit is not restored when
> > replayed heap_update's WAL record.  I'm now wondering what other bits
> > are set without much care about correctly restoring them during replay.
> I'm now wondering whether it'd be easier to just ignore pd_flags in
> calculating the checksum.

Okay, so this is what I've done.  pd_flags is skipped.  Also the WAL
routine logs both HeapTupleHeader infomasks and ItemId->lp_flags.  On
the latter point I'm not 100% sure of the cases where lp_flags must be
logged; right now I'm only logging if the item is marked as "having
storage" (the logic being that if an item does not have storage, then
making it have requires a WAL entry, and vice versa).

(This version has some debugging log entries which are obviously only
WIP material.)

Alvaro Herrera                      
PostgreSQL Replication, Consulting, Custom Development, 24x7 support
Index: src/backend/access/gist/gistget.c
RCS file: /home/alvherre/Code/cvs/pgsql/src/backend/access/gist/gistget.c,v
retrieving revision 1.79
diff -c -p -r1.79 gistget.c
*** src/backend/access/gist/gistget.c	22 Oct 2008 12:53:56 -0000	1.79
--- src/backend/access/gist/gistget.c	5 Nov 2008 21:17:40 -0000
*************** killtuple(Relation r, GISTScanOpaque so,
*** 43,48 ****
--- 43,49 ----
  		/* page unchanged, so all is simple */
  		offset = ItemPointerGetOffsetNumber(iptr);
  		ItemIdMarkDead(PageGetItemId(p, offset));
+ 		PageSetUnloggedChange(p);
*************** killtuple(Relation r, GISTScanOpaque so,
*** 57,62 ****
--- 58,64 ----
  				/* found */
  				ItemIdMarkDead(PageGetItemId(p, offset));
+ 				PageSetUnloggedChange(p);
Index: src/backend/access/hash/hash.c
RCS file: /home/alvherre/Code/cvs/pgsql/src/backend/access/hash/hash.c,v
retrieving revision 1.106
diff -c -p -r1.106 hash.c
*** src/backend/access/hash/hash.c	17 Oct 2008 23:50:57 -0000	1.106
--- src/backend/access/hash/hash.c	5 Nov 2008 21:17:47 -0000
*************** hashgettuple(PG_FUNCTION_ARGS)
*** 239,244 ****
--- 239,245 ----
  			offnum = ItemPointerGetOffsetNumber(&(so->hashso_curpos));
  			page = BufferGetPage(so->hashso_curbuf);
  			ItemIdMarkDead(PageGetItemId(page, offnum));
+ 			PageSetUnloggedChange(page);
  			 * Since this can be redone later if needed, it's treated the same
Index: src/backend/access/heap/heapam.c
RCS file: /home/alvherre/Code/cvs/pgsql/src/backend/access/heap/heapam.c,v
retrieving revision 1.268
diff -c -p -r1.268 heapam.c
*** src/backend/access/heap/heapam.c	31 Oct 2008 19:40:26 -0000	1.268
--- src/backend/access/heap/heapam.c	7 Nov 2008 19:01:57 -0000
*************** log_newpage(RelFileNode *rnode, ForkNumb
*** 4008,4013 ****
--- 4008,4102 ----
+  * Perform XLogInsert for hint bits changes in a page.  This handles hint
+  * bits set in HeapTupleHeaderData (t_infomask and t_infomask2) as well as
+  * those set in ItemIdData->lp_flags.
+  *
+  * This is intended to be called right before writing a page from shared
+  * buffers to disk.
+  *
+  * The approach used here, instead of WAL-logging every change, is to produce
+  * a complete record of the current state of hint bits in a page just before
+  * flushing it.  There are two downsides to this approach: first, it stores
+  * all hint bits in the page, not only those that changed; and second, that
+  * the flusher of the page needs to flush a lot more of the WAL (namely up
+  * to this new record's LSN) than the original LSN marked on the page.
+  */
+ XLogRecPtr
+ log_hintbits(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno,
+ 			 Page page)
+ {
+ 	xl_heap_hintbits xlrec;
+ 	OffsetNumber	i;
+ 	XLogRecPtr		recptr;
+ 	XLogRecData		rdata[2];
+ 	char		   *bits;
+ 	int				pos = 0;
+ 	StringInfoData	buf;
+ 	/*
+ 	 * 1 byte for line pointer bits, 2 bytes for infomask,
+ 	 * 2 bytes for infomask2
+ 	 */
+ 	bits = palloc(MaxHeapTuplesPerPage * 5);
+ 	initStringInfo(&buf);
+ 	appendStringInfo(&buf, "page %u: ", blkno);
+ 	for (i = FirstOffsetNumber; i <= PageGetMaxOffsetNumber(page);
+ 		 i = OffsetNumberNext(i))
+ 	{
+ 		HeapTupleHeader	htup;
+ 		ItemId		lp = PageGetItemId(page, i);
+ 		if (!ItemIdHasStorage(lp))
+ 			continue;
+ 		appendStringInfo(&buf, "offset %d: flags %02x ", i, lp->lp_flags);
+ 		bits[pos++] = lp->lp_flags;
+ 		htup = (HeapTupleHeader) PageGetItem(page, lp);
+ 		*((uint16 *) (bits + pos)) = htup->t_infomask & HEAP_XACT_MASK;
+ 		appendStringInfo(&buf, "infomask %04x/%04x ", htup->t_infomask,
+ 			 htup->t_infomask & HEAP_XACT_MASK);
+ 		pos += 2;
+ 		*((uint16 *) (bits + pos)) = htup->t_infomask2 & HEAP2_XACT_MASK;
+ 		appendStringInfo(&buf, "infomask2 %04x/%04x\n", htup->t_infomask2,
+ 			 htup->t_infomask2 & HEAP2_XACT_MASK);
+ 		pos += 2;
+ 	}
+ 	elog(LOG, "%s",;
+ 	pfree(;
+ 	/* NO ELOG(ERROR) from here till hint bits are logged */
+ 	xlrec.node = *rnode;
+ 	xlrec.block = blkno;
+ 	rdata[0].data = (char *) &xlrec;
+ 	rdata[0].len = SizeOfHeapHintbits;
+ 	rdata[0].buffer = InvalidBuffer;
+ 	rdata[0].next = &(rdata[1]);
+ 	rdata[1].data = (char *) bits;
+ 	rdata[1].len = pos;
+ 	rdata[1].buffer = InvalidBuffer;
+ 	rdata[1].next = NULL;
+ 	recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_HINTBITS, rdata);
+ 	PageSetLSN(page, recptr);
+ 	PageSetTLI(page, ThisTimeLineID);
+ 	return recptr;
+ }
+ /*
   * Handles CLEAN and CLEAN_MOVE record types
  static void
*************** heap_xlog_freeze(XLogRecPtr lsn, XLogRec
*** 4125,4130 ****
--- 4214,4302 ----
  static void
+ heap_xlog_hintbits(XLogRecPtr lsn, XLogRecord *record)
+ {
+ 	xl_heap_hintbits *xlrec = (xl_heap_hintbits *) XLogRecGetData(record);
+ 	Buffer		buffer;
+ 	Page		page;
+ 	buffer = XLogReadBuffer(xlrec->node, xlrec->block, false);
+ 	if (!BufferIsValid(buffer))
+ 		return;
+ 	page = (Page) BufferGetPage(buffer);
+ 	if (XLByteLE(lsn, PageGetLSN(page)))
+ 	{
+ 		UnlockReleaseBuffer(buffer);
+ 		return;
+ 	}
+ 	if (record->xl_len > SizeOfHeapHintbits)
+ 	{
+ 		char *bits;
+ 		char *bits_end;
+ 		OffsetNumber offset = FirstOffsetNumber;
+ 		StringInfoData	buf;
+ 		bits = (char *) xlrec + SizeOfHeapHintbits;
+ 		bits_end = (char *) xlrec + record->xl_len;
+ 		initStringInfo(&buf);
+ 		appendStringInfo(&buf, "page %u: ", xlrec->block);
+ 		while (bits < bits_end)
+ 		{
+ 			for (;;)
+ 			{
+ 				HeapTupleHeader	htup;
+ 				ItemId		lp = PageGetItemId(page, offset);
+ 				if (!ItemIdHasStorage(lp))
+ 				{
+ 					offset++;
+ 					continue;
+ 				}
+ 				/* set the page flags */
+ 				lp->lp_flags = *bits;
+ 				bits++;
+ 				appendStringInfo(&buf, "offset %d: flags %02x ", offset,
+ 								 lp->lp_flags);
+ 				htup = (HeapTupleHeader) PageGetItem(page, lp);
+ 				/* set the right bits in infomask */
+ 				htup->t_infomask = *(uint16 *) bits |
+ 					(htup->t_infomask & ~HEAP_XACT_MASK);
+ 				appendStringInfo(&buf, "infomask %04x/%04x ", htup->t_infomask,
+ 								 *(uint16 *) bits);
+ 				bits += 2;
+ 				/* set the right bits in infomask2 */
+ 				htup->t_infomask2 = *(uint16 *) bits |
+ 					(htup->t_infomask2 & ~HEAP2_XACT_MASK);
+ 				appendStringInfo(&buf, "infomask2 %04x/%04x\n", htup->t_infomask2,
+ 								 *(uint16 *) bits);
+ 				bits += 2;
+ 				offset++;
+ 				break;
+ 			}
+ 		}
+ 		elog(LOG, "%s",;
+ 		pfree(;
+ 	}
+ 	PageSetLSN(page, lsn);
+ 	PageSetTLI(page, ThisTimeLineID);
+ 	MarkBufferDirty(buffer);
+ 	UnlockReleaseBuffer(buffer);
+ }
+ static void
  heap_xlog_newpage(XLogRecPtr lsn, XLogRecord *record)
  	xl_heap_newpage *xlrec = (xl_heap_newpage *) XLogRecGetData(record);
*************** heap_xlog_update(XLogRecPtr lsn, XLogRec
*** 4389,4394 ****
--- 4561,4568 ----
  	if (samepage)
  		goto newsame;
+ 	if (!hot_update && !move)
+ 		PageSetFull(page);
  	PageSetLSN(page, lsn);
  	PageSetTLI(page, ThisTimeLineID);
*************** heap2_redo(XLogRecPtr lsn, XLogRecord *r
*** 4664,4669 ****
--- 4838,4846 ----
  			heap_xlog_clean(lsn, record, true);
+ 			heap_xlog_hintbits(lsn, record);
+ 			break;
  			elog(PANIC, "heap2_redo: unknown op code %u", info);
*************** heap2_desc(StringInfo buf, uint8 xl_info
*** 4805,4810 ****
--- 4982,4995 ----
  						 xlrec->node.spcNode, xlrec->node.dbNode,
  						 xlrec->node.relNode, xlrec->block);
+ 	else if (info == XLOG_HEAP2_HINTBITS)
+ 	{
+ 		xl_heap_hintbits *xlrec = (xl_heap_hintbits *) rec;
+ 		appendStringInfo(buf, "hintbits: rel %u/%u/%u; blk %u",
+ 						 xlrec->node.spcNode, xlrec->node.dbNode,
+ 						 xlrec->node.relNode, xlrec->block);
+ 	}
  		appendStringInfo(buf, "UNKNOWN");
Index: src/backend/access/nbtree/nbtinsert.c
RCS file: /home/alvherre/Code/cvs/pgsql/src/backend/access/nbtree/nbtinsert.c,v
retrieving revision 1.168
diff -c -p -r1.168 nbtinsert.c
*** src/backend/access/nbtree/nbtinsert.c	3 Nov 2008 20:47:48 -0000	1.168
--- src/backend/access/nbtree/nbtinsert.c	5 Nov 2008 21:18:28 -0000
*************** _bt_check_unique(Relation rel, IndexTupl
*** 308,313 ****
--- 308,314 ----
  					 * killed.
+ 					PageSetUnloggedChange(page);
  					opaque->btpo_flags |= BTP_HAS_GARBAGE;
  					/* be sure to mark the proper buffer dirty... */
  					if (nbuf != InvalidBuffer)
Index: src/backend/access/nbtree/nbtutils.c
RCS file: /home/alvherre/Code/cvs/pgsql/src/backend/access/nbtree/nbtutils.c,v
retrieving revision 1.91
diff -c -p -r1.91 nbtutils.c
*** src/backend/access/nbtree/nbtutils.c	19 Jun 2008 00:46:03 -0000	1.91
--- src/backend/access/nbtree/nbtutils.c	5 Nov 2008 21:20:11 -0000
*************** _bt_killitems(IndexScanDesc scan, bool h
*** 1153,1158 ****
--- 1153,1159 ----
  				/* found the item */
+ 				PageSetUnloggedChange(page);
  				killedsomething = true;
  				break;			/* out of inner search loop */
Index: src/backend/storage/buffer/bufmgr.c
RCS file: /home/alvherre/Code/cvs/pgsql/src/backend/storage/buffer/bufmgr.c,v
retrieving revision 1.240
diff -c -p -r1.240 bufmgr.c
*** src/backend/storage/buffer/bufmgr.c	31 Oct 2008 15:05:00 -0000	1.240
--- src/backend/storage/buffer/bufmgr.c	7 Nov 2008 19:03:51 -0000
*** 33,38 ****
--- 33,39 ----
  #include <sys/file.h>
  #include <unistd.h>
+ #include "access/heapam.h"
  #include "miscadmin.h"
  #include "pg_trace.h"
  #include "pgstat.h"
*** 42,47 ****
--- 43,49 ----
  #include "storage/ipc.h"
  #include "storage/proc.h"
  #include "storage/smgr.h"
+ #include "utils/memutils.h"
  #include "utils/rel.h"
  #include "utils/resowner.h"
*************** BgBufferSync(void)
*** 1464,1470 ****
   *	BUF_REUSABLE: buffer is available for replacement, ie, it has
   *		pin count 0 and usage count 0.
!  * (BUF_WRITTEN could be set in error if FlushBuffers finds the buffer clean
   * after locking it, but we don't care all that much.)
   * Note: caller must have done ResourceOwnerEnlargeBuffers.
--- 1466,1472 ----
   *	BUF_REUSABLE: buffer is available for replacement, ie, it has
   *		pin count 0 and usage count 0.
!  * (BUF_WRITTEN could be set in error if FlushBuffer finds the buffer clean
   * after locking it, but we don't care all that much.)
   * Note: caller must have done ResourceOwnerEnlargeBuffers.
*************** FlushBuffer(volatile BufferDesc *buf, SM
*** 1774,1779 ****
--- 1776,1789 ----
  	XLogRecPtr	recptr;
  	ErrorContextCallback errcontext;
+ 	static char *dblbuf = NULL;
+ 	bool	done = false;
+ 	if (enable_block_checksums && dblbuf == NULL)
+ 	{
+ 		dblbuf = MemoryContextAlloc(TopMemoryContext, BLCKSZ + ALIGNOF_BUFFER);
+ 		dblbuf = (char *) BUFFERALIGN(dblbuf);
+ 	}
  	 * Acquire the buffer's io_in_progress lock.  If StartBufferIO returns
*************** FlushBuffer(volatile BufferDesc *buf, SM
*** 1798,1803 ****
--- 1808,1837 ----
+ 	 * We make a copy of the buffer to write.
+ 	 */
+ 	if (enable_block_checksums)
+ 		memcpy(dblbuf, BufHdrGetBlock(buf), BLCKSZ);
+ 	/*
+ 	 * If the page has been modified by a hint bit setter, ensure we WAL-log
+ 	 * their changes before actually writing the page; otherwise the CRC we're
+ 	 * about to store could be invalid if the page is torn.  Note: we check
+ 	 * the flag on the shared-memory copy of the buffer, not the private copy
+ 	 * we just made, to forestall the possibility that hints bits could have
+ 	 * been set in the later parts of the page after we copied the flag in
+ 	 * unset state.
+ 	 */
+ 	if (enable_block_checksums && PageHasUnloggedChange(BufHdrGetBlock(buf)) &&
+ 		!InRecovery)
+ 	{
+ 		/* XXX cast away the "volatile" qualifier */
+ 		log_hintbits(&((BufferDesc *) buf)->tag.rnode, buf->tag.forkNum,
+ 					 buf->tag.blockNum, BufHdrGetBlock(buf));
+ 		done = true;
+ 	}
+ 	/*
  	 * Force XLOG flush up to buffer's LSN.  This implements the basic WAL
  	 * rule that log updates must hit disk before any of the data-file changes
  	 * they describe do.
*************** FlushBuffer(volatile BufferDesc *buf, SM
*** 1819,1825 ****
! 			  (char *) BufHdrGetBlock(buf),
--- 1853,1859 ----
! 			  enable_block_checksums ? dblbuf : BufHdrGetBlock(buf),
Index: src/backend/storage/page/bufpage.c
RCS file: /home/alvherre/Code/cvs/pgsql/src/backend/storage/page/bufpage.c,v
retrieving revision 1.81
diff -c -p -r1.81 bufpage.c
*** src/backend/storage/page/bufpage.c	3 Nov 2008 20:47:48 -0000	1.81
--- src/backend/storage/page/bufpage.c	3 Nov 2008 22:37:02 -0000
*************** PageInit(Page page, Size pageSize, Size 
*** 41,46 ****
--- 41,47 ----
  	MemSet(p, 0, pageSize);
  	/* p->pd_flags = 0;								done by above MemSet */
+ 	p->pd_checksum = PAGE_INVALID_CHECKSUM;
  	p->pd_lower = SizeOfPageHeaderData;
  	p->pd_upper = pageSize - specialSize;
  	p->pd_special = pageSize - specialSize;
*************** PageHeaderIsValid(PageHeader page)
*** 84,92 ****
  		page->pd_special == MAXALIGN(page->pd_special))
  		return true;
! 	/* Check all-zeroes case */
  	pagebytes = (char *) page;
! 	for (i = 0; i < BLCKSZ; i++)
  		if (pagebytes[i] != 0)
  			return false;
--- 85,93 ----
  		page->pd_special == MAXALIGN(page->pd_special))
  		return true;
! 	/* Check all-zeroes case (skipping the checksum) */
  	pagebytes = (char *) page;
! 	for (i = sizeof(PAGE_CHECKSUM_TYPE); i < BLCKSZ; i++)
  		if (pagebytes[i] != 0)
  			return false;
Index: src/backend/storage/smgr/smgr.c
RCS file: /home/alvherre/Code/cvs/pgsql/src/backend/storage/smgr/smgr.c,v
retrieving revision 1.112
diff -c -p -r1.112 smgr.c
*** src/backend/storage/smgr/smgr.c	30 Sep 2008 10:52:13 -0000	1.112
--- src/backend/storage/smgr/smgr.c	3 Nov 2008 22:37:02 -0000
*** 27,32 ****
--- 27,35 ----
  #include "utils/memutils.h"
+ /* Perform block checksumming for corruption detection */
+ bool enable_block_checksums = false;
   * This struct of function pointers defines the API between smgr.c and
   * any individual storage manager module.  Note that smgr subfunctions are
*************** void
*** 503,508 ****
--- 506,515 ----
  smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, 
  		   char *buffer, bool isTemp)
+ 	/* Perform block checksumming for corruption detection */
+ 	if (enable_block_checksums)
+ 		WritePageChecksum(buffer);
  	(*(smgrsw[reln->smgr_which].smgr_extend)) (reln, forknum, blocknum,
  											   buffer, isTemp);
*************** smgrread(SMgrRelation reln, ForkNumber f
*** 520,525 ****
--- 527,551 ----
  		 char *buffer)
  	(*(smgrsw[reln->smgr_which].smgr_read)) (reln, forknum, blocknum, buffer);
+ 	/* Perform block checksumming for corruption detection */
+ 	if (enable_block_checksums && PageGetChecksum(buffer) != PAGE_INVALID_CHECKSUM)
+ 	{
+ 		CalcPageChecksum(buffer, chksum);
+ 		if (chksum != PageGetChecksum(buffer))
+ 		{
+ 			ereport(ERROR,
+ 					(errcode(ERRCODE_DATA_CORRUPTED),
+ 					 errmsg("invalid checksum on read of block %u of relation %u/%u/%u",
+ 							blocknum,
+ 							reln->smgr_rnode.spcNode,
+ 							reln->smgr_rnode.dbNode,
+ 							reln->smgr_rnode.relNode)));
+ 		}
+ 	}
*************** void
*** 541,546 ****
--- 567,578 ----
  smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, 
  		  char *buffer, bool isTemp)
+ 	/*
+ 	 * Perform block checksumming before writing.
+ 	 */
+ 	if (enable_block_checksums)
+ 		WritePageChecksum(buffer);
  	(*(smgrsw[reln->smgr_which].smgr_write)) (reln, forknum, blocknum,
  											  buffer, isTemp);
Index: src/backend/utils/misc/guc.c
RCS file: /home/alvherre/Code/cvs/pgsql/src/backend/utils/misc/guc.c,v
retrieving revision 1.475
diff -c -p -r1.475 guc.c
*** src/backend/utils/misc/guc.c	6 Oct 2008 13:05:36 -0000	1.475
--- src/backend/utils/misc/guc.c	3 Nov 2008 22:37:02 -0000
*** 57,62 ****
--- 57,63 ----
  #include "regex/regex.h"
  #include "storage/bufmgr.h"
  #include "storage/fd.h"
+ #include "storage/smgr.h"
  #include "tcop/tcopprot.h"
  #include "tsearch/ts_cache.h"
  #include "utils/builtins.h"
*************** static struct config_bool ConfigureNames
*** 762,767 ****
--- 763,778 ----
  		false, NULL, NULL
+ 		{"perform_checksum", PGC_SIGHUP, UNGROUPED,
+ 			gettext_noop("Forces checksumming of blocks to/from disk."),
+ 			gettext_noop("The server will perform a checksum on the block "
+ 				"when read from or written to disk in order to detect storage-related "
+ 				"corruption.")
+ 		},
+ 		&enable_block_checksums,
+ 		false, NULL, NULL
+ 	},
+ 	{
  		{"log_duration", PGC_SUSET, LOGGING_WHAT,
  			gettext_noop("Logs the duration of each completed SQL statement."),
Index: src/backend/utils/misc/postgresql.conf.sample
RCS file: /home/alvherre/Code/cvs/pgsql/src/backend/utils/misc/postgresql.conf.sample,v
retrieving revision 1.246
diff -c -p -r1.246 postgresql.conf.sample
*** src/backend/utils/misc/postgresql.conf.sample	30 Sep 2008 10:52:13 -0000	1.246
--- src/backend/utils/misc/postgresql.conf.sample	3 Nov 2008 22:37:02 -0000
*** 480,485 ****
--- 480,490 ----
  #transform_null_equals = off
+ #------------------------------------------------------------------------------
+ #------------------------------------------------------------------------------
+ #perform_checksum = off		# Perform block checksumming to/from disk
Index: src/backend/utils/time/tqual.c
RCS file: /home/alvherre/Code/cvs/pgsql/src/backend/utils/time/tqual.c,v
retrieving revision 1.110
diff -c -p -r1.110 tqual.c
*** src/backend/utils/time/tqual.c	26 Mar 2008 16:20:47 -0000	1.110
--- src/backend/utils/time/tqual.c	3 Nov 2008 22:37:02 -0000
*** 44,49 ****
--- 44,50 ----
  #include "access/xact.h"
  #include "storage/bufmgr.h"
  #include "storage/procarray.h"
+ #include "storage/smgr.h"
  #include "utils/tqual.h"
*************** SetHintBits(HeapTupleHeader tuple, Buffe
*** 96,101 ****
--- 97,104 ----
  	tuple->t_infomask |= infomask;
+ 	if (enable_block_checksums)
+ 		PageSetUnloggedChange(BufferGetPage(buffer));
Index: src/include/pg_config_manual.h
RCS file: /home/alvherre/Code/cvs/pgsql/src/include/pg_config_manual.h,v
retrieving revision 1.35
diff -c -p -r1.35 pg_config_manual.h
*** src/include/pg_config_manual.h	12 Jul 2008 02:28:43 -0000	1.35
--- src/include/pg_config_manual.h	3 Nov 2008 22:37:02 -0000
*** 195,201 ****
   * Enable debugging print statements for WAL-related operations; see
   * also the wal_debug GUC var.
! /* #define WAL_DEBUG */
   * Enable tracing of resource consumption during sort operations;
--- 195,201 ----
   * Enable debugging print statements for WAL-related operations; see
   * also the wal_debug GUC var.
! #define WAL_DEBUG 1
   * Enable tracing of resource consumption during sort operations;
Index: src/include/access/heapam.h
RCS file: /home/alvherre/Code/cvs/pgsql/src/include/access/heapam.h,v
retrieving revision 1.139
diff -c -p -r1.139 heapam.h
*** src/include/access/heapam.h	8 Oct 2008 01:14:44 -0000	1.139
--- src/include/access/heapam.h	3 Nov 2008 22:37:02 -0000
*************** extern XLogRecPtr log_heap_freeze(Relati
*** 131,136 ****
--- 131,138 ----
  				OffsetNumber *offsets, int offcnt);
  extern XLogRecPtr log_newpage(RelFileNode *rnode, ForkNumber forkNum,
  							  BlockNumber blk, Page page);
+ extern XLogRecPtr log_hintbits(RelFileNode *rnode, ForkNumber forkNum,
+ 							   BlockNumber blk, Page page);
  /* in heap/pruneheap.c */
  extern void heap_page_prune_opt(Relation relation, Buffer buffer,
Index: src/include/access/htup.h
RCS file: /home/alvherre/Code/cvs/pgsql/src/include/access/htup.h,v
retrieving revision 1.103
diff -c -p -r1.103 htup.h
*** src/include/access/htup.h	2 Nov 2008 01:45:28 -0000	1.103
--- src/include/access/htup.h	3 Nov 2008 22:37:02 -0000
*************** typedef HeapTupleData *HeapTuple;
*** 580,585 ****
--- 580,586 ----
  #define XLOG_HEAP2_FREEZE		0x00
  #define XLOG_HEAP2_CLEAN		0x10
  #define XLOG_HEAP2_CLEAN_MOVE	0x20
+ #define XLOG_HEAP2_HINTBITS		0x30
   * All what we need to find changed tuple
*************** typedef struct xl_heap_freeze
*** 714,719 ****
--- 715,730 ----
  #define SizeOfHeapFreeze (offsetof(xl_heap_freeze, cutoff_xid) + sizeof(TransactionId))
+ /* This is what we need to know about hint bits */
+ typedef struct xl_heap_hintbits
+ {
+ 	RelFileNode node;
+ 	BlockNumber block;
+ } xl_heap_hintbits;
+ #define SizeOfHeapHintbits (offsetof(xl_heap_hintbits, block) + sizeof(BlockNumber))
  /* HeapTupleHeader functions implemented in utils/time/combocid.c */
  extern CommandId HeapTupleHeaderGetCmin(HeapTupleHeader tup);
  extern CommandId HeapTupleHeaderGetCmax(HeapTupleHeader tup);
Index: src/include/storage/bufpage.h
RCS file: /home/alvherre/Code/cvs/pgsql/src/include/storage/bufpage.h,v
retrieving revision 1.84
diff -c -p -r1.84 bufpage.h
*** src/include/storage/bufpage.h	3 Nov 2008 20:47:49 -0000	1.84
--- src/include/storage/bufpage.h	3 Nov 2008 22:37:02 -0000
*** 17,22 ****
--- 17,23 ----
  #include "access/xlogdefs.h"
  #include "storage/item.h"
  #include "storage/off.h"
+ #include "utils/pg_crc.h"
   * A postgres disk page is an abstraction layered on top of a postgres
*************** typedef uint16 LocationIndex;
*** 87,92 ****
--- 88,94 ----
   * space management information generic to any page
+  *		pd_checksum	- the checksum of the page
   *		pd_lsn		- identifies xlog record for last change to this page.
   *		pd_tli		- ditto.
   *		pd_flags	- flag bits.
*************** typedef uint16 LocationIndex;
*** 118,136 ****
   * the constraint on pagesize mod 256 is not an important restriction.
   * On the high end, we can only support pages up to 32KB because lp_off/lp_len
   * are 15 bits.
  typedef struct PageHeaderData
! 	/* XXX LSN is member of *any* block, not only page-organized ones */
  	XLogRecPtr	pd_lsn;			/* LSN: next byte after last byte of xlog
  								 * record for last change to this page */
- 	uint16		pd_tli;			/* least significant bits of the TimeLineID
- 								 * containing the LSN */
- 	uint16		pd_flags;		/* flag bits, see below */
  	LocationIndex pd_lower;		/* offset to start of free space */
  	LocationIndex pd_upper;		/* offset to end of free space */
  	LocationIndex pd_special;	/* offset to start of special space */
  	uint16		pd_pagesize_version;
  	TransactionId pd_prune_xid; /* oldest prunable XID, or zero if none */
  	ItemIdData	pd_linp[1];		/* beginning of line pointer array */
  } PageHeaderData;
--- 120,143 ----
   * the constraint on pagesize mod 256 is not an important restriction.
   * On the high end, we can only support pages up to 32KB because lp_off/lp_len
   * are 15 bits.
+  *
+  * Note that pd_tli appears in a rather awkward position in the struct;
+  * this is because we moved it to accomodate pd_checksum without changing
+  * pg_pagesize_version's offset.
  typedef struct PageHeaderData
! 	/* XXX CRC & LSN are members of *any* block, not only page-organized ones */
! 	pg_crc32	pd_checksum;    /* The block-level checksum */
  	XLogRecPtr	pd_lsn;			/* LSN: next byte after last byte of xlog
  								 * record for last change to this page */
  	LocationIndex pd_lower;		/* offset to start of free space */
  	LocationIndex pd_upper;		/* offset to end of free space */
  	LocationIndex pd_special;	/* offset to start of special space */
  	uint16		pd_pagesize_version;
+ 	uint16		pd_tli;			/* least significant bits of the TimeLineID
+ 								 * containing the LSN */
+ 	uint16		pd_flags;		/* flag bits, see below */
  	TransactionId pd_prune_xid; /* oldest prunable XID, or zero if none */
  	ItemIdData	pd_linp[1];		/* beginning of line pointer array */
  } PageHeaderData;
*************** typedef PageHeaderData *PageHeader;
*** 148,159 ****
   * PD_PAGE_FULL is set if an UPDATE doesn't find enough free space in the
   * page for its new tuple version; this suggests that a prune is needed.
   * Again, this is just a hint.
  #define PD_HAS_FREE_LINES	0x0001		/* are there any unused line pointers? */
  #define PD_PAGE_FULL		0x0002		/* not enough free space for new
  										 * tuple? */
! #define PD_VALID_FLAG_BITS	0x0003		/* OR of all valid pd_flags bits */
   * Page layout version number 0 is for pre-7.3 Postgres releases.
--- 155,172 ----
   * PD_PAGE_FULL is set if an UPDATE doesn't find enough free space in the
   * page for its new tuple version; this suggests that a prune is needed.
   * Again, this is just a hint.
+  *
+  * PG_UNLOGGED_CHANGE indicates whether a process has set hint bits on the
+  * page.  This is used to determine whether a WAL message needs to be emitted
+  * before writing the page to disk when page checksums are enabled.
  #define PD_HAS_FREE_LINES	0x0001		/* are there any unused line pointers? */
  #define PD_PAGE_FULL		0x0002		/* not enough free space for new
  										 * tuple? */
+ #define PD_UNLOGGED_CHANGE	0x0004		/* does the page have unlogged hint
+ 										   bits? */
! #define PD_VALID_FLAG_BITS	0x0007		/* OR of all valid pd_flags bits */
   * Page layout version number 0 is for pre-7.3 Postgres releases.
*************** typedef PageHeaderData *PageHeader;
*** 163,170 ****
   * Release 8.3 uses 4; it changed the HeapTupleHeader layout again, and
   *		added the pd_flags field (by stealing some bits from pd_tli),
   *		as well as adding the pd_prune_xid field (which enlarges the header).
  /* ----------------------------------------------------------------
--- 176,186 ----
   * Release 8.3 uses 4; it changed the HeapTupleHeader layout again, and
   *		added the pd_flags field (by stealing some bits from pd_tli),
   *		as well as adding the pd_prune_xid field (which enlarges the header).
+  * Release 8.4 uses 5; it added a checksum to the page header, and moved
+  *		pd_tli and pd_flags so that the page version would keep the same
+  *		offset.
  /* ----------------------------------------------------------------
*************** do { \
*** 352,357 ****
--- 368,410 ----
  #define PageClearPrunable(page) \
  	(((PageHeader) (page))->pd_prune_xid = InvalidTransactionId)
+ /* ----------------------------------------------------------------
+  *      CRC support
+  * ----------------------------------------------------------------
+  */
+ #define PAGE_CHECKSUM_TYPE		pg_crc32
+ #define PAGE_INVALID_CHECKSUM	0xb79a6e9c
+ #define CalcPageChecksum(buffer, sum)			\
+ 	do {										\
+ 		INIT_CRC32(sum);						\
+ 		COMP_CRC32(sum, &buffer[sizeof(pg_crc32)],	\
+ 				   offsetof(PageHeaderData, pd_flags) - sizeof(pg_crc32)); \
+ 		COMP_CRC32(sum, &buffer[offsetof(PageHeaderData, pd_flags) + sizeof(uint16)],	\
+ 				   BLCKSZ - (offsetof(PageHeaderData, pd_flags) + sizeof(uint16)));	\
+ 		FIN_CRC32(sum);							\
+ 	} while (0)
+ /* beware multiple evaluation of argument */
+ #define WritePageChecksum(buffer)				\
+ 	do {										\
+ 		PAGE_CHECKSUM_TYPE    chksum;			\
+ 		CalcPageChecksum(buffer, chksum);		\
+ 		PageSetChecksum(buffer, chksum);		\
+ 	} while (0)
+ #define PageGetChecksum(page) \
+ 	(((PageHeader) (page))->pd_checksum)
+ #define PageSetChecksum(page, checksum) \
+ 	(((PageHeader) (page))->pd_checksum = (checksum))
+ #define PageHasUnloggedChange(page) \
+ 	(((PageHeader) (page))->pd_flags & PD_UNLOGGED_CHANGE)
+ #define PageSetUnloggedChange(page) \
+ 	(((PageHeader) (page))->pd_flags |= PD_UNLOGGED_CHANGE)
+ #define PageClearUnloggedChange(page) \
+ 	(((PageHeader) (page))->pd_flags &= ~PD_UNLOGGED_CHANGE)
  /* ----------------------------------------------------------------
   *		extern declarations
Index: src/include/storage/smgr.h
RCS file: /home/alvherre/Code/cvs/pgsql/src/include/storage/smgr.h,v
retrieving revision 1.63
diff -c -p -r1.63 smgr.h
*** src/include/storage/smgr.h	11 Aug 2008 11:05:11 -0000	1.63
--- src/include/storage/smgr.h	3 Nov 2008 22:37:02 -0000
*** 20,25 ****
--- 20,28 ----
  #include "storage/relfilenode.h"
+ /* Perform block checksumming for corruption detection */
+ bool enable_block_checksums;
   * smgr.c maintains a table of SMgrRelation objects, which are essentially
   * cached file handles.  An SMgrRelation is created (if not already present)
Sent via pgsql-hackers mailing list (
To make changes to your subscription:

Reply via email to