Updated patch attached, thanks.
Amit: what's your conclusion from the review?
-- Abhijit
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c
index 5f9fc49..dc90c02 100644
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -501,33 +501,9 @@ btree_xlog_vacuum(XLogRecPtr lsn, XLogRecord *record)
* here during crash recovery.
*/
if (HotStandbyActiveInReplay())
- {
- BlockNumber blkno;
-
- for (blkno = xlrec->lastBlockVacuumed + 1; blkno < xlrec->block; blkno++)
- {
- /*
- * We use RBM_NORMAL_NO_LOG mode because it's not an error
- * condition to see all-zero pages. The original btvacuumpage
- * scan would have skipped over all-zero pages, noting them in FSM
- * but not bothering to initialize them just yet; so we mustn't
- * throw an error here. (We could skip acquiring the cleanup lock
- * if PageIsNew, but it's probably not worth the cycles to test.)
- *
- * XXX we don't actually need to read the block, we just need to
- * confirm it is unpinned. If we had a special call into the
- * buffer manager we could optimise this so that if the block is
- * not in shared_buffers we confirm it as unpinned.
- */
- buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM, blkno,
- RBM_NORMAL_NO_LOG);
- if (BufferIsValid(buffer))
- {
- LockBufferForCleanup(buffer);
- UnlockReleaseBuffer(buffer);
- }
- }
- }
+ XLogLockBlockRangeForCleanup(xlrec->node, MAIN_FORKNUM,
+ xlrec->lastBlockVacuumed + 1,
+ xlrec->block);
/*
* If we have a full-page image, restore it (using a cleanup lock) and
diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c
index b7829ff..d84f83a 100644
--- a/src/backend/access/transam/xlogutils.c
+++ b/src/backend/access/transam/xlogutils.c
@@ -287,10 +287,6 @@ XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init)
*
* In RBM_ZERO and RBM_ZERO_ON_ERROR modes, if the page doesn't exist, the
* relation is extended with all-zeroes pages up to the given block number.
- *
- * In RBM_NORMAL_NO_LOG mode, we return InvalidBuffer if the page doesn't
- * exist, and we don't check for all-zeroes. Thus, no log entry is made
- * to imply that the page should be dropped or truncated later.
*/
Buffer
XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
@@ -331,8 +327,6 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
log_invalid_page(rnode, forknum, blkno, false);
return InvalidBuffer;
}
- if (mode == RBM_NORMAL_NO_LOG)
- return InvalidBuffer;
/* OK to extend the file */
/* we do this in recovery only - no rel-extension lock needed */
Assert(InRecovery);
@@ -375,6 +369,62 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
return buffer;
}
+/*
+ * XLogBlockRangeForCleanup is used in Hot Standby mode to emulate the
+ * locking effects imposed by VACUUM for nbtrees.
+ */
+void
+XLogLockBlockRangeForCleanup(RelFileNode rnode, ForkNumber forkNum,
+ BlockNumber startBlkno,
+ BlockNumber uptoBlkno)
+{
+ BlockNumber blkno;
+ BlockNumber lastblock;
+ BlockNumber endingBlkno;
+ Buffer buffer;
+ SMgrRelation smgr;
+
+ Assert(startBlkno != P_NEW);
+ Assert(uptoBlkno != P_NEW);
+
+ /* Open the relation at smgr level */
+ smgr = smgropen(rnode, InvalidBackendId);
+
+ /*
+ * Create the target file if it doesn't already exist. This lets us cope
+ * if the replay sequence contains writes to a relation that is later
+ * deleted. (The original coding of this routine would instead suppress
+ * the writes, but that seems like it risks losing valuable data if the
+ * filesystem loses an inode during a crash. Better to write the data
+ * until we are actually told to delete the file.)
+ */
+ smgrcreate(smgr, forkNum, true);
+
+ lastblock = smgrnblocks(smgr, forkNum);
+
+ endingBlkno = uptoBlkno;
+ if (lastblock < endingBlkno)
+ endingBlkno = lastblock;
+
+ for (blkno = startBlkno; blkno < endingBlkno; blkno++)
+ {
+ /*
+ * All we need to do here is prove that we can lock each block
+ * with a cleanup lock. It's not an error to see all-zero pages
+ * here because the original btvacuumpage would not have thrown
+ * an error either.
+ *
+ * We don't actually need to read the block, we just need to
+ * confirm it is unpinned.
+ */
+ buffer = GetBufferWithoutRelcache(rnode, forkNum, blkno);
+ if (BufferIsValid(buffer))
+ {
+ LockBufferForCleanup(buffer);
+ UnlockReleaseBuffer(buffer);
+ }
+ }
+}
/*
* Struct actually returned by XLogFakeRelcacheEntry, though the declared
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 07ea665..ce5ff70 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -222,8 +222,6 @@ ReadBuffer(Relation reln, BlockNumber blockNum)
* current physical EOF; that is likely to cause problems in md.c when
* the page is modified and written out. P_NEW is OK, though.
*
- * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
- *
* If strategy is not NULL, a nondefault buffer access strategy is used.
* See buffer/README for details.
*/
@@ -284,6 +282,62 @@ ReadBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum,
mode, strategy, &hit);
}
+/*
+ * GetBufferWithoutRelcache returns Buffer iff available in shared_buffers,
+ * otherwise returns InvalidBuffer. Buffer is pinned, if available.
+ *
+ * Special purpose routine only executed during recovery, so uses a cut-down
+ * execution path rather than complicate ReadBuffer/AllocBuffer.
+ */
+Buffer
+GetBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum,
+ BlockNumber blockNum)
+{
+ BufferTag bufTag; /* identity of requested block */
+ uint32 bufHash; /* hash value for newTag */
+ LWLock *bufPartitionLock; /* buffer partition lock for it */
+ int buf_id;
+ volatile BufferDesc *buf;
+ SMgrRelation smgr = smgropen(rnode, InvalidBackendId);
+ bool valid = false;
+
+ Assert(InRecovery);
+
+ /* Make sure we will have room to remember the buffer pin */
+ ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
+
+ /* create a tag so we can lookup the buffer */
+ INIT_BUFFERTAG(bufTag, smgr->smgr_rnode.node, forkNum, blockNum);
+
+ /* determine its hash code and partition lock ID */
+ bufHash = BufTableHashCode(&bufTag);
+ bufPartitionLock = BufMappingPartitionLock(bufHash);
+
+ /* see if the block is in the buffer pool already */
+ LWLockAcquire(bufPartitionLock, LW_SHARED);
+ buf_id = BufTableLookup(&bufTag, bufHash);
+ if (buf_id >= 0)
+ {
+ /*
+ * Found it. Now, try to pin the buffer if it's valid, but leave
+ * its usage count alone.
+ */
+ buf = &BufferDescriptors[buf_id];
+
+ LockBufHdr(buf);
+ valid = (buf->flags & BM_VALID) != 0;
+ if (valid)
+ PinBuffer_Locked(buf);
+ else
+ UnlockBufHdr(buf);
+ }
+ LWLockRelease(bufPartitionLock);
+
+ if (valid)
+ return BufferDescriptorGetBuffer(&BufferDescriptors[buf_id]);
+
+ return InvalidBuffer;
+}
/*
* ReadBuffer_common -- common logic for all ReadBuffer variants
diff --git a/src/include/access/xlogutils.h b/src/include/access/xlogutils.h
index 58f11d9..1ecbeb3 100644
--- a/src/include/access/xlogutils.h
+++ b/src/include/access/xlogutils.h
@@ -25,7 +25,9 @@ extern void XLogTruncateRelation(RelFileNode rnode, ForkNumber forkNum,
extern Buffer XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init);
extern Buffer XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
BlockNumber blkno, ReadBufferMode mode);
-
+extern void XLogLockBlockRangeForCleanup(RelFileNode rnode, ForkNumber forkNum,
+ BlockNumber startBlkno,
+ BlockNumber uptoBlkno);
extern Relation CreateFakeRelcacheEntry(RelFileNode rnode);
extern void FreeFakeRelcacheEntry(Relation fakerel);
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 89447d0..b827be0 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -38,9 +38,7 @@ typedef enum
RBM_NORMAL, /* Normal read */
RBM_ZERO, /* Don't read from disk, caller will
* initialize */
- RBM_ZERO_ON_ERROR, /* Read, but return an all-zeros page on error */
- RBM_NORMAL_NO_LOG /* Don't log page as invalid during WAL
- * replay; otherwise same as RBM_NORMAL */
+ RBM_ZERO_ON_ERROR /* Read, but return an all-zeros page on error */
} ReadBufferMode;
/* in globals.c ... this duplicates miscadmin.h */
@@ -170,6 +168,8 @@ extern Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum,
extern Buffer ReadBufferWithoutRelcache(RelFileNode rnode,
ForkNumber forkNum, BlockNumber blockNum,
ReadBufferMode mode, BufferAccessStrategy strategy);
+extern Buffer GetBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum,
+ BlockNumber blockNum);
extern void ReleaseBuffer(Buffer buffer);
extern void UnlockReleaseBuffer(Buffer buffer);
extern void MarkBufferDirty(Buffer buffer);
--
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers