On 11.02.2013 08:44, Jeevan Chalke wrote:
Hi,
Any review comments on this ?
Sorry for the delay.
I did some minor cleanup on this. I added code to pg_resetxlog and
pg_controldata to reset / display the current unlogged LSN value. I
moved the static counter, for temporary relations, back to gistutil.c,
so that the function in xlog.c only deals with unlogged relations. It's
debatable if that's better, but IMHO it is. Also, the unloggedLSN
counter is now reset to 1 at crash recovery. There's no fundamental
reason it needs to be reset, rather than just continue from the last
shutdowned value like nothing happened, but it seems cleaner that way.
I'm happy with this now, but please take one more look before I commit this.
- Heikki
diff --git a/doc/src/sgml/ref/create_table.sgml b/doc/src/sgml/ref/create_table.sgml
index 8872920..af11eb0 100644
--- a/doc/src/sgml/ref/create_table.sgml
+++ b/doc/src/sgml/ref/create_table.sgml
@@ -182,8 +182,7 @@ CREATE [ [ GLOBAL | LOCAL ] { TEMPORARY | TEMP } | UNLOGGED ] TABLE [ IF NOT EXI
automatically truncated after a crash or unclean shutdown. The contents
of an unlogged table are also not replicated to standby servers.
Any indexes created on an unlogged table are automatically unlogged as
- well; however, unlogged <link linkend="GiST">GiST indexes</link> are
- currently not supported and cannot be created on an unlogged table.
+ well.
</para>
</listitem>
</varlistentry>
diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c
index e2d3390..eba95f1 100644
--- a/src/backend/access/gist/gist.c
+++ b/src/backend/access/gist/gist.c
@@ -16,6 +16,7 @@
#include "access/genam.h"
#include "access/gist_private.h"
+#include "access/heapam_xlog.h"
#include "catalog/index.h"
#include "catalog/pg_collation.h"
#include "miscadmin.h"
@@ -71,9 +72,22 @@ createTempGistContext(void)
Datum
gistbuildempty(PG_FUNCTION_ARGS)
{
- ereport(ERROR,
- (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
- errmsg("unlogged GiST indexes are not supported")));
+ Relation index = (Relation) PG_GETARG_POINTER(0);
+ Buffer buffer;
+
+ /* Initialize the root page */
+ buffer = ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL);
+ LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
+
+ /* Initialize and xlog buffer */
+ START_CRIT_SECTION();
+ GISTInitBuffer(buffer, F_LEAF);
+ MarkBufferDirty(buffer);
+ log_newpage_buffer(buffer);
+ END_CRIT_SECTION();
+
+ /* Unlock and release the buffer */
+ UnlockReleaseBuffer(buffer);
PG_RETURN_VOID();
}
@@ -391,7 +405,7 @@ gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate,
dist, oldrlink, oldnsn, leftchildbuf,
markfollowright);
else
- recptr = GetXLogRecPtrForTemp();
+ recptr = gistGetFakeLSN(rel);
for (ptr = dist; ptr; ptr = ptr->next)
{
@@ -448,7 +462,7 @@ gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate,
}
else
{
- recptr = GetXLogRecPtrForTemp();
+ recptr = gistGetFakeLSN(rel);
PageSetLSN(page, recptr);
}
diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c
index aec5b52..0cf22cd 100644
--- a/src/backend/access/gist/gistbuild.c
+++ b/src/backend/access/gist/gistbuild.c
@@ -158,16 +158,6 @@ gistbuild(PG_FUNCTION_ARGS)
elog(ERROR, "index \"%s\" already contains data",
RelationGetRelationName(index));
- /*
- * We can't yet handle unlogged GiST indexes, because we depend on LSNs.
- * This is duplicative of an error in gistbuildempty, but we want to check
- * here so as to throw error before doing all the index-build work.
- */
- if (heap->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED)
- ereport(ERROR,
- (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
- errmsg("unlogged GiST indexes are not supported")));
-
/* no locking is needed */
buildstate.giststate = initGISTstate(index);
@@ -204,7 +194,7 @@ gistbuild(PG_FUNCTION_ARGS)
PageSetTLI(page, ThisTimeLineID);
}
else
- PageSetLSN(page, GetXLogRecPtrForTemp());
+ PageSetLSN(page, gistGetFakeLSN(heap));
UnlockReleaseBuffer(buffer);
diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c
index e5c3d69..c5b2c87 100644
--- a/src/backend/access/gist/gistutil.c
+++ b/src/backend/access/gist/gistutil.c
@@ -798,16 +798,30 @@ gistoptions(PG_FUNCTION_ARGS)
}
/*
- * Temporary GiST indexes are not WAL-logged, but we need LSNs to detect
- * concurrent page splits anyway. GetXLogRecPtrForTemp() provides a fake
- * sequence of LSNs for that purpose. Each call generates an LSN that is
- * greater than any previous value returned by this function in the same
- * session.
+ * Temporary and unlogged GiST indexes are not WAL-logged, but we need LSNs
+ * to detect concurrent page splits anyway. This function provides a fake
+ * sequence of LSNs for that purpose.
*/
XLogRecPtr
-GetXLogRecPtrForTemp(void)
+gistGetFakeLSN(Relation rel)
{
static XLogRecPtr counter = 1;
- counter++;
- return counter;
+
+ if (rel->rd_rel->relpersistence == RELPERSISTENCE_TEMP)
+ {
+ /*
+ * Temporary relations are only accessible in our session, so a
+ * simple counter in our backend will do.
+ */
+ return counter++;
+ }
+ else
+ {
+ /*
+ * Unlogged relations are accessible from other backends, and survive
+ * (clean) restarts. GetFakeLSNForUnloggedRel() handles that for us.
+ */
+ Assert(rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED);
+ return GetFakeLSNForUnloggedRel();
+ }
}
diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c
index b5be676..1d9f832 100644
--- a/src/backend/access/gist/gistvacuum.c
+++ b/src/backend/access/gist/gistvacuum.c
@@ -238,7 +238,7 @@ gistbulkdelete(PG_FUNCTION_ARGS)
PageSetTLI(page, ThisTimeLineID);
}
else
- PageSetLSN(page, GetXLogRecPtrForTemp());
+ PageSetLSN(page, gistGetFakeLSN(rel));
END_CRIT_SECTION();
}
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index f0df297..6ef776d 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -391,6 +391,10 @@ typedef struct XLogCtlData
XLogRecPtr asyncXactLSN; /* LSN of newest async commit/abort */
XLogSegNo lastRemovedSegNo; /* latest removed/recycled XLOG segment */
+ /* Protected by ulsn_lck: */
+ XLogRecPtr unloggedLSN; /* Fake LSN counter (for unlogged relations) */
+ slock_t ulsn_lck;
+
/* Protected by WALWriteLock: */
XLogCtlWrite Write;
@@ -3688,6 +3692,32 @@ GetSystemIdentifier(void)
}
/*
+ * Returns a fake LSN for unlogged relations.
+ *
+ * Each call generates an LSN that is greater than any previous value
+ * returned. The current counter value is saved and restored across clean
+ * shutdowns, but like unlogged relations, does not survive a crash. The
+ * returned value can be used in lieu of real LSN values returned by
+ * XLogInsert, if you need an increasing sequence of numbers for unlogged
+ * relations.
+ */
+XLogRecPtr
+GetFakeLSNForUnloggedRel(void)
+{
+ XLogRecPtr nextUnloggedLSN;
+
+ /* use volatile pointer to prevent code rearrangement */
+ volatile XLogCtlData *xlogctl = XLogCtl;
+
+ /* increment the unloggedLSN counter, need SpinLock */
+ SpinLockAcquire(&xlogctl->ulsn_lck);
+ nextUnloggedLSN = xlogctl->unloggedLSN++;
+ SpinLockRelease(&xlogctl->ulsn_lck);
+
+ return nextUnloggedLSN;
+}
+
+/*
* Auto-tune the number of XLOG buffers.
*
* The preferred setting for wal_buffers is about 3% of shared_buffers, with
@@ -3835,6 +3865,7 @@ XLOGShmemInit(void)
XLogCtl->WalWriterSleeping = false;
XLogCtl->Insert.currpage = (XLogPageHeader) (XLogCtl->pages);
SpinLockInit(&XLogCtl->info_lck);
+ SpinLockInit(&XLogCtl->ulsn_lck);
InitSharedLatch(&XLogCtl->recoveryWakeupLatch);
/*
@@ -3979,6 +4010,7 @@ BootStrapXLOG(void)
ControlFile->time = checkPoint.time;
ControlFile->checkPoint = checkPoint.redo;
ControlFile->checkPointCopy = checkPoint;
+ ControlFile->unloggedLSN = 1;
/* Set important parameter values for use when replaying WAL */
ControlFile->MaxConnections = MaxConnections;
@@ -5021,6 +5053,16 @@ StartupXLOG(void)
XLogCtl->ckptXid = checkPoint.nextXid;
/*
+ * Initialize unlogged LSN. On a clean shutdown, it's restored from
+ * the control file. On recovery, all unlogged relations are blown away,
+ * so unlogged LSN is reset too.
+ */
+ if (ControlFile->state == DB_SHUTDOWNED)
+ XLogCtl->unloggedLSN = ControlFile->unloggedLSN;
+ else
+ XLogCtl->unloggedLSN = 1;
+
+ /*
* We must replay WAL entries using the same TimeLineID they were created
* under, so temporarily adopt the TLI indicated by the checkpoint (see
* also xlog_redo()).
@@ -6894,6 +6936,16 @@ CreateCheckPoint(int flags)
/* crash recovery should always recover to the end of WAL */
ControlFile->minRecoveryPoint = InvalidXLogRecPtr;
ControlFile->minRecoveryPointTLI = 0;
+
+ /*
+ * Persist unloggedLSN value. It's reset on crash recovery, so this goes
+ * unused on non-shutdown checkpoints, but seems useful to store it always
+ * for debugging purposes.
+ */
+ SpinLockAcquire(&XLogCtl->ulsn_lck);
+ ControlFile->unloggedLSN = XLogCtl->unloggedLSN;
+ SpinLockRelease(&XLogCtl->ulsn_lck);
+
UpdateControlFile();
LWLockRelease(ControlFileLock);
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 13b80ae..405ff61 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -1922,9 +1922,24 @@ FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln)
* Force XLOG flush up to buffer's LSN. This implements the basic WAL
* rule that log updates must hit disk before any of the data-file changes
* they describe do.
+ *
+ * However, this rule does not apply to unlogged relations, which will be
+ * lost after a crash anyway. Most unlogged relation pages do not bear
+ * LSNs since we never emit WAL records for them, and therefore flushing
+ * up through the buffer LSN would be useless, but harmless. However, GiST
+ * indexes use LSNs internally to track page-splits, and therefore unlogged
+ * GiST pages bear "fake" LSNs generated by GetFakeLSNForUnloggedRel. It
+ * is unlikely but possible that the fake LSN counter could advance past
+ * the WAL insertion point; and if it did happen, attempting to flush WAL
+ * through that location would fail, with disastrous system-wide
+ * consequences. To make sure that can't happen, skip the flush if the
+ * buffer isn't permanent.
*/
- recptr = BufferGetLSN(buf);
- XLogFlush(recptr);
+ if (buf->flags & BM_PERMANENT)
+ {
+ recptr = BufferGetLSN(buf);
+ XLogFlush(recptr);
+ }
/*
* Now it's safe to write buffer to disk. Note that no one else should
diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c
index 67ebc88..4763059 100644
--- a/src/bin/pg_controldata/pg_controldata.c
+++ b/src/bin/pg_controldata/pg_controldata.c
@@ -238,6 +238,9 @@ main(int argc, char *argv[])
ControlFile.checkPointCopy.oldestMultiDB);
printf(_("Time of latest checkpoint: %s\n"),
ckpttime_str);
+ printf(_("Fake LSN counter for unlogged rels: %X/%X\n"),
+ (uint32) (ControlFile.unloggedLSN >> 32),
+ (uint32) ControlFile.unloggedLSN);
printf(_("Min recovery ending location: %X/%X\n"),
(uint32) (ControlFile.minRecoveryPoint >> 32),
(uint32) ControlFile.minRecoveryPoint);
diff --git a/src/bin/pg_resetxlog/pg_resetxlog.c b/src/bin/pg_resetxlog/pg_resetxlog.c
index 8e7fe7e..41f0ee4 100644
--- a/src/bin/pg_resetxlog/pg_resetxlog.c
+++ b/src/bin/pg_resetxlog/pg_resetxlog.c
@@ -506,6 +506,7 @@ GuessControlValues(void)
ControlFile.state = DB_SHUTDOWNED;
ControlFile.time = (pg_time_t) time(NULL);
ControlFile.checkPoint = ControlFile.checkPointCopy.redo;
+ ControlFile.unloggedLSN = 1;
/* minRecoveryPoint, backupStartPoint and backupEndPoint can be left zero */
diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h
index c2f9031..cae6dbc 100644
--- a/src/include/access/gist_private.h
+++ b/src/include/access/gist_private.h
@@ -512,7 +512,7 @@ extern void gistMakeUnionKey(GISTSTATE *giststate, int attno,
GISTENTRY *entry2, bool isnull2,
Datum *dst, bool *dstisnull);
-extern XLogRecPtr GetXLogRecPtrForTemp(void);
+extern XLogRecPtr gistGetFakeLSN(Relation rel);
/* gistvacuum.c */
extern Datum gistbulkdelete(PG_FUNCTION_ARGS);
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 72e3242..8a65492 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -294,6 +294,7 @@ extern char *XLogFileNameP(TimeLineID tli, XLogSegNo segno);
extern void UpdateControlFile(void);
extern uint64 GetSystemIdentifier(void);
+extern XLogRecPtr GetFakeLSNForUnloggedRel(void);
extern Size XLOGShmemSize(void);
extern void XLOGShmemInit(void);
extern void BootStrapXLOG(void);
diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h
index ec8cea7..28a6c11 100644
--- a/src/include/catalog/pg_control.h
+++ b/src/include/catalog/pg_control.h
@@ -124,6 +124,8 @@ typedef struct ControlFileData
CheckPoint checkPointCopy; /* copy of last check point record */
+ XLogRecPtr unloggedLSN; /* current fake LSN value, for unlogged rels */
+
/*
* These two values determine the minimum point we must recover up to
* before starting up:
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers