I'm working with a client that uses Postgres on what amounts to an appliance.
The database is therefore subject to occasional torture such as, in this particular case, running out of disk space while performing a million plus queries (of mixed varieties, many using plpgsql with exception handling -- more on that later), and eventually being power-cycled. Upon restart, clog_redo was called approx 885000 times (CLOG_ZEROPAGE) during recovery, which took almost 2 hours on their hardware. I should note that this is on Postgres 8.3.x. After studying the source, I can only see one possible way that this could have occurred: In varsup.c:GetNewTracsactionId(), ExtendCLOG() needs to succeed on a freshly zeroed clog page, and ExtendSUBTRANS() has to fail. Both of these calls can lead to a page being pushed out of shared buffers and to disk, so given a lack of disk space, sufficient clog buffers, but lack of subtrans buffers, this could happen. At that point the transaction id does not get advanced, so clog zeros the same page, extendSUBTRANS() fails again, rinse and repeat. I believe in the case above, subtrans buffers were exhausted due to the extensive use of plpgsql with exception handling. I can simulate this failure with the attached debug-clog patch which makes use of two pre-existing debug GUCs to selectively interject an ERROR in between calls to ExtendCLOG() and ExtendSUBTRANS(). If you want to test this yourself, apply this patch and use the function in test_clog.sql to generate a million or so transactions. After the first 32K or before (based on when clog gets its first opportunity to zero a new page) you should start seeing messages about injected ERRORs. Let a few hundred thousand ERRORs go by, then kill postgres. Recovery will take ages, because clog_redo is calling fsync hundreds of thousands of times in order to zero the same page over and over. The attached fix-clogredo diff is my proposal for a fix for this. Thoughts/alternatives, etc appreciated. Thanks, Joe -- Joe Conway credativ LLC: http://www.credativ.us Linux, PostgreSQL, and general Open Source Training, Service, Consulting, & 24x7 Support
diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c index 52224b1..317bc2e 100644 --- a/src/backend/access/transam/clog.c +++ b/src/backend/access/transam/clog.c @@ -36,6 +36,7 @@ #include "access/slru.h" #include "access/transam.h" #include "postmaster/bgwriter.h" +#include "utils/guc.h" /* * Defines for CLOG page sizes. A page is the same BLCKSZ as is used @@ -355,6 +356,9 @@ ExtendCLOG(TransactionId newestXact) /* Zero the page and make an XLOG entry about it */ ZeroCLOGPage(pageno, true); + /* steal this variable for test -- means we've been here */ + Debug_print_rewritten = true; + LWLockRelease(CLogControlLock); } diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c index 8838d42..e55a67b 100644 --- a/src/backend/access/transam/varsup.c +++ b/src/backend/access/transam/varsup.c @@ -21,6 +21,7 @@ #include "storage/pmsignal.h" #include "storage/proc.h" #include "utils/builtins.h" +#include "utils/guc.h" /* Number of OIDs to prefetch (preallocate) per XLOG write */ @@ -107,6 +108,11 @@ GetNewTransactionId(bool isSubXact) * Extend pg_subtrans too. */ ExtendCLOG(xid); + if (Debug_print_rewritten && Debug_pretty_print) + { + Debug_print_rewritten = false; + elog(ERROR,"injected ERROR"); + } ExtendSUBTRANS(xid); /*
diff -cNr postgresql-8.3.13.orig/src/backend/access/transam/clog.c postgresql-8.3.13/src/backend/access/transam/clog.c *** postgresql-8.3.13.orig/src/backend/access/transam/clog.c Tue Dec 14 03:51:20 2010 --- postgresql-8.3.13/src/backend/access/transam/clog.c Thu Apr 28 12:04:52 2011 *************** *** 74,79 **** --- 75,81 ---- #define ClogCtl (&ClogCtlData) + static int last_pageno = -1; static int ZeroCLOGPage(int pageno, bool writeXlog); static bool CLOGPagePrecedes(int page1, int page2); *************** *** 471,476 **** --- 476,488 ---- memcpy(&pageno, XLogRecGetData(record), sizeof(int)); + /* avoid repeatedly zeroing the same page */ + if (InRecovery && pageno == last_pageno) + return; + + /* save state */ + last_pageno = pageno; + LWLockAcquire(CLogControlLock, LW_EXCLUSIVE); slotno = ZeroCLOGPage(pageno, false);
create language plpgsql; \i /path/to/share/contrib/dblink.sql CREATE OR REPLACE FUNCTION test_clog(howmany int) RETURNS int AS $_$ DECLARE i int; arr text[]; dbname text; BEGIN dbname := current_database(); arr := dblink_get_connections(); IF arr IS NOT NULL THEN PERFORM dblink_disconnect('conn'); END IF; EXECUTE $$SELECT dblink_connect('conn','dbname=$$ || dbname || $$')$$; PERFORM dblink_exec('conn', 'DROP TABLE IF EXISTS clogtest'); PERFORM dblink_exec('conn', 'CREATE TABLE clogtest(id int primary key)'); PERFORM dblink_exec('conn', 'SET debug_pretty_print TO ON'); FOR i IN 1..howmany LOOP PERFORM dblink_exec('conn', 'insert into clogtest values(' || i || ')', false); IF (i % 5000) = 0 THEN RAISE WARNING '% iterations performed', i; RAISE WARNING 'Last ERROR: %', dblink_error_message('conn'); END IF; END LOOP; PERFORM dblink_disconnect('conn'); RETURN howmany; END; $_$ LANGUAGE plpgsql SET client_min_messages TO WARNING; SELECT test_clog(1000000);
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers