Re: [HACKERS] WIP: long transactions on hot standby feedback replica / proof of concept
and aggressive autovacuum. autovacuum = on autovacuum_max_workers = 1 autovacuum_naptime = 1s autovacuum_vacuum_threshold = 1 autovacuum_vacuum_cost_delay = 0 Test: Here we will start replica and begi repeatable read transaction on table, then we stop replicas postmaster to prevent starting walreceiver worker (on master startup) and sending master it`s transaction xmin over hot_standby_feedback message. MASTERREPLICA start CREATE TABLE test AS (SELECT id, 1 AS value FROM generate_series(1,1000) id); stop start BEGIN TRANSACTION ISOLATION LEVEL REPEATABLE READ; SELECT * FROM test; stop postmaster with gdb start DELETE FROM test WHERE id > 0; wait till autovacuum delete and changed xmin release postmaster with gdb --- Result on replica FATAL: terminating connection due to conflict with recovery DETAIL: User query might have needed to see row versions that must be removed. There is one feature of the behavior of standby, which let us to allow the autovacuum to cut off the page table (at the end of relation) that no one else needs (because there is only dead and removed tuples). So if the standby SEQSCAN or another *SCAN mdread a page that is damaged or has been deleted, it will receive a zero page, and not break the request for ERROR. Could you give me your ideas over these patches. -- Ivan Kartyshov Postgres Professional: http://www.postgrespro.com The Russian Postgres Companydiff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c index cff49ba..8e6c525 100644 --- a/src/backend/catalog/storage.c +++ b/src/backend/catalog/storage.c @@ -27,8 +27,10 @@ #include "catalog/catalog.h" #include "catalog/storage.h" #include "catalog/storage_xlog.h" +#include "postmaster/autovacuum.h" #include "storage/freespace.h" #include "storage/smgr.h" +#include "storage/lock.h" #include "utils/memutils.h" #include "utils/rel.h" @@ -269,6 +271,7 @@ RelationTruncate(Relation rel, BlockNumber nblocks) xlrec.blkno = nblocks; xlrec.rnode = rel->rd_node; xlrec.flags = SMGR_TRUNCATE_ALL; + xlrec.isautovacuum = IsAutoVacuumWorkerProcess(); XLogBeginInsert(); XLogRegisterData((char *) &xlrec, sizeof(xlrec)); @@ -495,6 +498,16 @@ smgr_redo(XLogReaderState *record) xl_smgr_truncate *xlrec = (xl_smgr_truncate *) XLogRecGetData(record); SMgrRelation reln; Relation rel; + bool isautovacuum = false; + + /* + * Check iff truncation made by autovacuum, then take Exclusive lock + * because previously AccessEclusive lock was blocked from master to + * let long transctions run on replica. + * NB: do it only InHotStandby + */ + if (InHotStandby) + isautovacuum = xlrec->isautovacuum; reln = smgropen(xlrec->rnode, InvalidBackendId); @@ -525,10 +538,29 @@ smgr_redo(XLogReaderState *record) if ((xlrec->flags & SMGR_TRUNCATE_HEAP) != 0) { + LOCKTAG locktag; + + /* + * If the value isautovacuum is true, then we assume that truncate + * wal was formed by the autovacuum and we ourselves have to take + * ExclusiveLock on the relation, because we didn`t apply + * AccessExclusiveLock from master to let long transactions to work + * on relica. + */ + if (isautovacuum) + { +/* Behave like LockRelationForExtension */ +SET_LOCKTAG_RELATION_EXTEND(locktag, xlrec->rnode.dbNode, xlrec->rnode.relNode); +(void) LockAcquire(&locktag, ExclusiveLock, false, false); + } + smgrtruncate(reln, MAIN_FORKNUM, xlrec->blkno); /* Also tell xlogutils.c about it */ XLogTruncateRelation(xlrec->rnode, MAIN_FORKNUM, xlrec->blkno); + + if (isautovacuum) +LockRelease(&locktag, ExclusiveLock, true); } /* Truncate FSM and VM too */ diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c index 44ed209..34fbd30 100644 --- a/src/backend/storage/ipc/standby.c +++ b/src/backend/storage/ipc/standby.c @@ -23,6 +23,7 @@ #include "access/xloginsert.h" #include "miscadmin.h" #include "pgstat.h" +#include "postmaster/autovacuum.h" #include "storage/bufmgr.h" #include "storage/lmgr.h" #include "storage/proc.h" @@ -37,6 +38,7 @@ int vacuum_defer_cleanup_age; int max_standby_archive_delay = 30 * 1000; int max_standby_streaming_delay = 30 * 1000; +extern bool hot_standby_feedback; static List *RecoveryLockList; @@ -805,10 +807,17 @@ standby_redo(XLogReaderState *record) xl_standby_locks *xlrec = (xl_standby_locks *) XLogRecGetData(record); int i; - for (i = 0; i < xlrec->nlocks; i++) - StandbyAcquireAccessExclusiveLock(xlrec->locks[i].xid, - xlrec->locks[i].dbOid, - xlrec->locks[i].relOid); + /* + * If this xlog standby lock was formed by autovacuum, then ignore it + * because this can cause
Re: [HACKERS] make async slave to wait for lsn to be replayed
Andres Freund писал 2018-03-02 03:47: On 2018-02-02 19:41:37 +, Simon Riggs wrote: On 2 February 2018 at 18:46, Robert Haas wrote: > On Fri, Feb 2, 2018 at 3:46 AM, Simon Riggs wrote: >> In PG11, I propose the following command, sticking mostly to Ants' >> syntax, and allowing to wait for multiple events before it returns. It >> doesn't hold snapshot and will not get cancelled by Hot Standby. >> >> WAIT FOR event [, event ...] options >> >> event is >> LSN value >> TIMESTAMP value >> >> options >> TIMEOUT delay >> UNTIL TIMESTAMP timestamp >> (we have both, so people don't need to do math, they can use whichever >> they have) > > WAIT FOR TIMEOUT sounds a lot like SELECT pg_sleep_for(), and WAIT > UNTIL TIMESTAMP sounds a lot like SELECT pg_sleep_until(). Yes, it sounds very similar. It's the behavior that differs; I read and agreed with yours and Thomas' earlier comments on that point. As pointed out upthread, the key difference is whether it gets cancelled on Hot Standby and whether you can call it in a non-READ COMMITTED transaction. Given that nobody has updated the patch or even discussed doing so, I assume this would CF issue should now appropriately be classified as returned with feedback? Hello, I now is preparing the patch over syntax that Simon offered. And in few day I will update the patch. Thank you for your interest in thread. -- Ivan Kartyshov Postgres Professional: http://www.postgrespro.com The Russian Postgres Company
Re: [HACKERS] WIP: long transactions on hot standby feedback replica / proof of concept
Hello, thank you for your review. Alexander Korotkov писал 2018-06-20 20:54: Thinking about that more I found that adding vacuum mark as an extra argument to LockAcquireExtended is also wrong. It would be still hard to determine if we should log the lock in LogStandbySnapshot(). We'll have to add that flag to shared memory table of locks. And that looks like a kludge. Therefore, I'd like to propose another approach: introduce new lock level. So, AccessExclusiveLock will be split into two AccessExclusiveLocalLock and AccessExclusiveLock. In spite of AccessExclusiveLock, AccessExclusiveLocalLock will be not logged to standby, and used for heap truncation. I expect some resistance to my proposal, because fixing this "small bug" doesn't deserve new lock level. But current behavior of hot_standby_feedback is buggy. From user prospective, hot_standby_feedback option is just non-worker, which causes master to bloat without protection for standby queries from cancel. We need to fix that, but I don't see other proper way to do that except adding new lock level... Your offer is very interesting, it made patch smaller and more beautiful. So I update patch and made changes accordance with the proposed concept of special AccessExclusiveLocalLock. I don't yet understand what is the problem here and why this patch should solve that. As I get idea of this patch, GetOldestXmin() will initialize xmin of walsender with lowest xmin of replication slot. But xmin of replication slots will be anyway taken into account by GetSnapshotData() called by vacuum. How to test: Make async replica, turn on feedback, reduce max_standby_streaming_delay and aggressive autovacuum. You forgot to mention, that one should setup the replication using replication slot. Naturally, if replication slot isn't exist, then master shouldn't keep dead tuples for disconnected standby. Because master doesn't know if standby will reconnect or is it gone forever. I tried to solve hot_standby_feedback without replication slots, so I found a little window of possibility of case, when vacuum on master make heap truncation of pages that standby needs for just started transaction. How to test: Make async replica, turn on feedback and reduce max_standby_streaming_delay. Make autovacuum more aggressive. autovacuum = on autovacuum_max_workers = 1 autovacuum_naptime = 1s autovacuum_vacuum_threshold = 1 autovacuum_vacuum_cost_delay = 0 Test1: Here we will do a load on the master and simulation of a long transaction with repeated 1 second SEQSCANS on the replica (by calling pg_sleep 1 second duration every 6 seconds). MASTERREPLICA hot_standby = on max_standby_streaming_delay = 1s hot_standby_feedback = on start CREATE TABLE test AS (SELECT id, 1 AS value FROM generate_series(1,1) id); pgbench -T600 -P2 -n --file=master.sql postgres (update test set value = value;) start BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED; SELECT pg_sleep(value) FROM test; \watch 6 ---Autovacuum truncate pages at the end Result on replica: FATAL: terminating connection due to conflict with recovery DETAIL: User was holding a relation lock for too long. On Patched version lazy_vacuum_truncation passed without fatal errors. Only some times Error occurs because this tests is too synthetic ERROR: canceling statement due to conflict with recovery DETAIL: User was holding shared buffer pin for too long. Because of rising ResolveRecoveryConflictWithSnapshot while redo some visibility flags to avoid this conflict we can do test2 or increase max_standby_streaming_delay. Test2: Here we will do a load on the master and simulation of a long transaction on the replica (by taking LOCK on table) MASTERREPLICA hot_standby = on max_standby_streaming_delay = 1s hot_standby_feedback = on start CREATE TABLE test AS (SELECT id, 1 AS value FROM generate_series(1,1) id); pgbench -T600 -P2 -n --file=master.sql postgres (update test set value = value;) start BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED; LOCK TABLE test IN ACCESS SHARE MODE; select * from test; \watch 6 ---Autovacuum truncate pages at the end Result on replica: FATAL: terminating connection due to conflict with recovery DETAIL: User was holding a relation lock for too long. On Patched version lazy_vacuum_truncation passed without fatal errors. I would like to here you opinion over this implementation. -- Ivan Kartyshov Postgres Professional: http://www.postgrespro.com The Russian Postgres Companydiff --git a/doc/src/sgml/mvcc.sgml b/doc/src/sgml/mvcc.sgml index 73934e5..73975cc 100644 --- a/doc/src/sgml/mvcc.sgml +++ b/doc/src/sgml/mvcc.sgml @@ -854,8 +854,8 @@ ERROR: could not serialize access due to read/write dependencies among transact - Conflicts with the ACCESS EXCLUSIVE lock - mode only.
[Patch] Checksums for SLRU files
Hello, I`d like to show my implementation of SLRU file protection with checksums. It only has effect if checksums on database are enabled. Detection of a checksum failure during a read normally causes PostgreSQL to report an error. Setting ignore_slru_checksum_failure to on causes the system to ignore the failure (but still report a warning), and continue processing. It is similary like ignore_checksum_failure but for some slru files. The default setting is true, and it can only be changed by a database admin. For changes, use ALTER SYSTEM and reload configuration: ALTER SYSTEM SET ignore_slru_checksum_failure = off; SELECT pg_reload_conf(); Impementation: 1) Checksum writes in last 2 bytes of every page 2) Checksum calculates and write down in page happens when page writes on disk (same as relation does it) 3) Checking checksum happens same as relation does it, on Read\PhysicalRead when GUC ignore_slru_checksum_failure = false {default = true}. I would like to hear your thoughts over my patch. -- Ivan Kartyshov Postgres Professional: http://www.postgrespro.com The Russian Postgres Companydiff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index aeda826..8501dd2 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -8394,6 +8394,29 @@ LOG: CleanUpLock: deleting: lock(0xb7acd844) id(24688,24696,0,0,0,1) + + ignore_slru_checksum_failure (boolean) + + ignore_slru_checksum_failure configuration parameter + + + + +Only has effect if are enabled. + + +Detection of a checksum failure during a read normally causes +PostgreSQL to report an error. Setting ignore_slru_checksum_failure +to on causes the system to ignore the failure (but still report a warning), and +continue processing. Similary like ignore_checksum_failure but for +not relation files. The default setting is true, and it can only be +changed by a superuser. For changes, use ALTER SYSTEM and reload configuration: +ALTER SYSTEM SET ignore_slru_checksum_failure = off; +SELECT pg_reload_conf(); + + + + zero_damaged_pages (boolean) diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c index a3e2b12..87ed09c 100644 --- a/src/backend/access/transam/clog.c +++ b/src/backend/access/transam/clog.c @@ -58,7 +58,7 @@ /* We need two bits per xact, so four xacts fit in a byte */ #define CLOG_BITS_PER_XACT 2 #define CLOG_XACTS_PER_BYTE 4 -#define CLOG_XACTS_PER_PAGE (BLCKSZ * CLOG_XACTS_PER_BYTE) +#define CLOG_XACTS_PER_PAGE ((BLCKSZ - CHKSUMSZ) * CLOG_XACTS_PER_BYTE) #define CLOG_XACT_BITMASK ((1 << CLOG_BITS_PER_XACT) - 1) #define TransactionIdToPage(xid) ((xid) / (TransactionId) CLOG_XACTS_PER_PAGE) diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c index 7142ece..35c317e 100644 --- a/src/backend/access/transam/multixact.c +++ b/src/backend/access/transam/multixact.c @@ -106,7 +106,7 @@ */ /* We need four bytes per offset */ -#define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(MultiXactOffset)) +#define MULTIXACT_OFFSETS_PER_PAGE ((BLCKSZ - CHKSUMSZ) / sizeof(MultiXactOffset)) #define MultiXactIdToOffsetPage(xid) \ ((xid) / (MultiXactOffset) MULTIXACT_OFFSETS_PER_PAGE) @@ -2039,7 +2039,6 @@ TrimMultiXact(void) { int slotno; MultiXactOffset *offptr; - slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact); offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; offptr += entryno; diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c index 9dd7719..b7a154c 100644 --- a/src/backend/access/transam/slru.c +++ b/src/backend/access/transam/slru.c @@ -58,7 +58,18 @@ #include "storage/fd.h" #include "storage/shmem.h" #include "miscadmin.h" +#include "utils/guc.h" +#include "storage/checksum.h" +#include "utils/memutils.h" + +/* + * GUC variable + * Set from backend: + * alter system set ignore_slru_checksum_failure = on/off; + * select pg_reload_conf(); + */ +bool ignore_slru_checksum_failure = true; #define SlruFileName(ctl, path, seg) \ snprintf(path, MAXPGPATH, "%s/%04X", (ctl)->Dir, seg) @@ -376,6 +387,8 @@ SimpleLruReadPage(SlruCtl ctl, int pageno, bool write_ok, TransactionId xid) { SlruShared shared = ctl->shared; + int checksum; + static char *pageCopy = NULL; /* Outer loop handles restart if we must wait for someone else's I/O */ for (;;) @@ -426,6 +439,22 @@ SimpleLruReadPage(SlruCtl ctl, int pageno, bool write_ok, /* Do the read */ ok = SlruPhysicalReadPage(ctl, pageno, slotno); + if (pageCopy == NULL) + pageCopy = MemoryContextAlloc(TopMemoryContext, BLCKSZ); + + memcpy(pageCopy, shared->page_buffer[slot