Re: [HACKERS] WIP: long transactions on hot standby feedback replica / proof of concept

2018-02-28 Thread Ivan Kartyshov
and aggressive autovacuum.

autovacuum = on
autovacuum_max_workers = 1
autovacuum_naptime = 1s
autovacuum_vacuum_threshold = 1
autovacuum_vacuum_cost_delay = 0

Test:
Here we will start replica and begi repeatable read transaction on 
table, then we stop replicas postmaster to prevent starting walreceiver 
worker (on master startup) and sending master it`s transaction xmin over 
hot_standby_feedback message.

MASTERREPLICA
start
CREATE TABLE test AS (SELECT id, 1 AS value FROM 
generate_series(1,1000) id);

stop
start
BEGIN TRANSACTION ISOLATION LEVEL REPEATABLE READ;
SELECT * FROM test;
stop postmaster with gdb
start
DELETE FROM test WHERE id > 0;
wait till autovacuum delete and changed xmin
release postmaster with gdb
--- Result on replica
FATAL: terminating connection due to conflict with recovery
DETAIL: User query might have needed to see row versions that must be 
removed.


There is one feature of the behavior of standby, which let us to allow 
the autovacuum to cut off the page table (at the end of relation) that 
no one else needs (because there is only dead and removed tuples). So if 
the standby SEQSCAN or another *SCAN mdread a page that is damaged or 
has been deleted, it will receive a zero page, and not break the request 
for ERROR.


Could you give me your ideas over these patches.

--
Ivan Kartyshov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Companydiff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index cff49ba..8e6c525 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -27,8 +27,10 @@
 #include "catalog/catalog.h"
 #include "catalog/storage.h"
 #include "catalog/storage_xlog.h"
+#include "postmaster/autovacuum.h"
 #include "storage/freespace.h"
 #include "storage/smgr.h"
+#include "storage/lock.h"
 #include "utils/memutils.h"
 #include "utils/rel.h"
 
@@ -269,6 +271,7 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
 		xlrec.blkno = nblocks;
 		xlrec.rnode = rel->rd_node;
 		xlrec.flags = SMGR_TRUNCATE_ALL;
+		xlrec.isautovacuum = IsAutoVacuumWorkerProcess();
 
 		XLogBeginInsert();
 		XLogRegisterData((char *) &xlrec, sizeof(xlrec));
@@ -495,6 +498,16 @@ smgr_redo(XLogReaderState *record)
 		xl_smgr_truncate *xlrec = (xl_smgr_truncate *) XLogRecGetData(record);
 		SMgrRelation reln;
 		Relation	rel;
+		bool		isautovacuum = false;
+
+		/*
+		 * Check iff truncation made by autovacuum, then take Exclusive lock
+		 * because previously AccessEclusive lock was blocked from master to
+		 * let long transctions run on replica.
+		 * NB: do it only InHotStandby
+		 */
+		if (InHotStandby)
+			isautovacuum = xlrec->isautovacuum;
 
 		reln = smgropen(xlrec->rnode, InvalidBackendId);
 
@@ -525,10 +538,29 @@ smgr_redo(XLogReaderState *record)
 
 		if ((xlrec->flags & SMGR_TRUNCATE_HEAP) != 0)
 		{
+			LOCKTAG		locktag;
+
+			/*
+			 * If the value isautovacuum is true, then we assume that truncate
+			 * wal was formed by the autovacuum and we ourselves have to take
+			 * ExclusiveLock on the relation, because we didn`t apply
+			 * AccessExclusiveLock from master to let long transactions to work
+			 * on relica.
+			 */
+			if (isautovacuum)
+			{
+/* Behave like LockRelationForExtension */
+SET_LOCKTAG_RELATION_EXTEND(locktag, xlrec->rnode.dbNode, xlrec->rnode.relNode);
+(void) LockAcquire(&locktag, ExclusiveLock, false, false);
+			}
+
 			smgrtruncate(reln, MAIN_FORKNUM, xlrec->blkno);
 
 			/* Also tell xlogutils.c about it */
 			XLogTruncateRelation(xlrec->rnode, MAIN_FORKNUM, xlrec->blkno);
+
+			if (isautovacuum)
+LockRelease(&locktag, ExclusiveLock, true);
 		}
 
 		/* Truncate FSM and VM too */
diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c
index 44ed209..34fbd30 100644
--- a/src/backend/storage/ipc/standby.c
+++ b/src/backend/storage/ipc/standby.c
@@ -23,6 +23,7 @@
 #include "access/xloginsert.h"
 #include "miscadmin.h"
 #include "pgstat.h"
+#include "postmaster/autovacuum.h"
 #include "storage/bufmgr.h"
 #include "storage/lmgr.h"
 #include "storage/proc.h"
@@ -37,6 +38,7 @@
 int			vacuum_defer_cleanup_age;
 int			max_standby_archive_delay = 30 * 1000;
 int			max_standby_streaming_delay = 30 * 1000;
+extern bool hot_standby_feedback;
 
 static List *RecoveryLockList;
 
@@ -805,10 +807,17 @@ standby_redo(XLogReaderState *record)
 		xl_standby_locks *xlrec = (xl_standby_locks *) XLogRecGetData(record);
 		int			i;
 
-		for (i = 0; i < xlrec->nlocks; i++)
-			StandbyAcquireAccessExclusiveLock(xlrec->locks[i].xid,
-			  xlrec->locks[i].dbOid,
-			  xlrec->locks[i].relOid);
+		/*
+		 * If this xlog standby lock was formed by autovacuum, then ignore it
+		 * because this can cause 

Re: [HACKERS] make async slave to wait for lsn to be replayed

2018-03-06 Thread Ivan Kartyshov

Andres Freund писал 2018-03-02 03:47:

On 2018-02-02 19:41:37 +, Simon Riggs wrote:
On 2 February 2018 at 18:46, Robert Haas  
wrote:

> On Fri, Feb 2, 2018 at 3:46 AM, Simon Riggs  wrote:
>> In PG11, I propose the following command, sticking mostly to Ants'
>> syntax, and allowing to wait for multiple events before it returns. It
>> doesn't hold snapshot and will not get cancelled by Hot Standby.
>>
>> WAIT FOR event [, event ...] options
>>
>> event is
>> LSN value
>> TIMESTAMP value
>>
>> options
>> TIMEOUT delay
>> UNTIL TIMESTAMP timestamp
>> (we have both, so people don't need to do math, they can use whichever
>> they have)
>
> WAIT FOR TIMEOUT sounds a lot like SELECT pg_sleep_for(), and WAIT
> UNTIL TIMESTAMP sounds a lot like SELECT pg_sleep_until().

Yes, it sounds very similar. It's the behavior that differs; I read
and agreed with yours and Thomas' earlier comments on that point.

As pointed out upthread, the key difference is whether it gets
cancelled on Hot Standby and whether you can call it in a non-READ
COMMITTED transaction.


Given that nobody has updated the patch or even discussed doing so, I
assume this would CF issue should now appropriately be classified as
returned with feedback?


Hello, I now is preparing the patch over syntax that Simon offered. And 
in few day I will update the patch.

Thank you for your interest in thread.

--
Ivan Kartyshov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company



Re: [HACKERS] WIP: long transactions on hot standby feedback replica / proof of concept

2018-08-09 Thread Ivan Kartyshov

Hello, thank you for your review.

Alexander Korotkov писал 2018-06-20 20:54:

Thinking about that more I found that adding vacuum mark as an extra
argument to LockAcquireExtended is also wrong.  It would be still hard
to determine if we should log the lock in LogStandbySnapshot().  We'll
have to add that flag to shared memory table of locks.  And that looks
like a kludge.

Therefore, I'd like to propose another approach: introduce new lock
level.  So, AccessExclusiveLock will be split into two
AccessExclusiveLocalLock and AccessExclusiveLock.  In spite of
AccessExclusiveLock, AccessExclusiveLocalLock will be not logged to
standby, and used for heap truncation.

I expect some resistance to my proposal, because fixing this "small
bug" doesn't deserve new lock level.  But current behavior of
hot_standby_feedback is buggy.  From user prospective,
hot_standby_feedback option is just non-worker, which causes master to
bloat without protection for standby queries from cancel.  We need to
fix that, but I don't see other proper way to do that except adding
new lock level...


Your offer is very interesting, it made patch smaller and more 
beautiful.
So I update patch and made changes accordance with the proposed concept 
of

special AccessExclusiveLocalLock.


I don't yet understand what is the problem here and why this patch
should solve that.  As I get idea of this patch, GetOldestXmin() will
initialize xmin of walsender with lowest xmin of replication slot.
But xmin of replication slots will be anyway taken into account by
GetSnapshotData() called by vacuum.


How to test:
Make async replica, turn on feedback, reduce 
max_standby_streaming_delay

and aggressive autovacuum.


You forgot to mention, that one should setup the replication using
replication slot.  Naturally, if replication slot isn't exist, then
master shouldn't keep dead tuples for disconnected standby.  Because
master doesn't know if standby will reconnect or is it gone forever.


I tried to solve hot_standby_feedback without replication slots,
so I found a little window of possibility of case, when vacuum on
master make heap truncation of pages that standby needs for just started
transaction.

How to test:
Make async replica, turn on feedback and reduce
max_standby_streaming_delay.
Make autovacuum more aggressive.
autovacuum = on
autovacuum_max_workers = 1
autovacuum_naptime = 1s
autovacuum_vacuum_threshold = 1
autovacuum_vacuum_cost_delay = 0



Test1:
Here we will do a load on the master and simulation of  a long
transaction with repeated 1 second SEQSCANS on the replica (by  calling
pg_sleep 1 second duration every 6 seconds).
MASTERREPLICA
 hot_standby = on
 max_standby_streaming_delay = 1s
 hot_standby_feedback = on
start
CREATE TABLE test AS (SELECT id, 1 AS value
FROM generate_series(1,1) id);
pgbench -T600 -P2 -n --file=master.sql postgres
(update test set value = value;)
 start
 BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED;
 SELECT pg_sleep(value) FROM test;
 \watch 6



---Autovacuum truncate pages at the end
Result on replica:
FATAL: terminating connection due to conflict with recovery
DETAIL: User was holding a relation lock for too long.



On Patched version lazy_vacuum_truncation passed without fatal errors.



Only some times Error occurs because this tests is too synthetic
ERROR: canceling statement due to conflict with recovery
DETAIL: User was holding shared buffer pin for too long.
Because of rising ResolveRecoveryConflictWithSnapshot while
redo some visibility flags to avoid this conflict we can do test2 or
increase max_standby_streaming_delay.



Test2:
Here we will do a load on the master and simulation of  a long
transaction on the replica (by  taking LOCK on table)
MASTERREPLICA
 hot_standby = on
 max_standby_streaming_delay = 1s
 hot_standby_feedback = on
start
CREATE TABLE test AS (SELECT id, 1 AS value FROM generate_series(1,1)
id);
pgbench -T600 -P2 -n --file=master.sql postgres
(update test set value = value;)
 start
 BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED;
 LOCK TABLE test IN ACCESS SHARE MODE;
 select * from test;
 \watch 6



---Autovacuum truncate pages at the end
Result on replica:
FATAL: terminating connection due to conflict with recovery
DETAIL: User was holding a relation lock for too long.

On Patched version lazy_vacuum_truncation passed without fatal errors.

I would like to here you opinion over this implementation.

--
Ivan Kartyshov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Companydiff --git a/doc/src/sgml/mvcc.sgml b/doc/src/sgml/mvcc.sgml
index 73934e5..73975cc 100644
--- a/doc/src/sgml/mvcc.sgml
+++ b/doc/src/sgml/mvcc.sgml
@@ -854,8 +854,8 @@ ERROR:  could not serialize access due to read/write dependencies among transact


 
- Conflicts with the ACCESS EXCLUSIVE lock
- mode only.

[Patch] Checksums for SLRU files

2017-12-31 Thread Ivan Kartyshov
Hello, I`d like to show my implementation of SLRU file protection with 
checksums.


It only has effect if checksums on database are enabled.
Detection of a checksum failure during a read normally causes PostgreSQL 
to report an error. Setting ignore_slru_checksum_failure to on causes 
the system to ignore the failure (but still report a warning), and  
continue processing. It is similary like ignore_checksum_failure but for 
some slru files. The default setting is true, and it can only be changed 
by a database admin. For changes, use ALTER SYSTEM and reload 
configuration:

ALTER SYSTEM SET ignore_slru_checksum_failure = off;
SELECT pg_reload_conf();

Impementation:
1) Checksum writes in last 2 bytes of every page
2) Checksum calculates and write down in page happens when page writes 
on disk (same as relation does it)
3) Checking checksum happens same as relation does it, on 
Read\PhysicalRead when GUC ignore_slru_checksum_failure = false

{default = true}.

I would like to hear your thoughts over my patch.

--
Ivan Kartyshov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Companydiff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index aeda826..8501dd2 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -8394,6 +8394,29 @@ LOG:  CleanUpLock: deleting: lock(0xb7acd844) id(24688,24696,0,0,0,1)
   
  
 
+
+  ignore_slru_checksum_failure (boolean)
+  
+   ignore_slru_checksum_failure configuration parameter
+  
+  
+  
+   
+Only has effect if  are enabled.
+   
+   
+Detection of a checksum failure during a read normally causes
+PostgreSQL to report an error. Setting ignore_slru_checksum_failure
+to on causes the system to ignore the failure (but still report a warning), and
+continue processing. Similary like ignore_checksum_failure but for
+not relation files. The default setting is true, and it can only be
+changed by a superuser. For changes, use ALTER SYSTEM and reload configuration:
+ALTER SYSTEM SET ignore_slru_checksum_failure = off; 
+SELECT pg_reload_conf(); 
+   
+  
+ 
+
 
   zero_damaged_pages (boolean)
   
diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c
index a3e2b12..87ed09c 100644
--- a/src/backend/access/transam/clog.c
+++ b/src/backend/access/transam/clog.c
@@ -58,7 +58,7 @@
 /* We need two bits per xact, so four xacts fit in a byte */
 #define CLOG_BITS_PER_XACT	2
 #define CLOG_XACTS_PER_BYTE 4
-#define CLOG_XACTS_PER_PAGE (BLCKSZ * CLOG_XACTS_PER_BYTE)
+#define CLOG_XACTS_PER_PAGE ((BLCKSZ - CHKSUMSZ) * CLOG_XACTS_PER_BYTE)
 #define CLOG_XACT_BITMASK	((1 << CLOG_BITS_PER_XACT) - 1)
 
 #define TransactionIdToPage(xid)	((xid) / (TransactionId) CLOG_XACTS_PER_PAGE)
diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index 7142ece..35c317e 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -106,7 +106,7 @@
  */
 
 /* We need four bytes per offset */
-#define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(MultiXactOffset))
+#define MULTIXACT_OFFSETS_PER_PAGE ((BLCKSZ - CHKSUMSZ) / sizeof(MultiXactOffset))
 
 #define MultiXactIdToOffsetPage(xid) \
 	((xid) / (MultiXactOffset) MULTIXACT_OFFSETS_PER_PAGE)
@@ -2039,7 +2039,6 @@ TrimMultiXact(void)
 	{
 		int			slotno;
 		MultiXactOffset *offptr;
-
 		slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact);
 		offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
 		offptr += entryno;
diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c
index 9dd7719..b7a154c 100644
--- a/src/backend/access/transam/slru.c
+++ b/src/backend/access/transam/slru.c
@@ -58,7 +58,18 @@
 #include "storage/fd.h"
 #include "storage/shmem.h"
 #include "miscadmin.h"
+#include "utils/guc.h"
 
+#include "storage/checksum.h"
+#include "utils/memutils.h"
+
+/*
+ * GUC variable
+ * Set from backend:
+ * alter system set ignore_slru_checksum_failure = on/off;
+ * select pg_reload_conf();
+ */
+bool		ignore_slru_checksum_failure = true;
 
 #define SlruFileName(ctl, path, seg) \
 	snprintf(path, MAXPGPATH, "%s/%04X", (ctl)->Dir, seg)
@@ -376,6 +387,8 @@ SimpleLruReadPage(SlruCtl ctl, int pageno, bool write_ok,
   TransactionId xid)
 {
 	SlruShared	shared = ctl->shared;
+	int			checksum;
+	static char *pageCopy = NULL;
 
 	/* Outer loop handles restart if we must wait for someone else's I/O */
 	for (;;)
@@ -426,6 +439,22 @@ SimpleLruReadPage(SlruCtl ctl, int pageno, bool write_ok,
 		/* Do the read */
 		ok = SlruPhysicalReadPage(ctl, pageno, slotno);
 
+		if (pageCopy == NULL)
+			pageCopy = MemoryContextAlloc(TopMemoryContext, BLCKSZ);
+
+		memcpy(pageCopy, shared->page_buffer[slot