One case is conflicting transactions T1 and T2 with different domain id, in
optimistic parallel replication in non-GTID mode. Then T2 will
wait_for_prior_commit on T1; and if T1 got a row lock wait on T2 it would
hang, as different domains caused the deadlock kill to be skipped in
thd_rpl_deadlock_check().

More generally, if we have transactions T1 and T2 in one domain/master
connection, and independently transactions U1 and U2 in another, then we can
still deadlock like this:

  T1 row low wait on U2
  U2 wait_for_prior_commit on U1
  U1 row lock wait on T2
  T2 wait_for_prior_commit on T1

This commit enforces the deadlock kill in these cases. If the waited-for
transaction is speculatively applied, then it will be deadlock killed in
case of a conflict, even if the two transactions are in different domains
or master connections.

Signed-off-by: Kristian Nielsen <kniel...@knielsen-hq.org>
---
 mysql-test/suite/rpl/r/rpl_mdev33798.result | 143 +++++++++++++++
 mysql-test/suite/rpl/t/rpl_mdev33798.cnf    |  17 ++
 mysql-test/suite/rpl/t/rpl_mdev33798.test   | 182 ++++++++++++++++++++
 sql/sql_class.cc                            |  42 ++++-
 4 files changed, 376 insertions(+), 8 deletions(-)
 create mode 100644 mysql-test/suite/rpl/r/rpl_mdev33798.result
 create mode 100644 mysql-test/suite/rpl/t/rpl_mdev33798.cnf
 create mode 100644 mysql-test/suite/rpl/t/rpl_mdev33798.test

diff --git a/mysql-test/suite/rpl/r/rpl_mdev33798.result 
b/mysql-test/suite/rpl/r/rpl_mdev33798.result
new file mode 100644
index 00000000000..8796e948546
--- /dev/null
+++ b/mysql-test/suite/rpl/r/rpl_mdev33798.result
@@ -0,0 +1,143 @@
+include/rpl_init.inc [topology=1->2,1->3]
+connect  server_2b,127.0.0.1,root,,,$SERVER_MYPORT_2;
+connection server_2;
+SET @old_parallel_threads=@@GLOBAL.slave_parallel_threads;
+SET @old_parallel_mode= @@GLOBAL.slave_parallel_mode;
+SET @old_timeout= @@GLOBAL.lock_wait_timeout;
+SET @old_innodb_timeout= @@GLOBAL.innodb_lock_wait_timeout;
+include/stop_slave.inc
+SET GLOBAL slave_parallel_threads=5;
+set global slave_parallel_mode= aggressive;
+SET GLOBAL lock_wait_timeout= 86400;
+SET GLOBAL innodb_lock_wait_timeout= 86400;
+SET STATEMENT sql_log_bin=0 FOR ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB;
+include/start_slave.inc
+connection server_1;
+CREATE TABLE t1 (a INT PRIMARY KEY, b INT) ENGINE=InnoDB;
+INSERT INTO t1 VALUES (1, 0), (2, 0), (3, 0), (4, 0), (5, 0), (6, 0), (7, 0), 
(8, 0);
+connection server_2;
+include/stop_slave.inc
+connection server_2b;
+BEGIN;
+SELECT * FROM t1 WHERE a=1 FOR UPDATE;
+a      b
+1      0
+SELECT * FROM t1 WHERE a=5 FOR UPDATE;
+a      b
+5      0
+connection server_1;
+SET SESSION gtid_domain_id= 1;
+BEGIN;
+UPDATE t1 SET b=1 WHERE a=1;
+UPDATE t1 SET b=1 WHERE a=7;
+COMMIT;
+UPDATE t1 SET b=2 WHERE a=3;
+SET SESSION gtid_domain_id=2;
+BEGIN;
+UPDATE t1 SET b=3 WHERE a=5;
+UPDATE t1 SET b=3 WHERE a=3;
+COMMIT;
+UPDATE t1 SET b=4 WHERE a=7;
+SET SESSION gtid_domain_id= 0;
+include/save_master_gtid.inc
+connection server_2;
+include/start_slave.inc
+connection server_2b;
+ROLLBACK;
+connection server_2;
+include/sync_with_master_gtid.inc
+SELECT a, (
+(a=1 AND b=1) OR
+(a=3 AND (b=2 OR b=3)) OR
+(a=5 AND b=3) OR
+(a=7 AND (b=1 OR b=4)) OR
+((a MOD 2)=0 AND b=0)) AS `ok`
+  FROM t1
+ORDER BY a;
+a      ok
+1      1
+2      1
+3      1
+4      1
+5      1
+6      1
+7      1
+8      1
+connection server_3;
+include/sync_with_master_gtid.inc
+include/stop_slave.inc
+connection server_2;
+include/stop_slave.inc
+CHANGE MASTER 'm2' to master_port=MYPORT_3 , master_host='127.0.0.1', 
master_user='root', master_use_gtid=slave_pos;
+connection server_1;
+SET SESSION gtid_domain_id= 1;
+BEGIN;
+UPDATE t1 SET b=11 WHERE a=1;
+UPDATE t1 SET b=11 WHERE a=7;
+COMMIT;
+UPDATE t1 SET b=12 WHERE a=3;
+SET SESSION gtid_domain_id= 1;
+connection server_3;
+SET SESSION gtid_domain_id=3;
+BEGIN;
+UPDATE t1 SET b=13 WHERE a=5;
+UPDATE t1 SET b=13 WHERE a=3;
+COMMIT;
+UPDATE t1 SET b=14 WHERE a=7;
+include/save_master_gtid.inc
+connection server_2b;
+BEGIN;
+SELECT * FROM t1 WHERE a=1 FOR UPDATE;
+a      b
+1      1
+SELECT * FROM t1 WHERE a=5 FOR UPDATE;
+a      b
+5      3
+START ALL SLAVES;
+Warnings:
+Note   1937    SLAVE 'm2' started
+Note   1937    SLAVE '' started
+connection server_2b;
+ROLLBACK;
+connection server_1;
+include/save_master_gtid.inc
+connection server_2;
+include/sync_with_master_gtid.inc
+connection server_3;
+include/save_master_gtid.inc
+connection server_2;
+include/sync_with_master_gtid.inc
+SELECT a, (
+(a=1 AND b=11) OR
+(a=3 AND (b=12 OR b=13)) OR
+(a=5 AND b=13) OR
+(a=7 AND (b=11 OR b=14)) OR
+((a MOD 2)=0 AND b=0)) AS `ok`
+  FROM t1
+ORDER BY a;
+a      ok
+1      1
+2      1
+3      1
+4      1
+5      1
+6      1
+7      1
+8      1
+SET default_master_connection = 'm2';
+include/stop_slave.inc
+RESET SLAVE 'm2' ALL;
+SET default_master_connection = '';
+connection server_3;
+include/start_slave.inc
+disconnect server_2b;
+connection server_1;
+DROP TABLE t1;
+connection server_2;
+include/stop_slave.inc
+SET GLOBAL slave_parallel_threads=@old_parallel_threads;
+set global slave_parallel_mode= @old_parallel_mode;
+SET GLOBAL lock_wait_timeout= @old_timeout;
+SET GLOBAL innodb_lock_wait_timeout= @old_innodb_timeout;
+include/start_slave.inc
+include/rpl_end.inc
diff --git a/mysql-test/suite/rpl/t/rpl_mdev33798.cnf 
b/mysql-test/suite/rpl/t/rpl_mdev33798.cnf
new file mode 100644
index 00000000000..8e5125ea6ca
--- /dev/null
+++ b/mysql-test/suite/rpl/t/rpl_mdev33798.cnf
@@ -0,0 +1,17 @@
+!include suite/rpl/my.cnf
+
+[mysqld.1]
+log-slave-updates
+loose-innodb
+
+[mysqld.2]
+log-slave-updates
+loose-innodb
+
+[mysqld.3]
+log-slave-updates
+loose-innodb
+
+[ENV]
+SERVER_MYPORT_3=                @mysqld.3.port
+SERVER_MYSOCK_3=                @mysqld.3.socket
diff --git a/mysql-test/suite/rpl/t/rpl_mdev33798.test 
b/mysql-test/suite/rpl/t/rpl_mdev33798.test
new file mode 100644
index 00000000000..1448ed91133
--- /dev/null
+++ b/mysql-test/suite/rpl/t/rpl_mdev33798.test
@@ -0,0 +1,182 @@
+--source include/have_innodb.inc
+--source include/have_log_bin.inc
+--let $rpl_topology=1->2,1->3
+--source include/rpl_init.inc
+--connect (server_2b,127.0.0.1,root,,,$SERVER_MYPORT_2)
+
+--connection server_2
+SET @old_parallel_threads=@@GLOBAL.slave_parallel_threads;
+SET @old_parallel_mode= @@GLOBAL.slave_parallel_mode;
+SET @old_timeout= @@GLOBAL.lock_wait_timeout;
+SET @old_innodb_timeout= @@GLOBAL.innodb_lock_wait_timeout;
+--source include/stop_slave.inc
+SET GLOBAL slave_parallel_threads=5;
+set global slave_parallel_mode= aggressive;
+# High timeout so we get replication sync error and test failure if the
+# conflict handling is insufficient and lock wait timeout occurs.
+SET GLOBAL lock_wait_timeout= 86400;
+SET GLOBAL innodb_lock_wait_timeout= 86400;
+SET STATEMENT sql_log_bin=0 FOR ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB;
+--source include/start_slave.inc
+
+--connection server_1
+CREATE TABLE t1 (a INT PRIMARY KEY, b INT) ENGINE=InnoDB;
+INSERT INTO t1 VALUES (1, 0), (2, 0), (3, 0), (4, 0), (5, 0), (6, 0), (7, 0), 
(8, 0);
+--save_master_pos
+
+--connection server_2
+--sync_with_master
+--source include/stop_slave.inc
+
+# Test the following scenario:
+#
+# Transactions T1, T2 in domain 1, U1, U2 in domain 2.
+# Wait cycle T1->U2->U1->T2->T1 as follows:
+#   T1 row lock wait on U2
+#   U2 wait_for_prior_commit on U1
+#   U1 row lock wait on T2
+#   T2 wait_for_prior_commit on T1
+#
+# Test that the wait cycle is broken correctly with deadlock kill.
+
+--connection server_2b
+# Temporarily block T1 and U1.
+BEGIN;
+SELECT * FROM t1 WHERE a=1 FOR UPDATE;
+SELECT * FROM t1 WHERE a=5 FOR UPDATE;
+
+--connection server_1
+
+SET SESSION gtid_domain_id= 1;
+# T1 in domain 1
+BEGIN;
+UPDATE t1 SET b=1 WHERE a=1;
+UPDATE t1 SET b=1 WHERE a=7;
+COMMIT;
+# T2 in domain 1
+UPDATE t1 SET b=2 WHERE a=3;
+
+SET SESSION gtid_domain_id=2;
+# U1 in domain 2
+BEGIN;
+UPDATE t1 SET b=3 WHERE a=5;
+UPDATE t1 SET b=3 WHERE a=3;
+COMMIT;
+# U2 in domain 2
+UPDATE t1 SET b=4 WHERE a=7;
+SET SESSION gtid_domain_id= 0;
+--source include/save_master_gtid.inc
+
+--connection server_2
+--source include/start_slave.inc
+# Wait until T2, U2 are holding the row locks.
+--let $wait_condition= SELECT COUNT(*)=2 FROM INFORMATION_SCHEMA.PROCESSLIST 
WHERE state LIKE '%Waiting for prior transaction to commit%'
+--source include/wait_condition.inc
+
+# Then let T1, U1 continue to conflict on the row locks, and check that
+# replication correctly handles the conflict.
+--connection server_2b
+ROLLBACK;
+
+--connection server_2
+--source include/sync_with_master_gtid.inc
+
+# Allow either domain to "win" on the conflicting updates.
+SELECT a, (
+  (a=1 AND b=1) OR
+  (a=3 AND (b=2 OR b=3)) OR
+  (a=5 AND b=3) OR
+  (a=7 AND (b=1 OR b=4)) OR
+  ((a MOD 2)=0 AND b=0)) AS `ok`
+  FROM t1
+ ORDER BY a;
+
+# Now try the same thing with multi-source replication.
+
+# Make server_3 a second master
+--connection server_3
+--source include/sync_with_master_gtid.inc
+--source include/stop_slave.inc
+
+--connection server_2
+--source include/stop_slave.inc
+--replace_result $SERVER_MYPORT_3 MYPORT_3
+eval CHANGE MASTER 'm2' to master_port=$SERVER_MYPORT_3 , 
master_host='127.0.0.1', master_user='root', master_use_gtid=slave_pos;
+
+--connection server_1
+
+SET SESSION gtid_domain_id= 1;
+# T1 in domain 1
+BEGIN;
+UPDATE t1 SET b=11 WHERE a=1;
+UPDATE t1 SET b=11 WHERE a=7;
+COMMIT;
+# T2 in domain 1
+UPDATE t1 SET b=12 WHERE a=3;
+SET SESSION gtid_domain_id= 1;
+
+--connection server_3
+SET SESSION gtid_domain_id=3;
+# U1 in domain 3
+BEGIN;
+UPDATE t1 SET b=13 WHERE a=5;
+UPDATE t1 SET b=13 WHERE a=3;
+COMMIT;
+# U2 in domain 3
+UPDATE t1 SET b=14 WHERE a=7;
+--source include/save_master_gtid.inc
+
+--connection server_2b
+# Temporarily block T1 and U1.
+BEGIN;
+SELECT * FROM t1 WHERE a=1 FOR UPDATE;
+SELECT * FROM t1 WHERE a=5 FOR UPDATE;
+
+START ALL SLAVES;
+# Wait until T2, U2 are holding the row locks.
+--let $wait_condition= SELECT COUNT(*)=2 FROM INFORMATION_SCHEMA.PROCESSLIST 
WHERE state LIKE '%Waiting for prior transaction to commit%'
+--source include/wait_condition.inc
+
+--connection server_2b
+ROLLBACK;
+
+--connection server_1
+--source include/save_master_gtid.inc
+--connection server_2
+--source include/sync_with_master_gtid.inc
+--connection server_3
+--source include/save_master_gtid.inc
+--connection server_2
+--source include/sync_with_master_gtid.inc
+
+SELECT a, (
+  (a=1 AND b=11) OR
+  (a=3 AND (b=12 OR b=13)) OR
+  (a=5 AND b=13) OR
+  (a=7 AND (b=11 OR b=14)) OR
+  ((a MOD 2)=0 AND b=0)) AS `ok`
+  FROM t1
+ ORDER BY a;
+
+SET default_master_connection = 'm2';
+--source include/stop_slave.inc
+RESET SLAVE 'm2' ALL;
+SET default_master_connection = '';
+
+--connection server_3
+--source include/start_slave.inc
+
+# Cleanup
+
+--disconnect server_2b
+--connection server_1
+DROP TABLE t1;
+--connection server_2
+--source include/stop_slave.inc
+SET GLOBAL slave_parallel_threads=@old_parallel_threads;
+set global slave_parallel_mode= @old_parallel_mode;
+SET GLOBAL lock_wait_timeout= @old_timeout;
+SET GLOBAL innodb_lock_wait_timeout= @old_innodb_timeout;
+--source include/start_slave.inc
+
+--source include/rpl_end.inc
diff --git a/sql/sql_class.cc b/sql/sql_class.cc
index 0d39f359a09..6026e1e13bb 100644
--- a/sql/sql_class.cc
+++ b/sql/sql_class.cc
@@ -5384,14 +5384,40 @@ thd_rpl_deadlock_check(MYSQL_THD thd, MYSQL_THD 
other_thd)
     return 0;
   if (!rgi->is_parallel_exec)
     return 0;
-  if (rgi->rli != other_rgi->rli)
-    return 0;
-  if (!rgi->gtid_sub_id || !other_rgi->gtid_sub_id)
-    return 0;
-  if (rgi->current_gtid.domain_id != other_rgi->current_gtid.domain_id)
-    return 0;
-  if (rgi->gtid_sub_id > other_rgi->gtid_sub_id)
-    return 0;
+  if (rgi->rli == other_rgi->rli)
+  {
+    /*
+      Within the same master connection, we can compare transaction order on
+      the GTID sub_id, and rollback the later transaction to allow the earlier
+      transaction to commit first.
+    */
+    if (!rgi->gtid_sub_id || !other_rgi->gtid_sub_id)
+      return 0;
+    if (rgi->current_gtid.domain_id != other_rgi->current_gtid.domain_id &&
+        other_rgi->speculation != rpl_group_info::SPECULATE_OPTIMISTIC)
+      return 0;
+    if (rgi->gtid_sub_id > other_rgi->gtid_sub_id)
+      return 0;
+  }
+  else
+  {
+    /*
+      Lock conflicts between different master connection should usually not
+      occur, but could still happen if user is running some special setup that
+      tolerates conflicting updates (or in case of user error). We do not have 
a
+      pre-defined ordering of transactions in this case, but we still need to
+      handle conflicts in _some_ way to avoid undetected deadlocks and hangs.
+
+      We do this by rolling back and retrying any transaction that is being
+      _optimistically_ applied. This can be overly conservative in some cases,
+      but should be fine as conflicts between different master connections are
+      not expected to be common. And it ensures that we won't end up in a
+      deadlock and hang due to a transaction doing wait_for_prior_commit while
+      holding locks that block something in another master connection.
+    */
+    if (other_rgi->speculation != rpl_group_info::SPECULATE_OPTIMISTIC)
+      return 0;
+  }
   if (rgi->finish_event_group_called || other_rgi->finish_event_group_called)
   {
     /*
-- 
2.30.2

_______________________________________________
commits mailing list -- commits@lists.mariadb.org
To unsubscribe send an email to commits-le...@lists.mariadb.org

Reply via email to