Hi hackers,

I found a problem in logical replication.
It seems to have the same cause as the following problem.

  Creating many tables gets logical replication stuck
  
https://www.postgresql.org/message-id/flat/20f3de7675f83176253f607b5e199b228406c21c.camel%40cybertec.at

  Logical decoding CPU-bound w/ large number of tables
  
https://www.postgresql.org/message-id/flat/CAHoiPjzea6N0zuCi%3D%2Bf9v_j94nfsy6y8SU7-%3Dbp4%3D7qw6_i%3DRg%40mail.gmail.com

# problem

* logical replication enabled
* walsender process has RelfilenodeMap cache(2000 relations in this case)
* TRUNCATE or DROP or CREATE many tables in same transaction

At this time, walsender process continues to use 100% of the CPU 1core.

# environment

PostgreSQL 12.4(rpm)
CentOS 7.6.1810

# test case

sh logical_replication_truncate.sh

# perf report walsender process

Samples: 25K of event 'cpu-clock:uhH', Event count (approx.): 6315250000
Overhead  Command   Shared Object       Symbol
  87.34%  postgres  postgres            [.] hash_seq_search
   2.80%  postgres  postgres            [.] hash_search_with_hash_value
   1.57%  postgres  postgres            [.] LocalExecuteInvalidationMessage
   1.50%  postgres  postgres            [.] hash_search
   1.31%  postgres  postgres            [.] RelfilenodeMapInvalidateCallback
   0.83%  postgres  postgres            [.] uint32_hash
   0.79%  postgres  pgoutput.so         [.] rel_sync_cache_relation_cb
   0.63%  postgres  postgres            [.] hash_uint32
   0.59%  postgres  postgres            [.] PlanCacheRelCallback
   0.48%  postgres  postgres            [.] CatCacheInvalidate
   0.43%  postgres  postgres            [.] ReorderBufferCommit
   0.36%  postgres  libc-2.17.so        [.] __memcmp_sse4_1
   0.34%  postgres  postgres            [.] deregister_seq_scan
   0.27%  postgres  postgres            [.] hash_seq_init
   0.27%  postgres  postgres            [.] CallSyscacheCallbacks
   0.23%  postgres  postgres            [.] SysCacheInvalidate
   0.10%  postgres  postgres            [.] memcmp@plt
   0.08%  postgres  postgres            [.] RelationCacheInvalidateEntry
   0.05%  postgres  postgres            [.] InvalidateCatalogSnapshot
   0.03%  postgres  postgres            [.] GetCurrentTransactionNestLevel
   0.01%  postgres  pgoutput.so         [.] hash_search@plt
   0.00%  postgres  libpthread-2.17.so  [.] __pread_nocancel

# backtrace walsender process

0x0000000000889cb4 in hash_seq_search
(status=status@entry=0x7ffd5ae38310) at dynahash.c:1442
#0  0x0000000000889cb4 in hash_seq_search
(status=status@entry=0x7ffd5ae38310) at dynahash.c:1442
#1  0x0000000000877df8 in RelfilenodeMapInvalidateCallback
(arg=<optimized out>, relid=17284) at relfilenodemap.c:64
#2  0x000000000086999a in LocalExecuteInvalidationMessage
(msg=0x2f2ef18) at inval.c:595
#3  0x000000000070b81e in ReorderBufferExecuteInvalidations
(rb=<optimized out>, txn=<optimized out>, txn=<optimized out>) at
reorderbuffer.c:2149
#4  ReorderBufferCommit (rb=0x2cbda90, xid=xid@entry=490,
commit_lsn=94490944, end_lsn=<optimized out>,
    commit_time=commit_time@entry=653705313747147,
origin_id=origin_id@entry=0, origin_lsn=origin_lsn@entry=0) at
reorderbuffer.c:1770
#5  0x0000000000701248 in DecodeCommit (xid=490,
parsed=0x7ffd5ae38600, buf=0x7ffd5ae387c0, ctx=0x2c2f1e0) at
decode.c:640
#6  DecodeXactOp (ctx=0x2c2f1e0, buf=buf@entry=0x7ffd5ae387c0) at decode.c:248
#7  0x00000000007015fa in LogicalDecodingProcessRecord (ctx=0x2c2f1e0,
record=0x2c2f458) at decode.c:117
#8  0x0000000000712cd1 in XLogSendLogical () at walsender.c:2848
#9  0x0000000000714e92 in WalSndLoop
(send_data=send_data@entry=0x712c70 <XLogSendLogical>) at
walsender.c:2199
#10 0x0000000000715b51 in StartLogicalReplication (cmd=0x2c8c5c0) at
walsender.c:1129
#11 exec_replication_command (
    cmd_string=cmd_string@entry=0x2c0af60 "START_REPLICATION SLOT
\"subdb_test\" LOGICAL 0/0 (proto_version '1', publication_names
'\"pubdb_test\"')") at walsender.c:1545
#12 0x0000000000760ff1 in PostgresMain (argc=<optimized out>,
argv=argv@entry=0x2c353d0, dbname=0x2c35288 "postgres",
username=<optimized out>)
    at postgres.c:4243
#13 0x0000000000484172 in BackendRun (port=<optimized out>,
port=<optimized out>) at postmaster.c:4448
#14 BackendStartup (port=0x2c2d200) at postmaster.c:4139
#15 ServerLoop () at postmaster.c:1704
#16 0x00000000006f10b3 in PostmasterMain (argc=argc@entry=3,
argv=argv@entry=0x2c05b20) at postmaster.c:1377
#17 0x0000000000485073 in main (argc=3, argv=0x2c05b20) at main.c:228

# probable cause

RelfilenodeMapInvalidateCallback is called many times.

Every time the CommandId is incremented, execute all invalidations.
ReorderBufferExecuteInvalidations repeats the invalidation
of all TRUNCATE objects in same transaction by the number of objects.

hash_seq_search in RelfilenodeMapInvalidateCallback is heavy,
but I have no idea to solve this problem...

./src/backend/replication/logical/reorderbuffer.c
1746         case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
1747           Assert(change->data.command_id != InvalidCommandId);
1748
1749           if (command_id < change->data.command_id)
1750           {
1751             command_id = change->data.command_id;
1752
1753             if (!snapshot_now->copied)
1754             {
1755               /* we don't use the global one anymore */
1756               snapshot_now = ReorderBufferCopySnap(rb, snapshot_now,
1757                                  txn, command_id);
1758             }
1759
1760             snapshot_now->curcid = command_id;
1761
1762             TeardownHistoricSnapshot(false);
1763             SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
1764
1765             /*
1766              * Every time the CommandId is incremented, we could
1767              * see new catalog contents, so execute all
1768              * invalidations.
1769              */
1770             ReorderBufferExecuteInvalidations(rb, txn);
1771           }
1772
1773           break;

Do you have any solutions?

-- 
Keisuke Kuroda
NTT Software Innovation Center
keisuke.kuroda.3...@gmail.com
#!/bin/bash

PGHOME="/usr/pgsql-12"
CLUSTER_PUB="/home/postgres/relfilenode/data_pub"
CLUSTER_SUB="/home/postgres/relfilenode/data_sub"

${PGHOME}/bin/pg_ctl -D  ${CLUSTER_PUB} stop
${PGHOME}/bin/pg_ctl -D  ${CLUSTER_SUB} stop
rm -rf ${CLUSTER_PUB}
rm -rf ${CLUSTER_SUB}
${PGHOME}/bin/initdb -D ${CLUSTER_PUB} --no-locale -E UTF8 -k

## Configuration
echo "shared_buffers = 1GB"  >> ${CLUSTER_PUB}/postgresql.conf
echo "max_locks_per_transaction = 10000" >> ${CLUSTER_PUB}/postgresql.conf
echo "synchronous_commit = off"  >> ${CLUSTER_PUB}/postgresql.conf
echo "archive_mode = off"  >> ${CLUSTER_PUB}/postgresql.conf
echo "fsync = off" >> ${CLUSTER_PUB}/postgresql.conf
echo "full_page_writes = off"  >> ${CLUSTER_PUB}/postgresql.conf
echo "autovacuum = off" >> ${CLUSTER_PUB}/postgresql.conf
echo "checkpoint_timeout = 120min" >> ${CLUSTER_PUB}/postgresql.conf
echo "max_wal_size = 20GB" >> ${CLUSTER_PUB}/postgresql.conf

echo "wal_level = logical" >> ${CLUSTER_PUB}/postgresql.conf
echo "max_wal_senders = 10" >> ${CLUSTER_PUB}/postgresql.conf
echo "wal_sender_timeout = 0" >> ${CLUSTER_PUB}/postgresql.conf
echo "wal_receiver_timeout = 0" >> ${CLUSTER_PUB}/postgresql.conf

echo "port = 5432" >> ${CLUSTER_PUB}/postgresql.conf

cp -p -r ${CLUSTER_PUB} ${CLUSTER_SUB}
echo "port = 6432" >> ${CLUSTER_SUB}/postgresql.conf

${PGHOME}/bin/pg_ctl -D ${CLUSTER_PUB} start

## create partition
${PGHOME}/bin/psql -p 5432 << "EOF"
DO $$
DECLARE
    i INT;
    j INT;
    schema TEXT;
    tablename TEXT;
    partname TEXT;
BEGIN
    FOR i IN 1 .. 2 LOOP
        schema := 'nsp_' || to_char(i, 'FM000');
        tablename := 'tbl_' || to_char(i, 'FM000');
        EXECUTE 'CREATE SCHEMA ' || schema;
        EXECUTE 'CREATE TABLE ' || schema || '.' || tablename || ' (
                big01 BIGINT,
                big02 BIGINT,
                big03 BIGINT,
                big04 BIGINT,
                big05 BIGINT,
                big06 BIGINT,
                big07 BIGINT,
                big08 BIGINT,
                big09 BIGINT,
                big10 BIGINT,
                big11 BIGINT,
                big12 BIGINT,
                byt01 BYTEA,
                byt02 BYTEA
        ) PARTITION BY RANGE (big02)';
        FOR j IN 1 .. 1000 LOOP
            partname := 'part_' || to_char(j, 'FM0000');
            EXECUTE 'CREATE TABLE ' || schema || '.' || partname ||
                    ' PARTITION OF ' || schema || '.' || tablename ||
                    ' FOR VALUES FROM (' || (j-1)*10 || ') TO (' || (j)*10 || ')';
            EXECUTE 'ALTER TABLE ' || schema || '.' || partname || ' ADD PRIMARY KEY (big01)';
        END LOOP;
        EXECUTE 'CREATE INDEX ON ' || schema || '.' || tablename || ' (big03)';
    END LOOP;
END;
$$ LANGUAGE plpgsql;
EOF

${PGHOME}/bin/psql -p 5432 << "EOF"
CREATE TABLE test(id int primary key);
CREATE PUBLICATION pubdb_test FOR TABLE public.test;
EOF

${PGHOME}/bin/pg_ctl -D ${CLUSTER_SUB} start

${PGHOME}/bin/psql -p 6432 << "EOF"
CREATE TABLE test(id int primary key);
CREATE SUBSCRIPTION subdb_test CONNECTION 'host=localhost dbname=postgres' PUBLICATION pubdb_test WITH (copy_data = false);
EOF

# insert data(create relfilenodemaphash)
${PGHOME}/bin/psql -p 5432 << "EOF"
DO $$
DECLARE
    i INT;
    j INT;
    schema TEXT;
    tablename TEXT;
BEGIN
    FOR i IN 1 .. 2 LOOP
        schema := 'nsp_' || to_char(i, 'FM000');
        tablename := 'tbl_' || to_char(i, 'FM000');
        EXECUTE 'INSERT INTO ' || schema || '.' || tablename || '
         SELECT i,i,i,0,0,0,0,0,0,0,0,0,''a''::bytea,''a''::bytea FROM generate_series(0,9999,1) AS i';
    END LOOP;
END;
$$ LANGUAGE plpgsql;
EOF

${PGHOME}/bin/psql -p 5432 << "EOF"
TRUNCATE TABLE nsp_001.tbl_001;
EOF

echo "---walsender process info---"

ps -aux | grep walsender | grep -v grep

Reply via email to