Hi, Problem was on disk io starvation, storage migration to more iops resolve the problem. Regards BS
pon., 17 lip 2023 o 07:40 Bogdan Siara <bogdan.si...@gmail.com> napisaĆ(a): > Hi, I have a problem with my slave postgresql instance. During normal > operations databaseis locked and in syslog I see stacktrace: > > Jul 16 22:34:12 ec1-prd-postgresan-2 kernel: [ 1331.289110] INFO: task > postgres:1172 blocked for more than 120 seconds. > Jul 16 22:34:12 ec1-prd-postgresan-2 kernel: [ 1331.292137] Not > tainted 5.19.0-1028-aws #29~22.04.1-Ubuntu > Jul 16 22:34:12 ec1-prd-postgresan-2 kernel: [ 1331.295239] "echo 0 > > /proc/sys/kernel/hung_task_timeout_secs" disables this message. > Jul 16 22:34:12 ec1-prd-postgresan-2 kernel: [ 1331.299360] task:postgres > state:D stack: 0 pid: 1172 ppid: 1133 flags:0x00000004 > Jul 16 22:34:12 ec1-prd-postgresan-2 kernel: [ 1331.299366] Call Trace: > Jul 16 22:34:12 ec1-prd-postgresan-2 kernel: [ 1331.299369] <TASK> > Jul 16 22:34:12 ec1-prd-postgresan-2 kernel: [ 1331.299373] > __schedule+0x254/0x5a0 > Jul 16 22:34:12 ec1-prd-postgresan-2 kernel: [ 1331.299381] > schedule+0x5d/0x100 > Jul 16 22:34:12 ec1-prd-postgresan-2 kernel: [ 1331.299385] > io_schedule+0x46/0x80 > Jul 16 22:34:12 ec1-prd-postgresan-2 kernel: [ 1331.299389] > blk_mq_get_tag+0x117/0x300 > Jul 16 22:34:12 ec1-prd-postgresan-2 kernel: [ 1331.299394] ? > destroy_sched_domains_rcu+0x40/0x40 > Jul 16 22:34:12 ec1-prd-postgresan-2 kernel: [ 1331.299399] > __blk_mq_alloc_requests+0xc4/0x1e0 > Jul 16 22:34:12 ec1-prd-postgresan-2 kernel: [ 1331.299404] > blk_mq_get_new_requests+0xcc/0x190 > Jul 16 22:34:12 ec1-prd-postgresan-2 kernel: [ 1331.299408] > blk_mq_submit_bio+0x1eb/0x450 > Jul 16 22:34:12 ec1-prd-postgresan-2 kernel: [ 1331.299411] > __submit_bio+0xf6/0x190 > Jul 16 22:34:12 ec1-prd-postgresan-2 kernel: [ 1331.299414] ? > kmem_cache_alloc+0x1a6/0x2f0 > Jul 16 22:34:12 ec1-prd-postgresan-2 kernel: [ 1331.299419] > __submit_bio_noacct+0x81/0x1f0 > Jul 16 22:34:12 ec1-prd-postgresan-2 kernel: [ 1331.299422] > submit_bio_noacct_nocheck+0x104/0x1c0 > Jul 16 22:34:12 ec1-prd-postgresan-2 kernel: [ 1331.299425] ? > ext4_inode_block_valid+0x1d/0x40 > Jul 16 22:34:12 ec1-prd-postgresan-2 kernel: [ 1331.299430] > submit_bio_noacct+0x1b9/0x600 > Jul 16 22:34:12 ec1-prd-postgresan-2 kernel: [ 1331.299433] > submit_bio+0x40/0xf0 > Jul 16 22:34:12 ec1-prd-postgresan-2 kernel: [ 1331.299437] > ext4_mpage_readpages+0x254/0xc20 > Jul 16 22:34:12 ec1-prd-postgresan-2 kernel: [ 1331.299441] ? > xas_load+0x1f/0x100 > Jul 16 22:34:12 ec1-prd-postgresan-2 kernel: [ 1331.299447] ? > __filemap_add_folio+0x1ca/0x540 > Jul 16 22:34:12 ec1-prd-postgresan-2 kernel: [ 1331.299452] > ext4_readahead+0x3f/0x50 > Jul 16 22:34:12 ec1-prd-postgresan-2 kernel: [ 1331.299455] > read_pages+0x7b/0x2e0 > Jul 16 22:34:12 ec1-prd-postgresan-2 kernel: [ 1331.299459] > page_cache_ra_unbounded+0x12d/0x180 > Jul 16 22:34:12 ec1-prd-postgresan-2 kernel: [ 1331.299462] > force_page_cache_ra+0xc5/0x100 > Jul 16 22:34:12 ec1-prd-postgresan-2 kernel: [ 1331.299466] > generic_fadvise+0x19d/0x280 > Jul 16 22:34:12 ec1-prd-postgresan-2 kernel: [ 1331.299469] > ksys_fadvise64_64+0x9f/0xb0 > Jul 16 22:34:12 ec1-prd-postgresan-2 kernel: [ 1331.299471] ? > syscall_trace_enter.constprop.0+0xb5/0x1a0 > Jul 16 22:34:12 ec1-prd-postgresan-2 kernel: [ 1331.299475] > __x64_sys_fadvise64+0x1c/0x30 > Jul 16 22:34:12 ec1-prd-postgresan-2 kernel: [ 1331.299478] > do_syscall_64+0x5c/0x90 > Jul 16 22:34:12 ec1-prd-postgresan-2 kernel: [ 1331.299482] ? > exc_page_fault+0x92/0x190 > Jul 16 22:34:12 ec1-prd-postgresan-2 kernel: [ 1331.299485] > entry_SYSCALL_64_after_hwframe+0x63/0xcd > Jul 16 22:34:12 ec1-prd-postgresan-2 kernel: [ 1331.299489] RIP: > 0033:0x7f7358c993c2 > Jul 16 22:34:12 ec1-prd-postgresan-2 kernel: [ 1331.299493] RSP: > 002b:00007ffd448a8a88 EFLAGS: 00000206 ORIG_RAX: 00000000000000dd > Jul 16 22:34:12 ec1-prd-postgresan-2 kernel: [ 1331.299498] RAX: > ffffffffffffffda RBX: 0000000000000383 RCX: 00007f7358c993c2 > Jul 16 22:34:12 ec1-prd-postgresan-2 kernel: [ 1331.299501] RDX: > 0000000000002000 RSI: 00000000099a0000 RDI: 0000000000000389 > Jul 16 22:34:12 ec1-prd-postgresan-2 kernel: [ 1331.299503] RBP: > 0000000000002000 R08: 0000000000000002 R09: 0000000000000000 > Jul 16 22:34:12 ec1-prd-postgresan-2 kernel: [ 1331.299506] R10: > 0000000000000003 R11: 0000000000000206 R12: 00000000099a0000 > Jul 16 22:34:12 ec1-prd-postgresan-2 kernel: [ 1331.299508] R13: > 000000000a000010 R14: 0000000001504cd0 R15: 00007ffd448a8ad0 > Jul 16 22:34:12 ec1-prd-postgresan-2 kernel: [ 1331.299514] </TASK> > Jul 16 22:36:13 ec1-prd-postgresan-2 kernel: [ 1452.119882] INFO: task > postgres:1172 blocked for more than 241 seconds. > Jul 16 22:36:13 ec1-prd-postgresan-2 kernel: [ 1452.122832] Not > tainted 5.19.0-1028-aws #29~22.04.1-Ubuntu > Jul 16 22:36:13 ec1-prd-postgresan-2 kernel: [ 1452.128443] "echo 0 > > /proc/sys/kernel/hung_task_timeout_secs" disables this message. > Jul 16 22:36:13 ec1-prd-postgresan-2 kernel: [ 1452.132098] task:postgres > state:D stack: 0 pid: 1172 ppid: 1133 flags:0x00000004 > Jul 16 22:36:13 ec1-prd-postgresan-2 kernel: [ 1452.132106] Call Trace: > Jul 16 22:36:13 ec1-prd-postgresan-2 kernel: [ 1452.132109] <TASK> > Jul 16 22:36:13 ec1-prd-postgresan-2 kernel: [ 1452.132113] > __schedule+0x254/0x5a0 > Jul 16 22:36:13 ec1-prd-postgresan-2 kernel: [ 1452.132121] > schedule+0x5d/0x100 > Jul 16 22:36:13 ec1-prd-postgresan-2 kernel: [ 1452.132125] > io_schedule+0x46/0x80 > Jul 16 22:36:13 ec1-prd-postgresan-2 kernel: [ 1452.132129] > blk_mq_get_tag+0x117/0x300 > Jul 16 22:36:13 ec1-prd-postgresan-2 kernel: [ 1452.132134] ? > destroy_sched_domains_rcu+0x40/0x40 > Jul 16 22:36:13 ec1-prd-postgresan-2 kernel: [ 1452.132143] > __blk_mq_alloc_requests+0xc4/0x1e0 > Jul 16 22:36:13 ec1-prd-postgresan-2 kernel: [ 1452.132148] > blk_mq_get_new_requests+0xcc/0x190 > Jul 16 22:36:13 ec1-prd-postgresan-2 kernel: [ 1452.132153] > blk_mq_submit_bio+0x1eb/0x450 > Jul 16 22:36:13 ec1-prd-postgresan-2 kernel: [ 1452.132156] > __submit_bio+0xf6/0x190 > Jul 16 22:36:13 ec1-prd-postgresan-2 kernel: [ 1452.132159] ? > kmem_cache_alloc+0x1a6/0x2f0 > Jul 16 22:36:13 ec1-prd-postgresan-2 kernel: [ 1452.132163] > __submit_bio_noacct+0x81/0x1f0 > Jul 16 22:36:13 ec1-prd-postgresan-2 kernel: [ 1452.132166] > submit_bio_noacct_nocheck+0x104/0x1c0 > Jul 16 22:36:13 ec1-prd-postgresan-2 kernel: [ 1452.132169] ? > ext4_inode_block_valid+0x1d/0x40 > Jul 16 22:36:13 ec1-prd-postgresan-2 kernel: [ 1452.132174] > submit_bio_noacct+0x1b9/0x600 > Jul 16 22:36:13 ec1-prd-postgresan-2 kernel: [ 1452.132176] > submit_bio+0x40/0xf0 > Jul 16 22:36:13 ec1-prd-postgresan-2 kernel: [ 1452.132179] > ext4_mpage_readpages+0x254/0xc20 > Jul 16 22:36:13 ec1-prd-postgresan-2 kernel: [ 1452.132183] ? > xas_load+0x1f/0x100 > Jul 16 22:36:13 ec1-prd-postgresan-2 kernel: [ 1452.132188] ? > __filemap_add_folio+0x1ca/0x540 > Jul 16 22:36:13 ec1-prd-postgresan-2 kernel: [ 1452.132193] > ext4_readahead+0x3f/0x50 > Jul 16 22:36:13 ec1-prd-postgresan-2 kernel: [ 1452.132198] > read_pages+0x7b/0x2e0 > Jul 16 22:36:13 ec1-prd-postgresan-2 kernel: [ 1452.132203] > page_cache_ra_unbounded+0x12d/0x180 > Jul 16 22:36:13 ec1-prd-postgresan-2 kernel: [ 1452.132209] > force_page_cache_ra+0xc5/0x100 > Jul 16 22:36:13 ec1-prd-postgresan-2 kernel: [ 1452.132213] > generic_fadvise+0x19d/0x280 > Jul 16 22:36:13 ec1-prd-postgresan-2 kernel: [ 1452.132217] > ksys_fadvise64_64+0x9f/0xb0 > Jul 16 22:36:13 ec1-prd-postgresan-2 kernel: [ 1452.132220] ? > syscall_trace_enter.constprop.0+0xb5/0x1a0 > Jul 16 22:36:13 ec1-prd-postgresan-2 kernel: [ 1452.132224] > __x64_sys_fadvise64+0x1c/0x30 > Jul 16 22:36:13 ec1-prd-postgresan-2 kernel: [ 1452.132227] > do_syscall_64+0x5c/0x90 > Jul 16 22:36:13 ec1-prd-postgresan-2 kernel: [ 1452.132232] ? > exc_page_fault+0x92/0x190 > Jul 16 22:36:13 ec1-prd-postgresan-2 kernel: [ 1452.132237] > entry_SYSCALL_64_after_hwframe+0x63/0xcd > Jul 16 22:36:13 ec1-prd-postgresan-2 kernel: [ 1452.132241] RIP: > 0033:0x7f7358c993c2 > Jul 16 22:36:13 ec1-prd-postgresan-2 kernel: [ 1452.132245] RSP: > 002b:00007ffd448a8a88 EFLAGS: 00000206 ORIG_RAX: 00000000000000dd > Jul 16 22:36:13 ec1-prd-postgresan-2 kernel: [ 1452.132251] RAX: > ffffffffffffffda RBX: 0000000000000383 RCX: 00007f7358c993c2 > Jul 16 22:36:13 ec1-prd-postgresan-2 kernel: [ 1452.132254] RDX: > 0000000000002000 RSI: 00000000099a0000 RDI: 0000000000000389 > Jul 16 22:36:13 ec1-prd-postgresan-2 kernel: [ 1452.132256] RBP: > 0000000000002000 R08: 0000000000000002 R09: 0000000000000000 > Jul 16 22:36:13 ec1-prd-postgresan-2 kernel: [ 1452.132258] R10: > 0000000000000003 R11: 0000000000000206 R12: 00000000099a0000 > Jul 16 22:36:13 ec1-prd-postgresan-2 kernel: [ 1452.132261] R13: > 000000000a000010 R14: 0000000001504cd0 R15: 00007ffd448a8ad0 > Jul 16 22:36:13 ec1-prd-postgresan-2 kernel: [ 1452.132267] </TASK> > > My postgres 15.3 is compiled as docker image in alpine 3.17 with flags: > ./configure --prefix=${PG_DIR} --exec-prefix=${PG_DIR} > --enable-integer-datetimes --enable-thread-safety --disable-rpath > --with-uuid=e2fs --with-gnu-ld --with-pgport=5432 > --with-system-tzdata=/usr/share/zoneinfo --with-llvm --with-gssapi > --with-ldap --with-icu --with-tcl --with-perl --with-python --with-pam > --with-openssl --with-libxml --with-libxslt > --with-includes=${PG_DIR}/include --with-libs=${PG_DIR}/lib > > this is my postgresql.conf: > listen_addresses = '*' > port = 5432 > max_connections = 1000 > unix_socket_directories = '/opt/pg/data' > superuser_reserved_connections = 3 > shared_buffers = 7GB > temp_buffers = 32MB > max_prepared_transactions = 100 > work_mem = 1146kB > maintenance_work_mem = 1792MB > max_stack_depth = 4MB > dynamic_shared_memory_type = posix > hot_standby = on > hot_standby_feedback = on > primary_conninfo='host=192.168.1.16 port=5432 user=user' > primary_slot_name='slot1' > promote_trigger_file='/opt/pg/data/MasterNow' > effective_cache_size = 21GB > log_destination = 'stderr' > logging_collector = on > log_directory = 'log' > log_filename = 'postgresql-%a.log' > log_truncate_on_rotation = on > log_rotation_age = 1440 > log_rotation_size = 0 > log_min_duration_statement = 60000 > log_checkpoints = on > log_autovacuum_min_duration = 3000 > log_line_prefix = '%m [%p] - [%a - %u - %d] [%h] : %e' > stats_fetch_consistency = 'cache' > datestyle = 'iso, dmy' > default_text_search_config = 'pg_catalog.simple' > jit = on > jit_above_cost = 100000 > jit_debugging_support = off > jit_dump_bitcode = off > jit_expressions = on > jit_inline_above_cost = 500000 > jit_optimize_above_cost = 500000 > jit_profiling_support = off > jit_provider = llvmjit > jit_tuple_deforming = on > max_worker_processes = 8 > max_parallel_workers_per_gather = 4 > max_parallel_workers = 4 > max_parallel_maintenance_workers = 2 > default_statistics_target = 100 > synchronous_commit = off > random_page_cost = 1.1 > effective_io_concurrency = 200 > shared_preload_libraries = 'auto_explain,pg_stat_statements' > # Increase the max size of the query strings Postgres records > track_activity_query_size = 10000 > # Track statements generated by stored procedures as well > pg_stat_statements.track = all > > Pleas give ma a information where is the problem? > > Regards > Bogdan >