Hi,

Here's a reproducer which enabled me to reach this stuck state:

  pid  |  wait_event   |                                    query
-------+---------------+-----------------------------------------------------------------------------
 64617 |               | select pid, wait_event, query from
pg_stat_activity where state = 'active';
 64619 | BufferPin     | VACUUM jobs
 64620 | ExecuteGather | SELECT COUNT(*) FROM jobs
 64621 | ExecuteGather | SELECT COUNT(*) FROM jobs
 64622 | ExecuteGather | SELECT COUNT(*) FROM jobs
 64623 | ExecuteGather | SELECT COUNT(*) FROM jobs
 84167 | BtreePage     | SELECT COUNT(*) FROM jobs
 84168 | BtreePage     | SELECT COUNT(*) FROM jobs
 96440 |               | SELECT COUNT(*) FROM jobs
 96438 |               | SELECT COUNT(*) FROM jobs
 96439 |               | SELECT COUNT(*) FROM jobs
(11 rows)

The main thread deletes stuff in the middle of the key range (not sure
if this is important) and vacuum in a loop, and meanwhile 4 threads
(probably not important, might as well be 1) run Parallel Index Scans
over the whole range, in the hope of hitting the interesting case.  In
the locked-up case I just saw now opaque->btpo_flags had the
BTP_DELETED bit set, not BTP_HALF_DEAD (I could tell because I added
logging).  Clearly pages are periodically being marked half-dead but I
haven't yet managed to get an index scan to hit one of those.

-- 
Thomas Munro
http://www.enterprisedb.com
/*
   Set up the schema like this:

    drop table jobs;
    create table jobs (id int, blah text, status text);
    create index on jobs(status);
    insert into jobs select generate_series(1, 100000), 'hello world';
    update jobs set status = 'A' where id % 100 = 0;
    analyze jobs;
    alter table jobs set (parallel_workers = 2);

  You should get a Parallel Index Only Scan like this:

    set enable_seqscan = off;
    set max_parallel_workers_per_gather = 2;
    set parallel_leader_participation = off;
    set min_parallel_index_scan_size = 0;
    set parallel_setup_cost = 0;
    set parallel_tuple_cost = 0;
    EXPLAIN SELECT COUNT(*) FROM jobs;

 */

#include <libpq-fe.h>

#include <assert.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>

#define CONNINFO "dbname=postgres"
#define NTHREADS 4

static void *
thread_main(void *data)
{
	int			i;
	PGconn	   *conn;
	PGresult   *res;

	conn = PQconnectdb(CONNINFO);
	assert(PQstatus(conn) == CONNECTION_OK);

	res = PQexec(conn, "SET enable_seqscan = off");
	assert(PQresultStatus(res) == PGRES_COMMAND_OK);
	PQclear(res);
	res = PQexec(conn, "SET max_parallel_workers_per_gather = 2");
	assert(PQresultStatus(res) == PGRES_COMMAND_OK);
	PQclear(res);
	res = PQexec(conn, "SET parallel_leader_participation = off");
	assert(PQresultStatus(res) == PGRES_COMMAND_OK);
	PQclear(res);
	res = PQexec(conn, "SET min_parallel_index_scan_size = 0");
	assert(PQresultStatus(res) == PGRES_COMMAND_OK);
	PQclear(res);
	res = PQexec(conn, "SET parallel_setup_cost = 0");
	assert(PQresultStatus(res) == PGRES_COMMAND_OK);
	PQclear(res);
	res = PQexec(conn, "SET parallel_tuple_cost = 0");
	assert(PQresultStatus(res) == PGRES_COMMAND_OK);
	PQclear(res);

	for (i = 0; i < 1000000; ++i)
	{
		//res = PQexec(conn, "SELECT COUNT(*) FROM jobs WHERE status = 'B'");
		res = PQexec(conn, "SELECT COUNT(*) FROM jobs");

		assert(PQresultStatus(res) == PGRES_TUPLES_OK);
		assert(PQnfields(res) == 1);
		assert(PQntuples(res) == 1);
		PQclear(res);
	}
	PQfinish(conn);
	return 0;
}

int
main(int argc, char **argv)
{
	pthread_t threads[NTHREADS];
	pthread_attr_t attr;
	int i;
	void *r;
	PGconn	   *conn;
	PGresult   *res;
	conn = PQconnectdb(CONNINFO);
	assert(PQstatus(conn) == CONNECTION_OK);

	pthread_attr_init(&attr);

	for (i = 0; i < NTHREADS; ++i)
	{
		if (pthread_create(&threads[i], &attr, &thread_main, NULL) != 0)
		{
			fprintf(stderr, "failed to create thread\n");
			exit(1);
		}
	}

	for (i = 50000; i < 100000; ++i)
	{
		char buf[1024];

		printf("updating %d\n", i);

		res = PQexec(conn, "BEGIN");
		assert(PQresultStatus(res) == PGRES_COMMAND_OK);
		PQclear(res);

		snprintf(buf, sizeof(buf), "DELETE FROM jobs WHERE id = %d", i);
		res = PQexec(conn, buf);
		assert(PQresultStatus(res) == PGRES_COMMAND_OK);
		PQclear(res);

		snprintf(buf, sizeof(buf), "INSERT INTO jobs VALUES (%d, 'xxx', %s)", i + 50000, (i % 100 == 0) ? "'A'" : (i % 100 == 1) ? "'B'" : (i % 100 == 2) ? "'C'" : "NULL");
		res = PQexec(conn, buf);
		assert(PQresultStatus(res) == PGRES_COMMAND_OK);
		PQclear(res);

		res = PQexec(conn, "COMMIT");
		assert(PQresultStatus(res) == PGRES_COMMAND_OK);
		PQclear(res);

		res = PQexec(conn, "VACUUM jobs");
		assert(PQresultStatus(res) == PGRES_COMMAND_OK);
		PQclear(res);
	}
	PQfinish(conn);

	for (i = 0; i < NTHREADS; ++i)
		pthread_cancel(threads[i]);

	for (i = 0; i < NTHREADS; ++i)
		pthread_join(threads[i], &r);
}

Reply via email to