On 12/09/2013 11:56 PM, Claudio Freire wrote:
On Mon, Dec 9, 2013 at 6:47 PM, Heikki Linnakangas
<hlinnakan...@vmware.com> wrote:
On 12/09/2013 11:35 PM, Jim Nasby wrote:

On 12/8/13 1:49 PM, Heikki Linnakangas wrote:

On 12/08/2013 08:14 PM, Greg Stark wrote:

The whole accounts table is 1.2GB and contains 10 million rows. As
expected with rows_per_block set to 1 it reads 240MB of that
containing nearly 2 million rows (and takes nearly 20s -- doing a full
table scan for select count(*) only takes about 5s):

One simple thing we could do, without or in addition to changing the
algorithm, is to issue posix_fadvise() calls for the blocks we're
going to read. It should at least be possible to match the speed of a
plain sequential scan that way.

Hrm... maybe it wouldn't be very hard to use async IO here either? I'm
thinking it wouldn't be very hard to do the stage 2 work in the callback
routine...

Yeah, other than the fact we have no infrastructure to do asynchronous I/O
anywhere in the backend. If we had that, then we could easily use it here. I
doubt it would be much better than posix_fadvising the blocks, though.

Without patches to the kernel, it is much better.

posix_fadvise interferes with read-ahead, so posix_fadvise on, say,
bitmap heap scans (or similarly sorted analyze block samples) run at 1
IO / block, ie horrible, whereas aio can do read coalescence and
read-ahead when the kernel thinks it'll be profitable, significantly
increasing IOPS. I've seen everything from a 2x to 10x difference.

How did you test that, given that we don't actually have an asynchronous I/O implementation? I don't recall any recent patches floating around either to do that. When Greg Stark investigated this back in 2007-2008 and implemented posix_fadvise() for bitmap heap scans, posix_fadvise certainly gave a significant speedup on the test data he used. What kind of a data distribution gives a slowdown like that?

I took a stab at using posix_fadvise() in ANALYZE. It turned out to be very easy, patch attached. Your mileage may vary, but I'm seeing a nice gain from this on my laptop. Taking a 30000 page sample of a table with 717717 pages (ie. slightly larger than RAM), ANALYZE takes about 6 seconds without the patch, and less than a second with the patch, with effective_io_concurrency=10. If anyone with a good test data set loaded would like to test this and post some numbers, that would be great.

- Heikki
diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index 9845b0b..410d487 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -58,10 +58,18 @@
 /* Data structure for Algorithm S from Knuth 3.4.2 */
 typedef struct
 {
+	Relation	rel;
+	ForkNumber	forknum;
+
 	BlockNumber N;				/* number of blocks, known in advance */
 	int			n;				/* desired sample size */
 	BlockNumber t;				/* current block number */
 	int			m;				/* blocks selected so far */
+
+	int			prefetch_target;
+	int			start;
+	int			end;
+	BlockNumber *prefetched;
 } BlockSamplerData;
 
 typedef BlockSamplerData *BlockSampler;
@@ -87,10 +95,11 @@ static BufferAccessStrategy vac_strategy;
 static void do_analyze_rel(Relation onerel, VacuumStmt *vacstmt,
 			   AcquireSampleRowsFunc acquirefunc, BlockNumber relpages,
 			   bool inh, int elevel);
-static void BlockSampler_Init(BlockSampler bs, BlockNumber nblocks,
+static void BlockSampler_Init(BlockSampler bs, Relation rel, ForkNumber forknum, BlockNumber nblocks,
 				  int samplesize);
 static bool BlockSampler_HasMore(BlockSampler bs);
 static BlockNumber BlockSampler_Next(BlockSampler bs);
+static BlockNumber BlockSampler_Next_internal(BlockSampler bs);
 static void compute_index_stats(Relation onerel, double totalrows,
 					AnlIndexData *indexdata, int nindexes,
 					HeapTuple *rows, int numrows,
@@ -954,8 +963,12 @@ examine_attribute(Relation onerel, int attnum, Node *index_expr)
  * algorithm.
  */
 static void
-BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize)
+BlockSampler_Init(BlockSampler bs, Relation rel, ForkNumber forknum,
+				  BlockNumber nblocks, int samplesize)
 {
+	bs->rel = rel;
+	bs->forknum = forknum;
+
 	bs->N = nblocks;			/* measured table size */
 
 	/*
@@ -965,17 +978,61 @@ BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize)
 	bs->n = samplesize;
 	bs->t = 0;					/* blocks scanned so far */
 	bs->m = 0;					/* blocks selected so far */
+
+	bs->prefetch_target = target_prefetch_pages;
+	if (target_prefetch_pages > 0)
+		bs->prefetched = palloc((bs->prefetch_target + 1) * sizeof(BlockNumber));
+	else
+		bs->prefetched = NULL;
+	bs->start = bs->end = 0;
 }
 
 static bool
 BlockSampler_HasMore(BlockSampler bs)
 {
+	if (bs->end != bs->start)
+		return true;
 	return (bs->t < bs->N) && (bs->m < bs->n);
 }
 
+static void
+BlockSampler_Prefetch(BlockSampler bs)
+{
+	int		next;
+
+	next = (bs->end + 1) % (bs->prefetch_target + 1);
+	while (next != bs->start && (bs->t < bs->N) && (bs->m < bs->n))
+	{
+		BlockNumber blk = BlockSampler_Next_internal(bs);
+
+		PrefetchBuffer(bs->rel, bs->forknum, blk);
+
+		bs->prefetched[bs->end] = blk;
+		bs->end = next;
+		next = (bs->end + 1) % (bs->prefetch_target + 1);
+	}
+}
+
 static BlockNumber
 BlockSampler_Next(BlockSampler bs)
 {
+	BlockNumber result;
+
+	if (bs->prefetch_target == 0)
+		return BlockSampler_Next_internal(bs);
+
+	BlockSampler_Prefetch(bs);
+
+	Assert(bs->end != bs->start);
+
+	result = bs->prefetched[bs->start];
+	bs->start = (bs->start + 1) % (bs->prefetch_target + 1);
+	return result;
+}
+
+static BlockNumber
+BlockSampler_Next_internal(BlockSampler bs)
+{
 	BlockNumber K = bs->N - bs->t;		/* remaining blocks */
 	int			k = bs->n - bs->m;		/* blocks still to sample */
 	double		p;				/* probability to skip block */
@@ -1084,7 +1141,7 @@ acquire_sample_rows(Relation onerel, int elevel,
 	OldestXmin = GetOldestXmin(onerel->rd_rel->relisshared, true);
 
 	/* Prepare for sampling block numbers */
-	BlockSampler_Init(&bs, totalblocks, targrows);
+	BlockSampler_Init(&bs, onerel, MAIN_FORKNUM, totalblocks, targrows);
 	/* Prepare for sampling rows */
 	rstate = anl_init_selection_state(targrows);
 
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to