On Tue, 2020-05-26 at 17:40 -0700, Jeff Davis wrote:
> On Tue, 2020-05-26 at 21:15 +0200, Tomas Vondra wrote:
> > Yeah. I agree prefetching is definitely out of v13 scope. It might
> > be
> > interesting to try how useful would it be, if you're willing to
> > spend
> > some time on a prototype.
> 
> I think a POC would be pretty quick; I'll see if I can hack something
> together.

Attached (intended for v14).

I changed the list from a simple array to a circular buffer so that we
can keep enough preallocated block numbers in it to do prefetching.

On SSD I didn't see any improvement, but perhaps it will do better on
magnetic storage.

Regards,
        Jeff Davis

diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index d7f99d9944c..89edbc4c579 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -3941,6 +3941,9 @@ pgstat_get_wait_io(WaitEventIO w)
 		case WAIT_EVENT_BUFFILE_WRITE:
 			event_name = "BufFileWrite";
 			break;
+		case WAIT_EVENT_BUFFILE_PREFETCH:
+			event_name = "BufFilePrefetch";
+			break;
 		case WAIT_EVENT_CONTROL_FILE_READ:
 			event_name = "ControlFileRead";
 			break;
diff --git a/src/backend/storage/file/buffile.c b/src/backend/storage/file/buffile.c
index 35e8f12e62d..4e2d4f059a7 100644
--- a/src/backend/storage/file/buffile.c
+++ b/src/backend/storage/file/buffile.c
@@ -618,6 +618,20 @@ BufFileWrite(BufFile *file, void *ptr, size_t size)
 	return nwritten;
 }
 
+/*
+ * BufFilePrefetch
+ */
+void
+BufFilePrefetchBlock(BufFile *buffile, long blknum)
+{
+	int		filenum = (blknum / BUFFILE_SEG_SIZE);
+	off_t	offset	= (blknum % BUFFILE_SEG_SIZE) * BLCKSZ;
+	File	file	= buffile->files[filenum];
+
+	(void) FilePrefetch(file, offset, BLCKSZ,
+						WAIT_EVENT_BUFFILE_PREFETCH);
+}
+
 /*
  * BufFileFlush
  *
diff --git a/src/backend/utils/sort/logtape.c b/src/backend/utils/sort/logtape.c
index 666a7c0e81c..014ec67b8ff 100644
--- a/src/backend/utils/sort/logtape.c
+++ b/src/backend/utils/sort/logtape.c
@@ -97,6 +97,7 @@ typedef struct TapeBlockTrailer
 								 * block */
 	long		next;			/* next block on this tape, or # of valid
 								 * bytes on last block (if < 0) */
+	long		prefetch;		/* block to prefetch when reading this one */
 } TapeBlockTrailer;
 
 #define TapeBlockPayloadSize  (BLCKSZ - sizeof(TapeBlockTrailer))
@@ -123,6 +124,11 @@ typedef struct TapeBlockTrailer
 #define TAPE_WRITE_PREALLOC_MIN 8
 #define TAPE_WRITE_PREALLOC_MAX 128
 
+#define TAPE_PREFETCH_DISTANCE_MAX 64
+
+StaticAssertDecl(TAPE_PREFETCH_DISTANCE_MAX * 2 <= TAPE_WRITE_PREALLOC_MAX,
+				 "Tape prefetch distance too large.");
+
 /*
  * This data structure represents a single "logical tape" within the set
  * of logical tapes stored in the same file.
@@ -165,13 +171,13 @@ typedef struct LogicalTape
 	int			nbytes;			/* total # of valid bytes in buffer */
 
 	/*
-	 * Preallocated block numbers are held in an array sorted in descending
-	 * order; blocks are consumed from the end of the array (lowest block
-	 * numbers first).
+	 * Preallocated block numbers are held in a sorted circular array.
 	 */
 	long	   *prealloc;
-	int		 	nprealloc;		/* number of elements in list */
-	int			prealloc_size;	/* number of elements list can hold */
+	int		 	prealloc_next;	/* next element in circular array */
+	int			prealloc_nelem;	/* number of elements in circular array */
+	int			prealloc_size;	/* total size of circular array */
+	int			prefetch_dist;	/* distance to prefetch */
 } LogicalTape;
 
 /*
@@ -219,7 +225,8 @@ struct LogicalTapeSet
 static void ltsWriteBlock(LogicalTapeSet *lts, long blocknum, void *buffer);
 static void ltsReadBlock(LogicalTapeSet *lts, long blocknum, void *buffer);
 static long ltsGetFreeBlock(LogicalTapeSet *lts);
-static long ltsGetPreallocBlock(LogicalTapeSet *lts, LogicalTape *lt);
+static long ltsGetPreallocBlock(LogicalTapeSet *lts, LogicalTape *lt,
+								long *prefetch_block);
 static void ltsReleaseBlock(LogicalTapeSet *lts, long blocknum);
 static void ltsConcatWorkerTapes(LogicalTapeSet *lts, TapeShare *shared,
 								 SharedFileSet *fileset);
@@ -329,6 +336,10 @@ ltsReadFillBuffer(LogicalTapeSet *lts, LogicalTape *lt)
 		else
 			lt->nextBlockNumber = TapeBlockGetTrailer(thisbuf)->next;
 
+		if (TapeBlockGetTrailer(thisbuf)->prefetch >= 0)
+			BufFilePrefetchBlock(lts->pfile,
+								 TapeBlockGetTrailer(thisbuf)->prefetch);
+
 		/* Advance to next block, if we have buffer space left */
 	} while (lt->buffer_size - lt->nbytes > BLCKSZ);
 
@@ -424,38 +435,79 @@ ltsGetFreeBlock(LogicalTapeSet *lts)
  * Refill the preallocation list if necessary.
  */
 static long
-ltsGetPreallocBlock(LogicalTapeSet *lts, LogicalTape *lt)
+ltsGetPreallocBlock(LogicalTapeSet *lts, LogicalTape *lt, long *prefetch)
 {
-	/* sorted in descending order, so return the last element */
-	if (lt->nprealloc > 0)
-		return lt->prealloc[--lt->nprealloc];
+	long	block;
+	int		prefetch_idx;
+
+	if (lt->prealloc_nelem > lt->prefetch_dist)
+	{
+		prefetch_idx = (lt->prealloc_next + lt->prefetch_dist) % lt->prealloc_size;
+		*prefetch = lt->prealloc[prefetch_idx];
+		block = lt->prealloc[lt->prealloc_next];
+		lt->prealloc_next = (lt->prealloc_next + 1) % lt->prealloc_size;
+		lt->prealloc_nelem--;
+		return block;
+	}
 
+	/* get a preallocated block number, or allocate a new block number */
+	if (lt->prealloc_nelem > 0)
+	{
+		block = lt->prealloc[lt->prealloc_next];
+		lt->prealloc_next = (lt->prealloc_next + 1) % lt->prealloc_size;
+		lt->prealloc_nelem--;
+	}
+	else
+		block = ltsGetFreeBlock(lts);
+
+	/* create or grow the circular buffer, as needed */
 	if (lt->prealloc == NULL)
 	{
 		lt->prealloc_size = TAPE_WRITE_PREALLOC_MIN;
+		lt->prefetch_dist = lt->prealloc_size / 2;
 		lt->prealloc = (long *) palloc(sizeof(long) * lt->prealloc_size);
 	}
 	else if (lt->prealloc_size < TAPE_WRITE_PREALLOC_MAX)
 	{
+		int oldsize = lt->prealloc_size;
+		int n;
+
 		/* when the preallocation list runs out, double the size */
 		lt->prealloc_size *= 2;
-		if (lt->prealloc_size > TAPE_WRITE_PREALLOC_MAX)
-			lt->prealloc_size = TAPE_WRITE_PREALLOC_MAX;
-		lt->prealloc = (long *) repalloc(lt->prealloc,
-										 sizeof(long) * lt->prealloc_size);
+		Assert(lt->prealloc_size <= TAPE_WRITE_PREALLOC_MAX);
+
+		/*
+		 * The prefetch distance is always half the prealloc size, until the
+		 * maximum is reached.
+		 */
+		lt->prefetch_dist *= 2;
+		if (lt->prefetch_dist > TAPE_PREFETCH_DISTANCE_MAX)
+			lt->prefetch_dist = TAPE_PREFETCH_DISTANCE_MAX;
+
+		lt->prealloc = (long *) repalloc(
+			lt->prealloc, sizeof(long) * lt->prealloc_size);
+
+		/* now that the buffer is larger, move elements into the new space */
+		n = lt->prealloc_nelem - (oldsize - lt->prealloc_next);
+		if (n > 0)
+			memcpy(lt->prealloc + oldsize, lt->prealloc, sizeof(long) * n);
 	}
 
-	/* refill preallocation list */
-	lt->nprealloc = lt->prealloc_size;
-	for (int i = lt->nprealloc; i > 0; i--)
+	/* preallocate more block numbers */
+	while (lt->prealloc_nelem < lt->prealloc_size)
 	{
-		lt->prealloc[i - 1] = ltsGetFreeBlock(lts);
+		int end;
 
-		/* verify descending order */
-		Assert(i == lt->nprealloc || lt->prealloc[i - 1] > lt->prealloc[i]);
+		end = (lt->prealloc_next + lt->prealloc_nelem) % lt->prealloc_size;
+		lt->prealloc[end] = ltsGetFreeBlock(lts);
+		lt->prealloc_nelem++;
 	}
 
-	return lt->prealloc[--lt->nprealloc];
+	Assert(lt->prealloc_nelem >= lt->prefetch_dist);
+
+	prefetch_idx = (lt->prealloc_next + lt->prefetch_dist) % lt->prealloc_size;
+	*prefetch = lt->prealloc[prefetch_idx];
+	return block;
 }
 
 /*
@@ -619,8 +671,10 @@ ltsInitTape(LogicalTape *lt)
 	lt->pos = 0;
 	lt->nbytes = 0;
 	lt->prealloc = NULL;
-	lt->nprealloc = 0;
+	lt->prealloc_next = 0;
+	lt->prealloc_nelem = 0;
 	lt->prealloc_size = 0;
+	lt->prefetch_dist = 0;
 }
 
 /*
@@ -770,13 +824,16 @@ LogicalTapeWrite(LogicalTapeSet *lts, int tapenum,
 	}
 	if (lt->curBlockNumber == -1)
 	{
+		long prefetch;
+
 		Assert(lt->firstBlockNumber == -1);
 		Assert(lt->pos == 0);
 
-		lt->curBlockNumber = ltsGetPreallocBlock(lts, lt);
+		lt->curBlockNumber = ltsGetPreallocBlock(lts, lt, &prefetch);
 		lt->firstBlockNumber = lt->curBlockNumber;
 
 		TapeBlockGetTrailer(lt->buffer)->prev = -1L;
+		TapeBlockGetTrailer(lt->buffer)->prefetch = prefetch;
 	}
 
 	Assert(lt->buffer_size == BLCKSZ);
@@ -786,6 +843,7 @@ LogicalTapeWrite(LogicalTapeSet *lts, int tapenum,
 		{
 			/* Buffer full, dump it out */
 			long		nextBlockNumber;
+			long		prefetch;
 
 			if (!lt->dirty)
 			{
@@ -797,7 +855,7 @@ LogicalTapeWrite(LogicalTapeSet *lts, int tapenum,
 			 * First allocate the next block, so that we can store it in the
 			 * 'next' pointer of this block.
 			 */
-			nextBlockNumber = ltsGetPreallocBlock(lts, lt);
+			nextBlockNumber = ltsGetPreallocBlock(lts, lt, &prefetch);
 
 			/* set the next-pointer and dump the current block. */
 			TapeBlockGetTrailer(lt->buffer)->next = nextBlockNumber;
@@ -805,6 +863,7 @@ LogicalTapeWrite(LogicalTapeSet *lts, int tapenum,
 
 			/* initialize the prev-pointer of the next block */
 			TapeBlockGetTrailer(lt->buffer)->prev = lt->curBlockNumber;
+			TapeBlockGetTrailer(lt->buffer)->prefetch = prefetch;
 			lt->curBlockNumber = nextBlockNumber;
 			lt->pos = 0;
 			lt->nbytes = 0;
@@ -909,12 +968,20 @@ LogicalTapeRewindForRead(LogicalTapeSet *lts, int tapenum, size_t buffer_size)
 	/* free the preallocation list, and return unused block numbers */
 	if (lt->prealloc != NULL)
 	{
-		for (int i = lt->nprealloc; i > 0; i--)
-			ltsReleaseBlock(lts, lt->prealloc[i - 1]);
+		while (lt->prealloc_nelem > 0)
+		{
+			long block = lt->prealloc[lt->prealloc_next];
+			lt->prealloc_next = (lt->prealloc_next + 1) % lt->prealloc_size;
+			lt->prealloc_nelem--;
+			ltsReleaseBlock(lts, block);
+		}
+
 		pfree(lt->prealloc);
 		lt->prealloc = NULL;
-		lt->nprealloc = 0;
+		lt->prealloc_next = 0;
+		lt->prealloc_nelem = 0;
 		lt->prealloc_size = 0;
+		lt->prefetch_dist = 0;
 	}
 }
 
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index c55dc1481ca..c95fbf67ef9 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -915,6 +915,7 @@ typedef enum
 {
 	WAIT_EVENT_BUFFILE_READ = PG_WAIT_IO,
 	WAIT_EVENT_BUFFILE_WRITE,
+	WAIT_EVENT_BUFFILE_PREFETCH,
 	WAIT_EVENT_CONTROL_FILE_READ,
 	WAIT_EVENT_CONTROL_FILE_SYNC,
 	WAIT_EVENT_CONTROL_FILE_SYNC_UPDATE,
diff --git a/src/include/storage/buffile.h b/src/include/storage/buffile.h
index 60433f35b45..016971ef29d 100644
--- a/src/include/storage/buffile.h
+++ b/src/include/storage/buffile.h
@@ -40,6 +40,7 @@ extern BufFile *BufFileCreateTemp(bool interXact);
 extern void BufFileClose(BufFile *file);
 extern size_t BufFileRead(BufFile *file, void *ptr, size_t size);
 extern size_t BufFileWrite(BufFile *file, void *ptr, size_t size);
+extern void BufFilePrefetchBlock(BufFile *file, long blknum);
 extern int	BufFileSeek(BufFile *file, int fileno, off_t offset, int whence);
 extern void BufFileTell(BufFile *file, int *fileno, off_t *offset);
 extern int	BufFileSeekBlock(BufFile *file, long blknum);

Reply via email to