On Thu, 2020-05-21 at 21:13 +0200, Tomas Vondra wrote:
> 2) We could make it self-tuning, by increasing the number of blocks
> we pre-allocate. So every time we exhaust the range, we double the
> number of blocks (with a reasonable maximum, like 1024 or so). Or we
> might just increment it by 32, or something.

Attached a new version that uses the doubling behavior, and cleans it
up a bit. It also returns the unused prealloc blocks back to lts-
>freeBlocks when the tape is rewound for reading.

> IIUC the danger of pre-allocating blocks is that we might not fill
> them,
> resulting in temp file much larger than necessary. It might be
> harmless
> on some (most?) current filesystems that don't actually allocate
> space
> for blocks that are never written, but it also confuses our
> accounting
> of temporary file sizes. So we should try to limit that, and growing
> the
> number of pre-allocated blocks over time seems reasonable.

There's another danger here: it doesn't matter how well the filesystem
deals with sparse writes, because ltsWriteBlock fills in the holes with
zeros anyway. That's potentially a significant amount of wasted IO
effort if we aren't careful.

Regards,
        Jeff Davis

diff --git a/src/backend/utils/sort/logtape.c b/src/backend/utils/sort/logtape.c
index bc8d56807e2..84f9d6d2358 100644
--- a/src/backend/utils/sort/logtape.c
+++ b/src/backend/utils/sort/logtape.c
@@ -110,6 +110,8 @@ typedef struct TapeBlockTrailer
 #define TapeBlockSetNBytes(buf, nbytes) \
 	(TapeBlockGetTrailer(buf)->next = -(nbytes))
 
+#define TAPE_WRITE_PREALLOC_MIN 8
+#define TAPE_WRITE_PREALLOC_MAX 128
 
 /*
  * This data structure represents a single "logical tape" within the set
@@ -151,6 +153,21 @@ typedef struct LogicalTape
 	int			max_size;		/* highest useful, safe buffer_size */
 	int			pos;			/* next read/write position in buffer */
 	int			nbytes;			/* total # of valid bytes in buffer */
+
+	/*
+	 * When multiple tapes are being written to concurrently (as in HashAgg),
+	 * avoid excessive fragmentation by preallocating block numbers to
+	 * individual tapes. The preallocated block numbers are held in an array
+	 * sorted in descending order; blocks are consumed from the end of the
+	 * array (lowest block numbers first).
+	 *
+	 * No filesystem operations are performed for preallocation; only the
+	 * block numbers are reserved. This may lead to sparse writes, which will
+	 * cause ltsWriteBlock() to fill in holes with zeros.
+	 */
+	long	   *prealloc;
+	int		 	nprealloc; /* number of elements in list */
+	int			prealloc_size; /* number of elements list can hold */
 } LogicalTape;
 
 /*
@@ -198,6 +215,7 @@ struct LogicalTapeSet
 static void ltsWriteBlock(LogicalTapeSet *lts, long blocknum, void *buffer);
 static void ltsReadBlock(LogicalTapeSet *lts, long blocknum, void *buffer);
 static long ltsGetFreeBlock(LogicalTapeSet *lts);
+static long ltsGetPreallocBlock(LogicalTapeSet *lts, LogicalTape *lt);
 static void ltsReleaseBlock(LogicalTapeSet *lts, long blocknum);
 static void ltsConcatWorkerTapes(LogicalTapeSet *lts, TapeShare *shared,
 								 SharedFileSet *fileset);
@@ -397,6 +415,51 @@ ltsGetFreeBlock(LogicalTapeSet *lts)
 	return blocknum;
 }
 
+/*
+ * Return the lowest free block number from the tape's preallocation list.
+ * Refill the preallocation list if necessary.
+ */
+static long
+ltsGetPreallocBlock(LogicalTapeSet *lts, LogicalTape *lt)
+{
+	/* sorted in descending order, so return the last element */
+	if (lt->nprealloc > 0)
+		return lt->prealloc[--lt->nprealloc];
+
+	if (lt->prealloc == NULL)
+	{
+		lt->prealloc_size = TAPE_WRITE_PREALLOC_MIN;
+		lt->prealloc = (long *) palloc(sizeof(long) * lt->prealloc_size);
+	}
+	else
+	{
+		/* when the preallocation list runs out, double the size */
+		if (lt->prealloc_size * 2 <= TAPE_WRITE_PREALLOC_MAX)
+			lt->prealloc_size *= 2;
+		lt->prealloc = (long *) repalloc(lt->prealloc,
+										 sizeof(long) * lt->prealloc_size);
+	}
+
+	/* refill preallocation list */
+	lt->nprealloc = lt->prealloc_size;
+	for (int i = lt->nprealloc; i > 0; i--)
+		lt->prealloc[i - 1] = ltsGetFreeBlock(lts);
+
+#ifdef USE_ASSERT_CHECKING
+	{
+		/* verify that list is in reverse sorted order */
+		long last_block_number = -1;
+		for (int i = lt->nprealloc; i > 0; i--)
+		{
+			Assert(lt->prealloc[i - 1] > last_block_number);
+			last_block_number = lt->prealloc[i - 1];
+		}
+	}
+#endif
+
+	return lt->prealloc[--lt->nprealloc];
+}
+
 /*
  * Return a block# to the freelist.
  */
@@ -557,6 +620,9 @@ ltsInitTape(LogicalTape *lt)
 	lt->max_size = MaxAllocSize;
 	lt->pos = 0;
 	lt->nbytes = 0;
+	lt->prealloc = NULL;
+	lt->nprealloc = 0;
+	lt->prealloc_size = 0;
 }
 
 /*
@@ -709,7 +775,7 @@ LogicalTapeWrite(LogicalTapeSet *lts, int tapenum,
 		Assert(lt->firstBlockNumber == -1);
 		Assert(lt->pos == 0);
 
-		lt->curBlockNumber = ltsGetFreeBlock(lts);
+		lt->curBlockNumber = ltsGetPreallocBlock(lts, lt);
 		lt->firstBlockNumber = lt->curBlockNumber;
 
 		TapeBlockGetTrailer(lt->buffer)->prev = -1L;
@@ -733,7 +799,7 @@ LogicalTapeWrite(LogicalTapeSet *lts, int tapenum,
 			 * First allocate the next block, so that we can store it in the
 			 * 'next' pointer of this block.
 			 */
-			nextBlockNumber = ltsGetFreeBlock(lts);
+			nextBlockNumber = ltsGetPreallocBlock(lts, lt);
 
 			/* set the next-pointer and dump the current block. */
 			TapeBlockGetTrailer(lt->buffer)->next = nextBlockNumber;
@@ -835,13 +901,23 @@ LogicalTapeRewindForRead(LogicalTapeSet *lts, int tapenum, size_t buffer_size)
 		Assert(lt->frozen);
 	}
 
-	/* Allocate a read buffer (unless the tape is empty) */
 	if (lt->buffer)
 		pfree(lt->buffer);
 
 	/* the buffer is lazily allocated, but set the size here */
 	lt->buffer = NULL;
 	lt->buffer_size = buffer_size;
+
+	/* free the preallocation list, and return unused block numbers */
+	if (lt->prealloc != NULL)
+	{
+		for (int i = lt->nprealloc; i > 0; i--)
+			ltsReleaseBlock(lts, lt->prealloc[i - 1]);
+		pfree(lt->prealloc);
+		lt->prealloc = NULL;
+		lt->nprealloc = 0;
+		lt->prealloc_size = 0;
+	}
 }
 
 /*

Reply via email to