On Thu, 2020-05-21 at 21:13 +0200, Tomas Vondra wrote: > 2) We could make it self-tuning, by increasing the number of blocks > we pre-allocate. So every time we exhaust the range, we double the > number of blocks (with a reasonable maximum, like 1024 or so). Or we > might just increment it by 32, or something.
Attached a new version that uses the doubling behavior, and cleans it up a bit. It also returns the unused prealloc blocks back to lts- >freeBlocks when the tape is rewound for reading. > IIUC the danger of pre-allocating blocks is that we might not fill > them, > resulting in temp file much larger than necessary. It might be > harmless > on some (most?) current filesystems that don't actually allocate > space > for blocks that are never written, but it also confuses our > accounting > of temporary file sizes. So we should try to limit that, and growing > the > number of pre-allocated blocks over time seems reasonable. There's another danger here: it doesn't matter how well the filesystem deals with sparse writes, because ltsWriteBlock fills in the holes with zeros anyway. That's potentially a significant amount of wasted IO effort if we aren't careful. Regards, Jeff Davis
diff --git a/src/backend/utils/sort/logtape.c b/src/backend/utils/sort/logtape.c index bc8d56807e2..84f9d6d2358 100644 --- a/src/backend/utils/sort/logtape.c +++ b/src/backend/utils/sort/logtape.c @@ -110,6 +110,8 @@ typedef struct TapeBlockTrailer #define TapeBlockSetNBytes(buf, nbytes) \ (TapeBlockGetTrailer(buf)->next = -(nbytes)) +#define TAPE_WRITE_PREALLOC_MIN 8 +#define TAPE_WRITE_PREALLOC_MAX 128 /* * This data structure represents a single "logical tape" within the set @@ -151,6 +153,21 @@ typedef struct LogicalTape int max_size; /* highest useful, safe buffer_size */ int pos; /* next read/write position in buffer */ int nbytes; /* total # of valid bytes in buffer */ + + /* + * When multiple tapes are being written to concurrently (as in HashAgg), + * avoid excessive fragmentation by preallocating block numbers to + * individual tapes. The preallocated block numbers are held in an array + * sorted in descending order; blocks are consumed from the end of the + * array (lowest block numbers first). + * + * No filesystem operations are performed for preallocation; only the + * block numbers are reserved. This may lead to sparse writes, which will + * cause ltsWriteBlock() to fill in holes with zeros. + */ + long *prealloc; + int nprealloc; /* number of elements in list */ + int prealloc_size; /* number of elements list can hold */ } LogicalTape; /* @@ -198,6 +215,7 @@ struct LogicalTapeSet static void ltsWriteBlock(LogicalTapeSet *lts, long blocknum, void *buffer); static void ltsReadBlock(LogicalTapeSet *lts, long blocknum, void *buffer); static long ltsGetFreeBlock(LogicalTapeSet *lts); +static long ltsGetPreallocBlock(LogicalTapeSet *lts, LogicalTape *lt); static void ltsReleaseBlock(LogicalTapeSet *lts, long blocknum); static void ltsConcatWorkerTapes(LogicalTapeSet *lts, TapeShare *shared, SharedFileSet *fileset); @@ -397,6 +415,51 @@ ltsGetFreeBlock(LogicalTapeSet *lts) return blocknum; } +/* + * Return the lowest free block number from the tape's preallocation list. + * Refill the preallocation list if necessary. + */ +static long +ltsGetPreallocBlock(LogicalTapeSet *lts, LogicalTape *lt) +{ + /* sorted in descending order, so return the last element */ + if (lt->nprealloc > 0) + return lt->prealloc[--lt->nprealloc]; + + if (lt->prealloc == NULL) + { + lt->prealloc_size = TAPE_WRITE_PREALLOC_MIN; + lt->prealloc = (long *) palloc(sizeof(long) * lt->prealloc_size); + } + else + { + /* when the preallocation list runs out, double the size */ + if (lt->prealloc_size * 2 <= TAPE_WRITE_PREALLOC_MAX) + lt->prealloc_size *= 2; + lt->prealloc = (long *) repalloc(lt->prealloc, + sizeof(long) * lt->prealloc_size); + } + + /* refill preallocation list */ + lt->nprealloc = lt->prealloc_size; + for (int i = lt->nprealloc; i > 0; i--) + lt->prealloc[i - 1] = ltsGetFreeBlock(lts); + +#ifdef USE_ASSERT_CHECKING + { + /* verify that list is in reverse sorted order */ + long last_block_number = -1; + for (int i = lt->nprealloc; i > 0; i--) + { + Assert(lt->prealloc[i - 1] > last_block_number); + last_block_number = lt->prealloc[i - 1]; + } + } +#endif + + return lt->prealloc[--lt->nprealloc]; +} + /* * Return a block# to the freelist. */ @@ -557,6 +620,9 @@ ltsInitTape(LogicalTape *lt) lt->max_size = MaxAllocSize; lt->pos = 0; lt->nbytes = 0; + lt->prealloc = NULL; + lt->nprealloc = 0; + lt->prealloc_size = 0; } /* @@ -709,7 +775,7 @@ LogicalTapeWrite(LogicalTapeSet *lts, int tapenum, Assert(lt->firstBlockNumber == -1); Assert(lt->pos == 0); - lt->curBlockNumber = ltsGetFreeBlock(lts); + lt->curBlockNumber = ltsGetPreallocBlock(lts, lt); lt->firstBlockNumber = lt->curBlockNumber; TapeBlockGetTrailer(lt->buffer)->prev = -1L; @@ -733,7 +799,7 @@ LogicalTapeWrite(LogicalTapeSet *lts, int tapenum, * First allocate the next block, so that we can store it in the * 'next' pointer of this block. */ - nextBlockNumber = ltsGetFreeBlock(lts); + nextBlockNumber = ltsGetPreallocBlock(lts, lt); /* set the next-pointer and dump the current block. */ TapeBlockGetTrailer(lt->buffer)->next = nextBlockNumber; @@ -835,13 +901,23 @@ LogicalTapeRewindForRead(LogicalTapeSet *lts, int tapenum, size_t buffer_size) Assert(lt->frozen); } - /* Allocate a read buffer (unless the tape is empty) */ if (lt->buffer) pfree(lt->buffer); /* the buffer is lazily allocated, but set the size here */ lt->buffer = NULL; lt->buffer_size = buffer_size; + + /* free the preallocation list, and return unused block numbers */ + if (lt->prealloc != NULL) + { + for (int i = lt->nprealloc; i > 0; i--) + ltsReleaseBlock(lts, lt->prealloc[i - 1]); + pfree(lt->prealloc); + lt->prealloc = NULL; + lt->nprealloc = 0; + lt->prealloc_size = 0; + } } /*