On Tue, 2020-05-26 at 17:40 -0700, Jeff Davis wrote: > On Tue, 2020-05-26 at 21:15 +0200, Tomas Vondra wrote: > > Yeah. I agree prefetching is definitely out of v13 scope. It might > > be > > interesting to try how useful would it be, if you're willing to > > spend > > some time on a prototype. > > I think a POC would be pretty quick; I'll see if I can hack something > together.
Attached (intended for v14). I changed the list from a simple array to a circular buffer so that we can keep enough preallocated block numbers in it to do prefetching. On SSD I didn't see any improvement, but perhaps it will do better on magnetic storage. Regards, Jeff Davis
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index d7f99d9944c..89edbc4c579 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -3941,6 +3941,9 @@ pgstat_get_wait_io(WaitEventIO w) case WAIT_EVENT_BUFFILE_WRITE: event_name = "BufFileWrite"; break; + case WAIT_EVENT_BUFFILE_PREFETCH: + event_name = "BufFilePrefetch"; + break; case WAIT_EVENT_CONTROL_FILE_READ: event_name = "ControlFileRead"; break; diff --git a/src/backend/storage/file/buffile.c b/src/backend/storage/file/buffile.c index 35e8f12e62d..4e2d4f059a7 100644 --- a/src/backend/storage/file/buffile.c +++ b/src/backend/storage/file/buffile.c @@ -618,6 +618,20 @@ BufFileWrite(BufFile *file, void *ptr, size_t size) return nwritten; } +/* + * BufFilePrefetch + */ +void +BufFilePrefetchBlock(BufFile *buffile, long blknum) +{ + int filenum = (blknum / BUFFILE_SEG_SIZE); + off_t offset = (blknum % BUFFILE_SEG_SIZE) * BLCKSZ; + File file = buffile->files[filenum]; + + (void) FilePrefetch(file, offset, BLCKSZ, + WAIT_EVENT_BUFFILE_PREFETCH); +} + /* * BufFileFlush * diff --git a/src/backend/utils/sort/logtape.c b/src/backend/utils/sort/logtape.c index 666a7c0e81c..014ec67b8ff 100644 --- a/src/backend/utils/sort/logtape.c +++ b/src/backend/utils/sort/logtape.c @@ -97,6 +97,7 @@ typedef struct TapeBlockTrailer * block */ long next; /* next block on this tape, or # of valid * bytes on last block (if < 0) */ + long prefetch; /* block to prefetch when reading this one */ } TapeBlockTrailer; #define TapeBlockPayloadSize (BLCKSZ - sizeof(TapeBlockTrailer)) @@ -123,6 +124,11 @@ typedef struct TapeBlockTrailer #define TAPE_WRITE_PREALLOC_MIN 8 #define TAPE_WRITE_PREALLOC_MAX 128 +#define TAPE_PREFETCH_DISTANCE_MAX 64 + +StaticAssertDecl(TAPE_PREFETCH_DISTANCE_MAX * 2 <= TAPE_WRITE_PREALLOC_MAX, + "Tape prefetch distance too large."); + /* * This data structure represents a single "logical tape" within the set * of logical tapes stored in the same file. @@ -165,13 +171,13 @@ typedef struct LogicalTape int nbytes; /* total # of valid bytes in buffer */ /* - * Preallocated block numbers are held in an array sorted in descending - * order; blocks are consumed from the end of the array (lowest block - * numbers first). + * Preallocated block numbers are held in a sorted circular array. */ long *prealloc; - int nprealloc; /* number of elements in list */ - int prealloc_size; /* number of elements list can hold */ + int prealloc_next; /* next element in circular array */ + int prealloc_nelem; /* number of elements in circular array */ + int prealloc_size; /* total size of circular array */ + int prefetch_dist; /* distance to prefetch */ } LogicalTape; /* @@ -219,7 +225,8 @@ struct LogicalTapeSet static void ltsWriteBlock(LogicalTapeSet *lts, long blocknum, void *buffer); static void ltsReadBlock(LogicalTapeSet *lts, long blocknum, void *buffer); static long ltsGetFreeBlock(LogicalTapeSet *lts); -static long ltsGetPreallocBlock(LogicalTapeSet *lts, LogicalTape *lt); +static long ltsGetPreallocBlock(LogicalTapeSet *lts, LogicalTape *lt, + long *prefetch_block); static void ltsReleaseBlock(LogicalTapeSet *lts, long blocknum); static void ltsConcatWorkerTapes(LogicalTapeSet *lts, TapeShare *shared, SharedFileSet *fileset); @@ -329,6 +336,10 @@ ltsReadFillBuffer(LogicalTapeSet *lts, LogicalTape *lt) else lt->nextBlockNumber = TapeBlockGetTrailer(thisbuf)->next; + if (TapeBlockGetTrailer(thisbuf)->prefetch >= 0) + BufFilePrefetchBlock(lts->pfile, + TapeBlockGetTrailer(thisbuf)->prefetch); + /* Advance to next block, if we have buffer space left */ } while (lt->buffer_size - lt->nbytes > BLCKSZ); @@ -424,38 +435,79 @@ ltsGetFreeBlock(LogicalTapeSet *lts) * Refill the preallocation list if necessary. */ static long -ltsGetPreallocBlock(LogicalTapeSet *lts, LogicalTape *lt) +ltsGetPreallocBlock(LogicalTapeSet *lts, LogicalTape *lt, long *prefetch) { - /* sorted in descending order, so return the last element */ - if (lt->nprealloc > 0) - return lt->prealloc[--lt->nprealloc]; + long block; + int prefetch_idx; + + if (lt->prealloc_nelem > lt->prefetch_dist) + { + prefetch_idx = (lt->prealloc_next + lt->prefetch_dist) % lt->prealloc_size; + *prefetch = lt->prealloc[prefetch_idx]; + block = lt->prealloc[lt->prealloc_next]; + lt->prealloc_next = (lt->prealloc_next + 1) % lt->prealloc_size; + lt->prealloc_nelem--; + return block; + } + /* get a preallocated block number, or allocate a new block number */ + if (lt->prealloc_nelem > 0) + { + block = lt->prealloc[lt->prealloc_next]; + lt->prealloc_next = (lt->prealloc_next + 1) % lt->prealloc_size; + lt->prealloc_nelem--; + } + else + block = ltsGetFreeBlock(lts); + + /* create or grow the circular buffer, as needed */ if (lt->prealloc == NULL) { lt->prealloc_size = TAPE_WRITE_PREALLOC_MIN; + lt->prefetch_dist = lt->prealloc_size / 2; lt->prealloc = (long *) palloc(sizeof(long) * lt->prealloc_size); } else if (lt->prealloc_size < TAPE_WRITE_PREALLOC_MAX) { + int oldsize = lt->prealloc_size; + int n; + /* when the preallocation list runs out, double the size */ lt->prealloc_size *= 2; - if (lt->prealloc_size > TAPE_WRITE_PREALLOC_MAX) - lt->prealloc_size = TAPE_WRITE_PREALLOC_MAX; - lt->prealloc = (long *) repalloc(lt->prealloc, - sizeof(long) * lt->prealloc_size); + Assert(lt->prealloc_size <= TAPE_WRITE_PREALLOC_MAX); + + /* + * The prefetch distance is always half the prealloc size, until the + * maximum is reached. + */ + lt->prefetch_dist *= 2; + if (lt->prefetch_dist > TAPE_PREFETCH_DISTANCE_MAX) + lt->prefetch_dist = TAPE_PREFETCH_DISTANCE_MAX; + + lt->prealloc = (long *) repalloc( + lt->prealloc, sizeof(long) * lt->prealloc_size); + + /* now that the buffer is larger, move elements into the new space */ + n = lt->prealloc_nelem - (oldsize - lt->prealloc_next); + if (n > 0) + memcpy(lt->prealloc + oldsize, lt->prealloc, sizeof(long) * n); } - /* refill preallocation list */ - lt->nprealloc = lt->prealloc_size; - for (int i = lt->nprealloc; i > 0; i--) + /* preallocate more block numbers */ + while (lt->prealloc_nelem < lt->prealloc_size) { - lt->prealloc[i - 1] = ltsGetFreeBlock(lts); + int end; - /* verify descending order */ - Assert(i == lt->nprealloc || lt->prealloc[i - 1] > lt->prealloc[i]); + end = (lt->prealloc_next + lt->prealloc_nelem) % lt->prealloc_size; + lt->prealloc[end] = ltsGetFreeBlock(lts); + lt->prealloc_nelem++; } - return lt->prealloc[--lt->nprealloc]; + Assert(lt->prealloc_nelem >= lt->prefetch_dist); + + prefetch_idx = (lt->prealloc_next + lt->prefetch_dist) % lt->prealloc_size; + *prefetch = lt->prealloc[prefetch_idx]; + return block; } /* @@ -619,8 +671,10 @@ ltsInitTape(LogicalTape *lt) lt->pos = 0; lt->nbytes = 0; lt->prealloc = NULL; - lt->nprealloc = 0; + lt->prealloc_next = 0; + lt->prealloc_nelem = 0; lt->prealloc_size = 0; + lt->prefetch_dist = 0; } /* @@ -770,13 +824,16 @@ LogicalTapeWrite(LogicalTapeSet *lts, int tapenum, } if (lt->curBlockNumber == -1) { + long prefetch; + Assert(lt->firstBlockNumber == -1); Assert(lt->pos == 0); - lt->curBlockNumber = ltsGetPreallocBlock(lts, lt); + lt->curBlockNumber = ltsGetPreallocBlock(lts, lt, &prefetch); lt->firstBlockNumber = lt->curBlockNumber; TapeBlockGetTrailer(lt->buffer)->prev = -1L; + TapeBlockGetTrailer(lt->buffer)->prefetch = prefetch; } Assert(lt->buffer_size == BLCKSZ); @@ -786,6 +843,7 @@ LogicalTapeWrite(LogicalTapeSet *lts, int tapenum, { /* Buffer full, dump it out */ long nextBlockNumber; + long prefetch; if (!lt->dirty) { @@ -797,7 +855,7 @@ LogicalTapeWrite(LogicalTapeSet *lts, int tapenum, * First allocate the next block, so that we can store it in the * 'next' pointer of this block. */ - nextBlockNumber = ltsGetPreallocBlock(lts, lt); + nextBlockNumber = ltsGetPreallocBlock(lts, lt, &prefetch); /* set the next-pointer and dump the current block. */ TapeBlockGetTrailer(lt->buffer)->next = nextBlockNumber; @@ -805,6 +863,7 @@ LogicalTapeWrite(LogicalTapeSet *lts, int tapenum, /* initialize the prev-pointer of the next block */ TapeBlockGetTrailer(lt->buffer)->prev = lt->curBlockNumber; + TapeBlockGetTrailer(lt->buffer)->prefetch = prefetch; lt->curBlockNumber = nextBlockNumber; lt->pos = 0; lt->nbytes = 0; @@ -909,12 +968,20 @@ LogicalTapeRewindForRead(LogicalTapeSet *lts, int tapenum, size_t buffer_size) /* free the preallocation list, and return unused block numbers */ if (lt->prealloc != NULL) { - for (int i = lt->nprealloc; i > 0; i--) - ltsReleaseBlock(lts, lt->prealloc[i - 1]); + while (lt->prealloc_nelem > 0) + { + long block = lt->prealloc[lt->prealloc_next]; + lt->prealloc_next = (lt->prealloc_next + 1) % lt->prealloc_size; + lt->prealloc_nelem--; + ltsReleaseBlock(lts, block); + } + pfree(lt->prealloc); lt->prealloc = NULL; - lt->nprealloc = 0; + lt->prealloc_next = 0; + lt->prealloc_nelem = 0; lt->prealloc_size = 0; + lt->prefetch_dist = 0; } } diff --git a/src/include/pgstat.h b/src/include/pgstat.h index c55dc1481ca..c95fbf67ef9 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -915,6 +915,7 @@ typedef enum { WAIT_EVENT_BUFFILE_READ = PG_WAIT_IO, WAIT_EVENT_BUFFILE_WRITE, + WAIT_EVENT_BUFFILE_PREFETCH, WAIT_EVENT_CONTROL_FILE_READ, WAIT_EVENT_CONTROL_FILE_SYNC, WAIT_EVENT_CONTROL_FILE_SYNC_UPDATE, diff --git a/src/include/storage/buffile.h b/src/include/storage/buffile.h index 60433f35b45..016971ef29d 100644 --- a/src/include/storage/buffile.h +++ b/src/include/storage/buffile.h @@ -40,6 +40,7 @@ extern BufFile *BufFileCreateTemp(bool interXact); extern void BufFileClose(BufFile *file); extern size_t BufFileRead(BufFile *file, void *ptr, size_t size); extern size_t BufFileWrite(BufFile *file, void *ptr, size_t size); +extern void BufFilePrefetchBlock(BufFile *file, long blknum); extern int BufFileSeek(BufFile *file, int fileno, off_t offset, int whence); extern void BufFileTell(BufFile *file, int *fileno, off_t *offset); extern int BufFileSeekBlock(BufFile *file, long blknum);