On 10.05.23 20:10, Andres Freund wrote:
So if you want to understand what smgrzeroextend() does, you need to
mentally combine the documentation of three different functions. Could we
write documentation for each function that stands on its own? And document
the function arguments, like what does blocknum and nblocks mean?
I guess it couldn't hurt. But if we go down that route, we basically need to
rewrite all the function headers in smgr.c, I think.
I took a stab at this, going through the function comments in smgr.c and
md.c and try to make some things easier to follow.
- Took at out the weird leading tabs in the older comments.
- Rephrased some comments so that smgr.c is more like an API
documentation and md.c documents what that particular instance of that
API does.
- Move the *extend and *zeroextend functions to a more sensible place
among all the functions. Especially since *write and *extend are very
similar, it makes sense to have them close together. This way, it's
also easier to follow "this function is like that function" comments.
- Also moved mdwriteback(), which was in some completely odd place.
- Added comments for function arguments that were previously not documented.
- Reworded the comments for *extend and *zeroextend to make more sense
relative to each other.
- I left this for smgrzeroextend():
FIXME: why both blocknum and nblocks
Like, what happens when blocknum is not the block right after the last
existing block? Do you get an implicit POSIX hole between the end of
the file and the specified block and then an explicit hole for the next
nblocks? We should be clear here what the designer of this API intended.
The name smgrzeroextend is not great. The "zero" isn't what
distinguishes it from smgrextend.
From fadd72afcf78a55a2cfd32217b317f17a9147962 Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <pe...@eisentraut.org>
Date: Tue, 16 May 2023 16:10:48 +0200
Subject: [PATCH] WIP: Improve smgr source code comments
---
src/backend/storage/smgr/md.c | 501 ++++++++++++++++----------------
src/backend/storage/smgr/smgr.c | 251 ++++++++--------
src/include/storage/md.h | 8 +-
src/include/storage/smgr.h | 8 +-
4 files changed, 382 insertions(+), 386 deletions(-)
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index e982a8dd7f..4115a24b3f 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -154,7 +154,7 @@ _mdfd_open_flags(void)
}
/*
- * mdinit() -- Initialize private state for magnetic disk storage manager.
+ * mdinit() -- Initialize private state for magnetic disk storage manager.
*/
void
mdinit(void)
@@ -165,7 +165,7 @@ mdinit(void)
}
/*
- * mdexists() -- Does the physical file exist?
+ * mdexists() -- Does the physical file exist?
*
* Note: this will return true for lingering files, with pending deletions
*/
@@ -184,7 +184,7 @@ mdexists(SMgrRelation reln, ForkNumber forknum)
}
/*
- * mdcreate() -- Create a new relation on magnetic disk.
+ * mdcreate() -- Create a new relation on magnetic disk.
*
* If isRedo is true, it's okay for the relation to exist already.
*/
@@ -242,7 +242,7 @@ mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
}
/*
- * mdunlink() -- Unlink a relation.
+ * mdunlink() -- Unlink a relation.
*
* Note that we're passed a RelFileLocatorBackend --- by the time this is
called,
* there won't be an SMgrRelation hashtable entry anymore.
@@ -447,183 +447,7 @@ mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber
forknum, bool isRedo)
}
/*
- * mdextend() -- Add a block to the specified relation.
- *
- * The semantics are nearly the same as mdwrite(): write at the
- * specified position. However, this is to be used for the case of
- * extending a relation (i.e., blocknum is at or beyond the current
- * EOF). Note that we assume writing a block beyond current EOF
- * causes intervening file space to become filled with zeroes.
- */
-void
-mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
- const void *buffer, bool skipFsync)
-{
- off_t seekpos;
- int nbytes;
- MdfdVec *v;
-
- /* If this build supports direct I/O, the buffer must be I/O aligned. */
- if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
- Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE,
buffer));
-
- /* This assert is too expensive to have on normally ... */
-#ifdef CHECK_WRITE_VS_EXTEND
- Assert(blocknum >= mdnblocks(reln, forknum));
-#endif
-
- /*
- * If a relation manages to grow to 2^32-1 blocks, refuse to extend it
any
- * more --- we mustn't create a block whose number actually is
- * InvalidBlockNumber. (Note that this failure should be unreachable
- * because of upstream checks in bufmgr.c.)
- */
- if (blocknum == InvalidBlockNumber)
- ereport(ERROR,
- (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
- errmsg("cannot extend file \"%s\" beyond %u
blocks",
- relpath(reln->smgr_rlocator,
forknum),
- InvalidBlockNumber)));
-
- v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE);
-
- seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
-
- Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
-
- if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos,
WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ)
- {
- if (nbytes < 0)
- ereport(ERROR,
- (errcode_for_file_access(),
- errmsg("could not extend file \"%s\":
%m",
-
FilePathName(v->mdfd_vfd)),
- errhint("Check free disk space.")));
- /* short write: complain appropriately */
- ereport(ERROR,
- (errcode(ERRCODE_DISK_FULL),
- errmsg("could not extend file \"%s\": wrote
only %d of %d bytes at block %u",
- FilePathName(v->mdfd_vfd),
- nbytes, BLCKSZ, blocknum),
- errhint("Check free disk space.")));
- }
-
- if (!skipFsync && !SmgrIsTemp(reln))
- register_dirty_segment(reln, forknum, v);
-
- Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
-}
-
-/*
- * mdzeroextend() -- Add new zeroed out blocks to the specified relation.
- *
- * Similar to mdextend(), except the relation can be extended by
multiple
- * blocks at once and the added blocks will be filled with zeroes.
- */
-void
-mdzeroextend(SMgrRelation reln, ForkNumber forknum,
- BlockNumber blocknum, int nblocks, bool skipFsync)
-{
- MdfdVec *v;
- BlockNumber curblocknum = blocknum;
- int remblocks = nblocks;
-
- Assert(nblocks > 0);
-
- /* This assert is too expensive to have on normally ... */
-#ifdef CHECK_WRITE_VS_EXTEND
- Assert(blocknum >= mdnblocks(reln, forknum));
-#endif
-
- /*
- * If a relation manages to grow to 2^32-1 blocks, refuse to extend it
any
- * more --- we mustn't create a block whose number actually is
- * InvalidBlockNumber or larger.
- */
- if ((uint64) blocknum + nblocks >= (uint64) InvalidBlockNumber)
- ereport(ERROR,
- (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
- errmsg("cannot extend file \"%s\" beyond %u
blocks",
- relpath(reln->smgr_rlocator,
forknum),
- InvalidBlockNumber)));
-
- while (remblocks > 0)
- {
- BlockNumber segstartblock = curblocknum % ((BlockNumber)
RELSEG_SIZE);
- off_t seekpos = (off_t) BLCKSZ * segstartblock;
- int numblocks;
-
- if (segstartblock + remblocks > RELSEG_SIZE)
- numblocks = RELSEG_SIZE - segstartblock;
- else
- numblocks = remblocks;
-
- v = _mdfd_getseg(reln, forknum, curblocknum, skipFsync,
EXTENSION_CREATE);
-
- Assert(segstartblock < RELSEG_SIZE);
- Assert(segstartblock + numblocks <= RELSEG_SIZE);
-
- /*
- * If available and useful, use posix_fallocate() (via
FileAllocate())
- * to extend the relation. That's often more efficient than
using
- * write(), as it commonly won't cause the kernel to allocate
page
- * cache space for the extended pages.
- *
- * However, we don't use FileAllocate() for small extensions,
as it
- * defeats delayed allocation on some filesystems. Not clear
where
- * that decision should be made though? For now just use a
cutoff of
- * 8, anything between 4 and 8 worked OK in some local testing.
- */
- if (numblocks > 8)
- {
- int ret;
-
- ret = FileFallocate(v->mdfd_vfd,
- seekpos,
(off_t) BLCKSZ * numblocks,
-
WAIT_EVENT_DATA_FILE_EXTEND);
- if (ret != 0)
- {
- ereport(ERROR,
- errcode_for_file_access(),
- errmsg("could not extend file
\"%s\" with FileFallocate(): %m",
-
FilePathName(v->mdfd_vfd)),
- errhint("Check free disk
space."));
- }
- }
- else
- {
- int ret;
-
- /*
- * Even if we don't want to use fallocate, we can still
extend a
- * bit more efficiently than writing each 8kB block
individually.
- * pg_pwrite_zeros() (via FileZero()) uses
- * pg_pwritev_with_retry() to avoid multiple writes or
needing a
- * zeroed buffer for the whole length of the extension.
- */
- ret = FileZero(v->mdfd_vfd,
- seekpos, (off_t) BLCKSZ *
numblocks,
- WAIT_EVENT_DATA_FILE_EXTEND);
- if (ret < 0)
- ereport(ERROR,
- errcode_for_file_access(),
- errmsg("could not extend file
\"%s\": %m",
-
FilePathName(v->mdfd_vfd)),
- errhint("Check free disk
space."));
- }
-
- if (!skipFsync && !SmgrIsTemp(reln))
- register_dirty_segment(reln, forknum, v);
-
- Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber)
RELSEG_SIZE));
-
- remblocks -= numblocks;
- curblocknum += numblocks;
- }
-}
-
-/*
- * mdopenfork() -- Open one fork of the specified relation.
+ * mdopenfork() -- Open one fork of the specified relation.
*
* Note we only open the first segment, when there are multiple segments.
*
@@ -673,7 +497,7 @@ mdopenfork(SMgrRelation reln, ForkNumber forknum, int
behavior)
}
/*
- * mdopen() -- Initialize newly-opened relation.
+ * mdopen() -- Initialize newly-opened relation.
*/
void
mdopen(SMgrRelation reln)
@@ -684,7 +508,7 @@ mdopen(SMgrRelation reln)
}
/*
- * mdclose() -- Close the specified relation, if it isn't closed already.
+ * mdclose() -- Close the specified relation, if it isn't closed already.
*/
void
mdclose(SMgrRelation reln, ForkNumber forknum)
@@ -707,7 +531,7 @@ mdclose(SMgrRelation reln, ForkNumber forknum)
}
/*
- * mdprefetch() -- Initiate asynchronous read of the specified block of a
relation
+ * mdprefetch() -- Initiate asynchronous read of the specified block of a
relation
*/
bool
mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
@@ -734,64 +558,7 @@ mdprefetch(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum)
}
/*
- * mdwriteback() -- Tell the kernel to write pages back to storage.
- *
- * This accepts a range of blocks because flushing several pages at once is
- * considerably more efficient than doing so individually.
- */
-void
-mdwriteback(SMgrRelation reln, ForkNumber forknum,
- BlockNumber blocknum, BlockNumber nblocks)
-{
- Assert((io_direct_flags & IO_DIRECT_DATA) == 0);
-
- /*
- * Issue flush requests in as few requests as possible; have to split at
- * segment boundaries though, since those are actually separate files.
- */
- while (nblocks > 0)
- {
- BlockNumber nflush = nblocks;
- off_t seekpos;
- MdfdVec *v;
- int segnum_start,
- segnum_end;
-
- v = _mdfd_getseg(reln, forknum, blocknum, true /* not used */ ,
- EXTENSION_DONT_OPEN);
-
- /*
- * We might be flushing buffers of already removed relations,
that's
- * ok, just ignore that case. If the segment file wasn't open
already
- * (ie from a recent mdwrite()), then we don't want to re-open
it, to
- * avoid a race with PROCSIGNAL_BARRIER_SMGRRELEASE that might
leave
- * us with a descriptor to a file that is about to be unlinked.
- */
- if (!v)
- return;
-
- /* compute offset inside the current segment */
- segnum_start = blocknum / RELSEG_SIZE;
-
- /* compute number of desired writes within the current segment
*/
- segnum_end = (blocknum + nblocks - 1) / RELSEG_SIZE;
- if (segnum_start != segnum_end)
- nflush = RELSEG_SIZE - (blocknum % ((BlockNumber)
RELSEG_SIZE));
-
- Assert(nflush >= 1);
- Assert(nflush <= nblocks);
-
- seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber)
RELSEG_SIZE));
-
- FileWriteback(v->mdfd_vfd, seekpos, (off_t) BLCKSZ * nflush,
WAIT_EVENT_DATA_FILE_FLUSH);
-
- nblocks -= nflush;
- blocknum += nflush;
- }
-}
-
-/*
- * mdread() -- Read the specified block from a relation.
+ * mdread() -- Read the specified block from a relation.
*/
void
mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
@@ -856,11 +623,7 @@ mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber
blocknum,
}
/*
- * mdwrite() -- Write the supplied block at the appropriate location.
- *
- * This is to be used only for updating already-existing blocks of
a
- * relation (ie, those before the current EOF). To extend a
relation,
- * use mdextend().
+ * mdwrite() -- Write the supplied block at the appropriate location.
*/
void
mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
@@ -924,12 +687,242 @@ mdwrite(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum,
}
/*
- * mdnblocks() -- Get the number of blocks stored in a relation.
+ * mdextend() -- Add a block to the specified relation.
+ *
+ * This is to be used for the case of extending a relation (i.e., blocknum is
+ * at or beyond the current EOF). Note that writing a block beyond current
+ * EOF must cause the intervening file space to become filled with zeroes.
+ * The POSIX file system APIs do that automatically, so we don't need to do
+ * anything about that.
+ */
+void
+mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+ const void *buffer, bool skipFsync)
+{
+ off_t seekpos;
+ int nbytes;
+ MdfdVec *v;
+
+ /* If this build supports direct I/O, the buffer must be I/O aligned. */
+ if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
+ Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE,
buffer));
+
+ /* This assert is too expensive to have on normally ... */
+#ifdef CHECK_WRITE_VS_EXTEND
+ Assert(blocknum >= mdnblocks(reln, forknum));
+#endif
+
+ /*
+ * If a relation manages to grow to 2^32-1 blocks, refuse to extend it
any
+ * more --- we mustn't create a block whose number actually is
+ * InvalidBlockNumber. (Note that this failure should be unreachable
+ * because of upstream checks in bufmgr.c.)
+ */
+ if (blocknum == InvalidBlockNumber)
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("cannot extend file \"%s\" beyond %u
blocks",
+ relpath(reln->smgr_rlocator,
forknum),
+ InvalidBlockNumber)));
+
+ v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE);
+
+ seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
+
+ Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
+
+ if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos,
WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ)
+ {
+ if (nbytes < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not extend file \"%s\":
%m",
+
FilePathName(v->mdfd_vfd)),
+ errhint("Check free disk space.")));
+ /* short write: complain appropriately */
+ ereport(ERROR,
+ (errcode(ERRCODE_DISK_FULL),
+ errmsg("could not extend file \"%s\": wrote
only %d of %d bytes at block %u",
+ FilePathName(v->mdfd_vfd),
+ nbytes, BLCKSZ, blocknum),
+ errhint("Check free disk space.")));
+ }
+
+ if (!skipFsync && !SmgrIsTemp(reln))
+ register_dirty_segment(reln, forknum, v);
+
+ Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
+}
+
+/*
+ * mdzeroextend() -- Add new zeroed out blocks to the specified relation.
+ */
+void
+mdzeroextend(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber blocknum, int nblocks, bool skipFsync)
+{
+ MdfdVec *v;
+ BlockNumber curblocknum = blocknum;
+ int remblocks = nblocks;
+
+ Assert(nblocks > 0);
+
+ /* This assert is too expensive to have on normally ... */
+#ifdef CHECK_WRITE_VS_EXTEND
+ Assert(blocknum >= mdnblocks(reln, forknum));
+#endif
+
+ /*
+ * If a relation manages to grow to 2^32-1 blocks, refuse to extend it
any
+ * more --- we mustn't create a block whose number actually is
+ * InvalidBlockNumber or larger.
+ */
+ if ((uint64) blocknum + nblocks >= (uint64) InvalidBlockNumber)
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("cannot extend file \"%s\" beyond %u
blocks",
+ relpath(reln->smgr_rlocator,
forknum),
+ InvalidBlockNumber)));
+
+ while (remblocks > 0)
+ {
+ BlockNumber segstartblock = curblocknum % ((BlockNumber)
RELSEG_SIZE);
+ off_t seekpos = (off_t) BLCKSZ * segstartblock;
+ int numblocks;
+
+ if (segstartblock + remblocks > RELSEG_SIZE)
+ numblocks = RELSEG_SIZE - segstartblock;
+ else
+ numblocks = remblocks;
+
+ v = _mdfd_getseg(reln, forknum, curblocknum, skipFsync,
EXTENSION_CREATE);
+
+ Assert(segstartblock < RELSEG_SIZE);
+ Assert(segstartblock + numblocks <= RELSEG_SIZE);
+
+ /*
+ * If available and useful, use posix_fallocate() (via
FileAllocate())
+ * to extend the relation. That's often more efficient than
using
+ * write(), as it commonly won't cause the kernel to allocate
page
+ * cache space for the extended pages.
+ *
+ * However, we don't use FileAllocate() for small extensions,
as it
+ * defeats delayed allocation on some filesystems. Not clear
where
+ * that decision should be made though? For now just use a
cutoff of
+ * 8, anything between 4 and 8 worked OK in some local testing.
+ */
+ if (numblocks > 8)
+ {
+ int ret;
+
+ ret = FileFallocate(v->mdfd_vfd,
+ seekpos,
(off_t) BLCKSZ * numblocks,
+
WAIT_EVENT_DATA_FILE_EXTEND);
+ if (ret != 0)
+ {
+ ereport(ERROR,
+ errcode_for_file_access(),
+ errmsg("could not extend file
\"%s\" with FileFallocate(): %m",
+
FilePathName(v->mdfd_vfd)),
+ errhint("Check free disk
space."));
+ }
+ }
+ else
+ {
+ int ret;
+
+ /*
+ * Even if we don't want to use fallocate, we can still
extend a
+ * bit more efficiently than writing each 8kB block
individually.
+ * pg_pwrite_zeros() (via FileZero()) uses
+ * pg_pwritev_with_retry() to avoid multiple writes or
needing a
+ * zeroed buffer for the whole length of the extension.
+ */
+ ret = FileZero(v->mdfd_vfd,
+ seekpos, (off_t) BLCKSZ *
numblocks,
+ WAIT_EVENT_DATA_FILE_EXTEND);
+ if (ret < 0)
+ ereport(ERROR,
+ errcode_for_file_access(),
+ errmsg("could not extend file
\"%s\": %m",
+
FilePathName(v->mdfd_vfd)),
+ errhint("Check free disk
space."));
+ }
+
+ if (!skipFsync && !SmgrIsTemp(reln))
+ register_dirty_segment(reln, forknum, v);
+
+ Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber)
RELSEG_SIZE));
+
+ remblocks -= numblocks;
+ curblocknum += numblocks;
+ }
+}
+
+/*
+ * mdwriteback() -- Tell the kernel to write pages back to storage.
+ *
+ * This accepts a range of blocks because flushing several pages at once is
+ * considerably more efficient than doing so individually.
+ */
+void
+mdwriteback(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber blocknum, BlockNumber nblocks)
+{
+ Assert((io_direct_flags & IO_DIRECT_DATA) == 0);
+
+ /*
+ * Issue flush requests in as few requests as possible; have to split at
+ * segment boundaries though, since those are actually separate files.
+ */
+ while (nblocks > 0)
+ {
+ BlockNumber nflush = nblocks;
+ off_t seekpos;
+ MdfdVec *v;
+ int segnum_start,
+ segnum_end;
+
+ v = _mdfd_getseg(reln, forknum, blocknum, true /* not used */ ,
+ EXTENSION_DONT_OPEN);
+
+ /*
+ * We might be flushing buffers of already removed relations,
that's
+ * ok, just ignore that case. If the segment file wasn't open
already
+ * (ie from a recent mdwrite()), then we don't want to re-open
it, to
+ * avoid a race with PROCSIGNAL_BARRIER_SMGRRELEASE that might
leave
+ * us with a descriptor to a file that is about to be unlinked.
+ */
+ if (!v)
+ return;
+
+ /* compute offset inside the current segment */
+ segnum_start = blocknum / RELSEG_SIZE;
+
+ /* compute number of desired writes within the current segment
*/
+ segnum_end = (blocknum + nblocks - 1) / RELSEG_SIZE;
+ if (segnum_start != segnum_end)
+ nflush = RELSEG_SIZE - (blocknum % ((BlockNumber)
RELSEG_SIZE));
+
+ Assert(nflush >= 1);
+ Assert(nflush <= nblocks);
+
+ seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber)
RELSEG_SIZE));
+
+ FileWriteback(v->mdfd_vfd, seekpos, (off_t) BLCKSZ * nflush,
WAIT_EVENT_DATA_FILE_FLUSH);
+
+ nblocks -= nflush;
+ blocknum += nflush;
+ }
+}
+
+/*
+ * mdnblocks() -- Get the number of blocks stored in a relation.
*
- * Important side effect: all active segments of the relation are
opened
- * and added to the md_seg_fds array. If this routine has not been
- * called, then only segments up to the last one actually touched
- * are present in the array.
+ * Important side effect: all active segments of the relation are opened
+ * and added to the md_seg_fds array. If this routine has not been
+ * called, then only segments up to the last one actually touched
+ * are present in the array.
*/
BlockNumber
mdnblocks(SMgrRelation reln, ForkNumber forknum)
@@ -986,7 +979,7 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum)
}
/*
- * mdtruncate() -- Truncate relation to specified number of blocks.
+ * mdtruncate() -- Truncate relation to specified number of blocks.
*/
void
mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
@@ -1080,7 +1073,7 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum,
BlockNumber nblocks)
}
/*
- * mdimmedsync() -- Immediately sync a relation to stable storage.
+ * mdimmedsync() -- Immediately sync a relation to stable storage.
*
* Note that only writes already issued are synced; this routine knows
* nothing of dirty buffers that may exist inside the buffer manager. We
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index 70d0d570b1..d983a30475 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -30,7 +30,9 @@
/*
* This struct of function pointers defines the API between smgr.c and
- * any individual storage manager module. Note that smgr subfunctions are
+ * any individual storage manager module.
+ *
+ * Note that smgr subfunctions are
* generally expected to report problems via elog(ERROR). An exception is
* that smgr_unlink should use elog(WARNING), rather than erroring out,
* because we normally unlink relations during post-commit/abort cleanup,
@@ -49,16 +51,16 @@ typedef struct f_smgr
bool (*smgr_exists) (SMgrRelation reln, ForkNumber forknum);
void (*smgr_unlink) (RelFileLocatorBackend rlocator,
ForkNumber forknum,
bool isRedo);
- void (*smgr_extend) (SMgrRelation reln, ForkNumber forknum,
- BlockNumber
blocknum, const void *buffer, bool skipFsync);
- void (*smgr_zeroextend) (SMgrRelation reln, ForkNumber
forknum,
-
BlockNumber blocknum, int nblocks, bool skipFsync);
bool (*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum,
BlockNumber
blocknum);
void (*smgr_read) (SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum,
void *buffer);
void (*smgr_write) (SMgrRelation reln, ForkNumber forknum,
BlockNumber
blocknum, const void *buffer, bool skipFsync);
+ void (*smgr_extend) (SMgrRelation reln, ForkNumber forknum,
+ BlockNumber
blocknum, const void *buffer, bool skipFsync);
+ void (*smgr_zeroextend) (SMgrRelation reln, ForkNumber
forknum,
+
BlockNumber blocknum, int nblocks, bool skipFsync);
void (*smgr_writeback) (SMgrRelation reln, ForkNumber
forknum,
BlockNumber
blocknum, BlockNumber nblocks);
BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum);
@@ -104,8 +106,7 @@ static void smgrshutdown(int code, Datum arg);
/*
- * smgrinit(), smgrshutdown() -- Initialize or shut down storage
- * managers.
+ * smgrinit() -- Initialize storage managers.
*
* Note: smgrinit is called during backend startup (normal or standalone
* case), *not* during postmaster start. Therefore, any resources created
@@ -142,9 +143,11 @@ smgrshutdown(int code, Datum arg)
}
/*
- * smgropen() -- Return an SMgrRelation object, creating it if need be.
+ * smgropen() -- Return an SMgrRelation object, creating it if need be.
+ *
+ * "backend" is for temporary tables, otherwise InvalidBackendId.
*
- * This does not attempt to actually open the underlying file.
+ * This does not attempt to actually open the underlying file.
*/
SMgrRelation
smgropen(RelFileLocator rlocator, BackendId backend)
@@ -245,7 +248,7 @@ smgrclearowner(SMgrRelation *owner, SMgrRelation reln)
}
/*
- * smgrexists() -- Does the underlying file for a fork exist?
+ * smgrexists() -- Does the underlying file for a fork exist?
*/
bool
smgrexists(SMgrRelation reln, ForkNumber forknum)
@@ -254,7 +257,7 @@ smgrexists(SMgrRelation reln, ForkNumber forknum)
}
/*
- * smgrclose() -- Close and delete an SMgrRelation object.
+ * smgrclose() -- Close and delete an SMgrRelation object.
*/
void
smgrclose(SMgrRelation reln)
@@ -284,9 +287,9 @@ smgrclose(SMgrRelation reln)
}
/*
- * smgrrelease() -- Release all resources used by this object.
+ * smgrrelease() -- Release all resources used by this object.
*
- * The object remains valid.
+ * The object remains valid.
*/
void
smgrrelease(SMgrRelation reln)
@@ -299,9 +302,9 @@ smgrrelease(SMgrRelation reln)
}
/*
- * smgrreleaseall() -- Release resources used by all objects.
+ * smgrreleaseall() -- Release resources used by all objects.
*
- * This is called for PROCSIGNAL_BARRIER_SMGRRELEASE.
+ * This is called for PROCSIGNAL_BARRIER_SMGRRELEASE.
*/
void
smgrreleaseall(void)
@@ -320,7 +323,7 @@ smgrreleaseall(void)
}
/*
- * smgrcloseall() -- Close all existing SMgrRelation objects.
+ * smgrcloseall() -- Close all existing SMgrRelation objects.
*/
void
smgrcloseall(void)
@@ -339,8 +342,8 @@ smgrcloseall(void)
}
/*
- * smgrcloserellocator() -- Close SMgrRelation object for given
RelFileLocator,
- * if one exists.
+ * smgrcloserellocator() -- Close SMgrRelation object for given
+ * RelFileLocator, if one
exists.
*
* This has the same effects as smgrclose(smgropen(rlocator)), but it avoids
* uselessly creating a hashtable entry only to drop it again when no
@@ -363,11 +366,13 @@ smgrcloserellocator(RelFileLocatorBackend rlocator)
}
/*
- * smgrcreate() -- Create a new relation.
+ * smgrcreate() -- Create a new relation.
*
- * Given an already-created (but presumably unused) SMgrRelation,
- * cause the underlying disk file or other storage for the fork
- * to be created.
+ * Given an already-created (but presumably unused) SMgrRelation, cause the
+ * underlying disk file or other storage for the fork to be created.
+ *
+ * isRedo is true during recovery. In that case, the underlying storage may
+ * already exist.
*/
void
smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
@@ -376,13 +381,13 @@ smgrcreate(SMgrRelation reln, ForkNumber forknum, bool
isRedo)
}
/*
- * smgrdosyncall() -- Immediately sync all forks of all given relations
+ * smgrdosyncall() -- Immediately sync all forks of all given relations
*
- * All forks of all given relations are synced out to the store.
+ * All forks of all given relations are synced out to the store.
*
- * This is equivalent to FlushRelationBuffers() for each smgr
relation,
- * then calling smgrimmedsync() for all forks of each relation,
but it's
- * significantly quicker so should be preferred when possible.
+ * This is equivalent to FlushRelationBuffers() for each smgr relation, then
+ * calling smgrimmedsync() for all forks of each relation, but it's
+ * significantly quicker so should be preferred when possible.
*/
void
smgrdosyncall(SMgrRelation *rels, int nrels)
@@ -411,14 +416,13 @@ smgrdosyncall(SMgrRelation *rels, int nrels)
}
/*
- * smgrdounlinkall() -- Immediately unlink all forks of all given relations
+ * smgrdounlinkall() -- Immediately unlink all forks of all given relations
*
- * All forks of all given relations are removed from the store.
This
- * should not be used during transactional operations, since it
can't be
- * undone.
+ * All forks of all given relations are removed from the store. This should
+ * not be used during transactional operations, since it can't be undone.
*
- * If isRedo is true, it is okay for the underlying file(s) to be
gone
- * already.
+ * If isRedo is true, it is okay for the underlying file(s) to be gone
+ * already.
*/
void
smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo)
@@ -483,15 +487,64 @@ smgrdounlinkall(SMgrRelation *rels, int nrels, bool
isRedo)
pfree(rlocators);
}
+/*
+ * smgrprefetch() -- Initiate asynchronous read of the specified block of a
relation.
+ *
+ * In recovery only, this can return false to indicate that a file doesn't
+ * exist (presumably it has been dropped by a later WAL record).
+ */
+bool
+smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
+{
+ return smgrsw[reln->smgr_which].smgr_prefetch(reln, forknum, blocknum);
+}
+
+/*
+ * smgrread() -- read a particular block from a relation into the supplied
+ * buffer.
+ *
+ * This routine is called from the buffer manager in order to instantiate
+ * pages in the shared buffer cache. All storage managers return pages in the
+ * format that POSTGRES expects.
+ */
+void
+smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+ void *buffer)
+{
+ smgrsw[reln->smgr_which].smgr_read(reln, forknum, blocknum, buffer);
+}
+
+/*
+ * smgrwrite() -- Write the supplied buffer out.
+ *
+ * This is to be used only for updating already-existing blocks of a relation
+ * (ie, those before the current EOF). To extend a relation, use
+ * smgrextend().
+ *
+ * This is not a synchronous write -- the block is not necessarily on disk at
+ * return, only dumped out to the kernel. However, provisions will be made to
+ * fsync the write before the next checkpoint.
+ *
+ * skipFsync indicates that the caller will make other provisions to fsync the
+ * relation, so we needn't bother. Temporary relations also do not require
+ * fsync.
+ */
+void
+smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+ const void *buffer, bool skipFsync)
+{
+ smgrsw[reln->smgr_which].smgr_write(reln, forknum, blocknum,
+
buffer, skipFsync);
+}
/*
- * smgrextend() -- Add a new block to a file.
+ * smgrextend() -- Add a new block to a file.
*
- * The semantics are nearly the same as smgrwrite(): write at the
- * specified position. However, this is to be used for the case of
- * extending a relation (i.e., blocknum is at or beyond the current
- * EOF). Note that we assume writing a block beyond current EOF
- * causes intervening file space to become filled with zeroes.
+ * The semantics are nearly the same as smgrwrite(): write at the specified
+ * position. However, this is to be used for the case of extending a relation
+ * (i.e., blocknum is at or beyond the current EOF). Writing a block beyond
+ * current EOF must cause the intervening file space to become filled with
+ * zeroes.
*/
void
smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
@@ -512,11 +565,13 @@ smgrextend(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum,
}
/*
- * smgrzeroextend() -- Add new zeroed out blocks to a file.
+ * smgrzeroextend() -- Add new zeroed out blocks to a file.
+ *
+ * Extend the relation by the given number of blocks, which will be filled
+ * with zeroes. This is similar to smgrextend() but only extends and does not
+ * write a buffer of data.
*
- * Similar to smgrextend(), except the relation can be extended by
- * multiple blocks at once and the added blocks will be filled with
- * zeroes.
+ * FIXME: why both blocknum and nblocks
*/
void
smgrzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
@@ -537,60 +592,8 @@ smgrzeroextend(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum,
}
/*
- * smgrprefetch() -- Initiate asynchronous read of the specified block of
a relation.
- *
- * In recovery only, this can return false to indicate that a file
- * doesn't exist (presumably it has been dropped by a later WAL
- * record).
- */
-bool
-smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
-{
- return smgrsw[reln->smgr_which].smgr_prefetch(reln, forknum, blocknum);
-}
-
-/*
- * smgrread() -- read a particular block from a relation into the supplied
- * buffer.
- *
- * This routine is called from the buffer manager in order to
- * instantiate pages in the shared buffer cache. All storage
managers
- * return pages in the format that POSTGRES expects.
- */
-void
-smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
- void *buffer)
-{
- smgrsw[reln->smgr_which].smgr_read(reln, forknum, blocknum, buffer);
-}
-
-/*
- * smgrwrite() -- Write the supplied buffer out.
- *
- * This is to be used only for updating already-existing blocks of
a
- * relation (ie, those before the current EOF). To extend a
relation,
- * use smgrextend().
- *
- * This is not a synchronous write -- the block is not necessarily
- * on disk at return, only dumped out to the kernel. However,
- * provisions will be made to fsync the write before the next
checkpoint.
- *
- * skipFsync indicates that the caller will make other provisions
to
- * fsync the relation, so we needn't bother. Temporary relations
also
- * do not require fsync.
- */
-void
-smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
- const void *buffer, bool skipFsync)
-{
- smgrsw[reln->smgr_which].smgr_write(reln, forknum, blocknum,
-
buffer, skipFsync);
-}
-
-
-/*
- * smgrwriteback() -- Trigger kernel writeback for the supplied range of
- * blocks.
+ * smgrwriteback() -- Trigger kernel writeback for the supplied range of
+ * blocks.
*/
void
smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
@@ -601,8 +604,8 @@ smgrwriteback(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum,
}
/*
- * smgrnblocks() -- Calculate the number of blocks in the
- * supplied relation.
+ * smgrnblocks() -- Calculate the number of blocks in the
+ * supplied relation.
*/
BlockNumber
smgrnblocks(SMgrRelation reln, ForkNumber forknum)
@@ -622,8 +625,8 @@ smgrnblocks(SMgrRelation reln, ForkNumber forknum)
}
/*
- * smgrnblocks_cached() -- Get the cached number of blocks in the supplied
- * relation.
+ * smgrnblocks_cached() -- Get the cached number of blocks in the supplied
+ * relation.
*
* Returns an InvalidBlockNumber when not in recovery and when the relation
* fork size is not cached.
@@ -642,8 +645,8 @@ smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum)
}
/*
- * smgrtruncate() -- Truncate the given forks of supplied relation to
- * each specified numbers of blocks
+ * smgrtruncate() -- Truncate the given forks of supplied relation to
+ * each specified numbers of blocks
*
* The truncation is done immediately, so this can't be rolled back.
*
@@ -694,27 +697,27 @@ smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int
nforks, BlockNumber *nb
}
/*
- * smgrimmedsync() -- Force the specified relation to stable storage.
- *
- * Synchronously force all previous writes to the specified
relation
- * down to disk.
- *
- * This is useful for building completely new relations (eg, new
- * indexes). Instead of incrementally WAL-logging the index build
- * steps, we can just write completed index pages to disk with
smgrwrite
- * or smgrextend, and then fsync the completed index file before
- * committing the transaction. (This is sufficient for purposes of
- * crash recovery, since it effectively duplicates forcing a
checkpoint
- * for the completed index. But it is *not* sufficient if one
wishes
- * to use the WAL log for PITR or replication purposes: in that
case
- * we have to make WAL entries as well.)
- *
- * The preceding writes should specify skipFsync = true to avoid
- * duplicative fsyncs.
- *
- * Note that you need to do FlushRelationBuffers() first if there
is
- * any possibility that there are dirty buffers for the relation;
- * otherwise the sync is not very meaningful.
+ * smgrimmedsync() -- Force the specified relation to stable storage.
+ *
+ * Synchronously force all previous writes to the specified relation
+ * down to disk.
+ *
+ * This is useful for building completely new relations (eg, new
+ * indexes). Instead of incrementally WAL-logging the index build
+ * steps, we can just write completed index pages to disk with smgrwrite
+ * or smgrextend, and then fsync the completed index file before
+ * committing the transaction. (This is sufficient for purposes of
+ * crash recovery, since it effectively duplicates forcing a checkpoint
+ * for the completed index. But it is *not* sufficient if one wishes
+ * to use the WAL log for PITR or replication purposes: in that case
+ * we have to make WAL entries as well.)
+ *
+ * The preceding writes should specify skipFsync = true to avoid
+ * duplicative fsyncs.
+ *
+ * Note that you need to do FlushRelationBuffers() first if there is
+ * any possibility that there are dirty buffers for the relation;
+ * otherwise the sync is not very meaningful.
*/
void
smgrimmedsync(SMgrRelation reln, ForkNumber forknum)
diff --git a/src/include/storage/md.h b/src/include/storage/md.h
index 941879ee6a..8af34e4155 100644
--- a/src/include/storage/md.h
+++ b/src/include/storage/md.h
@@ -26,16 +26,16 @@ extern void mdclose(SMgrRelation reln, ForkNumber forknum);
extern void mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo);
extern bool mdexists(SMgrRelation reln, ForkNumber forknum);
extern void mdunlink(RelFileLocatorBackend rlocator, ForkNumber forknum, bool
isRedo);
-extern void mdextend(SMgrRelation reln, ForkNumber forknum,
- BlockNumber blocknum, const void
*buffer, bool skipFsync);
-extern void mdzeroextend(SMgrRelation reln, ForkNumber forknum,
- BlockNumber blocknum, int
nblocks, bool skipFsync);
extern bool mdprefetch(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum);
extern void mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
void *buffer);
extern void mdwrite(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, const void
*buffer, bool skipFsync);
+extern void mdextend(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber blocknum, const void
*buffer, bool skipFsync);
+extern void mdzeroextend(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber blocknum, int
nblocks, bool skipFsync);
extern void mdwriteback(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum,
BlockNumber nblocks);
extern BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum);
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index a9a179aaba..896512f1bc 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -90,16 +90,16 @@ extern void smgrreleaseall(void);
extern void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo);
extern void smgrdosyncall(SMgrRelation *rels, int nrels);
extern void smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo);
-extern void smgrextend(SMgrRelation reln, ForkNumber forknum,
- BlockNumber blocknum, const void
*buffer, bool skipFsync);
-extern void smgrzeroextend(SMgrRelation reln, ForkNumber forknum,
- BlockNumber blocknum, int
nblocks, bool skipFsync);
extern bool smgrprefetch(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum);
extern void smgrread(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, void *buffer);
extern void smgrwrite(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, const void
*buffer, bool skipFsync);
+extern void smgrextend(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber blocknum, const void
*buffer, bool skipFsync);
+extern void smgrzeroextend(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber blocknum, int
nblocks, bool skipFsync);
extern void smgrwriteback(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum,
BlockNumber nblocks);
extern BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum);
--
2.40.1