Hi! Sorry for delay. I was a bit busy last month. Anyway, here is my proposal for making multioffsets 64 bit. The patch set consists of three parts: 0001 - making user output of offsets 64-bit ready; 0002 - making offsets 64-bit; 0003 - provide 32 to 64 bit conversion in pg_upgarde.
I'm pretty sure this is just a beginning of the conversation, so any opinions and reviews, as always, are very welcome! -- Best regards, Maxim Orlov.
From 2e1f05b3b0504153e57188e968bb19cb6741c087 Mon Sep 17 00:00:00 2001 From: Maxim Orlov <m.orlov@postgrespro.ru> Date: Wed, 6 Mar 2024 11:11:33 +0300 Subject: [PATCH v1 2/3] Use 64-bit multixact offsets. Author: Maxim Orlov <orlovmg@gmail.com> --- src/backend/access/transam/multixact.c | 182 ++----------------------- src/bin/pg_resetwal/pg_resetwal.c | 2 +- src/bin/pg_resetwal/t/001_basic.pl | 2 +- src/include/access/multixact.h | 2 +- src/include/c.h | 2 +- 5 files changed, 16 insertions(+), 174 deletions(-) diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c index 57c5148933..f2a2aa9547 100644 --- a/src/backend/access/transam/multixact.c +++ b/src/backend/access/transam/multixact.c @@ -95,14 +95,6 @@ /* * Defines for MultiXactOffset page sizes. A page is the same BLCKSZ as is * used everywhere else in Postgres. - * - * Note: because MultiXactOffsets are 32 bits and wrap around at 0xFFFFFFFF, - * MultiXact page numbering also wraps around at - * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE, and segment numbering at - * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE/SLRU_PAGES_PER_SEGMENT. We need - * take no explicit notice of that fact in this module, except when comparing - * segment and page numbers in TruncateMultiXact (see - * MultiXactOffsetPagePrecedes). */ /* We need four bytes per offset */ @@ -174,7 +166,7 @@ MXOffsetToMemberPage(MultiXactOffset offset) return offset / MULTIXACT_MEMBERS_PER_PAGE; } -static inline int +static inline int64 MXOffsetToMemberSegment(MultiXactOffset offset) { return MXOffsetToMemberPage(offset) / SLRU_PAGES_PER_SEGMENT; @@ -271,9 +263,6 @@ typedef struct MultiXactStateData MultiXactId multiStopLimit; MultiXactId multiWrapLimit; - /* support for members anti-wraparound measures */ - MultiXactOffset offsetStopLimit; /* known if oldestOffsetKnown */ - /* * This is used to sleep until a multixact offset is written when we want * to create the next one. @@ -408,8 +397,6 @@ static bool MultiXactOffsetPrecedes(MultiXactOffset offset1, MultiXactOffset offset2); static void ExtendMultiXactOffset(MultiXactId multi); static void ExtendMultiXactMember(MultiXactOffset offset, int nmembers); -static bool MultiXactOffsetWouldWrap(MultiXactOffset boundary, - MultiXactOffset start, uint32 distance); static bool SetOffsetVacuumLimit(bool is_startup); static bool find_multixact_start(MultiXactId multi, MultiXactOffset *result); static void WriteMZeroPageXlogRec(int64 pageno, uint8 info); @@ -1158,78 +1145,6 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset) else *offset = nextOffset; - /*---------- - * Protect against overrun of the members space as well, with the - * following rules: - * - * If we're past offsetStopLimit, refuse to generate more multis. - * If we're close to offsetStopLimit, emit a warning. - * - * Arbitrarily, we start emitting warnings when we're 20 segments or less - * from offsetStopLimit. - * - * Note we haven't updated the shared state yet, so if we fail at this - * point, the multixact ID we grabbed can still be used by the next guy. - * - * Note that there is no point in forcing autovacuum runs here: the - * multixact freeze settings would have to be reduced for that to have any - * effect. - *---------- - */ -#define OFFSET_WARN_SEGMENTS 20 - if (MultiXactState->oldestOffsetKnown && - MultiXactOffsetWouldWrap(MultiXactState->offsetStopLimit, nextOffset, - nmembers)) - { - /* see comment in the corresponding offsets wraparound case */ - SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER); - - ereport(ERROR, - (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("multixact \"members\" limit exceeded"), - errdetail_plural("This command would create a multixact with %u members, but the remaining space is only enough for %u member.", - "This command would create a multixact with %u members, but the remaining space is only enough for %u members.", - MultiXactState->offsetStopLimit - nextOffset - 1, - nmembers, - MultiXactState->offsetStopLimit - nextOffset - 1), - errhint("Execute a database-wide VACUUM in database with OID %u with reduced \"vacuum_multixact_freeze_min_age\" and \"vacuum_multixact_freeze_table_age\" settings.", - MultiXactState->oldestMultiXactDB))); - } - - /* - * Check whether we should kick autovacuum into action, to prevent members - * wraparound. NB we use a much larger window to trigger autovacuum than - * just the warning limit. The warning is just a measure of last resort - - * this is in line with GetNewTransactionId's behaviour. - */ - if (!MultiXactState->oldestOffsetKnown || - (MultiXactState->nextOffset - MultiXactState->oldestOffset - > MULTIXACT_MEMBER_SAFE_THRESHOLD)) - { - /* - * To avoid swamping the postmaster with signals, we issue the autovac - * request only when crossing a segment boundary. With default - * compilation settings that's roughly after 50k members. This still - * gives plenty of chances before we get into real trouble. - */ - if ((MXOffsetToMemberPage(nextOffset) / SLRU_PAGES_PER_SEGMENT) != - (MXOffsetToMemberPage(nextOffset + nmembers) / SLRU_PAGES_PER_SEGMENT)) - SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER); - } - - if (MultiXactState->oldestOffsetKnown && - MultiXactOffsetWouldWrap(MultiXactState->offsetStopLimit, - nextOffset, - nmembers + MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT * OFFSET_WARN_SEGMENTS)) - ereport(WARNING, - (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg_plural("database with OID %u must be vacuumed before %d more multixact member is used", - "database with OID %u must be vacuumed before %d more multixact members are used", - MultiXactState->offsetStopLimit - nextOffset + nmembers, - MultiXactState->oldestMultiXactDB, - MultiXactState->offsetStopLimit - nextOffset + nmembers), - errhint("Execute a database-wide VACUUM in that database with reduced \"vacuum_multixact_freeze_min_age\" and \"vacuum_multixact_freeze_table_age\" settings."))); - ExtendMultiXactMember(nextOffset, nmembers); /* @@ -1968,7 +1883,7 @@ MultiXactShmemInit(void) "pg_multixact/offsets", LWTRANCHE_MULTIXACTOFFSET_BUFFER, LWTRANCHE_MULTIXACTOFFSET_SLRU, SYNC_HANDLER_MULTIXACT_OFFSET, - false); + true); SlruPagePrecedesUnitTests(MultiXactOffsetCtl, MULTIXACT_OFFSETS_PER_PAGE); SimpleLruInit(MultiXactMemberCtl, "multixact_member", multixact_member_buffers, 0, @@ -2713,8 +2628,6 @@ SetOffsetVacuumLimit(bool is_startup) MultiXactOffset nextOffset; bool oldestOffsetKnown = false; bool prevOldestOffsetKnown; - MultiXactOffset offsetStopLimit = 0; - MultiXactOffset prevOffsetStopLimit; /* * NB: Have to prevent concurrent truncation, we might otherwise try to @@ -2729,7 +2642,6 @@ SetOffsetVacuumLimit(bool is_startup) nextOffset = MultiXactState->nextOffset; prevOldestOffsetKnown = MultiXactState->oldestOffsetKnown; prevOldestOffset = MultiXactState->oldestOffset; - prevOffsetStopLimit = MultiXactState->offsetStopLimit; Assert(MultiXactState->finishedStartup); LWLockRelease(MultiXactGenLock); @@ -2760,11 +2672,7 @@ SetOffsetVacuumLimit(bool is_startup) oldestOffsetKnown = find_multixact_start(oldestMultiXactId, &oldestOffset); - if (oldestOffsetKnown) - ereport(DEBUG1, - (errmsg_internal("oldest MultiXactId member is at offset %u", - oldestOffset))); - else + if (!oldestOffsetKnown) ereport(LOG, (errmsg("MultiXact member wraparound protections are disabled because oldest checkpointed MultiXact %u does not exist on disk", oldestMultiXactId))); @@ -2777,24 +2685,7 @@ SetOffsetVacuumLimit(bool is_startup) * overrun of old data in the members SLRU area. We can only do so if the * oldest offset is known though. */ - if (oldestOffsetKnown) - { - /* move back to start of the corresponding segment */ - offsetStopLimit = oldestOffset - (oldestOffset % - (MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT)); - - /* always leave one segment before the wraparound point */ - offsetStopLimit -= (MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT); - - if (!prevOldestOffsetKnown && !is_startup) - ereport(LOG, - (errmsg("MultiXact member wraparound protections are now enabled"))); - - ereport(DEBUG1, - (errmsg_internal("MultiXact member stop limit is now %u based on MultiXact %u", - offsetStopLimit, oldestMultiXactId))); - } - else if (prevOldestOffsetKnown) + if (prevOldestOffsetKnown) { /* * If we failed to get the oldest offset this time, but we have a @@ -2804,14 +2695,12 @@ SetOffsetVacuumLimit(bool is_startup) */ oldestOffset = prevOldestOffset; oldestOffsetKnown = true; - offsetStopLimit = prevOffsetStopLimit; } /* Install the computed values */ LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); MultiXactState->oldestOffset = oldestOffset; MultiXactState->oldestOffsetKnown = oldestOffsetKnown; - MultiXactState->offsetStopLimit = offsetStopLimit; LWLockRelease(MultiXactGenLock); /* @@ -2821,54 +2710,6 @@ SetOffsetVacuumLimit(bool is_startup) (nextOffset - oldestOffset > MULTIXACT_MEMBER_SAFE_THRESHOLD); } -/* - * Return whether adding "distance" to "start" would move past "boundary". - * - * We use this to determine whether the addition is "wrapping around" the - * boundary point, hence the name. The reason we don't want to use the regular - * 2^31-modulo arithmetic here is that we want to be able to use the whole of - * the 2^32-1 space here, allowing for more multixacts than would fit - * otherwise. - */ -static bool -MultiXactOffsetWouldWrap(MultiXactOffset boundary, MultiXactOffset start, - uint32 distance) -{ - MultiXactOffset finish; - - /* - * Note that offset number 0 is not used (see GetMultiXactIdMembers), so - * if the addition wraps around the UINT_MAX boundary, skip that value. - */ - finish = start + distance; - if (finish < start) - finish++; - - /*----------------------------------------------------------------------- - * When the boundary is numerically greater than the starting point, any - * value numerically between the two is not wrapped: - * - * <----S----B----> - * [---) = F wrapped past B (and UINT_MAX) - * [---) = F not wrapped - * [----] = F wrapped past B - * - * When the boundary is numerically less than the starting point (i.e. the - * UINT_MAX wraparound occurs somewhere in between) then all values in - * between are wrapped: - * - * <----B----S----> - * [---) = F not wrapped past B (but wrapped past UINT_MAX) - * [---) = F wrapped past B (and UINT_MAX) - * [----] = F not wrapped - *----------------------------------------------------------------------- - */ - if (start < boundary) - return finish >= boundary || finish < start; - else - return finish >= boundary && finish < start; -} - /* * Find the starting offset of the given MultiXactId. * @@ -2990,8 +2831,9 @@ MultiXactMemberFreezeThreshold(void) * we try to eliminate from the system is based on how far we are past * MULTIXACT_MEMBER_SAFE_THRESHOLD. */ - fraction = (double) (members - MULTIXACT_MEMBER_SAFE_THRESHOLD) / - (MULTIXACT_MEMBER_DANGER_THRESHOLD - MULTIXACT_MEMBER_SAFE_THRESHOLD); + fraction = (double) (members - MULTIXACT_MEMBER_SAFE_THRESHOLD); + fraction /= (double) (MULTIXACT_MEMBER_DANGER_THRESHOLD - MULTIXACT_MEMBER_SAFE_THRESHOLD); + victim_multixacts = multixacts * fraction; /* fraction could be > 1.0, but lowest possible freeze age is zero */ @@ -3041,10 +2883,10 @@ SlruScanDirCbFindEarliest(SlruCtl ctl, char *filename, int64 segpage, void *data static void PerformMembersTruncation(MultiXactOffset oldestOffset, MultiXactOffset newOldestOffset) { - const int maxsegment = MXOffsetToMemberSegment(MaxMultiXactOffset); - int startsegment = MXOffsetToMemberSegment(oldestOffset); - int endsegment = MXOffsetToMemberSegment(newOldestOffset); - int segment = startsegment; + const int64 maxsegment = MXOffsetToMemberSegment(MaxMultiXactOffset); + int64 startsegment = MXOffsetToMemberSegment(oldestOffset); + int64 endsegment = MXOffsetToMemberSegment(newOldestOffset); + int64 segment = startsegment; /* * Delete all the segments but the last one. The last segment can still @@ -3337,7 +3179,7 @@ MultiXactIdPrecedesOrEquals(MultiXactId multi1, MultiXactId multi2) static bool MultiXactOffsetPrecedes(MultiXactOffset offset1, MultiXactOffset offset2) { - int32 diff = (int32) (offset1 - offset2); + int64 diff = (int64) (offset1 - offset2); return (diff < 0); } diff --git a/src/bin/pg_resetwal/pg_resetwal.c b/src/bin/pg_resetwal/pg_resetwal.c index 985cd06802..1af2ce4b93 100644 --- a/src/bin/pg_resetwal/pg_resetwal.c +++ b/src/bin/pg_resetwal/pg_resetwal.c @@ -264,7 +264,7 @@ main(int argc, char *argv[]) case 'O': errno = 0; - set_mxoff = strtoul(optarg, &endptr, 0); + set_mxoff = strtou64(optarg, &endptr, 0); if (endptr == optarg || *endptr != '\0' || errno != 0) { pg_log_error("invalid argument for option %s", "-O"); diff --git a/src/bin/pg_resetwal/t/001_basic.pl b/src/bin/pg_resetwal/t/001_basic.pl index 9829e48106..f8a8eef44d 100644 --- a/src/bin/pg_resetwal/t/001_basic.pl +++ b/src/bin/pg_resetwal/t/001_basic.pl @@ -206,7 +206,7 @@ push @cmd, sprintf("%d,%d", hex($files[0]) == 0 ? 3 : hex($files[0]), hex($files[-1])); @files = get_slru_files('pg_multixact/offsets'); -$mult = 32 * $blcksz / 4; +$mult = 32 * $blcksz / 8; # -m argument is "new,old" push @cmd, '-m', sprintf("%d,%d", diff --git a/src/include/access/multixact.h b/src/include/access/multixact.h index 7ffd256c74..90583634ec 100644 --- a/src/include/access/multixact.h +++ b/src/include/access/multixact.h @@ -27,7 +27,7 @@ #define MultiXactIdIsValid(multi) ((multi) != InvalidMultiXactId) -#define MaxMultiXactOffset ((MultiXactOffset) 0xFFFFFFFF) +#define MaxMultiXactOffset UINT64CONST(0xFFFFFFFFFFFFFFFF) /* * Possible multixact lock modes ("status"). The first four modes are for diff --git a/src/include/c.h b/src/include/c.h index dc1841346c..ccfb82b478 100644 --- a/src/include/c.h +++ b/src/include/c.h @@ -661,7 +661,7 @@ typedef uint32 SubTransactionId; /* MultiXactId must be equivalent to TransactionId, to fit in t_xmax */ typedef TransactionId MultiXactId; -typedef uint32 MultiXactOffset; +typedef uint64 MultiXactOffset; typedef uint32 CommandId; -- 2.45.2
From 95226756a225ca6b95e2baafff502034c355310d Mon Sep 17 00:00:00 2001 From: Maxim Orlov <orlovmg@gmail.com> Date: Wed, 7 Aug 2024 16:35:22 +0300 Subject: [PATCH v1 1/3] Use 64-bit format output for multixact offsets Author: Maxim Orlov <orlovmg@gmail.com> --- src/backend/access/rmgrdesc/mxactdesc.c | 9 ++++---- src/backend/access/rmgrdesc/xlogdesc.c | 4 ++-- src/backend/access/transam/multixact.c | 26 +++++++++++++---------- src/backend/access/transam/xlogrecovery.c | 5 +++-- src/bin/pg_controldata/pg_controldata.c | 4 ++-- src/bin/pg_resetwal/pg_resetwal.c | 8 +++---- 6 files changed, 31 insertions(+), 25 deletions(-) diff --git a/src/backend/access/rmgrdesc/mxactdesc.c b/src/backend/access/rmgrdesc/mxactdesc.c index 3e8ad4d5ef..1b486de38c 100644 --- a/src/backend/access/rmgrdesc/mxactdesc.c +++ b/src/backend/access/rmgrdesc/mxactdesc.c @@ -65,8 +65,8 @@ multixact_desc(StringInfo buf, XLogReaderState *record) xl_multixact_create *xlrec = (xl_multixact_create *) rec; int i; - appendStringInfo(buf, "%u offset %u nmembers %d: ", xlrec->mid, - xlrec->moff, xlrec->nmembers); + appendStringInfo(buf, "%u offset %llu nmembers %d: ", xlrec->mid, + (unsigned long long) xlrec->moff, xlrec->nmembers); for (i = 0; i < xlrec->nmembers; i++) out_member(buf, &xlrec->members[i]); } @@ -74,9 +74,10 @@ multixact_desc(StringInfo buf, XLogReaderState *record) { xl_multixact_truncate *xlrec = (xl_multixact_truncate *) rec; - appendStringInfo(buf, "offsets [%u, %u), members [%u, %u)", + appendStringInfo(buf, "offsets [%u, %u), members [%llu, %llu)", xlrec->startTruncOff, xlrec->endTruncOff, - xlrec->startTruncMemb, xlrec->endTruncMemb); + (unsigned long long) xlrec->startTruncMemb, + (unsigned long long) xlrec->endTruncMemb); } } diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c index 363294d623..aaa19c81c8 100644 --- a/src/backend/access/rmgrdesc/xlogdesc.c +++ b/src/backend/access/rmgrdesc/xlogdesc.c @@ -66,7 +66,7 @@ xlog_desc(StringInfo buf, XLogReaderState *record) CheckPoint *checkpoint = (CheckPoint *) rec; appendStringInfo(buf, "redo %X/%X; " - "tli %u; prev tli %u; fpw %s; wal_level %s; xid %u:%u; oid %u; multi %u; offset %u; " + "tli %u; prev tli %u; fpw %s; wal_level %s; xid %u:%u; oid %u; multi %u; offset %llu; " "oldest xid %u in DB %u; oldest multi %u in DB %u; " "oldest/newest commit timestamp xid: %u/%u; " "oldest running xid %u; %s", @@ -79,7 +79,7 @@ xlog_desc(StringInfo buf, XLogReaderState *record) XidFromFullTransactionId(checkpoint->nextXid), checkpoint->nextOid, checkpoint->nextMulti, - checkpoint->nextMultiOffset, + (unsigned long long) checkpoint->nextMultiOffset, checkpoint->oldestXid, checkpoint->oldestXidDB, checkpoint->oldestMulti, diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c index c601ff98a1..57c5148933 100644 --- a/src/backend/access/transam/multixact.c +++ b/src/backend/access/transam/multixact.c @@ -1258,7 +1258,8 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset) LWLockRelease(MultiXactGenLock); - debug_elog4(DEBUG2, "GetNew: returning %u offset %u", result, *offset); + debug_elog4(DEBUG2, "GetNew: returning %u offset %llu", result, + (unsigned long long) *offset); return result; } @@ -2285,8 +2286,9 @@ MultiXactGetCheckptMulti(bool is_shutdown, LWLockRelease(MultiXactGenLock); debug_elog6(DEBUG2, - "MultiXact: checkpoint is nextMulti %u, nextOffset %u, oldestMulti %u in DB %u", - *nextMulti, *nextMultiOffset, *oldestMulti, *oldestMultiDB); + "MultiXact: checkpoint is nextMulti %u, nextOffset %llu, oldestMulti %u in DB %u", + *nextMulti, (unsigned long long) *nextMultiOffset, *oldestMulti, + *oldestMultiDB); } /* @@ -2320,8 +2322,8 @@ void MultiXactSetNextMXact(MultiXactId nextMulti, MultiXactOffset nextMultiOffset) { - debug_elog4(DEBUG2, "MultiXact: setting next multi to %u offset %u", - nextMulti, nextMultiOffset); + debug_elog4(DEBUG2, "MultiXact: setting next multi to %u offset %llu", + nextMulti, (unsigned long long) nextMultiOffset); LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); MultiXactState->nextMXact = nextMulti; MultiXactState->nextOffset = nextMultiOffset; @@ -2511,8 +2513,8 @@ MultiXactAdvanceNextMXact(MultiXactId minMulti, } if (MultiXactOffsetPrecedes(MultiXactState->nextOffset, minMultiOffset)) { - debug_elog3(DEBUG2, "MultiXact: setting next offset to %u", - minMultiOffset); + debug_elog3(DEBUG2, "MultiXact: setting next offset to %llu", + (unsigned long long) minMultiOffset); MultiXactState->nextOffset = minMultiOffset; } LWLockRelease(MultiXactGenLock); @@ -3203,11 +3205,12 @@ TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB) elog(DEBUG1, "performing multixact truncation: " "offsets [%u, %u), offsets segments [%llx, %llx), " - "members [%u, %u), members segments [%llx, %llx)", + "members [%llu, %llu), members segments [%llx, %llx)", oldestMulti, newOldestMulti, (unsigned long long) MultiXactIdToOffsetSegment(oldestMulti), (unsigned long long) MultiXactIdToOffsetSegment(newOldestMulti), - oldestOffset, newOldestOffset, + (unsigned long long) oldestOffset, + (unsigned long long) newOldestOffset, (unsigned long long) MXOffsetToMemberSegment(oldestOffset), (unsigned long long) MXOffsetToMemberSegment(newOldestOffset)); @@ -3463,11 +3466,12 @@ multixact_redo(XLogReaderState *record) elog(DEBUG1, "replaying multixact truncation: " "offsets [%u, %u), offsets segments [%llx, %llx), " - "members [%u, %u), members segments [%llx, %llx)", + "members [%llu, %llu), members segments [%llx, %llx)", xlrec.startTruncOff, xlrec.endTruncOff, (unsigned long long) MultiXactIdToOffsetSegment(xlrec.startTruncOff), (unsigned long long) MultiXactIdToOffsetSegment(xlrec.endTruncOff), - xlrec.startTruncMemb, xlrec.endTruncMemb, + (unsigned long long) xlrec.startTruncMemb, + (unsigned long long) xlrec.endTruncMemb, (unsigned long long) MXOffsetToMemberSegment(xlrec.startTruncMemb), (unsigned long long) MXOffsetToMemberSegment(xlrec.endTruncMemb)); diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index ad817fbca6..388037a94b 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -877,8 +877,9 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, U64FromFullTransactionId(checkPoint.nextXid), checkPoint.nextOid))); ereport(DEBUG1, - (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %u", - checkPoint.nextMulti, checkPoint.nextMultiOffset))); + (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %llu", + checkPoint.nextMulti, + (unsigned long long) checkPoint.nextMultiOffset))); ereport(DEBUG1, (errmsg_internal("oldest unfrozen transaction ID: %u, in database %u", checkPoint.oldestXid, checkPoint.oldestXidDB))); diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c index 93a05d80ca..43b6727570 100644 --- a/src/bin/pg_controldata/pg_controldata.c +++ b/src/bin/pg_controldata/pg_controldata.c @@ -253,8 +253,8 @@ main(int argc, char *argv[]) ControlFile->checkPointCopy.nextOid); printf(_("Latest checkpoint's NextMultiXactId: %u\n"), ControlFile->checkPointCopy.nextMulti); - printf(_("Latest checkpoint's NextMultiOffset: %u\n"), - ControlFile->checkPointCopy.nextMultiOffset); + printf(_("Latest checkpoint's NextMultiOffset: %llu\n"), + (unsigned long long) ControlFile->checkPointCopy.nextMultiOffset); printf(_("Latest checkpoint's oldestXID: %u\n"), ControlFile->checkPointCopy.oldestXid); printf(_("Latest checkpoint's oldestXID's DB: %u\n"), diff --git a/src/bin/pg_resetwal/pg_resetwal.c b/src/bin/pg_resetwal/pg_resetwal.c index e9dcb5a6d8..985cd06802 100644 --- a/src/bin/pg_resetwal/pg_resetwal.c +++ b/src/bin/pg_resetwal/pg_resetwal.c @@ -737,8 +737,8 @@ PrintControlValues(bool guessed) ControlFile.checkPointCopy.nextOid); printf(_("Latest checkpoint's NextMultiXactId: %u\n"), ControlFile.checkPointCopy.nextMulti); - printf(_("Latest checkpoint's NextMultiOffset: %u\n"), - ControlFile.checkPointCopy.nextMultiOffset); + printf(_("Latest checkpoint's NextMultiOffset: %llu\n"), + (unsigned long long) ControlFile.checkPointCopy.nextMultiOffset); printf(_("Latest checkpoint's oldestXID: %u\n"), ControlFile.checkPointCopy.oldestXid); printf(_("Latest checkpoint's oldestXID's DB: %u\n"), @@ -809,8 +809,8 @@ PrintNewControlValues(void) if (set_mxoff != -1) { - printf(_("NextMultiOffset: %u\n"), - ControlFile.checkPointCopy.nextMultiOffset); + printf(_("NextMultiOffset: %llu\n"), + (unsigned long long) ControlFile.checkPointCopy.nextMultiOffset); } if (set_oid != 0) -- 2.45.2
From 063ec2662d94f7a72e3162702c4051f34cd67000 Mon Sep 17 00:00:00 2001 From: Maxim Orlov <m.orlov@postgrespro.ru> Date: Tue, 13 Aug 2024 14:44:50 +0300 Subject: [PATCH v1 3/3] Make pg_upgrade convert multixact offsets. Author: Maxim Orlov <orlovmg@gmail.com> --- src/bin/pg_upgrade/Makefile | 1 + src/bin/pg_upgrade/meson.build | 1 + src/bin/pg_upgrade/pg_upgrade.c | 29 ++- src/bin/pg_upgrade/pg_upgrade.h | 13 +- src/bin/pg_upgrade/segresize.c | 350 +++++++++++++++++++++++++++++++ src/include/catalog/catversion.h | 2 +- 6 files changed, 391 insertions(+), 5 deletions(-) create mode 100644 src/bin/pg_upgrade/segresize.c diff --git a/src/bin/pg_upgrade/Makefile b/src/bin/pg_upgrade/Makefile index bde91e2beb..030816596f 100644 --- a/src/bin/pg_upgrade/Makefile +++ b/src/bin/pg_upgrade/Makefile @@ -21,6 +21,7 @@ OBJS = \ info.o \ option.o \ parallel.o \ + segresize.o \ pg_upgrade.o \ relfilenumber.o \ server.o \ diff --git a/src/bin/pg_upgrade/meson.build b/src/bin/pg_upgrade/meson.build index 9825fa3305..2d9f7e6b65 100644 --- a/src/bin/pg_upgrade/meson.build +++ b/src/bin/pg_upgrade/meson.build @@ -10,6 +10,7 @@ pg_upgrade_sources = files( 'info.c', 'option.c', 'parallel.c', + 'segresize.c', 'pg_upgrade.c', 'relfilenumber.c', 'server.c', diff --git a/src/bin/pg_upgrade/pg_upgrade.c b/src/bin/pg_upgrade/pg_upgrade.c index 663235816f..d9d8d0ea78 100644 --- a/src/bin/pg_upgrade/pg_upgrade.c +++ b/src/bin/pg_upgrade/pg_upgrade.c @@ -750,7 +750,30 @@ copy_xact_xlog_xid(void) if (old_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER && new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER) { - copy_subdir_files("pg_multixact/offsets", "pg_multixact/offsets"); + /* + * If the old server is before the MULTIXACTOFFSET_FORMATCHANGE_CAT_VER + * it must have 32-bit multixid offsets, thus it should be converted. + */ + if (old_cluster.controldata.cat_ver < MULTIXACTOFFSET_FORMATCHANGE_CAT_VER && + new_cluster.controldata.cat_ver >= MULTIXACTOFFSET_FORMATCHANGE_CAT_VER) + { + uint64 oldest_offset = convert_multixact_offsets(); + + if (oldest_offset) + { + uint64 next_offset = old_cluster.controldata.chkpnt_nxtmxoff; + + /* Handle possible wraparound. */ + if (next_offset < oldest_offset) + next_offset += ((uint64) 1 << 32) - 1; + + next_offset -= oldest_offset - 1; + old_cluster.controldata.chkpnt_nxtmxoff = next_offset; + } + } + else + copy_subdir_files("pg_multixact/offsets", "pg_multixact/offsets"); + copy_subdir_files("pg_multixact/members", "pg_multixact/members"); prep_status("Setting next multixact ID and offset for new cluster"); @@ -760,9 +783,9 @@ copy_xact_xlog_xid(void) * counters here and the oldest multi present on system. */ exec_prog(UTILITY_LOG_FILE, NULL, true, true, - "\"%s/pg_resetwal\" -O %u -m %u,%u \"%s\"", + "\"%s/pg_resetwal\" -O %llu -m %u,%u \"%s\"", new_cluster.bindir, - old_cluster.controldata.chkpnt_nxtmxoff, + (unsigned long long) old_cluster.controldata.chkpnt_nxtmxoff, old_cluster.controldata.chkpnt_nxtmulti, old_cluster.controldata.chkpnt_oldstMulti, new_cluster.pgdata); diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h index cdb6e2b759..37d173cb86 100644 --- a/src/bin/pg_upgrade/pg_upgrade.h +++ b/src/bin/pg_upgrade/pg_upgrade.h @@ -114,6 +114,13 @@ extern char *output_files[]; */ #define MULTIXACT_FORMATCHANGE_CAT_VER 201301231 +/* + * Swicth from 32-bit to 64-bit for multixid offsets. + * + * XXX: should be changed to the actual CATALOG_VERSION_NO on commit. + */ +#define MULTIXACTOFFSET_FORMATCHANGE_CAT_VER 202408123 + /* * large object chunk size added to pg_controldata, * commit 5f93c37805e7485488480916b4585e098d3cc883 @@ -230,7 +237,7 @@ typedef struct uint32 chkpnt_nxtepoch; uint32 chkpnt_nxtoid; uint32 chkpnt_nxtmulti; - uint32 chkpnt_nxtmxoff; + uint64 chkpnt_nxtmxoff; uint32 chkpnt_oldstMulti; uint32 chkpnt_oldstxid; uint32 align; @@ -494,3 +501,7 @@ void parallel_transfer_all_new_dbs(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr char *old_pgdata, char *new_pgdata, char *old_tablespace); bool reap_child(bool wait_for_child); + +/* segresize.c */ + +uint64 convert_multixact_offsets(void); diff --git a/src/bin/pg_upgrade/segresize.c b/src/bin/pg_upgrade/segresize.c new file mode 100644 index 0000000000..e47c0a2407 --- /dev/null +++ b/src/bin/pg_upgrade/segresize.c @@ -0,0 +1,350 @@ +/* + * segresize.c + * + * SLRU segment resize utility + * + * Copyright (c) 2024, PostgreSQL Global Development Group + * src/bin/pg_upgrade/segresize.c + */ + +#include "postgres_fe.h" + +#include "pg_upgrade.h" +#include "access/multixact.h" + +/* See slru.h */ +#define SLRU_PAGES_PER_SEGMENT 32 + +/* + * Some kind of iterator associated with a particular SLRU segment. The idea is + * to specify the segment and page number and then move through the pages. + */ +typedef struct SlruSegState +{ + char *dir; + char *fn; + FILE *file; + int64 segno; + uint64 pageno; + bool leading_gap; + bool long_segment_names; +} SlruSegState; + +/* + * Get SLRU segmen file name from state. + * + * NOTE: this function should mirror SlruFileName call. + */ +static inline char * +SlruFileName(SlruSegState *state) +{ + if (state->long_segment_names) + { + Assert(state->segno >= 0 && + state->segno <= INT64CONST(0xFFFFFFFFFFFFFFF)); + return psprintf("%s/%015llX", state->dir, (long long) state->segno); + } + else + { + Assert(state->segno >= 0 && + state->segno <= INT64CONST(0xFFFFFF)); + return psprintf("%s/%04X", state->dir, (unsigned int) state->segno); + } +} + +/* + * Create SLRU segment file. + */ +static void +create_segment(SlruSegState *state) +{ + Assert(state->fn == NULL); + Assert(state->file == NULL); + + state->fn = SlruFileName(state); + state->file = fopen(state->fn, "wb"); + if (!state->file) + pg_fatal("could not create file \"%s\": %m", state->fn); +} + +/* + * Open existing SLRU segment file. + */ +static void +open_segment(SlruSegState *state) +{ + Assert(state->fn == NULL); + Assert(state->file == NULL); + + state->fn = SlruFileName(state); + state->file = fopen(state->fn, "rb"); + if (!state->file) + pg_fatal("could not open file \"%s\": %m", state->fn); +} + +/* + * Close SLRU segment file. + */ +static void +close_segment(SlruSegState *state) +{ + if (state->file) + { + fclose(state->file); + state->file = NULL; + } + + if (state->fn) + { + pfree(state->fn); + state->fn = NULL; + } +} + +/* + * Read next page from the old 32-bit offset segment file. + */ +static int +read_old_segment_page(SlruSegState *state, void *buf, bool *empty) +{ + int len; + + /* Open next segment file, if needed. */ + if (!state->fn) + { + if (!state->segno) + state->leading_gap = true; + + open_segment(state); + + /* Set position to the needed page. */ + if (state->pageno > 0 && + fseek(state->file, state->pageno * BLCKSZ, SEEK_SET)) + { + close_segment(state); + } + } + + if (state->file) + { + /* Segment file do exists, read page from it. */ + state->leading_gap = false; + + len = fread(buf, sizeof(char), BLCKSZ, state->file); + + /* Are we done or was there an error? */ + if (len <= 0) + { + if (ferror(state->file)) + pg_fatal("error reading file \"%s\": %m", state->fn); + + if (feof(state->file)) + { + *empty = true; + len = -1; + + close_segment(state); + } + } + else + *empty = false; + } + else if (!state->leading_gap) + { + /* We reached the last segment. */ + len = -1; + *empty = true; + } + else + { + /* Skip few first segments if they were frozen and removed. */ + len = BLCKSZ; + *empty = true; + } + + if (++state->pageno >= SLRU_PAGES_PER_SEGMENT) + { + /* Start a new segment. */ + state->segno++; + state->pageno = 0; + + close_segment(state); + } + + return len; +} + +/* + * Write next page to the new 64-bit offset segment file. + */ +static void +write_new_segment_page(SlruSegState *state, void *buf) +{ + /* + * Create a new segment file if we still didn't. Creation is + * postponed until the first non-empty page is found. This helps + * not to create completely empty segments. + */ + if (!state->file) + { + create_segment(state); + + /* Write zeroes to the previously skipped prefix. */ + if (state->pageno > 0) + { + char zerobuf[BLCKSZ] = {0}; + + for (int64 i = 0; i < state->pageno; i++) + { + if (fwrite(zerobuf, sizeof(char), BLCKSZ, state->file) != BLCKSZ) + pg_fatal("could not write file \"%s\": %m", state->fn); + } + } + } + + /* Write page to the new segment (if it was created). */ + if (state->file) + { + if (fwrite(buf, sizeof(char), BLCKSZ, state->file) != BLCKSZ) + pg_fatal("could not write file \"%s\": %m", state->fn); + } + + state->pageno++; + + /* + * Did we reach the maximum page number? Then close segment file + * and create a new one on the next iteration. + */ + if (state->pageno >= SLRU_PAGES_PER_SEGMENT) + { + state->segno++; + state->pageno = 0; + close_segment(state); + } +} + +/* + * Convert pg_multixact/offsets segments and return oldest multi offset. + */ +uint64 +convert_multixact_offsets(void) +{ + /* See multixact.c */ +#define MULTIXACT_OFFSETS_PER_PAGE_OLD (BLCKSZ / sizeof(uint32)) +#define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(MultiXactOffset)) + + SlruSegState oldseg = {0}, + newseg = {0}; + uint32 oldbuf[MULTIXACT_OFFSETS_PER_PAGE_OLD] = {0}; + MultiXactOffset newbuf[MULTIXACT_OFFSETS_PER_PAGE] = {0}; + /* + * It is much easier to deal with multi wraparound in 64 bitd format. Thus + * we use 64 bits for multi-transactions, although they remain 32 bits. + */ + uint64 oldest_multi = old_cluster.controldata.chkpnt_oldstMulti, + next_multi = old_cluster.controldata.chkpnt_nxtmulti, + multi, + old_entry, + new_entry; + bool found = false; + uint64 oldest_offset = 0; + + prep_status("Converting pg_multixact/offsets to 64-bit"); + + oldseg.pageno = oldest_multi / MULTIXACT_OFFSETS_PER_PAGE_OLD; + oldseg.segno = oldseg.pageno / SLRU_PAGES_PER_SEGMENT; + oldseg.pageno %= SLRU_PAGES_PER_SEGMENT; + oldseg.dir = psprintf("%s/pg_multixact/offsets", old_cluster.pgdata); + oldseg.long_segment_names = false; /* old format XXXX */ + + newseg.pageno = oldest_multi / MULTIXACT_OFFSETS_PER_PAGE; + newseg.segno = newseg.pageno / SLRU_PAGES_PER_SEGMENT; + newseg.pageno %= SLRU_PAGES_PER_SEGMENT; + newseg.dir = psprintf("%s/pg_multixact/offsets", new_cluster.pgdata); + newseg.long_segment_names = true; + + old_entry = oldest_multi % MULTIXACT_OFFSETS_PER_PAGE_OLD; + new_entry = oldest_multi % MULTIXACT_OFFSETS_PER_PAGE; + + if (next_multi < oldest_multi) + next_multi += (uint64) 1 << 32; /* wraparound */ + + for (multi = oldest_multi; multi < next_multi; old_entry = 0) + { + int oldlen; + bool empty; + + /* Handle possible segment wraparound. */ + if (oldseg.segno > MaxMultiXactId / + MULTIXACT_OFFSETS_PER_PAGE_OLD / + SLRU_PAGES_PER_SEGMENT) + oldseg.segno = 0; + + /* Read old offset segment. */ + oldlen = read_old_segment_page(&oldseg, oldbuf, &empty); + if (oldlen <= 0 || empty) + pg_fatal("cannot read page %llu from file \"%s\": %m", + (unsigned long long) oldseg.pageno, oldseg.fn); + + /* Fill possible gap. */ + if (oldlen < BLCKSZ) + memset((char *) oldbuf + oldlen, 0, BLCKSZ - oldlen); + + /* Save oldest multi offset */ + if (!found) + { + oldest_offset = oldbuf[old_entry]; + found = true; + } + + /* ... skip wrapped-around invalid multi */ + if (multi == (uint64) 1 << 32) + { + Assert(oldseg.segno == 0); + Assert(oldseg.pageno == 1); + Assert(old_entry == 0); + + multi += FirstMultiXactId; + old_entry = FirstMultiXactId; + } + + /* Copy entries to the new page. */ + for (; multi < next_multi && old_entry < MULTIXACT_OFFSETS_PER_PAGE_OLD; + multi++, old_entry++) + { + MultiXactOffset offset = oldbuf[old_entry]; + + /* Handle possible offset wraparound. */ + if (offset < oldest_offset) + offset += ((uint64) 1 << 32) - 1; + + /* Subtract oldest_offset, so new offsets will start from 1. */ + newbuf[new_entry++] = offset - oldest_offset + 1; + if (new_entry >= MULTIXACT_OFFSETS_PER_PAGE) + { + /* Write a new page. */ + write_new_segment_page(&newseg, newbuf); + new_entry = 0; + } + } + } + + /* Write the last incomplete page. */ + if (new_entry > 0 || oldest_multi == next_multi) + { + memset(&newbuf[new_entry], 0, + sizeof(newbuf[0]) * (MULTIXACT_OFFSETS_PER_PAGE - new_entry)); + write_new_segment_page(&newseg, newbuf); + } + + /* Release resources. */ + close_segment(&oldseg); + close_segment(&newseg); + + pfree(oldseg.dir); + pfree(newseg.dir); + + check_ok(); + + return oldest_offset; +} diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index 9a0ae27823..f29dc9fc92 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -57,6 +57,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 202408122 +#define CATALOG_VERSION_NO 202408123 #endif -- 2.45.2