Hi, First look at this patch, so I might be repeating stuff already commented on / discussed.
On 2022-03-17 13:25:35 +0530, Bharath Rupireddy wrote: > +-- > +-- pg_get_raw_wal_record() > +-- > +CREATE FUNCTION pg_get_raw_wal_record(IN in_lsn pg_lsn, > + OUT start_lsn pg_lsn, > + OUT end_lsn pg_lsn, > + OUT prev_lsn pg_lsn, > + OUT record_length int4, > + OUT record bytea > +) > +AS 'MODULE_PATHNAME', 'pg_get_raw_wal_record' > +LANGUAGE C CALLED ON NULL INPUT PARALLEL SAFE; What is raw about the function? Why "CALLED ON NULL INPUT"? It doesn't make sense to call the function with a NULL lsn, does it? Also, that's the default, why is it restated here? > +REVOKE EXECUTE ON FUNCTION pg_get_raw_wal_record(pg_lsn) FROM PUBLIC; > +GRANT EXECUTE ON FUNCTION pg_get_raw_wal_record(pg_lsn) TO > pg_read_server_files; > + > +-- > +-- pg_get_wal_record_info() > +-- > +CREATE FUNCTION pg_get_wal_record_info(IN in_lsn pg_lsn, > + OUT start_lsn pg_lsn, > + OUT end_lsn pg_lsn, > + OUT prev_lsn pg_lsn, > + OUT xid xid, > + OUT resource_manager text, > + OUT record_length int4, > + OUT fpi_length int4, > + OUT description text, > + OUT block_ref text, > + OUT data_length int4, > + OUT data bytea > +) > +AS 'MODULE_PATHNAME', 'pg_get_wal_record_info' > +LANGUAGE C CALLED ON NULL INPUT PARALLEL SAFE; > + > +REVOKE EXECUTE ON FUNCTION pg_get_wal_record_info(pg_lsn) FROM PUBLIC; > +GRANT EXECUTE ON FUNCTION pg_get_wal_record_info(pg_lsn) TO pg_monitor; I don't think it's appropriate for pg_monitor to see all the data in the WAL. > +-- > +-- pg_get_wal_stats() > +-- > +CREATE FUNCTION pg_get_wal_stats(IN start_lsn pg_lsn, > + IN end_lsn pg_lsn DEFAULT NULL, > + OUT resource_manager text, > + OUT count int8, > + OUT count_percentage float4, > + OUT record_length int8, > + OUT record_length_percentage float4, > + OUT fpi_length int8, > + OUT fpi_length_percentage float4 > + ) > +RETURNS SETOF record AS $$ > +SELECT resource_manager, > + count(*) AS cnt, > + CASE WHEN count(*) > 0 THEN (count(*) * 100 / sum(count(*)) OVER > total)::numeric(5,2) ELSE 0 END AS "count_%", > + sum(record_length) AS trecl, > + CASE WHEN sum(record_length) > 0 THEN (sum(record_length) * 100 / > sum(sum(record_length)) OVER total)::numeric(5,2) ELSE 0 END AS "trecl_%", > + sum(fpi_length) AS tfpil, > + CASE WHEN sum(fpi_length) > 0 THEN (sum(fpi_length) * 100 / > sum(sum(fpi_length)) OVER total)::numeric(5,2) ELSE 0 END AS "tfpil_%" > +FROM pg_get_wal_records_info(start_lsn, end_lsn) > +GROUP BY resource_manager > +WINDOW total AS (); > +$$ LANGUAGE SQL CALLED ON NULL INPUT PARALLEL SAFE; This seems like an exceedingly expensive way to compute this. Not just because of doing the grouping, window etc, but also because it's serializing the "data" field from pg_get_wal_records_info() just to never use it. With any appreciatable amount of data the return value pg_get_wal_records_info() will be serialized into a on-disk tuplestore. This is probably close to an order of magnitude slower than pg_waldump --stats. Which imo renders this largely useless. The column names don't seem great either. "tfpil"? > +/* > + * Module load callback. > + */ > +void > +_PG_init(void) > +{ > + /* Define custom GUCs and install hooks here, if any. */ > + > + /* > + * Have EmitWarningsOnPlaceholders("pg_walinspect"); if custom GUCs are > + * defined. > + */ > +} > + > +/* > + * Module unload callback. > + */ > +void > +_PG_fini(void) > +{ > + /* Uninstall hooks, if any. */ > +} Why have this stuff if it's not used? > +/* > + * Validate given LSN and return the LSN up to which the server has WAL. > + */ > +static XLogRecPtr > +ValidateInputLSN(XLogRecPtr lsn) > +{ > + XLogRecPtr curr_lsn; > + > + /* Validate input WAL LSN. */ > + if (XLogRecPtrIsInvalid(lsn)) > + ereport(ERROR, > + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), > + errmsg("invalid WAL LSN"))); > + > + /* > + * We determine the current LSN of the server similar to how page_read > + * callback read_local_xlog_page does. > + */ > + if (!RecoveryInProgress()) > + curr_lsn = GetFlushRecPtr(NULL); > + else > + curr_lsn = GetXLogReplayRecPtr(NULL); > + > + Assert(!XLogRecPtrIsInvalid(curr_lsn)); > + > + if (lsn >= curr_lsn) > + { > + /* > + * GetFlushRecPtr or GetXLogReplayRecPtr gives "end+1" LSN of > the last > + * record flushed or replayed respectively. But let's use the > LSN up > + * to "end" in user facing message. > + */ > + ereport(ERROR, > + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), > + errmsg("cannot accept future input LSN"), > + errdetail("Last known WAL LSN on the database > system is %X/%X.", > + LSN_FORMAT_ARGS(curr_lsn - > 1)))); > + } > + return curr_lsn; > +} > + > +/* > + * Validate given start LSN and end LSN, return the new end LSN in case user > + * hasn't specified one. > + */ > +static XLogRecPtr > +ValidateStartAndEndLSNs(XLogRecPtr start_lsn, XLogRecPtr end_lsn) > +{ > + XLogRecPtr curr_lsn; > + > + /* Validate WAL start LSN. */ > + if (XLogRecPtrIsInvalid(start_lsn)) > + ereport(ERROR, > + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), > + errmsg("invalid WAL start LSN"))); > + > + if (!RecoveryInProgress()) > + curr_lsn = GetFlushRecPtr(NULL); > + else > + curr_lsn = GetXLogReplayRecPtr(NULL); > + > + if (start_lsn >= curr_lsn) > + { > + /* > + * GetFlushRecPtr or GetXLogReplayRecPtr gives "end+1" LSN of > the last > + * record flushed or replayed respectively. But let's use the > LSN up > + * to "end" in user facing message. > + */ > + ereport(ERROR, > + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), > + errmsg("cannot accept future start LSN"), > + errdetail("Last known WAL LSN on the database > system is %X/%X.", > + LSN_FORMAT_ARGS(curr_lsn - > 1)))); > + } > + /* > + * If end_lsn is specified, let's ensure that it's not a future LSN i.e. > + * something the database system doesn't know about. > + */ > + if (!XLogRecPtrIsInvalid(end_lsn) && > + (end_lsn >= curr_lsn)) > + { > + /* > + * GetFlushRecPtr or GetXLogReplayRecPtr gives "end+1" LSN of > the last > + * record flushed or replayed respectively. But let's use the > LSN up > + * to "end" in user facing message. > + */ > + ereport(ERROR, > + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), > + errmsg("cannot accept future end LSN"), > + errdetail("Last known WAL LSN on the database > system is %X/%X.", > + LSN_FORMAT_ARGS(curr_lsn - > 1)))); > + } > + > + /* > + * When end_lsn is not specified let's read up to the last WAL position > + * known to be on the server. > + */ > + if (XLogRecPtrIsInvalid(end_lsn)) > + end_lsn = curr_lsn; > + > + Assert(!XLogRecPtrIsInvalid(end_lsn)); > + > + if (start_lsn >= end_lsn) > + ereport(ERROR, > + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), > + errmsg("WAL start LSN must be less than end > LSN"))); > + > + return end_lsn; > +} These two functions are largely redundant, that doesn't seem great. > +Datum > +pg_get_raw_wal_record(PG_FUNCTION_ARGS) > +{ > +#define PG_GET_RAW_WAL_RECORD_COLS 5 > + XLogRecPtr lsn; > + XLogRecord *record; > + XLogRecPtr first_record; > + XLogReaderState *xlogreader; > + bytea *raw_record; > + uint32 rec_len; > + char *raw_record_data; > + TupleDesc tupdesc; > + Datum result; > + HeapTuple tuple; > + Datum values[PG_GET_RAW_WAL_RECORD_COLS]; > + bool nulls[PG_GET_RAW_WAL_RECORD_COLS]; > + int i = 0; > + > + lsn = PG_GETARG_LSN(0); > + > + (void) ValidateInputLSN(lsn); > + > + /* Build a tuple descriptor for our result type. */ > + if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) > + elog(ERROR, "return type must be a row type"); > + > + xlogreader = InitXLogReaderState(lsn, &first_record); > + > + Assert(xlogreader); > + > + record = ReadNextXLogRecord(xlogreader, first_record); > + > + rec_len = XLogRecGetTotalLen(xlogreader); > + > + Assert(rec_len > 0); > + Most of this has another copy in pg_get_wal_record_info(). Can more of this be deduplicated? > +/* > + * Get WAL record info. > + */ > +static void > +GetXLogRecordInfo(XLogReaderState *record, XLogRecPtr lsn, > + Datum *values, bool *nulls, uint32 ncols) > +{ > + const char *id; > + const RmgrData *desc; > + uint32 fpi_len = 0; > + RelFileNode rnode; > + ForkNumber forknum; > + BlockNumber blk; > + int block_id; > + StringInfoData rec_desc; > + StringInfoData rec_blk_ref; > + StringInfoData temp; > + bytea *data; > + char *main_data; > + uint32 main_data_len; > + int i = 0; > + > + desc = &RmgrTable[XLogRecGetRmid(record)]; > + initStringInfo(&rec_desc); > + id = desc->rm_identify(XLogRecGetInfo(record)); > + > + if (id == NULL) > + appendStringInfo(&rec_desc, "UNKNOWN (%x) ", > XLogRecGetInfo(record) & ~XLR_INFO_MASK); > + else > + appendStringInfo(&rec_desc, "%s ", id); > + > + initStringInfo(&temp); > + desc->rm_desc(&temp, record); > + appendStringInfo(&rec_desc, "%s", temp.data); > + pfree(temp.data); > + initStringInfo(&rec_blk_ref); This seems unnecessarily wasteful. You serialize into one stringinfo, just to then copy that stringinfo into another stringinfo. Just to then allocate yet another stringinfo. > + /* Block references (detailed format). */ This comment seems copied from pg_waldump, but doesn't make sense here, because there's no short format. > + for (block_id = 0; block_id <= record->max_block_id; block_id++) > + { > + if (!XLogRecHasBlockRef(record, block_id)) > + continue; > + > + XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blk); > + > + if (forknum != MAIN_FORKNUM) > + appendStringInfo(&rec_blk_ref, > + "blkref #%u: rel > %u/%u/%u fork %s blk %u", > + block_id, > rnode.spcNode, rnode.dbNode, > + rnode.relNode, > get_forkname(forknum), blk); > + else > + appendStringInfo(&rec_blk_ref, > + "blkref #%u: rel > %u/%u/%u blk %u", > + block_id, > rnode.spcNode, rnode.dbNode, > + rnode.relNode, blk); > + > + if (XLogRecHasBlockImage(record, block_id)) > + { > + uint8 bimg_info = > record->blocks[block_id].bimg_info; > + > + /* Calculate the amount of FPI data in the record. */ > + fpi_len += record->blocks[block_id].bimg_len; > + > + if (BKPIMAGE_COMPRESSED(bimg_info)) > + { > + const char *method; > + > + if ((bimg_info & BKPIMAGE_COMPRESS_PGLZ) != 0) > + method = "pglz"; > + else if ((bimg_info & BKPIMAGE_COMPRESS_LZ4) != > 0) > + method = "lz4"; > + else > + method = "unknown"; > + > + appendStringInfo(&rec_blk_ref, " (FPW%s); hole: > offset: %u, length: %u, " > + "compression > saved: %u, method: %s", > + > XLogRecBlockImageApply(record, block_id) ? > + "" : " for WAL > verification", > + > record->blocks[block_id].hole_offset, > + > record->blocks[block_id].hole_length, > + BLCKSZ - > + > record->blocks[block_id].hole_length - > + > record->blocks[block_id].bimg_len, > + method); > + } > + else > + { > + appendStringInfo(&rec_blk_ref, " (FPW%s); hole: > offset: %u, length: %u", > + > XLogRecBlockImageApply(record, block_id) ? > + "" : " for WAL > verification", > + > record->blocks[block_id].hole_offset, > + > record->blocks[block_id].hole_length); > + } > + } > + } To me duplicating this much code from waldump seems like a bad idea from a maintainability POV. > +/* > + * Get info and data of all WAL records between start LSN and end LSN. > + */ > +static void > +GetWALRecordsInfoInternal(FunctionCallInfo fcinfo, XLogRecPtr start_lsn, > + XLogRecPtr end_lsn) > +{ > +#define PG_GET_WAL_RECORDS_INFO_COLS 11 > + XLogRecPtr first_record; > + XLogReaderState *xlogreader; > + ReturnSetInfo *rsinfo; > + TupleDesc tupdesc; > + Tuplestorestate *tupstore; > + MemoryContext per_query_ctx; > + MemoryContext oldcontext; > + Datum values[PG_GET_WAL_RECORDS_INFO_COLS]; > + bool nulls[PG_GET_WAL_RECORDS_INFO_COLS]; > + > + rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; > + > + /* Check to see if caller supports us returning a tuplestore. */ > + if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo)) > + ereport(ERROR, > + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), > + errmsg("set-valued function called in context > that cannot accept a set"))); > + if (!(rsinfo->allowedModes & SFRM_Materialize)) > + ereport(ERROR, > + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), > + errmsg("materialize mode required, but it is > not allowed in this context"))); > + > + /* Build a tuple descriptor for our result type. */ > + if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) > + elog(ERROR, "return type must be a row type"); > + > + /* Build tuplestore to hold the result rows. */ > + per_query_ctx = rsinfo->econtext->ecxt_per_query_memory; > + oldcontext = MemoryContextSwitchTo(per_query_ctx); > + tupstore = tuplestore_begin_heap(true, false, work_mem); > + rsinfo->returnMode = SFRM_Materialize; > + rsinfo->setResult = tupstore; > + rsinfo->setDesc = tupdesc; This should likely use the infrastructure introduced in 5b81703787bfc1e6072c8e37125eba0c5598b807. > + for (;;) > + { > + (void) ReadNextXLogRecord(xlogreader, first_record); > + > + /* > + * Let's not show the record info if it is spanning more than > the > + * end_lsn. EndRecPtr is "end+1" of the last read record, hence > + * use "end" here. > + */ > + if ((xlogreader->EndRecPtr - 1) <= end_lsn) > + { > + GetXLogRecordInfo(xlogreader, xlogreader->currRecPtr, > values, nulls, > + > PG_GET_WAL_RECORDS_INFO_COLS); > + > + tuplestore_putvalues(tupstore, tupdesc, values, nulls); > + } > + > + /* Exit loop if read up to end_lsn. */ > + if (xlogreader->EndRecPtr >= end_lsn) > + break; Seems weird to have both of these conditions separately. Greetings, Andres Freund