Attached is the new version of patch that addresses the comments from Andrey and Beena.
On Wed, Jul 29, 2020 at 10:07 AM Ashutosh Sharma <ashu.coe...@gmail.com> wrote: > Hi Beena, > > Thanks for the review. > > 1. We would be marking buffer dirty and writing wal even if we have >> not done any changes( ex if we pass invalid/dead tids). Maybe we can >> handle this better? >> > > Yeah, we can skip this when nothing has changed. Will take care of it in > the next version of patch. > > >> cosmetic changes >> 1. Maybe "HTupleSurgicalOption" instead of "HTupleForceOption" and >> also the variable names could use surgery instead. >> > > I think that looks fine. I would rather prefer the word "Force" just > because all the enum options contain the word "Force" in it. > > >> 2. extension comment pg_surgery.control "extension to perform surgery >> the damaged heap table" -> "extension to perform surgery on the >> damaged heap table" >> > > Okay, will fix that typo. > > -- > With Regards, > Ashutosh Sharma > EnterpriseDB:http://www.enterprisedb.com >
From e78c6fc6a99bba7f38482864a36dc812b0798464 Mon Sep 17 00:00:00 2001 From: ashu <ashutosh.sha...@enterprisedb.com> Date: Fri, 31 Jul 2020 17:24:50 +0530 Subject: [PATCH] Add contrib/pg_surgery to perform surgery on the damaged heap tables. This contrib module basically adds a couple of functions named heap_force_kill and heap_force_freeze that can be used in the scripts to perform surgery on the damaged heap tables. Ashutosh Sharma. --- contrib/Makefile | 1 + contrib/pg_surgery/Makefile | 23 ++ contrib/pg_surgery/expected/pg_surgery.out | 109 +++++++++ contrib/pg_surgery/heap_surgery_funcs.c | 378 +++++++++++++++++++++++++++++ contrib/pg_surgery/pg_surgery--1.0.sql | 14 ++ contrib/pg_surgery/pg_surgery.control | 5 + contrib/pg_surgery/sql/pg_surgery.sql | 72 ++++++ 7 files changed, 602 insertions(+) create mode 100644 contrib/pg_surgery/Makefile create mode 100644 contrib/pg_surgery/expected/pg_surgery.out create mode 100644 contrib/pg_surgery/heap_surgery_funcs.c create mode 100644 contrib/pg_surgery/pg_surgery--1.0.sql create mode 100644 contrib/pg_surgery/pg_surgery.control create mode 100644 contrib/pg_surgery/sql/pg_surgery.sql diff --git a/contrib/Makefile b/contrib/Makefile index 1846d41..07d5734 100644 --- a/contrib/Makefile +++ b/contrib/Makefile @@ -35,6 +35,7 @@ SUBDIRS = \ pg_standby \ pg_stat_statements \ pg_trgm \ + pg_surgery \ pgcrypto \ pgrowlocks \ pgstattuple \ diff --git a/contrib/pg_surgery/Makefile b/contrib/pg_surgery/Makefile new file mode 100644 index 0000000..baf2a88 --- /dev/null +++ b/contrib/pg_surgery/Makefile @@ -0,0 +1,23 @@ +# contrib/pg_surgery/Makefile + +MODULE_big = pg_surgery +OBJS = \ + $(WIN32RES) \ + heap_surgery_funcs.o + +EXTENSION = pg_surgery +DATA = pg_surgery--1.0.sql +PGFILEDESC = "pg_surgery - perform surgery on the damaged heap table" + +REGRESS = pg_surgery + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = contrib/pg_surgery +top_builddir = ../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif diff --git a/contrib/pg_surgery/expected/pg_surgery.out b/contrib/pg_surgery/expected/pg_surgery.out new file mode 100644 index 0000000..98419b0 --- /dev/null +++ b/contrib/pg_surgery/expected/pg_surgery.out @@ -0,0 +1,109 @@ +create extension pg_surgery; +-- +-- test pg_surgery functions with all the supported relations. Should pass. +-- +-- normal heap table. +begin; +create table htab(a int); +insert into htab values (100), (200), (300), (400), (500); +select * from htab where xmin = 2; + a +--- +(0 rows) + +select heap_force_freeze('htab'::regclass, ARRAY['(0, 4)']::tid[]); + heap_force_freeze +------------------- + +(1 row) + +select ctid, xmax from htab where xmin = 2; + ctid | xmax +-------+------ + (0,4) | 0 +(1 row) + +select heap_force_kill('htab'::regclass, ARRAY['(0, 4)']::tid[]); + heap_force_kill +----------------- + +(1 row) + +select * from htab where ctid = '(0, 4)'; + a +--- +(0 rows) + +rollback; +-- toast table. +begin; +create table ttab(a text); +insert into ttab select string_agg(chr(floor(random() * 26)::int + 65), '') from generate_series(1,10000); +select * from ttab where xmin = 2; + a +--- +(0 rows) + +select heap_force_freeze('ttab'::regclass, ARRAY['(0, 1)']::tid[]); + heap_force_freeze +------------------- + +(1 row) + +select ctid, xmax from ttab where xmin = 2; + ctid | xmax +-------+------ + (0,1) | 0 +(1 row) + +select heap_force_kill('ttab'::regclass, ARRAY['(0, 1)']::tid[]); + heap_force_kill +----------------- + +(1 row) + +select * from ttab where ctid = '(0, 1)'; + a +--- +(0 rows) + +rollback; +-- +-- test pg_surgery functions with the unsupported relations. Should fail. +-- +-- partitioned tables (the parent table) doesn't contain any tuple. +create table ptab (a int) partition by list (a); +select heap_force_kill('ptab'::regclass, ARRAY['(0, 1)']::tid[]); +ERROR: only heap AM is supported +select heap_force_freeze('ptab'::regclass, ARRAY['(0, 1)']::tid[]); +ERROR: only heap AM is supported +create index ptab_idx on ptab (a); +-- indexes are not supported. so, all these should fail. +select heap_force_kill('ptab_idx'::regclass, ARRAY['(0, 1)']::tid[]); +ERROR: only heap AM is supported +select heap_force_freeze('ptab_idx'::regclass, ARRAY['(0, 1)']::tid[]); +ERROR: only heap AM is supported +create view vw as select 1; +-- views are not supported as well. so, all these should fail. +select heap_force_kill('vw'::regclass, ARRAY['(0, 1)']::tid[]); +ERROR: only heap AM is supported +select heap_force_freeze('vw'::regclass, ARRAY['(0, 1)']::tid[]); +ERROR: only heap AM is supported +create materialized view mvw as select 10; +-- materialized views are not supported as well. so, all these should fail. +select heap_force_kill('mvw'::regclass, ARRAY['(0, 1)']::tid[]); +ERROR: "mvw" is not a table or TOAST table +select heap_force_freeze('mvw'::regclass, ARRAY['(0, 1)']::tid[]); +ERROR: "mvw" is not a table or TOAST table +create sequence seq; +-- sequences are not supported as well. so, all these functions should fail. +select heap_force_kill('seq'::regclass, ARRAY['(0, 1)']::tid[]); +ERROR: only heap AM is supported +select heap_force_freeze('seq'::regclass, ARRAY['(0, 1)']::tid[]); +ERROR: only heap AM is supported +-- cleanup. +drop materialized view mvw; +drop table ptab; +drop view vw; +drop sequence seq; +drop extension pg_surgery; diff --git a/contrib/pg_surgery/heap_surgery_funcs.c b/contrib/pg_surgery/heap_surgery_funcs.c new file mode 100644 index 0000000..93f5d36 --- /dev/null +++ b/contrib/pg_surgery/heap_surgery_funcs.c @@ -0,0 +1,378 @@ +/*------------------------------------------------------------------------- + * + * heap_surgery_funcs.c + * Functions to perform surgery on the damaged heap table. + * + * Copyright (c) 2020, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/pg_surgery/heap_surgery_funcs.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/heapam.h" +#include "catalog/pg_am_d.h" +#include "miscadmin.h" +#include "storage/bufmgr.h" +#include "utils/rel.h" + +#ifdef PG_MODULE_MAGIC +PG_MODULE_MAGIC; +#endif + +/* + * Macros for accessing the item pointer array. Taken from the intarray contrib + * header. + */ +#define ARRPTR(x) ((ItemPointer) ARR_DATA_PTR(x)) +#define ARRNELEMS(x) ArrayGetNItems(ARR_NDIM(x), ARR_DIMS(x)) + +/* Reject arrays we can't handle. */ +#define CHECKARRVALID(x) \ + do { \ + if (ARR_HASNULL(x) && array_contains_nulls(x)) \ + ereport(ERROR, \ + (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), \ + errmsg("array must not contain nulls"))); \ + } while(0) + +#define ARRISEMPTY(x) (ARRNELEMS(x) == 0) + +/* Sort the elements of the array. */ +#define SORT(x) \ + do { \ + int _nelems_ = ARRNELEMS(x); \ + if (_nelems_ > 1) \ + qsort((void*) ARRPTR(x), _nelems_, sizeof(ItemPointerData), \ + tidcmp); \ + } while(0) + +/* Options to forcefully change the state of a heap tuple. */ +typedef enum HTupleForceOption +{ + FORCE_KILL, + FORCE_FREEZE +} HTupleForceOption; + +PG_FUNCTION_INFO_V1(heap_force_kill); +PG_FUNCTION_INFO_V1(heap_force_freeze); + +static int32 tidcmp(const void *a, const void *b); +static Datum heap_force_common(FunctionCallInfo fcinfo, + HTupleForceOption force_opt); +static void check_relation_relkind(Relation rel); +static void verify_tid(ItemPointer tid, BlockNumber *blkno, + OffsetNumber *offno); +static BlockNumber tids_same_page_fetch_offnums(ItemPointer tids, int ntids, + OffsetNumber *next_start_ptr, + OffsetNumber *offnos); + +/*------------------------------------------------------------------------- + * heap_force_kill() + * + * Force kill the tuple(s) pointed to by the item pointer(s) stored in the + * given tid array. + * + * Usage: SELECT heap_force_kill(regclass, tid[]); + *------------------------------------------------------------------------- + */ +Datum +heap_force_kill(PG_FUNCTION_ARGS) +{ + return heap_force_common(fcinfo, FORCE_KILL); +} + +/*------------------------------------------------------------------------- + * heap_force_freeze() + * + * Force freeze the tuple(s) pointed to by the item pointer(s) stored in the + * given tid array. + * + * Usage: SELECT heap_force_freeze(regclass, tid[]); + *------------------------------------------------------------------------- + */ +Datum +heap_force_freeze(PG_FUNCTION_ARGS) +{ + return heap_force_common(fcinfo, FORCE_FREEZE); +} + +/*------------------------------------------------------------------------- + * heap_force_common() + * + * Common code for heap_force_kill and heap_force_freeze + *------------------------------------------------------------------------- + */ +static Datum +heap_force_common(FunctionCallInfo fcinfo, HTupleForceOption force_opt) +{ + Oid relid = PG_GETARG_OID(0); + ArrayType *ta = PG_GETARG_ARRAYTYPE_P_COPY(1); + ItemPointer tids; + int ntids; + Relation rel; + Buffer buf; + Page page; + ItemId itemid; + BlockNumber blkno; + OffsetNumber *offnos; + OffsetNumber offno, + noffs, + curr_start_ptr, + next_start_ptr, + maxoffset; + int i, + nskippedItems; + + Assert(force_opt == FORCE_KILL || force_opt == FORCE_FREEZE); + + if (RecoveryInProgress()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("recovery is in progress"), + errhint("%s cannot be executed during recovery.", + force_opt == FORCE_KILL ? "heap_force_kill()" : + "heap_force_freeze()"))); + + /* Basic sanity checking. */ + CHECKARRVALID(ta); + + if (ARRISEMPTY(ta)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("empty tid array"))); + + rel = relation_open(relid, AccessShareLock); + + if (!superuser() && GetUserId() != rel->rd_rel->relowner) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("must be superuser or object owner to use %s.", + force_opt == FORCE_KILL ? "heap_force_kill()" : + "heap_force_freeze()"))); + + if (rel->rd_rel->relam != HEAP_TABLE_AM_OID) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("only heap AM is supported"))); + + check_relation_relkind(rel); + + SORT(ta); + + tids = ARRPTR(ta); + ntids = ARRNELEMS(ta); + + offnos = (OffsetNumber *) palloc(ntids * sizeof(OffsetNumber)); + noffs = curr_start_ptr = next_start_ptr = 0; + + do + { + /* + * Get the offset numbers from the tids belonging to one particular page + * and process them one by one. + */ + blkno = tids_same_page_fetch_offnums(tids, ntids, &next_start_ptr, + offnos); + + CHECK_FOR_INTERRUPTS(); + + buf = ReadBuffer(rel, blkno); + LockBufferForCleanup(buf); + + page = BufferGetPage(buf); + + /* Calculate the number of offsets stored in offnos array. */ + noffs = next_start_ptr - curr_start_ptr; + + maxoffset = PageGetMaxOffsetNumber(page); + + if (noffs > maxoffset) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("number of offsets specified for block %u exceeds the max offset number %u", + blkno, maxoffset))); + + nskippedItems = 0; + + /* No ereport(ERROR) from here until all the changes are logged. */ + START_CRIT_SECTION(); + + for (i = 0; i < noffs; i++) + { + itemid = PageGetItemId(page, offnos[i]); + + /* Follow any redirections until we find something useful. */ + while (ItemIdIsRedirected(itemid)) + { + offno = ItemIdGetRedirect(itemid); + itemid = PageGetItemId(page, offno); + CHECK_FOR_INTERRUPTS(); + } + + /* Nothing to do if the itemid is unused or already dead. */ + if (!ItemIdIsUsed(itemid) || ItemIdIsDead(itemid)) + { + nskippedItems++; + ereport(NOTICE, + (errmsg("skipping tid (%u, %u) because it is already marked %s", + blkno, offnos[i], + ItemIdIsDead(itemid) ? "dead" : "unused"))); + continue; + } + + Assert(ItemIdIsNormal(itemid)); + + if (force_opt == FORCE_KILL) + ItemIdSetDead(itemid); + else if (force_opt == FORCE_FREEZE) + { + HeapTupleHeader htup; + + htup = (HeapTupleHeader) PageGetItem(page, itemid); + + HeapTupleHeaderSetXmin(htup, FrozenTransactionId); + HeapTupleHeaderSetXmax(htup, InvalidTransactionId); + + /* + * Clear all the visibility-related bits of this tuple and mark + * it as frozen. + */ + htup->t_infomask &= ~HEAP_XACT_MASK; + htup->t_infomask |= (HEAP_XMIN_FROZEN | HEAP_XMAX_INVALID); + } + else + { + /* shouldn't happen. */ + elog(ERROR, "invalid heap tuple force option: %d\n", force_opt); + } + } + + if (nskippedItems == noffs) + goto skip_wal; + + /* Mark buffer dirty before we write WAL. */ + MarkBufferDirty(buf); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + log_newpage_buffer(buf, true); + +skip_wal: + END_CRIT_SECTION(); + + UnlockReleaseBuffer(buf); + + /* + * Update the current start pointer before fetching next set of offset + * numbers from the tids belonging to the same page. + */ + curr_start_ptr = next_start_ptr; + } while (next_start_ptr != ntids); + + relation_close(rel, AccessShareLock); + + pfree(ta); + pfree(offnos); + + PG_RETURN_VOID(); +} + +/*------------------------------------------------------------------------- + * tidcmp() + * + * Compare two item pointers, return -1, 0, or +1. + * + * See ItemPointerCompare for details. + * ------------------------------------------------------------------------ + */ +static int32 +tidcmp(const void *a, const void *b) +{ + ItemPointer iptr1 = ((const ItemPointer) a); + ItemPointer iptr2 = ((const ItemPointer) b); + + return ItemPointerCompare(iptr1, iptr2); +} + +/*------------------------------------------------------------------------- + * check_relation_relkind() + * + * Check if the given relation is of the relkind supported by the callers. + * ------------------------------------------------------------------------ + */ +static void +check_relation_relkind(Relation rel) +{ + if (rel->rd_rel->relkind != RELKIND_RELATION && + rel->rd_rel->relkind != RELKIND_TOASTVALUE) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("\"%s\" is not a table or TOAST table", + RelationGetRelationName(rel)))); +} + +/*------------------------------------------------------------------------- + * verify_tid() + * + * Verify the given item pointer. + * ------------------------------------------------------------------------ + */ +static void +verify_tid(ItemPointer tid, BlockNumber *blkno, OffsetNumber *offno) +{ + if (!ItemPointerIsValid(tid)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid item pointer"))); + + *blkno = ItemPointerGetBlockNumberNoCheck(tid); + *offno = ItemPointerGetOffsetNumberNoCheck(tid); + + if (!BlockNumberIsValid(*blkno)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid block number"))); + + if (!OffsetNumberIsValid(*offno)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid offset number"))); +} + +/*------------------------------------------------------------------------- + * tids_same_page_fetch_offnums() + * + * Find out all the tids residing in the same page as tids[next_start_ptr] and + * fetch the offset number stored in each of them into a caller-allocated offset + * number array. + * ------------------------------------------------------------------------ + */ +static BlockNumber +tids_same_page_fetch_offnums(ItemPointer tids, int ntids, + OffsetNumber *next_start_ptr, OffsetNumber *offnos) +{ + int i; + BlockNumber prev_blkno, + blkno; + OffsetNumber offno; + + for (i = *next_start_ptr; i < ntids; i++) + { + ItemPointerData tid = tids[i]; + + verify_tid(&tid, &blkno, &offno); + + if (i == *next_start_ptr || (prev_blkno == blkno)) + offnos[i - *next_start_ptr] = offno; + else + break; + + prev_blkno = blkno; + } + + *next_start_ptr = i; + return prev_blkno; +} \ No newline at end of file diff --git a/contrib/pg_surgery/pg_surgery--1.0.sql b/contrib/pg_surgery/pg_surgery--1.0.sql new file mode 100644 index 0000000..0878be6 --- /dev/null +++ b/contrib/pg_surgery/pg_surgery--1.0.sql @@ -0,0 +1,14 @@ +/* contrib/pg_surgery/pg_surgery--1.0.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION pg_surgery" to load this file. \quit + +CREATE FUNCTION heap_force_kill(reloid regclass, tids tid[]) +RETURNS VOID +AS 'MODULE_PATHNAME', 'heap_force_kill' +LANGUAGE C STRICT; + +CREATE FUNCTION heap_force_freeze(reloid regclass, tids tid[]) +RETURNS VOID +AS 'MODULE_PATHNAME', 'heap_force_freeze' +LANGUAGE C STRICT; \ No newline at end of file diff --git a/contrib/pg_surgery/pg_surgery.control b/contrib/pg_surgery/pg_surgery.control new file mode 100644 index 0000000..e042ad4 --- /dev/null +++ b/contrib/pg_surgery/pg_surgery.control @@ -0,0 +1,5 @@ +# pg_surgery extension +comment = 'extension to perform surgery on the damaged heap table' +default_version = '1.0' +module_pathname = '$libdir/pg_surgery' +relocatable = true diff --git a/contrib/pg_surgery/sql/pg_surgery.sql b/contrib/pg_surgery/sql/pg_surgery.sql new file mode 100644 index 0000000..936e28b --- /dev/null +++ b/contrib/pg_surgery/sql/pg_surgery.sql @@ -0,0 +1,72 @@ +create extension pg_surgery; + +-- +-- test pg_surgery functions with all the supported relations. Should pass. +-- + +-- normal heap table. +begin; +create table htab(a int); +insert into htab values (100), (200), (300), (400), (500); + +select * from htab where xmin = 2; +select heap_force_freeze('htab'::regclass, ARRAY['(0, 4)']::tid[]); +select ctid, xmax from htab where xmin = 2; + +select heap_force_kill('htab'::regclass, ARRAY['(0, 4)']::tid[]); +select * from htab where ctid = '(0, 4)'; +rollback; + +-- toast table. +begin; +create table ttab(a text); +insert into ttab select string_agg(chr(floor(random() * 26)::int + 65), '') from generate_series(1,10000); + +select * from ttab where xmin = 2; +select heap_force_freeze('ttab'::regclass, ARRAY['(0, 1)']::tid[]); +select ctid, xmax from ttab where xmin = 2; + +select heap_force_kill('ttab'::regclass, ARRAY['(0, 1)']::tid[]); +select * from ttab where ctid = '(0, 1)'; +rollback; + +-- +-- test pg_surgery functions with the unsupported relations. Should fail. +-- + +-- partitioned tables (the parent table) doesn't contain any tuple. +create table ptab (a int) partition by list (a); + +select heap_force_kill('ptab'::regclass, ARRAY['(0, 1)']::tid[]); +select heap_force_freeze('ptab'::regclass, ARRAY['(0, 1)']::tid[]); + +create index ptab_idx on ptab (a); + +-- indexes are not supported. so, all these should fail. +select heap_force_kill('ptab_idx'::regclass, ARRAY['(0, 1)']::tid[]); +select heap_force_freeze('ptab_idx'::regclass, ARRAY['(0, 1)']::tid[]); + +create view vw as select 1; + +-- views are not supported as well. so, all these should fail. +select heap_force_kill('vw'::regclass, ARRAY['(0, 1)']::tid[]); +select heap_force_freeze('vw'::regclass, ARRAY['(0, 1)']::tid[]); + +create materialized view mvw as select 10; + +-- materialized views are not supported as well. so, all these should fail. +select heap_force_kill('mvw'::regclass, ARRAY['(0, 1)']::tid[]); +select heap_force_freeze('mvw'::regclass, ARRAY['(0, 1)']::tid[]); + +create sequence seq; + +-- sequences are not supported as well. so, all these functions should fail. +select heap_force_kill('seq'::regclass, ARRAY['(0, 1)']::tid[]); +select heap_force_freeze('seq'::regclass, ARRAY['(0, 1)']::tid[]); + +-- cleanup. +drop materialized view mvw; +drop table ptab; +drop view vw; +drop sequence seq; +drop extension pg_surgery; -- 1.8.3.1