Hi,

while investigating some checksum-related issues, I needed to perform some forensics on a copy of a btree page (taken from another instance using 'dd').

But I've ran into two pageinspect limitations, hopefully addressed by this patch:

1) bt_page_items(bytea) not defined

We have heap_page_items(bytea) but not bt_page_items(bytea). I suspect this is partially historical API inconsistence, and partially due to the need to handle the btree metapage explicitly.

The original function simply threw an error with blkno=0, the new function simply checks for BTP_META page.

I believe this is sufficient, assuming the instance without data corruption (which pageinspect assumes anyway). With data corruption all bets are off anyway - for example the metapage might be written to a different block (essentially what I saw in the investigated issue). Actually, the flag check is better in this case - it detects the metapage, while the blkno=0 check fails to do that (leading to crash).

2) page_checksum()

When everything is fine, you can do page_header() which also includes the checksum. When the checksum gets broken, you may still dump the page using 'dd+pg_read_binary_file' to see the header, but clearly that checksum is wrong - and it's interesting to see the correct one and compare it to the checksum in the header.

This function makes it possible - it accepts the bytea image of the page, and blkno (so it's possible to see how would the block look if it was written somewhere else, for example).


BTW I've noticed the pageinspect version is 1.6, but we only have pageinspect--1.5.sql (and upgrade script to 1.6). Not sure that's entirely intentional?

regards

--
Tomas Vondra                  http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
>From 499a7bd9aea3032f03d787833c0501d9fa703271 Mon Sep 17 00:00:00 2001
From: Tomas Vondra <to...@2ndquadrant.com>
Date: Mon, 13 Feb 2017 21:20:12 +0100
Subject: [PATCH] pageinspect - page_checksum and bt_page_items(bytea)

Adds two functions to the pageinspect extension:

1) page_checksum(bytea,int4) allows computing checksum for
arbitrary page, even if data checksums are not enabled

2) bt_page_items(bytea) is similar to heap_page_items(bytea)
---
 contrib/pageinspect/btreefuncs.c              | 209 +++++++++++++++++++++++---
 contrib/pageinspect/expected/btree.out        |  13 ++
 contrib/pageinspect/pageinspect--1.5--1.6.sql |  22 +++
 contrib/pageinspect/rawpage.c                 |  37 +++++
 contrib/pageinspect/sql/btree.sql             |   4 +
 doc/src/sgml/pageinspect.sgml                 |  58 +++++++
 src/include/access/nbtree.h                   |   1 +
 7 files changed, 320 insertions(+), 24 deletions(-)

diff --git a/contrib/pageinspect/btreefuncs.c b/contrib/pageinspect/btreefuncs.c
index d50ec3a..93da844 100644
--- a/contrib/pageinspect/btreefuncs.c
+++ b/contrib/pageinspect/btreefuncs.c
@@ -39,8 +39,14 @@
 #include "utils/varlena.h"
 
 
+extern Datum bt_metap(PG_FUNCTION_ARGS);
+extern Datum bt_page_items(PG_FUNCTION_ARGS);
+extern Datum bt_page_items_bytea(PG_FUNCTION_ARGS);
+extern Datum bt_page_stats(PG_FUNCTION_ARGS);
+
 PG_FUNCTION_INFO_V1(bt_metap);
 PG_FUNCTION_INFO_V1(bt_page_items);
+PG_FUNCTION_INFO_V1(bt_page_items_bytea);
 PG_FUNCTION_INFO_V1(bt_page_stats);
 
 #define IS_INDEX(r) ((r)->rd_rel->relkind == RELKIND_INDEX)
@@ -215,17 +221,31 @@ bt_page_stats(PG_FUNCTION_ARGS)
 		elog(ERROR, "return type must be a row type");
 
 	j = 0;
-	values[j++] = psprintf("%d", stat.blkno);
-	values[j++] = psprintf("%c", stat.type);
-	values[j++] = psprintf("%d", stat.live_items);
-	values[j++] = psprintf("%d", stat.dead_items);
-	values[j++] = psprintf("%d", stat.avg_item_size);
-	values[j++] = psprintf("%d", stat.page_size);
-	values[j++] = psprintf("%d", stat.free_size);
-	values[j++] = psprintf("%d", stat.btpo_prev);
-	values[j++] = psprintf("%d", stat.btpo_next);
-	values[j++] = psprintf("%d", (stat.type == 'd') ? stat.btpo.xact : stat.btpo.level);
-	values[j++] = psprintf("%d", stat.btpo_flags);
+	values[j] = palloc(32);
+	snprintf(values[j++], 32, "%d", stat.blkno);
+	values[j] = palloc(32);
+	snprintf(values[j++], 32, "%c", stat.type);
+	values[j] = palloc(32);
+	snprintf(values[j++], 32, "%d", stat.live_items);
+	values[j] = palloc(32);
+	snprintf(values[j++], 32, "%d", stat.dead_items);
+	values[j] = palloc(32);
+	snprintf(values[j++], 32, "%d", stat.avg_item_size);
+	values[j] = palloc(32);
+	snprintf(values[j++], 32, "%d", stat.page_size);
+	values[j] = palloc(32);
+	snprintf(values[j++], 32, "%d", stat.free_size);
+	values[j] = palloc(32);
+	snprintf(values[j++], 32, "%d", stat.btpo_prev);
+	values[j] = palloc(32);
+	snprintf(values[j++], 32, "%d", stat.btpo_next);
+	values[j] = palloc(32);
+	if (stat.type == 'd')
+		snprintf(values[j++], 32, "%d", stat.btpo.xact);
+	else
+		snprintf(values[j++], 32, "%d", stat.btpo.level);
+	values[j] = palloc(32);
+	snprintf(values[j++], 32, "%d", stat.btpo_flags);
 
 	tuple = BuildTupleFromCStrings(TupleDescGetAttInMetadata(tupleDesc),
 								   values);
@@ -361,13 +381,18 @@ bt_page_items(PG_FUNCTION_ARGS)
 		itup = (IndexTuple) PageGetItem(uargs->page, id);
 
 		j = 0;
-		values[j++] = psprintf("%d", uargs->offset);
-		values[j++] = psprintf("(%u,%u)",
-							   BlockIdGetBlockNumber(&(itup->t_tid.ip_blkid)),
-							   itup->t_tid.ip_posid);
-		values[j++] = psprintf("%d", (int) IndexTupleSize(itup));
-		values[j++] = psprintf("%c", IndexTupleHasNulls(itup) ? 't' : 'f');
-		values[j++] = psprintf("%c", IndexTupleHasVarwidths(itup) ? 't' : 'f');
+		values[j] = palloc(32);
+		snprintf(values[j++], 32, "%d", uargs->offset);
+		values[j] = palloc(32);
+		snprintf(values[j++], 32, "(%u,%u)",
+				 BlockIdGetBlockNumber(&(itup->t_tid.ip_blkid)),
+				 itup->t_tid.ip_posid);
+		values[j] = palloc(32);
+		snprintf(values[j++], 32, "%d", (int) IndexTupleSize(itup));
+		values[j] = palloc(32);
+		snprintf(values[j++], 32, "%c", IndexTupleHasNulls(itup) ? 't' : 'f');
+		values[j] = palloc(32);
+		snprintf(values[j++], 32, "%c", IndexTupleHasVarwidths(itup) ? 't' : 'f');
 
 		ptr = (char *) itup + IndexInfoFindDataOffset(itup->t_info);
 		dlen = IndexTupleSize(itup) - IndexInfoFindDataOffset(itup->t_info);
@@ -396,6 +421,136 @@ bt_page_items(PG_FUNCTION_ARGS)
 	}
 }
 
+/*-------------------------------------------------------
+ * bt_page_items_bytea()
+ *
+ * Get IndexTupleData set in a btree page
+ *
+ * Usage: SELECT * FROM bt_page_items(get_raw_page('t1_pkey', 1));
+ *-------------------------------------------------------
+ */
+
+Datum
+bt_page_items_bytea(PG_FUNCTION_ARGS)
+{
+	bytea	   *raw_page = PG_GETARG_BYTEA_P(0);
+	Datum		result;
+	char	   *values[6];
+	HeapTuple	tuple;
+	FuncCallContext *fctx;
+	struct user_args *uargs;
+	int			raw_page_size;
+
+	if (!superuser())
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+				 (errmsg("must be superuser to use pageinspect functions"))));
+
+	raw_page_size = VARSIZE(raw_page) - VARHDRSZ;
+
+	if (SRF_IS_FIRSTCALL())
+	{
+		BTPageOpaque opaque;
+		MemoryContext mctx;
+		TupleDesc	tupleDesc;
+
+		if (raw_page_size < SizeOfPageHeaderData)
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				  errmsg("input page too small (%d bytes)", raw_page_size)));
+
+		fctx = SRF_FIRSTCALL_INIT();
+		mctx = MemoryContextSwitchTo(fctx->multi_call_memory_ctx);
+
+		uargs = palloc(sizeof(struct user_args));
+
+		uargs->page = VARDATA(raw_page);
+
+		uargs->offset = FirstOffsetNumber;
+
+		opaque = (BTPageOpaque) PageGetSpecialPointer(uargs->page);
+
+		if (P_ISMETA(opaque))
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				  errmsg("block is a meta page")));
+
+		if (P_ISDELETED(opaque))
+			elog(NOTICE, "page is deleted");
+
+		fctx->max_calls = PageGetMaxOffsetNumber(uargs->page);
+
+		/* Build a tuple descriptor for our result type */
+		if (get_call_result_type(fcinfo, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE)
+			elog(ERROR, "return type must be a row type");
+
+		fctx->attinmeta = TupleDescGetAttInMetadata(tupleDesc);
+
+		fctx->user_fctx = uargs;
+
+		MemoryContextSwitchTo(mctx);
+	}
+
+	fctx = SRF_PERCALL_SETUP();
+	uargs = fctx->user_fctx;
+
+	if (fctx->call_cntr < fctx->max_calls)
+	{
+		ItemId		id;
+		IndexTuple	itup;
+		int			j;
+		int			off;
+		int			dlen;
+		char	   *dump;
+		char	   *ptr;
+
+		id = PageGetItemId(uargs->page, uargs->offset);
+
+		if (!ItemIdIsValid(id))
+			elog(ERROR, "invalid ItemId");
+
+		itup = (IndexTuple) PageGetItem(uargs->page, id);
+		j = 0;
+		values[j] = palloc(32);
+		snprintf(values[j++], 32, "%d", uargs->offset);
+		values[j] = palloc(32);
+		snprintf(values[j++], 32, "(%u,%u)",
+				 BlockIdGetBlockNumber(&(itup->t_tid.ip_blkid)),
+				 itup->t_tid.ip_posid);
+		values[j] = palloc(32);
+		snprintf(values[j++], 32, "%d", (int) IndexTupleSize(itup));
+		values[j] = palloc(32);
+		snprintf(values[j++], 32, "%c", IndexTupleHasNulls(itup) ? 't' : 'f');
+		values[j] = palloc(32);
+		snprintf(values[j++], 32, "%c", IndexTupleHasVarwidths(itup) ? 't' : 'f');
+
+		ptr = (char *) itup + IndexInfoFindDataOffset(itup->t_info);
+		dlen = IndexTupleSize(itup) - IndexInfoFindDataOffset(itup->t_info);
+		dump = palloc0(dlen * 3 + 1);
+
+		values[j] = dump;
+		for (off = 0; off < dlen; off++)
+		{
+			if (off > 0)
+				*dump++ = ' ';
+			sprintf(dump, "%02x", *(ptr + off) & 0xff);
+			dump += 2;
+		}
+
+		tuple = BuildTupleFromCStrings(fctx->attinmeta, values);
+		result = HeapTupleGetDatum(tuple);
+
+		uargs->offset = uargs->offset + 1;
+
+		SRF_RETURN_NEXT(fctx, result);
+	}
+	else
+	{
+		pfree(uargs);
+		SRF_RETURN_DONE(fctx);
+	}
+}
+
 
 /* ------------------------------------------------
  * bt_metap()
@@ -453,12 +608,18 @@ bt_metap(PG_FUNCTION_ARGS)
 		elog(ERROR, "return type must be a row type");
 
 	j = 0;
-	values[j++] = psprintf("%d", metad->btm_magic);
-	values[j++] = psprintf("%d", metad->btm_version);
-	values[j++] = psprintf("%d", metad->btm_root);
-	values[j++] = psprintf("%d", metad->btm_level);
-	values[j++] = psprintf("%d", metad->btm_fastroot);
-	values[j++] = psprintf("%d", metad->btm_fastlevel);
+	values[j] = palloc(32);
+	snprintf(values[j++], 32, "%d", metad->btm_magic);
+	values[j] = palloc(32);
+	snprintf(values[j++], 32, "%d", metad->btm_version);
+	values[j] = palloc(32);
+	snprintf(values[j++], 32, "%d", metad->btm_root);
+	values[j] = palloc(32);
+	snprintf(values[j++], 32, "%d", metad->btm_level);
+	values[j] = palloc(32);
+	snprintf(values[j++], 32, "%d", metad->btm_fastroot);
+	values[j] = palloc(32);
+	snprintf(values[j++], 32, "%d", metad->btm_fastlevel);
 
 	tuple = BuildTupleFromCStrings(TupleDescGetAttInMetadata(tupleDesc),
 								   values);
diff --git a/contrib/pageinspect/expected/btree.out b/contrib/pageinspect/expected/btree.out
index 82a49e3..67b103a 100644
--- a/contrib/pageinspect/expected/btree.out
+++ b/contrib/pageinspect/expected/btree.out
@@ -42,4 +42,17 @@ data       | 01 00 00 00 00 00 00 01
 
 SELECT * FROM bt_page_items('test1_a_idx', 2);
 ERROR:  block number out of range
+SELECT * FROM bt_page_items(get_raw_page('test1_a_idx', 0));
+ERROR:  block is a meta page
+SELECT * FROM bt_page_items(get_raw_page('test1_a_idx', 1));
+-[ RECORD 1 ]-----------------------
+itemoffset | 1
+ctid       | (0,1)
+itemlen    | 16
+nulls      | f
+vars       | f
+data       | 01 00 00 00 00 00 00 01
+
+SELECT * FROM bt_page_items(get_raw_page('test1_a_idx', 2));
+ERROR:  block number 2 is out of range for relation "test1_a_idx"
 DROP TABLE test1;
diff --git a/contrib/pageinspect/pageinspect--1.5--1.6.sql b/contrib/pageinspect/pageinspect--1.5--1.6.sql
index ac39568..df17fe6 100644
--- a/contrib/pageinspect/pageinspect--1.5--1.6.sql
+++ b/contrib/pageinspect/pageinspect--1.5--1.6.sql
@@ -75,3 +75,25 @@ CREATE FUNCTION hash_metapage_info(IN page bytea,
     OUT mapp int8[])
 AS 'MODULE_PATHNAME', 'hash_metapage_info'
 LANGUAGE C STRICT PARALLEL SAFE;
+
+--
+-- page_checksum()
+--
+CREATE FUNCTION page_checksum(IN page bytea, IN blkno int4)
+RETURNS smallint
+AS 'MODULE_PATHNAME', 'page_checksum'
+LANGUAGE C STRICT;
+
+--
+-- bt_page_items_bytea()
+--
+CREATE FUNCTION bt_page_items(IN page bytea,
+    OUT itemoffset smallint,
+    OUT ctid tid,
+    OUT itemlen smallint,
+    OUT nulls bool,
+    OUT vars bool,
+    OUT data text)
+RETURNS SETOF record
+AS 'MODULE_PATHNAME', 'bt_page_items_bytea'
+LANGUAGE C STRICT;
diff --git a/contrib/pageinspect/rawpage.c b/contrib/pageinspect/rawpage.c
index 102f360..0605989 100644
--- a/contrib/pageinspect/rawpage.c
+++ b/contrib/pageinspect/rawpage.c
@@ -24,6 +24,7 @@
 #include "funcapi.h"
 #include "miscadmin.h"
 #include "storage/bufmgr.h"
+#include "storage/checksum.h"
 #include "utils/builtins.h"
 #include "utils/pg_lsn.h"
 #include "utils/rel.h"
@@ -275,3 +276,39 @@ page_header(PG_FUNCTION_ARGS)
 
 	PG_RETURN_DATUM(result);
 }
+
+/*
+ * page_header
+ *
+ * Allows inspection of page header fields of a raw page
+ */
+
+PG_FUNCTION_INFO_V1(page_checksum);
+
+Datum
+page_checksum(PG_FUNCTION_ARGS)
+{
+	bytea	   *raw_page = PG_GETARG_BYTEA_P(0);
+	uint32		blkno = PG_GETARG_INT32(1);
+	int			raw_page_size;
+	PageHeader	page;
+
+	if (!superuser())
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+				 (errmsg("must be superuser to use raw page functions"))));
+
+	raw_page_size = VARSIZE(raw_page) - VARHDRSZ;
+
+	/*
+	 * Check that the supplied page is of the right size.
+	 */
+	if (raw_page_size != BLCKSZ)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("incorrect size of input page (%d bytes)", raw_page_size)));
+
+	page = (PageHeader) VARDATA(raw_page);
+
+	PG_RETURN_INT16(pg_checksum_page((char *)page, blkno));
+}
diff --git a/contrib/pageinspect/sql/btree.sql b/contrib/pageinspect/sql/btree.sql
index 1eafc32..8eac64c 100644
--- a/contrib/pageinspect/sql/btree.sql
+++ b/contrib/pageinspect/sql/btree.sql
@@ -14,4 +14,8 @@ SELECT * FROM bt_page_items('test1_a_idx', 0);
 SELECT * FROM bt_page_items('test1_a_idx', 1);
 SELECT * FROM bt_page_items('test1_a_idx', 2);
 
+SELECT * FROM bt_page_items(get_raw_page('test1_a_idx', 0));
+SELECT * FROM bt_page_items(get_raw_page('test1_a_idx', 1));
+SELECT * FROM bt_page_items(get_raw_page('test1_a_idx', 2));
+
 DROP TABLE test1;
diff --git a/doc/src/sgml/pageinspect.sgml b/doc/src/sgml/pageinspect.sgml
index 5e6712f..da5dd37 100644
--- a/doc/src/sgml/pageinspect.sgml
+++ b/doc/src/sgml/pageinspect.sgml
@@ -84,6 +84,33 @@ test=# SELECT * FROM page_header(get_raw_page('pg_class', 0));
 
    <varlistentry>
     <term>
+     <function>page_checksum(page bytea, blkno int4) returns smallint</function>
+     <indexterm>
+      <primary>page_checksum</primary>
+     </indexterm>
+    </term>
+
+    <listitem>
+     <para>
+      <function>page_checksum</function> computes a checksum for the page, as if
+      it was located at the given block.
+     </para>
+
+     <para>
+      A page image obtained with <function>get_raw_page</function> should be
+      passed as argument.  For example:
+<screen>
+test=# SELECT page_checksum(get_raw_page('pg_class', 0), 100);
+ page_checksum
+---------------
+         13443
+</screen>
+    </para>
+    </listitem>
+   </varlistentry>
+
+   <varlistentry>
+    <term>
      <function>heap_page_items(page bytea) returns setof record</function>
      <indexterm>
       <primary>heap_page_items</primary>
@@ -290,6 +317,37 @@ test=# SELECT * FROM bt_page_items('pg_cast_oid_index', 1);
      </para>
     </listitem>
    </varlistentry>
+
+   <varlistentry>
+    <term>
+     <function>bt_page_items(page bytea) returns setof record</function>
+     <indexterm>
+      <primary>bt_page_items</primary>
+     </indexterm>
+    </term>
+
+    <listitem>
+     <para>
+      Similarly to <function>heap_page_items</function>, it is also possible to
+      pass the page to <function>bt_page_items</function> as a <type>bytea</>
+      value. So the last example may also be rewritten like this:
+<screen>
+test=# SELECT * FROM bt_page_items(get_raw_page('pg_cast_oid_index', 1));
+ itemoffset |  ctid   | itemlen | nulls | vars |    data
+------------+---------+---------+-------+------+-------------
+          1 | (0,1)   |      12 | f     | f    | 23 27 00 00
+          2 | (0,2)   |      12 | f     | f    | 24 27 00 00
+          3 | (0,3)   |      12 | f     | f    | 25 27 00 00
+          4 | (0,4)   |      12 | f     | f    | 26 27 00 00
+          5 | (0,5)   |      12 | f     | f    | 27 27 00 00
+          6 | (0,6)   |      12 | f     | f    | 28 27 00 00
+          7 | (0,7)   |      12 | f     | f    | 29 27 00 00
+          8 | (0,8)   |      12 | f     | f    | 2a 27 00 00
+</screen>
+      All the other details are the same as explained in the previous section.
+     </para>
+    </listitem>
+   </varlistentry>
   </variablelist>
  </sect2>
 
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 6289ffa..f202715 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -177,6 +177,7 @@ typedef struct BTMetaPageData
 #define P_ISLEAF(opaque)		((opaque)->btpo_flags & BTP_LEAF)
 #define P_ISROOT(opaque)		((opaque)->btpo_flags & BTP_ROOT)
 #define P_ISDELETED(opaque)		((opaque)->btpo_flags & BTP_DELETED)
+#define P_ISMETA(opaque)		((opaque)->btpo_flags & BTP_META)
 #define P_ISHALFDEAD(opaque)	((opaque)->btpo_flags & BTP_HALF_DEAD)
 #define P_IGNORE(opaque)		((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD))
 #define P_HAS_GARBAGE(opaque)	((opaque)->btpo_flags & BTP_HAS_GARBAGE)
-- 
2.5.5

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to