On 10/03/2019 18:40, Andrey Borodin wrote:
Here's new version of the patch for actual page deletion.
Changes:
1. Fixed possible concurrency issue:
We were locking child page while holding lock on internal page. Notes
in GiST README recommend locking child before parent. Thus now we
unlock internal page before locking children for deletion, and lock
it again, check that everything is still on it's place and delete
only then.
Good catch. The implementation is a bit broken, though. This segfaults:
create table gist_point_tbl(id int4, p point);
create index gist_pointidx on gist_point_tbl using gist(p);
insert into gist_point_tbl (id, p)
select g, point((1+g) % 100, (1+g) % 100) from generate_series(1,
10000) g;
insert into gist_point_tbl (id, p)
select -g, point(1000, 1000) from generate_series(1, 10000) g;
delete from gist_point_tbl where id >= 0;
vacuum verbose gist_point_tbl;
BTW, we don't seem to have test coverage for this in the regression
tests. The tests that delete some rows, where you updated the comment,
doesn't actually seem to produce any empty pages that could be deleted.
One thing still bothers me. Let's assume that we have internal page
with 2 deletable leaves. We lock these leaves in order of items on
internal page. Is it possible that 2nd page have follow-right link on
1st and someone will lock 2nd page, try to lock 1st and deadlock with
VACUUM?
Hmm. If the follow-right flag is set on a page, it means that its right
sibling doesn't have a downlink in the parent yet. Nevertheless, I think
I'd sleep better, if we acquired the locks in left-to-right order, to be
safe.
An easy cop-out would be to use LWLockConditionalAcquire, and bail out
if we can't get the lock. But if it's not too complicated, it'd be nice
to acquire the locks in the correct order to begin with.
I did a round of cleanup and moving things around, before I bumped into
the above issue. I'm including them here as separate patches, for easier
review, but they should of course be squashed into yours. For
convenience, I'm including your patches here as well, so that all the
patches you need are in the same place, but they are identical to what
you posted.
- Heikki
>From a09f14de32f3040b19238d239b7f025f345e940b Mon Sep 17 00:00:00 2001
From: Andrey <amboro...@acm.org>
Date: Fri, 8 Mar 2019 21:19:32 +0500
Subject: [PATCH 1/6] Add block set data structure
Currently we have Bitmapset which works only with 32 bit ints.
Furthermore it is not very space-efficient in case of sparse
bitmap.
In this commit we invent block set: statically typed radix tree
for keeping BlockNumbers. This still can be improved in many ways
applicable to bitmaps, most notable in-pointer lists can be used.
But for the sake of code simplicity it is now plain in it's
ctructure.
The block set is introduced to support efficient page deletion in
GiST's VACUUM.
---
src/backend/lib/Makefile | 4 +-
src/backend/lib/blockset.c | 201 ++++++++++++++++++
src/include/lib/blockset.h | 24 +++
src/test/modules/test_blockset/.gitignore | 4 +
src/test/modules/test_blockset/Makefile | 21 ++
src/test/modules/test_blockset/README | 8 +
.../test_blockset/expected/test_blockset.out | 7 +
.../test_blockset/sql/test_blockset.sql | 3 +
.../test_blockset/test_blockset--1.0.sql | 8 +
.../modules/test_blockset/test_blockset.c | 182 ++++++++++++++++
.../test_blockset/test_blockset.control | 4 +
11 files changed, 464 insertions(+), 2 deletions(-)
create mode 100644 src/backend/lib/blockset.c
create mode 100644 src/include/lib/blockset.h
create mode 100644 src/test/modules/test_blockset/.gitignore
create mode 100644 src/test/modules/test_blockset/Makefile
create mode 100644 src/test/modules/test_blockset/README
create mode 100644 src/test/modules/test_blockset/expected/test_blockset.out
create mode 100644 src/test/modules/test_blockset/sql/test_blockset.sql
create mode 100644 src/test/modules/test_blockset/test_blockset--1.0.sql
create mode 100644 src/test/modules/test_blockset/test_blockset.c
create mode 100644 src/test/modules/test_blockset/test_blockset.control
diff --git a/src/backend/lib/Makefile b/src/backend/lib/Makefile
index 191ea9bca26..9601894f07c 100644
--- a/src/backend/lib/Makefile
+++ b/src/backend/lib/Makefile
@@ -12,7 +12,7 @@ subdir = src/backend/lib
top_builddir = ../../..
include $(top_builddir)/src/Makefile.global
-OBJS = binaryheap.o bipartite_match.o bloomfilter.o dshash.o hyperloglog.o \
- ilist.o knapsack.o pairingheap.o rbtree.o stringinfo.o
+OBJS = binaryheap.o bipartite_match.o blockset.o bloomfilter.o dshash.o \
+ hyperloglog.o ilist.o knapsack.o pairingheap.o rbtree.o stringinfo.o
include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/lib/blockset.c b/src/backend/lib/blockset.c
new file mode 100644
index 00000000000..c07b2974b1e
--- /dev/null
+++ b/src/backend/lib/blockset.c
@@ -0,0 +1,201 @@
+/*-------------------------------------------------------------------------
+ *
+ * blockset.c
+ * Data structure for operations on set of block numbers
+ *
+ * This data structure resembles bitmap set in idea and operations, but
+ * has two main differences:
+ * 1. It handles unsigned BlockNumber as position in set instead of int32_t
+ * This allows to work with relation forks bigger than 2B blocks
+ * 2. It is more space efficient for sparse bitmaps: regions are allocated
+ * in chunks of 256 items at once.
+ *
+ * Copyright (c) 2019, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/backend/lib/blockset.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "lib/blockset.h"
+
+/* Lowest level of radix tree is represented by bitmap */
+typedef struct
+{
+ char data[256/8];
+} BlockSetLevel4Data;
+
+typedef BlockSetLevel4Data *BlockSetLevel4;
+
+/* statically typed inner level chunks points to ground level */
+typedef struct
+{
+ /* null references denote empty subtree */
+ BlockSetLevel4 next[256];
+} BlockSetLevel3Data;
+
+typedef BlockSetLevel3Data *BlockSetLevel3;
+
+/* inner level points to another inner level */
+typedef struct
+{
+ BlockSetLevel3 next[256];
+} BlockSetLevel2Data;
+
+typedef BlockSetLevel2Data *BlockSetLevel2;
+
+/* Radix tree root */
+typedef struct BlockSetData
+{
+ BlockSetLevel2 next[256];
+} BlockSetData;
+
+/* multiplex block number into indexes of radix tree */
+#define BLOCKSET_SPLIT_BLKNO \
+ uint32_t i1, i2, i3, i4, byte_no, byte_mask; \
+ i4 = blkno % 256; \
+ blkno /= 256; \
+ i3 = blkno % 256; \
+ blkno /= 256; \
+ i2 = blkno % 256; \
+ blkno /= 256; \
+ i1 = blkno; \
+ byte_no = i4 / 8; \
+ byte_mask = 1 << (i4 % 8) \
+
+/* indicate presence of block number in set */
+BlockSet blockset_set(BlockSet bs, BlockNumber blkno)
+{
+ BLOCKSET_SPLIT_BLKNO;
+ if (bs == NULL)
+ {
+ bs = palloc0(sizeof(BlockSetData));
+ }
+ BlockSetLevel2 bs2 = bs->next[i1];
+ if (bs2 == NULL)
+ {
+ bs2 = palloc0(sizeof(BlockSetLevel2Data));
+ bs->next[i1] = bs2;
+ }
+ BlockSetLevel3 bs3 = bs2->next[i2];
+ if (bs3 == NULL)
+ {
+ bs3 = palloc0(sizeof(BlockSetLevel3Data));
+ bs2->next[i2] = bs3;
+ }
+ BlockSetLevel4 bs4 = bs3->next[i3];
+ if (bs4 == NULL)
+ {
+ bs4 = palloc0(sizeof(BlockSetLevel4Data));
+ bs3->next[i3] = bs4;
+ }
+ bs4->data[byte_no] = byte_mask | bs4->data[byte_no];
+ return bs;
+}
+
+/* Test presence of block in set */
+bool blockset_get(BlockNumber blkno, BlockSet bs)
+{
+ BLOCKSET_SPLIT_BLKNO;
+ if (bs == NULL)
+ return false;
+ BlockSetLevel2 bs2 = bs->next[i1];
+ if (bs2 == NULL)
+ return false;
+ BlockSetLevel3 bs3 = bs2->next[i2];
+ if (bs3 == NULL)
+ return false;
+ BlockSetLevel4 bs4 = bs3->next[i3];
+ if (bs4 == NULL)
+ return false;
+ return (bs4->data[byte_no] & byte_mask);
+}
+
+/*
+ * Find nearest block number in set no less than blkno
+ * Return InvalidBlockNumber if nothing to return
+ * If given InvalidBlockNumber - returns minimal element in set
+ */
+BlockNumber blockset_next(BlockSet bs, BlockNumber blkno)
+{
+ if (blkno == InvalidBlockNumber)
+ blkno = 0; /* equvalent to ++, left for clear code */
+ else
+ blkno++;
+
+ BLOCKSET_SPLIT_BLKNO;
+
+ if (bs == NULL)
+ return InvalidBlockNumber;
+ for (; i1 < 256; i1++)
+ {
+ BlockSetLevel2 bs2 = bs->next[i1];
+ if (!bs2)
+ continue;
+ for (; i2 < 256; i2++)
+ {
+ BlockSetLevel3 bs3 = bs2->next[i2];
+ if (!bs3)
+ continue;
+ for (; i3 < 256; i3++)
+ {
+ BlockSetLevel4 bs4 = bs3->next[i3];
+ if (!bs4)
+ continue;
+ for (; byte_no < 256 / 8; byte_no++)
+ {
+ if (!bs4->data[byte_no])
+ continue;
+ while (byte_mask < 256)
+ {
+ if ((byte_mask & bs4->data[byte_no]) == byte_mask)
+ {
+ i4 = byte_no * 8;
+ while (byte_mask >>= 1) i4++;
+ return i4 + 256 * (i3 + 256 * (i2 + 256 * i1));
+ }
+ byte_mask <<= 1;
+ }
+ byte_mask = 1;
+ }
+ byte_no = 0;
+ }
+ i3 = 0;
+ }
+ i2 = 0;
+ }
+ return InvalidBlockNumber;
+}
+
+/* free anything palloced */
+void blockset_free(BlockSet bs)
+{
+ BlockNumber blkno = 0;
+ BLOCKSET_SPLIT_BLKNO;
+ if (bs == NULL)
+ return;
+ for (; i1 < 256; i1++)
+ {
+ BlockSetLevel2 bs2 = bs->next[i1];
+ if (!bs2)
+ continue;
+ for (; i2 < 256; i2++)
+ {
+ BlockSetLevel3 bs3 = bs2->next[i2];
+ if (!bs3)
+ continue;
+ for (; i3 < 256; i3++)
+ {
+ BlockSetLevel4 bs4 = bs3->next[i3];
+ if (bs4)
+ pfree(bs4);
+ }
+ pfree(bs3);
+ }
+ pfree(bs2);
+ }
+ pfree(bs);
+}
\ No newline at end of file
diff --git a/src/include/lib/blockset.h b/src/include/lib/blockset.h
new file mode 100644
index 00000000000..1966d17a8d4
--- /dev/null
+++ b/src/include/lib/blockset.h
@@ -0,0 +1,24 @@
+/*-------------------------------------------------------------------------
+ *
+ * blockset.h
+ * Data structure for operations on set of block numbers
+ *
+ * IDENTIFICATION
+ * src/include/lib/blockset.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef BLOCKSET_H
+#define BLOCKSET_H
+
+#include "storage/block.h"
+
+typedef struct BlockSetData *BlockSet;
+
+extern BlockSet blockset_set(BlockSet bs, BlockNumber blkno);
+extern bool blockset_get(BlockNumber blkno, BlockSet bs);
+extern BlockNumber blockset_next(BlockSet bs, BlockNumber blkno);
+extern void blockset_free(BlockSet bs);
+
+#endif /* BLOCKSET_H */
diff --git a/src/test/modules/test_blockset/.gitignore b/src/test/modules/test_blockset/.gitignore
new file mode 100644
index 00000000000..5dcb3ff9723
--- /dev/null
+++ b/src/test/modules/test_blockset/.gitignore
@@ -0,0 +1,4 @@
+# Generated subdirectories
+/log/
+/results/
+/tmp_check/
diff --git a/src/test/modules/test_blockset/Makefile b/src/test/modules/test_blockset/Makefile
new file mode 100644
index 00000000000..091cf8c0958
--- /dev/null
+++ b/src/test/modules/test_blockset/Makefile
@@ -0,0 +1,21 @@
+# src/test/modules/test_blockset/Makefile
+
+MODULE_big = test_blockset
+OBJS = test_blockset.o $(WIN32RES)
+PGFILEDESC = "test_blockset - test code for block set library"
+
+EXTENSION = test_blockset
+DATA = test_blockset--1.0.sql
+
+REGRESS = test_blockset
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = src/test/modules/test_blockset
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/src/test/modules/test_blockset/README b/src/test/modules/test_blockset/README
new file mode 100644
index 00000000000..730951ff03c
--- /dev/null
+++ b/src/test/modules/test_blockset/README
@@ -0,0 +1,8 @@
+test_blockset overview
+=========================
+
+test_blockset is a test harness module for testing block set data structure.
+There are two main tests:
+1. Test of comliance with Bitmapset on numbers avaiable to Bitmapset
+2. Test of numbers that can exceed INT32_MAX but are just shifted on one bit
+from numbers kept in Bitmapset
\ No newline at end of file
diff --git a/src/test/modules/test_blockset/expected/test_blockset.out b/src/test/modules/test_blockset/expected/test_blockset.out
new file mode 100644
index 00000000000..02c29d5fc0c
--- /dev/null
+++ b/src/test/modules/test_blockset/expected/test_blockset.out
@@ -0,0 +1,7 @@
+CREATE EXTENSION test_blockset;
+SELECT test_blockset();
+ test_blockset
+---------------
+
+(1 row)
+
diff --git a/src/test/modules/test_blockset/sql/test_blockset.sql b/src/test/modules/test_blockset/sql/test_blockset.sql
new file mode 100644
index 00000000000..85d2886676e
--- /dev/null
+++ b/src/test/modules/test_blockset/sql/test_blockset.sql
@@ -0,0 +1,3 @@
+CREATE EXTENSION test_blockset;
+
+SELECT test_blockset();
\ No newline at end of file
diff --git a/src/test/modules/test_blockset/test_blockset--1.0.sql b/src/test/modules/test_blockset/test_blockset--1.0.sql
new file mode 100644
index 00000000000..04eea8a6146
--- /dev/null
+++ b/src/test/modules/test_blockset/test_blockset--1.0.sql
@@ -0,0 +1,8 @@
+/* src/test/modules/test_blockset/test_blockset--1.0.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION test_blockset" to load this file. \quit
+
+CREATE FUNCTION test_blockset()
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
diff --git a/src/test/modules/test_blockset/test_blockset.c b/src/test/modules/test_blockset/test_blockset.c
new file mode 100644
index 00000000000..7a1d1c86c86
--- /dev/null
+++ b/src/test/modules/test_blockset/test_blockset.c
@@ -0,0 +1,182 @@
+/*--------------------------------------------------------------------------
+ *
+ * test_blockset.c
+ * Test block set data structure.
+ *
+ * Copyright (c) 2019, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/test/modules/test_blockset/test_blockset.c
+ *
+ * -------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "fmgr.h"
+#include "lib/blockset.h"
+#include "nodes/bitmapset.h"
+
+PG_MODULE_MAGIC;
+
+PG_FUNCTION_INFO_V1(test_blockset);
+
+static void test_blockset_bms_compliance();
+static void test_blockset_big_block_numbers();
+
+/*
+ * SQL-callable entry point to perform all tests.
+ */
+Datum
+test_blockset(PG_FUNCTION_ARGS)
+{
+ test_blockset_bms_compliance(0);
+ test_blockset_bms_compliance(1);
+ test_blockset_bms_compliance(2);
+ test_blockset_bms_compliance(1337);
+ test_blockset_bms_compliance(100000);
+ test_blockset_big_block_numbers(1337);
+ test_blockset_big_block_numbers(31337);
+ PG_RETURN_VOID();
+}
+
+/*
+ * This test creates random bitmap with test_limit members
+ * and checks that block set behavior is similar to Bitmapset
+ */
+static void test_blockset_bms_compliance(int32_t test_limit)
+{
+ BlockSet bs = NULL;
+ Bitmapset *bms = NULL;
+ BlockNumber blockno;
+ int index;
+ int32_t point_index = 0;
+
+ for (int32_t i = 0; i < test_limit; i++)
+ {
+ blockno = random() & INT32_MAX;
+ /* bms does not support block numbers above INT32_MAX */
+ bs = blockset_set(bs, blockno);
+ bms = bms_add_member(bms, (int)blockno);
+ }
+
+ index = -1;
+ blockno = InvalidBlockNumber;
+
+ while (true)
+ {
+ point_index++;
+ BlockNumber next_bn = blockset_next(bs, blockno);
+ int next_index = bms_next_member(bms, index);
+
+
+ if (next_bn == InvalidBlockNumber && next_index == -2)
+ return; /* We have found everything */
+
+ if (((BlockNumber)next_index) != next_bn)
+ {
+ elog(ERROR,
+ "Bitmapset returned value %X different from block set %X,"
+ " test_limit %d, point index %d",
+ next_index, next_bn, test_limit, point_index);
+ }
+
+ if (!blockset_get(next_bn, bs))
+ {
+ elog(ERROR,
+ "Block set did not found present item %X"
+ " test_limit %d, point index %d",
+ next_bn, test_limit, point_index);
+ }
+
+ index = next_index;
+ blockno = next_bn;
+ }
+
+ for (int32_t i = 0; i < test_limit; i++)
+ {
+ blockno = random() & INT32_MAX;
+ if (blockset_get(blockno, bs) != bms_is_member((int)blockno, bms))
+ {
+ elog(ERROR,
+ "Block set did agree with bitmapset item %X"
+ " test_limit %d, point index %d",
+ blockno, test_limit, point_index);
+ }
+ }
+
+ blockset_free(bs);
+ bms_free(bms);
+}
+
+/*
+ * This test is similar to test_blockset_bms_compliance()
+ * except that it shifts BlockNumbers by one bit to ensure that blockset
+ * operates correctly on values higher that INT32_MAX
+ * This function is copy-pasted from previous with the exception of barrel
+ * shifts for BlockNumbers. I've tried various refactorings, but they all
+ * looked ugly.
+ */
+static void test_blockset_big_block_numbers(int32_t test_limit)
+{
+ BlockSet bs = NULL;
+ Bitmapset *bms = NULL;
+ BlockNumber blockno;
+ int index;
+ int32_t point_index = 0;
+
+ for (int32_t i = 0; i < test_limit; i++)
+ {
+ blockno = random() & INT32_MAX;
+ /* bms does not support block numbers above INT32_MAX */
+ bs = blockset_set(bs, blockno << 1);
+ bms = bms_add_member(bms, (int)blockno);
+ }
+
+ index = -1;
+ blockno = InvalidBlockNumber;
+
+ while (true)
+ {
+ point_index++;
+ BlockNumber next_bn = blockset_next(bs, blockno);
+ int next_index = bms_next_member(bms, index);
+
+
+ if (next_bn == InvalidBlockNumber && next_index == -2)
+ return; /* We have found everything */
+
+ if (((BlockNumber)next_index) != (next_bn >> 1))
+ {
+ elog(ERROR,
+ "Bitmapset returned value %X different from block set %X,"
+ " test_limit %d, point index %d",
+ next_index, next_bn, test_limit, point_index);
+ }
+
+ if (!blockset_get(next_bn, bs))
+ {
+ elog(ERROR,
+ "Block set did not found present item %X"
+ " test_limit %d, point index %d",
+ next_bn, test_limit, point_index);
+ }
+
+ index = next_index;
+ blockno = next_bn;
+ }
+
+ for (int32_t i = 0; i < test_limit; i++)
+ {
+ blockno = random() & INT32_MAX;
+ if (blockset_get(blockno << 1, bs) != bms_is_member((int)blockno, bms))
+ {
+ elog(ERROR,
+ "Block set did agree with bitmapset item %X"
+ " test_limit %d, point index %d",
+ blockno, test_limit, point_index);
+ }
+ }
+
+ blockset_free(bs);
+ bms_free(bms);
+}
diff --git a/src/test/modules/test_blockset/test_blockset.control b/src/test/modules/test_blockset/test_blockset.control
new file mode 100644
index 00000000000..fdb7598c5a7
--- /dev/null
+++ b/src/test/modules/test_blockset/test_blockset.control
@@ -0,0 +1,4 @@
+comment = 'Test code for block set library'
+default_version = '1.0'
+module_pathname = '$libdir/test_blockset'
+relocatable = true
--
2.20.1
>From 1e477f083cd639117944c7910db8aff0c763b4f6 Mon Sep 17 00:00:00 2001
From: Andrey <amboro...@acm.org>
Date: Fri, 8 Mar 2019 21:24:37 +0500
Subject: [PATCH 2/6] Delete pages during GiST VACUUM
This commit teaches GiST to actually delete pages during VACUUM.
To do this we scan GiST two times. At first pass we make notes
of empty pages and internal pages. At second pass we scan through
internal pages looking for references to empty leaf pages.
---
src/backend/access/gist/README | 14 ++
src/backend/access/gist/gist.c | 18 ++
src/backend/access/gist/gistutil.c | 3 +-
src/backend/access/gist/gistvacuum.c | 218 ++++++++++++++++++++++++-
src/backend/access/gist/gistxlog.c | 60 +++++++
src/backend/access/rmgrdesc/gistdesc.c | 3 +
src/include/access/gist.h | 4 +
src/include/access/gist_private.h | 7 +-
src/include/access/gistxlog.h | 10 +-
src/test/regress/expected/gist.out | 4 +-
src/test/regress/sql/gist.sql | 4 +-
11 files changed, 335 insertions(+), 10 deletions(-)
diff --git a/src/backend/access/gist/README b/src/backend/access/gist/README
index 02228662b81..c84359de310 100644
--- a/src/backend/access/gist/README
+++ b/src/backend/access/gist/README
@@ -413,6 +413,20 @@ emptied yet; tuples never move upwards in the tree. The final emptying loops
through buffers at a given level until all buffers at that level have been
emptied, and then moves down to the next level.
+Bulk delete algorithm (VACUUM)
+------------------------------
+
+Function gistbulkdelete() is responsible for marking empty leaf pages as free
+so that they can be used it allocate newly split pages. To find this pages
+function scans index in physical order.
+
+Physical scan reads the entire index from the first page to last. This scan
+maintains information necessary to collect block numbers of internal pages
+that need cleansing and block number of empty leafs.
+
+After the scan, for each internal pages under exclusive lock, each potentially
+free leaf page is examined. gistbulkdelete() never delete last one reference
+from internal page to preserve balanced tree properties.
Authors:
Teodor Sigaev <teo...@sigaev.ru>
diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c
index 2ce5425ef98..6f87b4b5044 100644
--- a/src/backend/access/gist/gist.c
+++ b/src/backend/access/gist/gist.c
@@ -704,6 +704,11 @@ gistdoinsert(Relation r, IndexTuple itup, Size freespace,
GISTInsertStack *item;
OffsetNumber downlinkoffnum;
+ /*
+ * Currently internal pages are not deleted during vacuum,
+ * so we do not need to check if page is deleted
+ */
+
downlinkoffnum = gistchoose(state.r, stack->page, itup, giststate);
iid = PageGetItemId(stack->page, downlinkoffnum);
idxtuple = (IndexTuple) PageGetItem(stack->page, iid);
@@ -838,6 +843,19 @@ gistdoinsert(Relation r, IndexTuple itup, Size freespace,
}
}
+ /*
+ * Leaf pages can be left deleted but still referenced
+ * until it's space is reused. Downlink to this page may be already
+ * removed from the internal page, but this scan can posess it.
+ */
+ if(GistPageIsDeleted(stack->page))
+ {
+ UnlockReleaseBuffer(stack->buffer);
+ xlocked = false;
+ state.stack = stack = stack->parent;
+ continue;
+ }
+
/* now state.stack->(page, buffer and blkno) points to leaf page */
gistinserttuple(&state, stack, giststate, itup,
diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c
index f32e16eed58..4fa44bf2f6c 100644
--- a/src/backend/access/gist/gistutil.c
+++ b/src/backend/access/gist/gistutil.c
@@ -23,6 +23,7 @@
#include "storage/lmgr.h"
#include "utils/float.h"
#include "utils/syscache.h"
+#include "utils/snapmgr.h"
#include "utils/lsyscache.h"
@@ -834,7 +835,7 @@ gistNewBuffer(Relation r)
gistcheckpage(r, buffer);
- if (GistPageIsDeleted(page))
+ if (GistPageIsDeleted(page) && TransactionIdPrecedes(GistPageGetDeleteXid(page), RecentGlobalXmin))
return buffer; /* OK to use */
LockBuffer(buffer, GIST_UNLOCK);
diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c
index 3c1d75691e8..fc606ac823c 100644
--- a/src/backend/access/gist/gistvacuum.c
+++ b/src/backend/access/gist/gistvacuum.c
@@ -16,11 +16,15 @@
#include "access/genam.h"
#include "access/gist_private.h"
+#include "access/transam.h"
#include "commands/vacuum.h"
+#include "lib/blockset.h"
#include "miscadmin.h"
+#include "nodes/bitmapset.h"
#include "storage/indexfsm.h"
#include "storage/lmgr.h"
+
/* Working state needed by gistbulkdelete */
typedef struct
{
@@ -30,6 +34,10 @@ typedef struct
void *callback_state;
GistNSN startNSN;
BlockNumber totFreePages; /* true total # of free pages */
+ BlockNumber emptyPages;
+
+ BlockSet internalPagesMap;
+ BlockSet emptyLeafPagesMap;
} GistVacState;
static void gistvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
@@ -50,6 +58,7 @@ gistbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
gistvacuumscan(info, stats, callback, callback_state);
+
return stats;
}
@@ -103,6 +112,11 @@ gistvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
* while the index is being expanded, leaving an all-zeros page behind.
*
* The caller is responsible for initially allocating/zeroing a stats struct.
+ *
+ * Bulk deletion of all index entries pointing to a set of heap tuples and
+ * check invalid tuples left after upgrade.
+ * The set of target tuples is specified via a callback routine that tells
+ * whether any given heap tuple (identified by ItemPointer) is being deleted.
*/
static void
gistvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
@@ -132,6 +146,9 @@ gistvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
else
vstate.startNSN = gistGetFakeLSN(rel);
vstate.totFreePages = 0;
+ vstate.emptyPages = 0;
+ vstate.internalPagesMap = NULL;
+ vstate.emptyLeafPagesMap = NULL;
/*
* The outer loop iterates over all index pages, in physical order (we
@@ -171,6 +188,7 @@ gistvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
/* Quit if we've scanned the whole relation */
if (blkno >= num_pages)
break;
+
/* Iterate over pages, then loop back to recheck length */
for (; blkno < num_pages; blkno++)
gistvacuumpage(&vstate, blkno, blkno);
@@ -194,6 +212,194 @@ gistvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
/* update statistics */
stats->num_pages = num_pages;
stats->pages_free = vstate.totFreePages;
+
+ /* rescan all inner pages to find those that has empty child pages */
+ if (vstate.emptyPages > 0)
+ {
+ BlockNumber x;
+
+ x = InvalidBlockNumber;
+ while (vstate.emptyPages > 0 &&
+ (x = blockset_next(vstate.internalPagesMap, x)) != InvalidBlockNumber)
+ {
+ Buffer buffer;
+ Page page;
+ OffsetNumber off,
+ maxoff;
+ IndexTuple idxtuple;
+ ItemId iid;
+ OffsetNumber todelete[MaxOffsetNumber];
+ Buffer buftodelete[MaxOffsetNumber];
+ int ntodelete = 0;
+
+ blkno = (BlockNumber) x;
+
+ buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
+ info->strategy);
+
+ LockBuffer(buffer, GIST_EXCLUSIVE);
+ page = (Page) BufferGetPage(buffer);
+ if (PageIsNew(page) || GistPageIsDeleted(page) || GistPageIsLeaf(page))
+ {
+ UnlockReleaseBuffer(buffer);
+ continue;
+ }
+
+ maxoff = PageGetMaxOffsetNumber(page);
+ /* Check that leafs are still empty and decide what to delete */
+ for (off = FirstOffsetNumber; off <= maxoff && ntodelete < maxoff-1; off = OffsetNumberNext(off))
+ {
+ Buffer leafBuffer;
+ Page leafPage;
+ BlockNumber leafBlockNo;
+
+ iid = PageGetItemId(page, off);
+ idxtuple = (IndexTuple) PageGetItem(page, iid);
+ /* if this page was not empty in previous scan - we do not consider it */
+ leafBlockNo = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
+ if (!blockset_get(leafBlockNo, vstate.emptyLeafPagesMap))
+ continue;
+
+ leafBuffer = ReadBufferExtended(rel, MAIN_FORKNUM, leafBlockNo,
+ RBM_NORMAL, info->strategy);
+
+ buftodelete[ntodelete] = leafBuffer;
+ todelete[ntodelete++] = off;
+
+ LockBuffer(leafBuffer, GIST_EXCLUSIVE);
+ gistcheckpage(rel, leafBuffer);
+ leafPage = (Page) BufferGetPage(leafBuffer);
+ if (!GistPageIsLeaf(leafPage))
+ {
+ UnlockReleaseBuffer(leafBuffer);
+ continue;
+ }
+
+ if (PageGetMaxOffsetNumber(leafPage) == InvalidOffsetNumber /* Nothing left to split */
+ && !(GistFollowRight(leafPage) || GistPageGetNSN(page) < GistPageGetNSN(leafPage)) /* No follow-right */
+ && ntodelete < maxoff-1) /* We must keep at least one leaf page per each */
+ {
+ buftodelete[ntodelete] = leafBuffer;
+ todelete[ntodelete++] = off;
+ }
+ else
+ UnlockReleaseBuffer(leafBuffer);
+ }
+
+ /*
+ * We will have to relock internal page in case of deletes:
+ * we cannot lock child while holding parent lock without risk
+ * of a deadlock
+ */
+ LockBuffer(buffer, GIST_UNLOCK);
+
+ if (ntodelete)
+ {
+ TransactionId txid;
+ int i;
+
+ for (i = 0; i < ntodelete; i++)
+ {
+ Buffer leafBuffer = buftodelete[i];
+ Page leafPage;
+ LockBuffer(leafBuffer, GIST_EXCLUSIVE);
+ gistcheckpage(rel, leafBuffer);
+ leafPage = (Page) BufferGetPage(leafBuffer);
+ if (!GistPageIsLeaf(leafPage) /* not a leaf anymore */
+ || PageGetMaxOffsetNumber(leafPage) != InvalidOffsetNumber /* Page is not empry */
+ || (GistFollowRight(leafPage) || GistPageGetNSN(page) < GistPageGetNSN(leafPage)) /* No follow-right */
+ )
+ {
+ UnlockReleaseBuffer(leafBuffer);
+ buftodelete[i] = InvalidBuffer;
+ todelete[i] = InvalidOffsetNumber;
+ }
+ }
+
+ LockBuffer(buffer, GIST_EXCLUSIVE);
+ page = (Page) BufferGetPage(buffer);
+
+ for (i = 0; i < ntodelete; i++)
+ {
+ Buffer leafBuffer = buftodelete[i];
+ bool inconsistent = false;
+ if (todelete[i] == InvalidOffsetNumber)
+ continue;
+
+ if (PageIsNew(page) || GistPageIsDeleted(page) || GistPageIsLeaf(page)
+ || PageGetMaxOffsetNumber(page) < todelete[i])
+ inconsistent = true;
+
+ if (!inconsistent)
+ {
+ iid = PageGetItemId(page, todelete[i]);
+ idxtuple = (IndexTuple) PageGetItem(page, iid);
+ if (todelete[i] != ItemPointerGetBlockNumber(&(idxtuple->t_tid)))
+ inconsistent = true;
+ }
+
+ if (inconsistent)
+ {
+ UnlockReleaseBuffer(leafBuffer);
+ buftodelete[i] = InvalidBuffer;
+ todelete[i] = InvalidOffsetNumber;
+ }
+ }
+
+ /*
+ * Like in _bt_unlink_halfdead_page we need an upper bound on xid
+ * that could hold downlinks to this page. We use
+ * ReadNewTransactionId() to instead of GetCurrentTransactionId
+ * since we are in a VACUUM.
+ */
+ txid = ReadNewTransactionId();
+
+ START_CRIT_SECTION();
+
+ /* Mark pages as deleted dropping references from internal pages */
+ for (i = 0; i < ntodelete; i++)
+ {
+ Page leafPage;
+ XLogRecPtr recptr;
+
+ if (todelete[i] == InvalidOffsetNumber)
+ continue;
+
+ leafPage = (Page) BufferGetPage(buftodelete[i]);
+
+ /* Remember xid of last transaction that could see this page */
+ GistPageSetDeleteXid(leafPage,txid);
+
+ GistPageSetDeleted(leafPage);
+ MarkBufferDirty(buftodelete[i]);
+ stats->pages_deleted++;
+ vstate.emptyPages--;
+
+ MarkBufferDirty(buffer);
+ /* Offsets are changed as long as we delete tuples from internal page */
+ PageIndexTupleDelete(page, todelete[i] - i);
+
+ if (RelationNeedsWAL(rel))
+ recptr = gistXLogSetDeleted(rel->rd_node, buftodelete[i],
+ txid, buffer, todelete[i] - i);
+ else
+ recptr = gistGetFakeLSN(rel);
+ PageSetLSN(page, recptr);
+ PageSetLSN(leafPage, recptr);
+
+ UnlockReleaseBuffer(buftodelete[i]);
+ }
+ END_CRIT_SECTION();
+
+ LockBuffer(buffer, GIST_UNLOCK);
+ }
+
+ ReleaseBuffer(buffer);
+ }
+ }
+
+ blockset_free(vstate.emptyLeafPagesMap);
+ blockset_free(vstate.internalPagesMap);
}
/*
@@ -246,6 +452,7 @@ restart:
{
OffsetNumber todelete[MaxOffsetNumber];
int ntodelete = 0;
+ int nremain;
GISTPageOpaque opaque = GistPageGetOpaque(page);
OffsetNumber maxoff = PageGetMaxOffsetNumber(page);
@@ -319,10 +526,19 @@ restart:
maxoff = PageGetMaxOffsetNumber(page);
}
- stats->num_index_tuples += maxoff - FirstOffsetNumber + 1;
+ nremain = maxoff - FirstOffsetNumber + 1;
+ if (nremain == 0)
+ {
+ vstate->emptyLeafPagesMap = blockset_set(vstate->emptyLeafPagesMap, blkno);
+ vstate->emptyPages++;
+ }
+ else
+ stats->num_index_tuples += nremain;
}
else
{
+ vstate->internalPagesMap = blockset_set(vstate->internalPagesMap, blkno);
+
/*
* On an internal page, check for "invalid tuples", left behind by an
* incomplete page split on PostgreSQL 9.0 or below. These are not
diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c
index 408bd5390af..3213ea98ea3 100644
--- a/src/backend/access/gist/gistxlog.c
+++ b/src/backend/access/gist/gistxlog.c
@@ -64,6 +64,39 @@ gistRedoClearFollowRight(XLogReaderState *record, uint8 block_id)
UnlockReleaseBuffer(buffer);
}
+static void
+gistRedoPageSetDeleted(XLogReaderState *record)
+{
+ XLogRecPtr lsn = record->EndRecPtr;
+ gistxlogPageDelete *xldata = (gistxlogPageDelete *) XLogRecGetData(record);
+ Buffer buffer;
+ Page page;
+
+ if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
+ {
+ page = (Page) BufferGetPage(buffer);
+
+ GistPageSetDeleteXid(page, xldata->deleteXid);
+ GistPageSetDeleted(page);
+
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(buffer);
+ }
+ if (BufferIsValid(buffer))
+ UnlockReleaseBuffer(buffer);
+
+ if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO)
+ {
+ page = (Page) BufferGetPage(buffer);
+
+ PageIndexTupleDelete(page, xldata->downlinkOffset);
+
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(buffer);
+ }
+ if (BufferIsValid(buffer))
+ UnlockReleaseBuffer(buffer);
+}
/*
* redo any page update (except page split)
*/
@@ -116,6 +149,7 @@ gistRedoPageUpdateRecord(XLogReaderState *record)
data += sizeof(OffsetNumber) * xldata->ntodelete;
PageIndexMultiDelete(page, todelete, xldata->ntodelete);
+
if (GistPageIsLeaf(page))
GistMarkTuplesDeleted(page);
}
@@ -535,6 +569,9 @@ gist_redo(XLogReaderState *record)
case XLOG_GIST_CREATE_INDEX:
gistRedoCreateIndex(record);
break;
+ case XLOG_GIST_PAGE_DELETE:
+ gistRedoPageSetDeleted(record);
+ break;
default:
elog(PANIC, "gist_redo: unknown op code %u", info);
}
@@ -653,6 +690,29 @@ gistXLogSplit(bool page_is_leaf,
return recptr;
}
+/*
+ * Write XLOG record describing a page delete. This also includes removal of
+ * downlink from internal page.
+ */
+XLogRecPtr
+gistXLogSetDeleted(RelFileNode node, Buffer buffer, TransactionId xid,
+ Buffer internalPageBuffer, OffsetNumber internalPageOffset) {
+ gistxlogPageDelete xlrec;
+ XLogRecPtr recptr;
+
+ xlrec.deleteXid = xid;
+ xlrec.downlinkOffset = internalPageOffset;
+
+ XLogBeginInsert();
+ XLogRegisterData((char *) &xlrec, sizeof(gistxlogPageDelete));
+
+ XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
+ XLogRegisterBuffer(1, internalPageBuffer, REGBUF_STANDARD);
+ /* new tuples */
+ recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_DELETE);
+ return recptr;
+}
+
/*
* Write XLOG record describing a page update. The update can include any
* number of deletions and/or insertions of tuples on a single index page.
diff --git a/src/backend/access/rmgrdesc/gistdesc.c b/src/backend/access/rmgrdesc/gistdesc.c
index e468c9e15aa..0861f829922 100644
--- a/src/backend/access/rmgrdesc/gistdesc.c
+++ b/src/backend/access/rmgrdesc/gistdesc.c
@@ -76,6 +76,9 @@ gist_identify(uint8 info)
case XLOG_GIST_CREATE_INDEX:
id = "CREATE_INDEX";
break;
+ case XLOG_GIST_PAGE_DELETE:
+ id = "PAGE_DELETE";
+ break;
}
return id;
diff --git a/src/include/access/gist.h b/src/include/access/gist.h
index 3234f241560..ce8bfd83ea4 100644
--- a/src/include/access/gist.h
+++ b/src/include/access/gist.h
@@ -151,6 +151,10 @@ typedef struct GISTENTRY
#define GistPageGetNSN(page) ( PageXLogRecPtrGet(GistPageGetOpaque(page)->nsn))
#define GistPageSetNSN(page, val) ( PageXLogRecPtrSet(GistPageGetOpaque(page)->nsn, val))
+/* For deleted pages we store last xid which could see the page in scan */
+#define GistPageGetDeleteXid(page) ( ((PageHeader) (page))->pd_prune_xid )
+#define GistPageSetDeleteXid(page, val) ( ((PageHeader) (page))->pd_prune_xid = val)
+
/*
* Vector of GISTENTRY structs; user-defined methods union and picksplit
* take it as one of their arguments
diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h
index 463d2bfc7b9..943163ccce7 100644
--- a/src/include/access/gist_private.h
+++ b/src/include/access/gist_private.h
@@ -414,12 +414,17 @@ extern bool gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate,
extern SplitedPageLayout *gistSplit(Relation r, Page page, IndexTuple *itup,
int len, GISTSTATE *giststate);
+/* gistxlog.c */
+extern XLogRecPtr gistXLogSetDeleted(RelFileNode node, Buffer buffer,
+ TransactionId xid, Buffer internalPageBuffer,
+ OffsetNumber internalPageOffset);
+
extern XLogRecPtr gistXLogUpdate(Buffer buffer,
OffsetNumber *todelete, int ntodelete,
IndexTuple *itup, int ntup,
Buffer leftchild);
-XLogRecPtr gistXLogDelete(Buffer buffer, OffsetNumber *todelete,
+extern XLogRecPtr gistXLogDelete(Buffer buffer, OffsetNumber *todelete,
int ntodelete, RelFileNode hnode);
extern XLogRecPtr gistXLogSplit(bool page_is_leaf,
diff --git a/src/include/access/gistxlog.h b/src/include/access/gistxlog.h
index 5117aabf1af..127cff5cb78 100644
--- a/src/include/access/gistxlog.h
+++ b/src/include/access/gistxlog.h
@@ -17,13 +17,15 @@
#include "access/xlogreader.h"
#include "lib/stringinfo.h"
+/* XLog stuff */
+
#define XLOG_GIST_PAGE_UPDATE 0x00
#define XLOG_GIST_DELETE 0x10 /* delete leaf index tuples for a page */
/* #define XLOG_GIST_NEW_ROOT 0x20 */ /* not used anymore */
#define XLOG_GIST_PAGE_SPLIT 0x30
/* #define XLOG_GIST_INSERT_COMPLETE 0x40 */ /* not used anymore */
#define XLOG_GIST_CREATE_INDEX 0x50
- /* #define XLOG_GIST_PAGE_DELETE 0x60 */ /* not used anymore */
+#define XLOG_GIST_PAGE_DELETE 0x60
/*
* Backup Blk 0: updated page.
@@ -76,6 +78,12 @@ typedef struct gistxlogPageSplit
*/
} gistxlogPageSplit;
+typedef struct gistxlogPageDelete
+{
+ TransactionId deleteXid; /* last Xid which could see page in scan */
+ OffsetNumber downlinkOffset; /* Offset of the downlink referencing this page */
+} gistxlogPageDelete;
+
extern void gist_redo(XLogReaderState *record);
extern void gist_desc(StringInfo buf, XLogReaderState *record);
extern const char *gist_identify(uint8 info);
diff --git a/src/test/regress/expected/gist.out b/src/test/regress/expected/gist.out
index f5a2993aaf2..5b92f08c747 100644
--- a/src/test/regress/expected/gist.out
+++ b/src/test/regress/expected/gist.out
@@ -27,9 +27,7 @@ insert into gist_point_tbl (id, p)
select g+100000, point(g*10+1, g*10+1) from generate_series(1, 10000) g;
-- To test vacuum, delete some entries from all over the index.
delete from gist_point_tbl where id % 2 = 1;
--- And also delete some concentration of values. (GiST doesn't currently
--- attempt to delete pages even when they become empty, but if it did, this
--- would exercise it)
+-- And also delete some concentration of values.
delete from gist_point_tbl where id < 10000;
vacuum analyze gist_point_tbl;
-- rebuild the index with a different fillfactor
diff --git a/src/test/regress/sql/gist.sql b/src/test/regress/sql/gist.sql
index bae722fe13c..e66396e851b 100644
--- a/src/test/regress/sql/gist.sql
+++ b/src/test/regress/sql/gist.sql
@@ -28,9 +28,7 @@ select g+100000, point(g*10+1, g*10+1) from generate_series(1, 10000) g;
-- To test vacuum, delete some entries from all over the index.
delete from gist_point_tbl where id % 2 = 1;
--- And also delete some concentration of values. (GiST doesn't currently
--- attempt to delete pages even when they become empty, but if it did, this
--- would exercise it)
+-- And also delete some concentration of values.
delete from gist_point_tbl where id < 10000;
vacuum analyze gist_point_tbl;
--
2.20.1
>From f3a33df4906e0074821493311cd8e0d25f4f63c6 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakan...@iki.fi>
Date: Mon, 11 Mar 2019 15:15:43 +0200
Subject: [PATCH 3/6] Minor cleanup
I renamed gistXLogSetDeleted to gistXLogPageDelete. I think that's better,
the WAL record also includes information about removing the downlink from
the parent, not just setting the flag on the child.
---
src/backend/access/gist/gist.c | 7 +--
src/backend/access/gist/gistvacuum.c | 9 +--
src/backend/access/gist/gistxlog.c | 88 +++++++++++++++-------------
src/include/access/gist_private.h | 6 +-
src/include/access/gistxlog.h | 10 ++--
5 files changed, 62 insertions(+), 58 deletions(-)
diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c
index 6f87b4b5044..e260c4df4e7 100644
--- a/src/backend/access/gist/gist.c
+++ b/src/backend/access/gist/gist.c
@@ -844,11 +844,10 @@ gistdoinsert(Relation r, IndexTuple itup, Size freespace,
}
/*
- * Leaf pages can be left deleted but still referenced
- * until it's space is reused. Downlink to this page may be already
- * removed from the internal page, but this scan can posess it.
+ * The page might have been deleted after we scanned the parent
+ * and saw the downlink.
*/
- if(GistPageIsDeleted(stack->page))
+ if (GistPageIsDeleted(stack->page))
{
UnlockReleaseBuffer(stack->buffer);
xlocked = false;
diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c
index fc606ac823c..eb90b2077d3 100644
--- a/src/backend/access/gist/gistvacuum.c
+++ b/src/backend/access/gist/gistvacuum.c
@@ -20,11 +20,9 @@
#include "commands/vacuum.h"
#include "lib/blockset.h"
#include "miscadmin.h"
-#include "nodes/bitmapset.h"
#include "storage/indexfsm.h"
#include "storage/lmgr.h"
-
/* Working state needed by gistbulkdelete */
typedef struct
{
@@ -58,7 +56,6 @@ gistbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
gistvacuumscan(info, stats, callback, callback_state);
-
return stats;
}
@@ -112,7 +109,7 @@ gistvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
* while the index is being expanded, leaving an all-zeros page behind.
*
* The caller is responsible for initially allocating/zeroing a stats struct.
- *
+ *
* Bulk deletion of all index entries pointing to a set of heap tuples and
* check invalid tuples left after upgrade.
* The set of target tuples is specified via a callback routine that tells
@@ -213,7 +210,7 @@ gistvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
stats->num_pages = num_pages;
stats->pages_free = vstate.totFreePages;
- /* rescan all inner pages to find those that has empty child pages */
+ /* rescan all inner pages to find those that have empty child pages */
if (vstate.emptyPages > 0)
{
BlockNumber x;
@@ -305,7 +302,7 @@ gistvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
LockBuffer(leafBuffer, GIST_EXCLUSIVE);
gistcheckpage(rel, leafBuffer);
leafPage = (Page) BufferGetPage(leafBuffer);
- if (!GistPageIsLeaf(leafPage) /* not a leaf anymore */
+ if (!GistPageIsLeaf(leafPage) /* not a leaf anymore */
|| PageGetMaxOffsetNumber(leafPage) != InvalidOffsetNumber /* Page is not empry */
|| (GistFollowRight(leafPage) || GistPageGetNSN(page) < GistPageGetNSN(leafPage)) /* No follow-right */
)
diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c
index 3213ea98ea3..7110c70451e 100644
--- a/src/backend/access/gist/gistxlog.c
+++ b/src/backend/access/gist/gistxlog.c
@@ -64,39 +64,6 @@ gistRedoClearFollowRight(XLogReaderState *record, uint8 block_id)
UnlockReleaseBuffer(buffer);
}
-static void
-gistRedoPageSetDeleted(XLogReaderState *record)
-{
- XLogRecPtr lsn = record->EndRecPtr;
- gistxlogPageDelete *xldata = (gistxlogPageDelete *) XLogRecGetData(record);
- Buffer buffer;
- Page page;
-
- if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
- {
- page = (Page) BufferGetPage(buffer);
-
- GistPageSetDeleteXid(page, xldata->deleteXid);
- GistPageSetDeleted(page);
-
- PageSetLSN(page, lsn);
- MarkBufferDirty(buffer);
- }
- if (BufferIsValid(buffer))
- UnlockReleaseBuffer(buffer);
-
- if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO)
- {
- page = (Page) BufferGetPage(buffer);
-
- PageIndexTupleDelete(page, xldata->downlinkOffset);
-
- PageSetLSN(page, lsn);
- MarkBufferDirty(buffer);
- }
- if (BufferIsValid(buffer))
- UnlockReleaseBuffer(buffer);
-}
/*
* redo any page update (except page split)
*/
@@ -542,6 +509,43 @@ gistRedoCreateIndex(XLogReaderState *record)
UnlockReleaseBuffer(buffer);
}
+/* redo page deletion */
+static void
+gistRedoPageDelete(XLogReaderState *record)
+{
+ XLogRecPtr lsn = record->EndRecPtr;
+ gistxlogPageDelete *xldata = (gistxlogPageDelete *) XLogRecGetData(record);
+ Buffer buffer;
+ Page page;
+
+ /* FIXME: Are we locking the pages in correct order, for hot standby? */
+
+ if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
+ {
+ page = (Page) BufferGetPage(buffer);
+
+ GistPageSetDeleteXid(page, xldata->deleteXid);
+ GistPageSetDeleted(page);
+
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(buffer);
+ }
+ if (BufferIsValid(buffer))
+ UnlockReleaseBuffer(buffer);
+
+ if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO)
+ {
+ page = (Page) BufferGetPage(buffer);
+
+ PageIndexTupleDelete(page, xldata->downlinkOffset);
+
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(buffer);
+ }
+ if (BufferIsValid(buffer))
+ UnlockReleaseBuffer(buffer);
+}
+
void
gist_redo(XLogReaderState *record)
{
@@ -570,7 +574,7 @@ gist_redo(XLogReaderState *record)
gistRedoCreateIndex(record);
break;
case XLOG_GIST_PAGE_DELETE:
- gistRedoPageSetDeleted(record);
+ gistRedoPageDelete(record);
break;
default:
elog(PANIC, "gist_redo: unknown op code %u", info);
@@ -691,25 +695,27 @@ gistXLogSplit(bool page_is_leaf,
}
/*
- * Write XLOG record describing a page delete. This also includes removal of
- * downlink from internal page.
+ * Write XLOG record describing a page deletion. This also includes removal of
+ * downlink from the parent page.
*/
XLogRecPtr
-gistXLogSetDeleted(RelFileNode node, Buffer buffer, TransactionId xid,
- Buffer internalPageBuffer, OffsetNumber internalPageOffset) {
+gistXLogPageDelete(Buffer buffer, TransactionId xid,
+ Buffer parentBuffer, OffsetNumber downlinkOffset)
+{
gistxlogPageDelete xlrec;
XLogRecPtr recptr;
xlrec.deleteXid = xid;
- xlrec.downlinkOffset = internalPageOffset;
+ xlrec.downlinkOffset = downlinkOffset;
XLogBeginInsert();
XLogRegisterData((char *) &xlrec, sizeof(gistxlogPageDelete));
XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
- XLogRegisterBuffer(1, internalPageBuffer, REGBUF_STANDARD);
- /* new tuples */
+ XLogRegisterBuffer(1, parentBuffer, REGBUF_STANDARD);
+
recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_DELETE);
+
return recptr;
}
diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h
index 943163ccce7..c77d0b4dd81 100644
--- a/src/include/access/gist_private.h
+++ b/src/include/access/gist_private.h
@@ -415,9 +415,9 @@ extern SplitedPageLayout *gistSplit(Relation r, Page page, IndexTuple *itup,
int len, GISTSTATE *giststate);
/* gistxlog.c */
-extern XLogRecPtr gistXLogSetDeleted(RelFileNode node, Buffer buffer,
- TransactionId xid, Buffer internalPageBuffer,
- OffsetNumber internalPageOffset);
+extern XLogRecPtr gistXLogPageDelete(Buffer buffer,
+ TransactionId xid, Buffer parentBuffer,
+ OffsetNumber downlinkOffset);
extern XLogRecPtr gistXLogUpdate(Buffer buffer,
OffsetNumber *todelete, int ntodelete,
diff --git a/src/include/access/gistxlog.h b/src/include/access/gistxlog.h
index 127cff5cb78..939a1ea7559 100644
--- a/src/include/access/gistxlog.h
+++ b/src/include/access/gistxlog.h
@@ -17,8 +17,6 @@
#include "access/xlogreader.h"
#include "lib/stringinfo.h"
-/* XLog stuff */
-
#define XLOG_GIST_PAGE_UPDATE 0x00
#define XLOG_GIST_DELETE 0x10 /* delete leaf index tuples for a page */
/* #define XLOG_GIST_NEW_ROOT 0x20 */ /* not used anymore */
@@ -78,10 +76,14 @@ typedef struct gistxlogPageSplit
*/
} gistxlogPageSplit;
+/*
+ * Backup Blk 0: page that was deleted.
+ * Backup Blk 1: parent page, containing the downlink to the deleted page.
+ */
typedef struct gistxlogPageDelete
{
- TransactionId deleteXid; /* last Xid which could see page in scan */
- OffsetNumber downlinkOffset; /* Offset of the downlink referencing this page */
+ TransactionId deleteXid; /* last Xid which could see page in scan */
+ OffsetNumber downlinkOffset; /* Offset of downlink referencing this page */
} gistxlogPageDelete;
extern void gist_redo(XLogReaderState *record);
--
2.20.1
>From 3b6c4f6901f4b861e37e2aaba755dc66a5012607 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakan...@iki.fi>
Date: Mon, 11 Mar 2019 15:34:01 +0200
Subject: [PATCH 4/6] Move the page deletion logic to separate function.
If a VACUUM does multiple index passes, I think we only want to do the
empty page deletion after the final pass. That saves effort, since we
only need to scan the internal pages once. But even if we wanted to do
it on every pass, I think having a separate function makes it more
readable.
---
src/backend/access/gist/gistvacuum.c | 464 ++++++++++++++-------------
1 file changed, 240 insertions(+), 224 deletions(-)
diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c
index eb90b2077d3..b95e755406e 100644
--- a/src/backend/access/gist/gistvacuum.c
+++ b/src/backend/access/gist/gistvacuum.c
@@ -23,25 +23,31 @@
#include "storage/indexfsm.h"
#include "storage/lmgr.h"
-/* Working state needed by gistbulkdelete */
typedef struct
{
+ IndexBulkDeleteResult stats;
+
IndexVacuumInfo *info;
- IndexBulkDeleteResult *stats;
+ BlockNumber numEmptyPages;
+ BlockSet internalPagesMap;
+ BlockSet emptyLeafPagesMap;
+} GistBulkDeleteResult;
+
+/* Working state needed by gistbulkdelete */
+typedef struct
+{
+ GistBulkDeleteResult *stats;
IndexBulkDeleteCallback callback;
void *callback_state;
GistNSN startNSN;
BlockNumber totFreePages; /* true total # of free pages */
- BlockNumber emptyPages;
-
- BlockSet internalPagesMap;
- BlockSet emptyLeafPagesMap;
} GistVacState;
-static void gistvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
+static void gistvacuumscan(IndexVacuumInfo *info, GistBulkDeleteResult *stats,
IndexBulkDeleteCallback callback, void *callback_state);
static void gistvacuumpage(GistVacState *vstate, BlockNumber blkno,
BlockNumber orig_blkno);
+static void gistvacuum_recycle_pages(GistBulkDeleteResult *stats);
/*
* VACUUM bulkdelete stage: remove index entries.
@@ -50,13 +56,15 @@ IndexBulkDeleteResult *
gistbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
IndexBulkDeleteCallback callback, void *callback_state)
{
+ GistBulkDeleteResult *gist_stats = (GistBulkDeleteResult *) stats;
+
/* allocate stats if first time through, else re-use existing struct */
- if (stats == NULL)
- stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
+ if (gist_stats == NULL)
+ gist_stats = (GistBulkDeleteResult *) palloc0(sizeof(GistBulkDeleteResult));
- gistvacuumscan(info, stats, callback, callback_state);
+ gistvacuumscan(info, gist_stats, callback, callback_state);
- return stats;
+ return (IndexBulkDeleteResult *) gist_stats;
}
/*
@@ -65,6 +73,8 @@ gistbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
IndexBulkDeleteResult *
gistvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
{
+ GistBulkDeleteResult *gist_stats = (GistBulkDeleteResult *) stats;
+
/* No-op in ANALYZE ONLY mode */
if (info->analyze_only)
return stats;
@@ -74,12 +84,15 @@ gistvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
* stats from the latest gistbulkdelete call. If it wasn't called, we
* still need to do a pass over the index, to obtain index statistics.
*/
- if (stats == NULL)
+ if (gist_stats == NULL)
{
- stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
- gistvacuumscan(info, stats, NULL, NULL);
+ gist_stats = (GistBulkDeleteResult *) palloc0(sizeof(GistBulkDeleteResult));
+ gistvacuumscan(info, gist_stats, NULL, NULL);
}
+ /* Recycle empty pages */
+ gistvacuum_recycle_pages(gist_stats);
+
/*
* It's quite possible for us to be fooled by concurrent page splits into
* double-counting some index tuples, so disbelieve any total that exceeds
@@ -88,11 +101,11 @@ gistvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
*/
if (!info->estimated_count)
{
- if (stats->num_index_tuples > info->num_heap_tuples)
- stats->num_index_tuples = info->num_heap_tuples;
+ if (gist_stats->stats.num_index_tuples > info->num_heap_tuples)
+ gist_stats->stats.num_index_tuples = info->num_heap_tuples;
}
- return stats;
+ return (IndexBulkDeleteResult *) gist_stats;
}
/*
@@ -116,7 +129,7 @@ gistvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
* whether any given heap tuple (identified by ItemPointer) is being deleted.
*/
static void
-gistvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
+gistvacuumscan(IndexVacuumInfo *info, GistBulkDeleteResult *stats,
IndexBulkDeleteCallback callback, void *callback_state)
{
Relation rel = info->index;
@@ -129,12 +142,12 @@ gistvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
* Reset counts that will be incremented during the scan; needed in case
* of multiple scans during a single VACUUM command.
*/
- stats->estimated_count = false;
- stats->num_index_tuples = 0;
- stats->pages_deleted = 0;
+ stats->stats.estimated_count = false;
+ stats->stats.num_index_tuples = 0;
+ stats->stats.pages_deleted = 0;
/* Set up info to pass down to gistvacuumpage */
- vstate.info = info;
+ stats->info = info;
vstate.stats = stats;
vstate.callback = callback;
vstate.callback_state = callback_state;
@@ -143,9 +156,6 @@ gistvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
else
vstate.startNSN = gistGetFakeLSN(rel);
vstate.totFreePages = 0;
- vstate.emptyPages = 0;
- vstate.internalPagesMap = NULL;
- vstate.emptyLeafPagesMap = NULL;
/*
* The outer loop iterates over all index pages, in physical order (we
@@ -207,196 +217,8 @@ gistvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
IndexFreeSpaceMapVacuum(rel);
/* update statistics */
- stats->num_pages = num_pages;
- stats->pages_free = vstate.totFreePages;
-
- /* rescan all inner pages to find those that have empty child pages */
- if (vstate.emptyPages > 0)
- {
- BlockNumber x;
-
- x = InvalidBlockNumber;
- while (vstate.emptyPages > 0 &&
- (x = blockset_next(vstate.internalPagesMap, x)) != InvalidBlockNumber)
- {
- Buffer buffer;
- Page page;
- OffsetNumber off,
- maxoff;
- IndexTuple idxtuple;
- ItemId iid;
- OffsetNumber todelete[MaxOffsetNumber];
- Buffer buftodelete[MaxOffsetNumber];
- int ntodelete = 0;
-
- blkno = (BlockNumber) x;
-
- buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
- info->strategy);
-
- LockBuffer(buffer, GIST_EXCLUSIVE);
- page = (Page) BufferGetPage(buffer);
- if (PageIsNew(page) || GistPageIsDeleted(page) || GistPageIsLeaf(page))
- {
- UnlockReleaseBuffer(buffer);
- continue;
- }
-
- maxoff = PageGetMaxOffsetNumber(page);
- /* Check that leafs are still empty and decide what to delete */
- for (off = FirstOffsetNumber; off <= maxoff && ntodelete < maxoff-1; off = OffsetNumberNext(off))
- {
- Buffer leafBuffer;
- Page leafPage;
- BlockNumber leafBlockNo;
-
- iid = PageGetItemId(page, off);
- idxtuple = (IndexTuple) PageGetItem(page, iid);
- /* if this page was not empty in previous scan - we do not consider it */
- leafBlockNo = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
- if (!blockset_get(leafBlockNo, vstate.emptyLeafPagesMap))
- continue;
-
- leafBuffer = ReadBufferExtended(rel, MAIN_FORKNUM, leafBlockNo,
- RBM_NORMAL, info->strategy);
-
- buftodelete[ntodelete] = leafBuffer;
- todelete[ntodelete++] = off;
-
- LockBuffer(leafBuffer, GIST_EXCLUSIVE);
- gistcheckpage(rel, leafBuffer);
- leafPage = (Page) BufferGetPage(leafBuffer);
- if (!GistPageIsLeaf(leafPage))
- {
- UnlockReleaseBuffer(leafBuffer);
- continue;
- }
-
- if (PageGetMaxOffsetNumber(leafPage) == InvalidOffsetNumber /* Nothing left to split */
- && !(GistFollowRight(leafPage) || GistPageGetNSN(page) < GistPageGetNSN(leafPage)) /* No follow-right */
- && ntodelete < maxoff-1) /* We must keep at least one leaf page per each */
- {
- buftodelete[ntodelete] = leafBuffer;
- todelete[ntodelete++] = off;
- }
- else
- UnlockReleaseBuffer(leafBuffer);
- }
-
- /*
- * We will have to relock internal page in case of deletes:
- * we cannot lock child while holding parent lock without risk
- * of a deadlock
- */
- LockBuffer(buffer, GIST_UNLOCK);
-
- if (ntodelete)
- {
- TransactionId txid;
- int i;
-
- for (i = 0; i < ntodelete; i++)
- {
- Buffer leafBuffer = buftodelete[i];
- Page leafPage;
- LockBuffer(leafBuffer, GIST_EXCLUSIVE);
- gistcheckpage(rel, leafBuffer);
- leafPage = (Page) BufferGetPage(leafBuffer);
- if (!GistPageIsLeaf(leafPage) /* not a leaf anymore */
- || PageGetMaxOffsetNumber(leafPage) != InvalidOffsetNumber /* Page is not empry */
- || (GistFollowRight(leafPage) || GistPageGetNSN(page) < GistPageGetNSN(leafPage)) /* No follow-right */
- )
- {
- UnlockReleaseBuffer(leafBuffer);
- buftodelete[i] = InvalidBuffer;
- todelete[i] = InvalidOffsetNumber;
- }
- }
-
- LockBuffer(buffer, GIST_EXCLUSIVE);
- page = (Page) BufferGetPage(buffer);
-
- for (i = 0; i < ntodelete; i++)
- {
- Buffer leafBuffer = buftodelete[i];
- bool inconsistent = false;
- if (todelete[i] == InvalidOffsetNumber)
- continue;
-
- if (PageIsNew(page) || GistPageIsDeleted(page) || GistPageIsLeaf(page)
- || PageGetMaxOffsetNumber(page) < todelete[i])
- inconsistent = true;
-
- if (!inconsistent)
- {
- iid = PageGetItemId(page, todelete[i]);
- idxtuple = (IndexTuple) PageGetItem(page, iid);
- if (todelete[i] != ItemPointerGetBlockNumber(&(idxtuple->t_tid)))
- inconsistent = true;
- }
-
- if (inconsistent)
- {
- UnlockReleaseBuffer(leafBuffer);
- buftodelete[i] = InvalidBuffer;
- todelete[i] = InvalidOffsetNumber;
- }
- }
-
- /*
- * Like in _bt_unlink_halfdead_page we need an upper bound on xid
- * that could hold downlinks to this page. We use
- * ReadNewTransactionId() to instead of GetCurrentTransactionId
- * since we are in a VACUUM.
- */
- txid = ReadNewTransactionId();
-
- START_CRIT_SECTION();
-
- /* Mark pages as deleted dropping references from internal pages */
- for (i = 0; i < ntodelete; i++)
- {
- Page leafPage;
- XLogRecPtr recptr;
-
- if (todelete[i] == InvalidOffsetNumber)
- continue;
-
- leafPage = (Page) BufferGetPage(buftodelete[i]);
-
- /* Remember xid of last transaction that could see this page */
- GistPageSetDeleteXid(leafPage,txid);
-
- GistPageSetDeleted(leafPage);
- MarkBufferDirty(buftodelete[i]);
- stats->pages_deleted++;
- vstate.emptyPages--;
-
- MarkBufferDirty(buffer);
- /* Offsets are changed as long as we delete tuples from internal page */
- PageIndexTupleDelete(page, todelete[i] - i);
-
- if (RelationNeedsWAL(rel))
- recptr = gistXLogSetDeleted(rel->rd_node, buftodelete[i],
- txid, buffer, todelete[i] - i);
- else
- recptr = gistGetFakeLSN(rel);
- PageSetLSN(page, recptr);
- PageSetLSN(leafPage, recptr);
-
- UnlockReleaseBuffer(buftodelete[i]);
- }
- END_CRIT_SECTION();
-
- LockBuffer(buffer, GIST_UNLOCK);
- }
-
- ReleaseBuffer(buffer);
- }
- }
-
- blockset_free(vstate.emptyLeafPagesMap);
- blockset_free(vstate.internalPagesMap);
+ stats->stats.num_pages = num_pages;
+ stats->stats.pages_free = vstate.totFreePages;
}
/*
@@ -413,8 +235,8 @@ gistvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
static void
gistvacuumpage(GistVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
{
- IndexVacuumInfo *info = vstate->info;
- IndexBulkDeleteResult *stats = vstate->stats;
+ GistBulkDeleteResult *stats = vstate->stats;
+ IndexVacuumInfo *info = stats->info;
IndexBulkDeleteCallback callback = vstate->callback;
void *callback_state = vstate->callback_state;
Relation rel = info->index;
@@ -443,7 +265,7 @@ restart:
/* Okay to recycle this page */
RecordFreeIndexPage(rel, blkno);
vstate->totFreePages++;
- stats->pages_deleted++;
+ stats->stats.pages_deleted++;
}
else if (GistPageIsLeaf(page))
{
@@ -518,7 +340,7 @@ restart:
END_CRIT_SECTION();
- stats->tuples_removed += ntodelete;
+ stats->stats.tuples_removed += ntodelete;
/* must recompute maxoff */
maxoff = PageGetMaxOffsetNumber(page);
}
@@ -526,16 +348,14 @@ restart:
nremain = maxoff - FirstOffsetNumber + 1;
if (nremain == 0)
{
- vstate->emptyLeafPagesMap = blockset_set(vstate->emptyLeafPagesMap, blkno);
- vstate->emptyPages++;
+ stats->emptyLeafPagesMap = blockset_set(stats->emptyLeafPagesMap, blkno);
+ stats->numEmptyPages++;
}
else
- stats->num_index_tuples += nremain;
+ stats->stats.num_index_tuples += nremain;
}
else
{
- vstate->internalPagesMap = blockset_set(vstate->internalPagesMap, blkno);
-
/*
* On an internal page, check for "invalid tuples", left behind by an
* incomplete page split on PostgreSQL 9.0 or below. These are not
@@ -560,6 +380,8 @@ restart:
errdetail("This is caused by an incomplete page split at crash recovery before upgrading to PostgreSQL 9.1."),
errhint("Please REINDEX it.")));
}
+
+ stats->internalPagesMap = blockset_set(stats->internalPagesMap, blkno);
}
UnlockReleaseBuffer(buffer);
@@ -577,3 +399,197 @@ restart:
goto restart;
}
}
+
+static void
+gistvacuum_recycle_pages(GistBulkDeleteResult *stats)
+{
+ IndexVacuumInfo *info = stats->info;
+ Relation rel = info->index;
+ BlockNumber x;
+
+ /* quick exit if no empty pages */
+ if (stats->numEmptyPages > 0)
+ gistvacuum_recycle_pages(stats);
+
+ /* rescan all inner pages to find those that have empty child pages */
+ x = InvalidBlockNumber;
+ while (stats->numEmptyPages > 0 &&
+ (x = blockset_next(stats->internalPagesMap, x)) != InvalidBlockNumber)
+ {
+ Buffer buffer;
+ Page page;
+ OffsetNumber off,
+ maxoff;
+ IndexTuple idxtuple;
+ ItemId iid;
+ OffsetNumber todelete[MaxOffsetNumber];
+ Buffer buftodelete[MaxOffsetNumber];
+ int ntodelete = 0;
+ BlockNumber blkno;
+
+ blkno = (BlockNumber) x;
+
+ buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
+ info->strategy);
+
+ LockBuffer(buffer, GIST_EXCLUSIVE);
+ page = (Page) BufferGetPage(buffer);
+ if (PageIsNew(page) || GistPageIsDeleted(page) || GistPageIsLeaf(page))
+ {
+ UnlockReleaseBuffer(buffer);
+ continue;
+ }
+
+ maxoff = PageGetMaxOffsetNumber(page);
+ /* Check that leafs are still empty and decide what to delete */
+ for (off = FirstOffsetNumber; off <= maxoff && ntodelete < maxoff-1; off = OffsetNumberNext(off))
+ {
+ Buffer leafBuffer;
+ Page leafPage;
+ BlockNumber leafBlockNo;
+
+ iid = PageGetItemId(page, off);
+ idxtuple = (IndexTuple) PageGetItem(page, iid);
+ /* if this page was not empty in previous scan - we do not consider it */
+ leafBlockNo = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
+ if (!blockset_get(leafBlockNo, stats->emptyLeafPagesMap))
+ continue;
+
+ leafBuffer = ReadBufferExtended(rel, MAIN_FORKNUM, leafBlockNo,
+ RBM_NORMAL, info->strategy);
+
+ buftodelete[ntodelete] = leafBuffer;
+ todelete[ntodelete++] = off;
+
+ LockBuffer(leafBuffer, GIST_EXCLUSIVE);
+ gistcheckpage(rel, leafBuffer);
+ leafPage = (Page) BufferGetPage(leafBuffer);
+ if (!GistPageIsLeaf(leafPage))
+ {
+ UnlockReleaseBuffer(leafBuffer);
+ continue;
+ }
+
+ if (PageGetMaxOffsetNumber(leafPage) == InvalidOffsetNumber /* Nothing left to split */
+ && !(GistFollowRight(leafPage) || GistPageGetNSN(page) < GistPageGetNSN(leafPage)) /* No follow-right */
+ && ntodelete < maxoff-1) /* We must keep at least one leaf page per each */
+ {
+ buftodelete[ntodelete] = leafBuffer;
+ todelete[ntodelete++] = off;
+ }
+ else
+ UnlockReleaseBuffer(leafBuffer);
+ }
+
+ /*
+ * We will have to relock internal page in case of deletes:
+ * we cannot lock child while holding parent lock without risk
+ * of a deadlock
+ */
+ LockBuffer(buffer, GIST_UNLOCK);
+
+ if (ntodelete)
+ {
+ TransactionId txid;
+ int i;
+
+ for (i = 0; i < ntodelete; i++)
+ {
+ Buffer leafBuffer = buftodelete[i];
+ Page leafPage;
+ LockBuffer(leafBuffer, GIST_EXCLUSIVE);
+ gistcheckpage(rel, leafBuffer);
+ leafPage = (Page) BufferGetPage(leafBuffer);
+ if (!GistPageIsLeaf(leafPage) /* not a leaf anymore */
+ || PageGetMaxOffsetNumber(leafPage) != InvalidOffsetNumber /* Page is not empry */
+ || (GistFollowRight(leafPage) || GistPageGetNSN(page) < GistPageGetNSN(leafPage)) /* No follow-right */
+ )
+ {
+ UnlockReleaseBuffer(leafBuffer);
+ buftodelete[i] = InvalidBuffer;
+ todelete[i] = InvalidOffsetNumber;
+ }
+ }
+
+ LockBuffer(buffer, GIST_EXCLUSIVE);
+ page = (Page) BufferGetPage(buffer);
+
+ for (i = 0; i < ntodelete; i++)
+ {
+ Buffer leafBuffer = buftodelete[i];
+ bool inconsistent = false;
+
+ if (todelete[i] == InvalidOffsetNumber)
+ continue;
+
+ if (PageIsNew(page) || GistPageIsDeleted(page) || GistPageIsLeaf(page)
+ || PageGetMaxOffsetNumber(page) < todelete[i])
+ inconsistent = true;
+
+ if (!inconsistent)
+ {
+ iid = PageGetItemId(page, todelete[i]);
+ idxtuple = (IndexTuple) PageGetItem(page, iid);
+ if (todelete[i] != ItemPointerGetBlockNumber(&(idxtuple->t_tid)))
+ inconsistent = true;
+ }
+
+ if (inconsistent)
+ {
+ UnlockReleaseBuffer(leafBuffer);
+ buftodelete[i] = InvalidBuffer;
+ todelete[i] = InvalidOffsetNumber;
+ }
+ }
+
+ /*
+ * Like in _bt_unlink_halfdead_page we need an upper bound on xid
+ * that could hold downlinks to this page. We use
+ * ReadNewTransactionId() to instead of GetCurrentTransactionId
+ * since we are in a VACUUM.
+ */
+ txid = ReadNewTransactionId();
+
+ START_CRIT_SECTION();
+
+ /* Mark pages as deleted dropping references from internal pages */
+ for (i = 0; i < ntodelete; i++)
+ {
+ Page leafPage;
+ XLogRecPtr recptr;
+
+ if (todelete[i] == InvalidOffsetNumber)
+ continue;
+
+ leafPage = (Page) BufferGetPage(buftodelete[i]);
+
+ /* Remember xid of last transaction that could see this page */
+ GistPageSetDeleteXid(leafPage,txid);
+
+ GistPageSetDeleted(leafPage);
+ MarkBufferDirty(buftodelete[i]);
+ stats->stats.pages_deleted++;
+ stats->numEmptyPages--;
+
+ MarkBufferDirty(buffer);
+ /* Offsets are changed as long as we delete tuples from internal page */
+ PageIndexTupleDelete(page, todelete[i] - i);
+
+ if (RelationNeedsWAL(rel))
+ recptr = gistXLogPageDelete(buftodelete[i],
+ txid, buffer, todelete[i] - i);
+ else
+ recptr = gistGetFakeLSN(rel);
+ PageSetLSN(page, recptr);
+ PageSetLSN(leafPage, recptr);
+
+ UnlockReleaseBuffer(buftodelete[i]);
+ }
+ END_CRIT_SECTION();
+
+ LockBuffer(buffer, GIST_UNLOCK);
+ }
+
+ ReleaseBuffer(buffer);
+ }
+}
--
2.20.1
>From 54393f2a3c81fe05f56140c9c367e1ff960fe0ba Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakan...@iki.fi>
Date: Mon, 11 Mar 2019 15:41:07 +0200
Subject: [PATCH 5/6] Remove 'numEmptyPages'. it's not really needed.
---
src/backend/access/gist/gistvacuum.c | 17 +++++++++--------
1 file changed, 9 insertions(+), 8 deletions(-)
diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c
index b95e755406e..bf0b9d54f69 100644
--- a/src/backend/access/gist/gistvacuum.c
+++ b/src/backend/access/gist/gistvacuum.c
@@ -28,7 +28,6 @@ typedef struct
IndexBulkDeleteResult stats;
IndexVacuumInfo *info;
- BlockNumber numEmptyPages;
BlockSet internalPagesMap;
BlockSet emptyLeafPagesMap;
} GistBulkDeleteResult;
@@ -349,7 +348,6 @@ restart:
if (nremain == 0)
{
stats->emptyLeafPagesMap = blockset_set(stats->emptyLeafPagesMap, blkno);
- stats->numEmptyPages++;
}
else
stats->stats.num_index_tuples += nremain;
@@ -408,13 +406,15 @@ gistvacuum_recycle_pages(GistBulkDeleteResult *stats)
BlockNumber x;
/* quick exit if no empty pages */
- if (stats->numEmptyPages > 0)
- gistvacuum_recycle_pages(stats);
+ /* HEIKKI: I'm assuming the blockset is always NULL if it's empty. That's true
+ * for the current usage. But add comments, and maybe a blockset_isempty() macro
+ * for clarity */
+ if (stats->emptyLeafPagesMap == NULL)
+ return;
/* rescan all inner pages to find those that have empty child pages */
x = InvalidBlockNumber;
- while (stats->numEmptyPages > 0 &&
- (x = blockset_next(stats->internalPagesMap, x)) != InvalidBlockNumber)
+ while ((x = blockset_next(stats->internalPagesMap, x)) != InvalidBlockNumber)
{
Buffer buffer;
Page page;
@@ -442,7 +442,9 @@ gistvacuum_recycle_pages(GistBulkDeleteResult *stats)
maxoff = PageGetMaxOffsetNumber(page);
/* Check that leafs are still empty and decide what to delete */
- for (off = FirstOffsetNumber; off <= maxoff && ntodelete < maxoff-1; off = OffsetNumberNext(off))
+ for (off = FirstOffsetNumber;
+ off <= maxoff && ntodelete < maxoff-1;
+ off = OffsetNumberNext(off))
{
Buffer leafBuffer;
Page leafPage;
@@ -569,7 +571,6 @@ gistvacuum_recycle_pages(GistBulkDeleteResult *stats)
GistPageSetDeleted(leafPage);
MarkBufferDirty(buftodelete[i]);
stats->stats.pages_deleted++;
- stats->numEmptyPages--;
MarkBufferDirty(buffer);
/* Offsets are changed as long as we delete tuples from internal page */
--
2.20.1
>From b365081e1bc0818445e44b1ad4ba32d311609f06 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakan...@iki.fi>
Date: Mon, 11 Mar 2019 16:58:10 +0200
Subject: [PATCH 6/6] Misc cleanup.
---
src/backend/access/gist/gistvacuum.c | 32 ++++++++++++++++------------
1 file changed, 18 insertions(+), 14 deletions(-)
diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c
index bf0b9d54f69..54a0adaf71f 100644
--- a/src/backend/access/gist/gistvacuum.c
+++ b/src/backend/access/gist/gistvacuum.c
@@ -419,7 +419,7 @@ gistvacuum_recycle_pages(GistBulkDeleteResult *stats)
Buffer buffer;
Page page;
OffsetNumber off,
- maxoff;
+ maxoff;
IndexTuple idxtuple;
ItemId iid;
OffsetNumber todelete[MaxOffsetNumber];
@@ -436,6 +436,8 @@ gistvacuum_recycle_pages(GistBulkDeleteResult *stats)
page = (Page) BufferGetPage(buffer);
if (PageIsNew(page) || GistPageIsDeleted(page) || GistPageIsLeaf(page))
{
+ /* HEIKKI: This page was an internal page earlier, but now it's something else.
+ * Shouldn't happen... */
UnlockReleaseBuffer(buffer);
continue;
}
@@ -446,12 +448,13 @@ gistvacuum_recycle_pages(GistBulkDeleteResult *stats)
off <= maxoff && ntodelete < maxoff-1;
off = OffsetNumberNext(off))
{
+ BlockNumber leafBlockNo;
Buffer leafBuffer;
Page leafPage;
- BlockNumber leafBlockNo;
iid = PageGetItemId(page, off);
idxtuple = (IndexTuple) PageGetItem(page, iid);
+
/* if this page was not empty in previous scan - we do not consider it */
leafBlockNo = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
if (!blockset_get(leafBlockNo, stats->emptyLeafPagesMap))
@@ -468,6 +471,7 @@ gistvacuum_recycle_pages(GistBulkDeleteResult *stats)
leafPage = (Page) BufferGetPage(leafBuffer);
if (!GistPageIsLeaf(leafPage))
{
+ /* it's not a leaf anymore? shouldn't happen.. */
UnlockReleaseBuffer(leafBuffer);
continue;
}
@@ -483,27 +487,29 @@ gistvacuum_recycle_pages(GistBulkDeleteResult *stats)
UnlockReleaseBuffer(leafBuffer);
}
- /*
- * We will have to relock internal page in case of deletes:
- * we cannot lock child while holding parent lock without risk
- * of a deadlock
- */
- LockBuffer(buffer, GIST_UNLOCK);
-
- if (ntodelete)
+ if (ntodelete > 0)
{
TransactionId txid;
int i;
+ /*
+ * We will have to relock internal page in case of deletes:
+ * we cannot lock child while holding parent lock without risk
+ * of a deadlock
+ */
+ LockBuffer(buffer, GIST_UNLOCK);
+
for (i = 0; i < ntodelete; i++)
{
Buffer leafBuffer = buftodelete[i];
Page leafPage;
+
+ /* FIXME: Aren't we still holding the lock from the loop above? */
LockBuffer(leafBuffer, GIST_EXCLUSIVE);
gistcheckpage(rel, leafBuffer);
leafPage = (Page) BufferGetPage(leafBuffer);
if (!GistPageIsLeaf(leafPage) /* not a leaf anymore */
- || PageGetMaxOffsetNumber(leafPage) != InvalidOffsetNumber /* Page is not empry */
+ || PageGetMaxOffsetNumber(leafPage) != InvalidOffsetNumber /* Page is not empty */
|| (GistFollowRight(leafPage) || GistPageGetNSN(page) < GistPageGetNSN(leafPage)) /* No follow-right */
)
{
@@ -587,10 +593,8 @@ gistvacuum_recycle_pages(GistBulkDeleteResult *stats)
UnlockReleaseBuffer(buftodelete[i]);
}
END_CRIT_SECTION();
-
- LockBuffer(buffer, GIST_UNLOCK);
}
- ReleaseBuffer(buffer);
+ UnlockReleaseBuffer(buffer);
}
}
--
2.20.1