[HACKERS] [PoC] pgstattuple2: block sampling to reduce physical read

Satoshi Nagayasu Mon, 22 Jul 2013 23:17:34 -0700

Hi,

I've been working on new pgstattuple function to allow
block sampling [1] in order to reduce block reads while
scanning a table. A PoC patch is attached.


[1] Re: [RFC] pgstattuple/pgstatindex enhancement

http://www.postgresql.org/message-id/ca+tgmoaxjhgz2c4ayfbr9muuvnhgwu4co-cthqpzrwwdtam...@mail.gmail.com

This new function, pgstattuple2(), samples only 3,000 blocks
(which accounts 24MB) from the table randomly, and estimates
several parameters of the entire table.

The function calculates the averages of the samples, estimates
the parameters (averages and SDs), and shows "standard errors
(SE)" to allow estimating status of the table with statistical
approach.

And, of course, it reduces number of physical block reads
while scanning a bigger table.

The following example shows that new pgstattuple2 function
runs x100 faster than the original pgstattuple function with
well-estimated results.

----------------------------------------------
postgres=# select * from pgstattuple('pgbench_accounts');
-[ RECORD 1 ]------+-----------
table_len          | 1402642432
tuple_count        | 10000000
tuple_len          | 1210000000
tuple_percent      | 86.27
dead_tuple_count   | 182895
dead_tuple_len     | 22130295
dead_tuple_percent | 1.58
free_space         | 21012328
free_percent       | 1.5

Time: 1615.651 ms
postgres=# select * from pgstattuple2('pgbench_accounts');
NOTICE:  pgstattuple2: SE tuple_count 2376.47, tuple_len 287552.58,
dead_tuple_count 497.63, dead_tuple_len 60213.08, free_space 289752.38
-[ RECORD 1 ]------+-----------
table_len          | 1402642432
tuple_count        | 9978074
tuple_len          | 1207347074
tuple_percent      | 86.08
dead_tuple_count   | 187315
dead_tuple_len     | 22665208
dead_tuple_percent | 1.62
free_space         | 23400431
free_percent       | 1.67

Time: 15.026 ms
postgres=#
----------------------------------------------

In addition to that, see attached chart to know how pgstattuple2
estimates well during repeating (long-running) pgbench.

I understand that pgbench would generate "random" transactions,
and those update operations might not have any skew over the table,
so estimating table status seems to be easy in this test.

However, I'm still curious to know whether it would work in
"real-world" worklaod.

Is it worth having this? Any comment or suggestion?

Regards,
-- 
Satoshi Nagayasu <sn...@uptime.jp>
Uptime Technologies, LLC. http://www.uptime.jp

diff --git a/contrib/pgstattuple/pgstattuple--1.1--1.2.sql 
b/contrib/pgstattuple/pgstattuple--1.1--1.2.sql
index 2783a63..8ebec6f 100644
--- a/contrib/pgstattuple/pgstattuple--1.1--1.2.sql
+++ b/contrib/pgstattuple/pgstattuple--1.1--1.2.sql
@@ -37,3 +37,17 @@ CREATE FUNCTION pg_relpages(IN relname regclass)
 RETURNS BIGINT
 AS 'MODULE_PATHNAME', 'pg_relpagesbyid'
 LANGUAGE C STRICT;
+
+CREATE FUNCTION pgstattuple2(IN relname regclass,
+    OUT table_len BIGINT,              -- physical table length in bytes
+    OUT tuple_count BIGINT,            -- number of live tuples
+    OUT tuple_len BIGINT,              -- total tuples length in bytes
+    OUT tuple_percent FLOAT8,          -- live tuples in %
+    OUT dead_tuple_count BIGINT,       -- number of dead tuples
+    OUT dead_tuple_len BIGINT,         -- total dead tuples length in bytes
+    OUT dead_tuple_percent FLOAT8,     -- dead tuples in %
+    OUT free_space BIGINT,             -- free space in bytes
+    OUT free_percent FLOAT8)           -- free space in %
+AS 'MODULE_PATHNAME', 'pgstattuple2'
+LANGUAGE C STRICT;
+
diff --git a/contrib/pgstattuple/pgstattuple--1.2.sql 
b/contrib/pgstattuple/pgstattuple--1.2.sql
index e5fa2f5..963eb00 100644
--- a/contrib/pgstattuple/pgstattuple--1.2.sql
+++ b/contrib/pgstattuple/pgstattuple--1.2.sql
@@ -77,3 +77,17 @@ CREATE FUNCTION pg_relpages(IN relname regclass)
 RETURNS BIGINT
 AS 'MODULE_PATHNAME', 'pg_relpagesbyid'
 LANGUAGE C STRICT;
+
+CREATE FUNCTION pgstattuple2(IN relname regclass,
+    OUT table_len BIGINT,              -- physical table length in bytes
+    OUT tuple_count BIGINT,            -- number of live tuples
+    OUT tuple_len BIGINT,              -- total tuples length in bytes
+    OUT tuple_percent FLOAT8,          -- live tuples in %
+    OUT dead_tuple_count BIGINT,       -- number of dead tuples
+    OUT dead_tuple_len BIGINT,         -- total dead tuples length in bytes
+    OUT dead_tuple_percent FLOAT8,     -- dead tuples in %
+    OUT free_space BIGINT,             -- free space in bytes
+    OUT free_percent FLOAT8)           -- free space in %
+AS 'MODULE_PATHNAME', 'pgstattuple2'
+LANGUAGE C STRICT;
+
diff --git a/contrib/pgstattuple/pgstattuple.c 
b/contrib/pgstattuple/pgstattuple.c
index 7f41ec3..c2adb75 100644
--- a/contrib/pgstattuple/pgstattuple.c
+++ b/contrib/pgstattuple/pgstattuple.c
@@ -41,9 +41,22 @@ PG_MODULE_MAGIC;
 
 PG_FUNCTION_INFO_V1(pgstattuple);
 PG_FUNCTION_INFO_V1(pgstattuplebyid);
+PG_FUNCTION_INFO_V1(pgstattuple2);
 
 extern Datum pgstattuple(PG_FUNCTION_ARGS);
 extern Datum pgstattuplebyid(PG_FUNCTION_ARGS);
+extern Datum pgstattuple2(PG_FUNCTION_ARGS);
+
+#define SAMPLE_SIZE 3000
+
+typedef struct pgstattuple_block_stats
+{
+       uint64          tuple_count;
+       uint64          tuple_len;
+       uint64          dead_tuple_count;
+       uint64          dead_tuple_len;
+       uint64          free_space;             /* free/reusable space in bytes 
*/
+} pgstattuple_block_stats;
 
 /*
  * struct pgstattuple_type
@@ -66,8 +79,9 @@ typedef void (*pgstat_page) (pgstattuple_type *, Relation, 
BlockNumber,
 
 static Datum build_pgstattuple_type(pgstattuple_type *stat,
                                           FunctionCallInfo fcinfo);
-static Datum pgstat_relation(Relation rel, FunctionCallInfo fcinfo);
+static Datum pgstat_relation(Relation rel, FunctionCallInfo fcinfo, bool 
enable_sample);
 static Datum pgstat_heap(Relation rel, FunctionCallInfo fcinfo);
+static Datum pgstat_heap_sample(Relation rel, FunctionCallInfo fcinfo);
 static void pgstat_btree_page(pgstattuple_type *stat,
                                  Relation rel, BlockNumber blkno,
                                  BufferAccessStrategy bstrategy);
@@ -81,6 +95,11 @@ static Datum pgstat_index(Relation rel, BlockNumber start,
                         pgstat_page pagefn, FunctionCallInfo fcinfo);
 static void pgstat_index_page(pgstattuple_type *stat, Page page,
                                  OffsetNumber minoff, OffsetNumber maxoff);
+static void compute_parameters(pgstattuple_block_stats *block_stats,
+                                  BlockNumber sample_size, BlockNumber nblocks,
+                                  uint64 *tuple_count, uint64 *tuple_len,
+                                  uint64 *dead_tuple_count, uint64 
*dead_tuple_len,
+                                  uint64 *free_space);
 
 /*
  * build_pgstattuple_type -- build a pgstattuple_type tuple
@@ -175,7 +194,7 @@ pgstattuple(PG_FUNCTION_ARGS)
        relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname));
        rel = relation_openrv(relrv, AccessShareLock);
 
-       PG_RETURN_DATUM(pgstat_relation(rel, fcinfo));
+       PG_RETURN_DATUM(pgstat_relation(rel, fcinfo, false));
 }
 
 Datum
@@ -192,14 +211,14 @@ pgstattuplebyid(PG_FUNCTION_ARGS)
        /* open relation */
        rel = relation_open(relid, AccessShareLock);
 
-       PG_RETURN_DATUM(pgstat_relation(rel, fcinfo));
+       PG_RETURN_DATUM(pgstat_relation(rel, fcinfo, false));
 }
 
 /*
  * pgstat_relation
  */
 static Datum
-pgstat_relation(Relation rel, FunctionCallInfo fcinfo)
+pgstat_relation(Relation rel, FunctionCallInfo fcinfo, bool enable_sample)
 {
        const char *err;
 
@@ -219,6 +238,9 @@ pgstat_relation(Relation rel, FunctionCallInfo fcinfo)
                case RELKIND_MATVIEW:
                case RELKIND_TOASTVALUE:
                case RELKIND_SEQUENCE:
+                       if ( enable_sample )
+                               return pgstat_heap_sample(rel, fcinfo);
+
                        return pgstat_heap(rel, fcinfo);
                case RELKIND_INDEX:
                        switch (rel->rd_rel->relam)
@@ -535,3 +557,225 @@ pgstat_index_page(pgstattuple_type *stat, Page page,
                }
        }
 }
+
+static void
+compute_parameters(pgstattuple_block_stats *block_stats, BlockNumber 
sample_size, BlockNumber nblocks,
+                                  uint64 *tuple_count, uint64 *tuple_len,
+                                  uint64 *dead_tuple_count, uint64 
*dead_tuple_len,
+                                  uint64 *free_space)
+{
+       double tuple_count_avg;
+       double tuple_len_avg;
+       double dead_tuple_count_avg;
+       double dead_tuple_len_avg;
+       double free_space_avg;
+
+       double tuple_count_sd;
+       double tuple_len_sd;
+       double dead_tuple_count_sd;
+       double dead_tuple_len_sd;
+       double free_space_sd;
+
+       double tuple_count_se;
+       double tuple_len_se;
+       double dead_tuple_count_se;
+       double dead_tuple_len_se;
+       double free_space_se;
+
+       int i;
+       
+       /*
+        * sample average
+        */
+       tuple_count_avg      = 0;
+       tuple_len_avg        = 0;
+       dead_tuple_count_avg = 0;
+       dead_tuple_len_avg   = 0;
+       free_space_avg       = 0;
+       
+       for (i=0 ; i<sample_size ; i++)
+       {
+               tuple_count_avg      += block_stats[i].tuple_count;
+               tuple_len_avg        += block_stats[i].tuple_len;
+               dead_tuple_count_avg += block_stats[i].dead_tuple_count;
+               dead_tuple_len_avg   += block_stats[i].dead_tuple_len;
+               free_space_avg       += block_stats[i].free_space;
+       }
+
+       tuple_count_avg      = tuple_count_avg / sample_size;
+       tuple_len_avg        = tuple_len_avg / sample_size;
+       dead_tuple_count_avg = dead_tuple_count_avg / sample_size;
+       dead_tuple_len_avg   = dead_tuple_len_avg / sample_size;
+       free_space_avg       = free_space_avg / sample_size;
+
+#ifdef NOT_USED
+       elog(NOTICE, "pgstattuple2: AVG tuple_count %.2f, tuple_len %.2f, 
dead_tuple_count %.2f, dead_tuple_len %.2f, free_space %.2f", 
+                tuple_count_avg,
+                tuple_len_avg,
+                dead_tuple_count_avg,
+                dead_tuple_len_avg,
+                free_space_avg);
+#endif
+
+       /*
+        * sample standard deviation
+        */
+       tuple_count_sd      = 0;
+       tuple_len_sd        = 0;
+       dead_tuple_count_sd = 0;
+       dead_tuple_len_sd   = 0;
+       free_space_sd       = 0;
+
+       for (i=0 ; i<sample_size ; i++)
+       {
+               tuple_count_sd      += pow(block_stats[i].tuple_count - 
tuple_count_avg, 2);
+               tuple_len_sd        += pow(block_stats[i].tuple_len - 
tuple_len_avg, 2);
+               dead_tuple_count_sd += pow(block_stats[i].dead_tuple_count - 
dead_tuple_count_avg, 2);
+               dead_tuple_len_sd   += pow(block_stats[i].dead_tuple_len - 
dead_tuple_len_avg, 2);
+               free_space_sd       += pow(block_stats[i].free_space - 
free_space_avg, 2);
+       }
+
+       tuple_count_sd      = sqrt(tuple_count_sd / (sample_size - 1));
+       tuple_len_sd        = sqrt(tuple_len_sd / (sample_size - 1));
+       dead_tuple_count_sd = sqrt(dead_tuple_count_sd / (sample_size - 1));
+       dead_tuple_len_sd   = sqrt(dead_tuple_len_sd / (sample_size - 1));
+       free_space_sd       = sqrt(free_space_sd / (sample_size - 1));
+
+#ifdef NOT_USED
+       elog(NOTICE, "pgstattuple2: SD tuple_count %.2f, tuple_len %.2f, 
dead_tuple_count %.2f, dead_tuple_len %.2f, free_space %.2f", 
+                tuple_count_sd,
+                tuple_len_sd,
+                dead_tuple_count_sd,
+                dead_tuple_len_sd,
+                free_space_sd);
+#endif
+
+       /*
+        * standard error
+        */
+       {
+               double tmp = ((double)nblocks - (double)sample_size) / 
(double)nblocks;
+
+               tuple_count_se      = tmp * tuple_count_sd / sqrt(nblocks);
+               tuple_len_se        = tmp * tuple_len_sd / sqrt(nblocks);
+               dead_tuple_count_se = tmp * dead_tuple_count_sd / sqrt(nblocks);
+               dead_tuple_len_se   = tmp * dead_tuple_len_sd / sqrt(nblocks);
+               free_space_se       = tmp * free_space_sd / sqrt(nblocks);
+       }
+
+       elog(NOTICE, "pgstattuple2: SE tuple_count %.2f, tuple_len %.2f, 
dead_tuple_count %.2f, dead_tuple_len %.2f, free_space %.2f", 
+                tuple_count_se * nblocks,
+                tuple_len_se * nblocks,
+                dead_tuple_count_se * nblocks,
+                dead_tuple_len_se * nblocks,
+                free_space_se * nblocks);
+
+       *tuple_count      = tuple_count_avg * nblocks;
+       *tuple_len        = tuple_len_avg * nblocks;
+       *dead_tuple_count = dead_tuple_count_avg * nblocks;
+       *dead_tuple_len   = dead_tuple_len_avg * nblocks;
+       *free_space       = free_space_avg * nblocks;
+}
+
+/*
+ * pgstat_heap_sample -- returns live/dead tuples info in a heap
+ */
+static Datum
+pgstat_heap_sample(Relation rel, FunctionCallInfo fcinfo)
+{
+       BlockNumber nblocks;
+       BlockNumber block;
+       pgstattuple_type stat = {0};
+       pgstattuple_block_stats block_stats[SAMPLE_SIZE];
+       int i;
+
+       nblocks = RelationGetNumberOfBlocks(rel);
+
+       for (i=0 ; i<SAMPLE_SIZE ; i++)
+       {
+               Buffer          buffer;
+               Page            page;
+               OffsetNumber pageoff;
+
+               /*
+                * sample random blocks
+                */
+               block = (double)random() / RAND_MAX * nblocks;
+
+               CHECK_FOR_INTERRUPTS();
+
+               /* Read and lock buffer */
+               buffer = ReadBuffer(rel, block);
+               LockBuffer(buffer, BUFFER_LOCK_SHARE);
+               page = BufferGetPage(buffer);
+
+               /* get block stats of the sample block */
+               block_stats[i].tuple_count      = 0;
+               block_stats[i].tuple_len        = 0;
+               block_stats[i].dead_tuple_count = 0;
+               block_stats[i].dead_tuple_len   = 0;
+               block_stats[i].free_space       = 0;
+
+               block_stats[i].free_space = PageGetFreeSpace(page);
+
+               for (pageoff = FirstOffsetNumber; pageoff <= 
PageGetMaxOffsetNumber(page); pageoff = OffsetNumberNext(pageoff))
+               {
+                       ItemId          lp = PageGetItemId(page, pageoff);
+                       HeapTupleData tup;
+
+                       memset(&tup, 0, sizeof(HeapTupleData));
+
+                       tup.t_data     = (HeapTupleHeader)PageGetItem(page, lp);
+                       tup.t_len      = ItemIdGetLength(lp);
+                       tup.t_tableOid = RelationGetRelid(rel);
+                       ItemPointerSet(&(tup.t_self), 
BufferGetBlockNumber(buffer), pageoff);
+
+                       if (ItemIdIsNormal(lp))
+                       {
+                               if (HeapTupleSatisfiesVisibility(&tup, 
SnapshotNow, buffer))
+                               {
+                                       block_stats[i].tuple_count++;
+                                       block_stats[i].tuple_len += tup.t_len;
+                               }
+                               else
+                               {
+                                       block_stats[i].dead_tuple_count++;
+                                       block_stats[i].dead_tuple_len += 
tup.t_len;
+                               }
+                       }
+               }
+               
+               LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+               ReleaseBuffer(buffer);
+       }
+
+       relation_close(rel, AccessShareLock);
+
+       compute_parameters(block_stats, SAMPLE_SIZE, nblocks,
+                                          &stat.tuple_count,
+                                          &stat.tuple_len,
+                                          &stat.dead_tuple_count,
+                                          &stat.dead_tuple_len,
+                                          &stat.free_space);
+
+       stat.table_len = (uint64) nblocks *BLCKSZ;
+
+       return build_pgstattuple_type(&stat, fcinfo);
+}
+
+Datum
+pgstattuple2(PG_FUNCTION_ARGS)
+{
+       Oid                     relid = PG_GETARG_OID(0);
+       Relation        rel;
+
+       if (!superuser())
+               ereport(ERROR,
+                               (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+                                (errmsg("must be superuser to use pgstattuple 
functions"))));
+
+       /* open relation */
+       rel = relation_open(relid, AccessShareLock);
+
+       PG_RETURN_DATUM(pgstat_relation(rel, fcinfo, true));
+}

<<attachment: pgstattuple2.png>>

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

[HACKERS] [PoC] pgstattuple2: block sampling to reduce physical read

Reply via email to