Re: [HACKERS] pgbench more operators & functions

2018-01-10 Thread Fabien COELHO



Some of the Windows buildfarm members aren't too happy with this.


Indeed.

Windows prettyprinting of double inserts a spurious "0" at the beginning 
of the exponent. Makes it look like an octal.


Here is a patch to fix it, which I cannot test on Windows.

--
Fabien.diff --git a/src/bin/pgbench/t/001_pgbench_with_server.pl b/src/bin/pgbench/t/001_pgbench_with_server.pl
index e579334..a8b2962 100644
--- a/src/bin/pgbench/t/001_pgbench_with_server.pl
+++ b/src/bin/pgbench/t/001_pgbench_with_server.pl
@@ -239,7 +239,7 @@ pgbench(
 		qr{command=26.: double -0.125\b},
 		qr{command=27.: double -0.00032\b},
 		qr{command=28.: double 8.50705917302346e\+0?37\b},
-		qr{command=29.: double 1e\+30\b},
+		qr{command=29.: double 1e\+0?30\b},
 		qr{command=30.: boolean false\b},
 		qr{command=31.: boolean true\b},
 		qr{command=32.: int 32\b},


Re: pgbench - add \if support

2018-01-10 Thread Fabien COELHO



A new automaton state is added to quickly step over false branches.


This one took me a little while to understand while reading the patch,
but mostly because of how diff doesn't handle moving things around.


"git diff -w --patience" may help.


Marking as ready for committer.


Here is a rebase. I made some tests use actual expressions instead of just 
0 and 1. No other changes.


--
Fabien.



Re: pgbench - add \if support

2018-01-10 Thread Fabien COELHO


Here is a rebase. I made some tests use actual expressions instead of just 0 
and 1. No other changes.


Sigh. Better with the attachment. Sorry for the noise.

--
Fabien.diff --git a/doc/src/sgml/ref/pgbench.sgml b/doc/src/sgml/ref/pgbench.sgml
index 3dd492c..c203c41 100644
--- a/doc/src/sgml/ref/pgbench.sgml
+++ b/doc/src/sgml/ref/pgbench.sgml
@@ -895,6 +895,21 @@ pgbench  options  d
   
 
   
+   
+\if expression
+\elif expression
+\else
+\endif
+
+  
+  This group of commands implements nestable conditional blocks,
+  similarly to psql's .
+  Conditional expressions are identical to those with \set,
+  with non-zero values interpreted as true.
+ 
+
+   
+

 
  \set varname expression
diff --git a/doc/src/sgml/ref/psql-ref.sgml b/doc/src/sgml/ref/psql-ref.sgml
index fce7e3a..e88f782 100644
--- a/doc/src/sgml/ref/psql-ref.sgml
+++ b/doc/src/sgml/ref/psql-ref.sgml
@@ -2148,7 +2148,7 @@ hello 10
   
 
 
-  
+  
 \if expression
 \elif expression
 \else
diff --git a/src/bin/pgbench/pgbench.c b/src/bin/pgbench/pgbench.c
index 31ea6ca..b5ebefc 100644
--- a/src/bin/pgbench/pgbench.c
+++ b/src/bin/pgbench/pgbench.c
@@ -32,6 +32,7 @@
 #endif			/* ! WIN32 */
 
 #include "postgres_fe.h"
+#include "fe_utils/conditional.h"
 
 #include "getopt_long.h"
 #include "libpq-fe.h"
@@ -274,6 +275,9 @@ typedef enum
 	 * and we enter the CSTATE_SLEEP state to wait for it to expire. Other
 	 * meta-commands are executed immediately.
 	 *
+	 * CSTATE_SKIP_COMMAND for conditional branches which are not executed,
+	 * quickly skip commands that do not need any evaluation.
+	 *
 	 * CSTATE_WAIT_RESULT waits until we get a result set back from the server
 	 * for the current command.
 	 *
@@ -283,6 +287,7 @@ typedef enum
 	 * command counter, and loops back to CSTATE_START_COMMAND state.
 	 */
 	CSTATE_START_COMMAND,
+	CSTATE_SKIP_COMMAND,
 	CSTATE_WAIT_RESULT,
 	CSTATE_SLEEP,
 	CSTATE_END_COMMAND,
@@ -312,6 +317,7 @@ typedef struct
 	PGconn	   *con;			/* connection handle to DB */
 	int			id;/* client No. */
 	ConnectionStateEnum state;	/* state machine's current state. */
+	ConditionalStack cstack;	/* enclosing conditionals state */
 
 	int			use_file;		/* index in sql_script for this client */
 	int			command;		/* command number in script */
@@ -400,7 +406,11 @@ typedef enum MetaCommand
 	META_SET,	/* \set */
 	META_SETSHELL,/* \setshell */
 	META_SHELL,	/* \shell */
-	META_SLEEP	/* \sleep */
+	META_SLEEP,	/* \sleep */
+	META_IF,	/* \if */
+	META_ELIF,	/* \elif */
+	META_ELSE,	/* \else */
+	META_ENDIF	/* \endif */
 } MetaCommand;
 
 typedef enum QueryMode
@@ -1587,6 +1597,7 @@ setBoolValue(PgBenchValue *pv, bool bval)
 	pv->type = PGBT_BOOLEAN;
 	pv->u.bval = bval;
 }
+
 /* assign an integer value */
 static void
 setIntValue(PgBenchValue *pv, int64 ival)
@@ -2295,6 +2306,14 @@ getMetaCommand(const char *cmd)
 		mc = META_SHELL;
 	else if (pg_strcasecmp(cmd, "sleep") == 0)
 		mc = META_SLEEP;
+	else if (pg_strcasecmp(cmd, "if") == 0)
+		mc = META_IF;
+	else if (pg_strcasecmp(cmd, "elif") == 0)
+		mc = META_ELIF;
+	else if (pg_strcasecmp(cmd, "else") == 0)
+		mc = META_ELSE;
+	else if (pg_strcasecmp(cmd, "endif") == 0)
+		mc = META_ENDIF;
 	else
 		mc = META_NONE;
 	return mc;
@@ -2416,11 +2435,11 @@ preparedStatementName(char *buffer, int file, int state)
 }
 
 static void
-commandFailed(CState *st, const char *message)
+commandFailed(CState *st, const char *cmd, const char *message)
 {
 	fprintf(stderr,
-			"client %d aborted in command %d of script %d; %s\n",
-			st->id, st->command, st->use_file, message);
+			"client %d aborted in command %d (%s) of script %d; %s\n",
+			st->id, st->command, cmd, st->use_file, message);
 }
 
 /* return a script number with a weighted choice. */
@@ -2608,6 +2627,8 @@ doCustom(TState *thread, CState *st, StatsData *agg)
 	st->state = CSTATE_START_THROTTLE;
 else
 	st->state = CSTATE_START_TX;
+/* check consistency */
+Assert(conditional_stack_empty(st->cstack));
 break;
 
 /*
@@ -2773,7 +2794,7 @@ doCustom(TState *thread, CState *st, StatsData *agg)
 {
 	if (!sendCommand(st, command))
 	{
-		commandFailed(st, "SQL command send failed");
+		commandFailed(st, "SQL", "SQL command send failed");
 		st->state = CSTATE_ABORTED;
 	}
 	else
@@ -2806,7 +2827,7 @@ doCustom(TState *thread, CState *st, StatsData *agg)
 
 		if (!evaluateSleep(st, argc, argv, &usec))
 		{
-			commandFailed(st, "execution of meta-command 'sleep' failed");
+			commandFailed(st, "sleep", "execution of meta-command failed");
 			st->state = CSTATE_ABORTED;
 			break;
 		}
@@ -2817,77 +2838,209 @@ doCustom(TState *thread, CState *st, StatsData *agg)
 		st->state = CSTATE_SLEEP;
 		break;
 	}
-	else
+	else if (command->meta == META_SET ||
+			 command->meta == META_IF ||
+			 co

Re: [HACKERS] pgbench - allow to store select results into variables

2018-01-10 Thread Fabien COELHO



Here is a v14, after yet another rebase, and some comments added to answer 
your new comments.


Attached v15 is a simple rebase after Teodor push of new functions & 
operators in pgbench.


--
Fabien.diff --git a/doc/src/sgml/ref/pgbench.sgml b/doc/src/sgml/ref/pgbench.sgml
index 3dd492c..b51399e 100644
--- a/doc/src/sgml/ref/pgbench.sgml
+++ b/doc/src/sgml/ref/pgbench.sgml
@@ -895,6 +895,51 @@ pgbench  options  d
   
 
   
+   
+
+ \cset [prefix] or
+ \gset [prefix]
+
+
+
+ 
+  These commands may be used to end SQL queries, replacing a semicolon.
+  \cset replaces an embedded semicolon (\;) within
+  a compound SQL command, and \gset replaces a final
+  (;) semicolon which ends the SQL command. 
+ 
+
+ 
+  When these commands are used, the preceding SQL query is expected to
+  return one row, the columns of which are stored into variables named after
+  column names, and prefixed with prefix if provided.
+ 
+
+ 
+  The following example puts the final account balance from the first query
+  into variable abalance, and fills variables
+  one, two and
+  p_three with integers from a compound query.
+
+UPDATE pgbench_accounts
+  SET abalance = abalance + :delta
+  WHERE aid = :aid
+  RETURNING abalance \gset
+-- compound of two queries
+SELECT 1 AS one, 2 AS two \cset
+SELECT 3 AS three \gset p_
+
+ 
+
+ 
+  
+\cset and \gset commands do not work when
+empty SQL queries appear within a compound SQL command.
+  
+ 
+
+   
+

 
  \set varname expression
diff --git a/src/bin/pgbench/pgbench.c b/src/bin/pgbench/pgbench.c
index 31ea6ca..e6daa9f 100644
--- a/src/bin/pgbench/pgbench.c
+++ b/src/bin/pgbench/pgbench.c
@@ -416,12 +416,15 @@ static const char *QUERYMODE[] = {"simple", "extended", "prepared"};
 
 typedef struct
 {
-	char	   *line;			/* text of command line */
+	char	   *line;			/* first line for short display */
+	char	   *lines;			/* full multi-line text of command */
 	int			command_num;	/* unique index of this Command struct */
 	int			type;			/* command type (SQL_COMMAND or META_COMMAND) */
 	MetaCommand meta;			/* meta command identifier, or META_NONE */
 	int			argc;			/* number of command words */
 	char	   *argv[MAX_ARGS]; /* command word list */
+	int		compound;   /* last compound command (number of \;) */
+	char	  **gset;   /* per-compound command prefix */
 	PgBenchExpr *expr;			/* parsed expression, if needed */
 	SimpleStats stats;			/* time spent in this command */
 } Command;
@@ -1523,6 +1526,104 @@ valueTruth(PgBenchValue *pval)
 	}
 }
 
+/* read all responses from backend */
+static bool
+read_response(CState *st, char **gset)
+{
+	PGresult   *res;
+	int			compound = 0;
+
+	while ((res = PQgetResult(st->con)) != NULL)
+	{
+		switch (PQresultStatus(res))
+		{
+			case PGRES_COMMAND_OK: /* non-SELECT commands */
+			case PGRES_EMPTY_QUERY: /* may be used for testing no-op overhead */
+if (gset[compound] != NULL)
+{
+	fprintf(stderr,
+			"client %d file %d command %d compound %d: "
+			"\\gset expects a row\n",
+			st->id, st->use_file, st->command, compound);
+	st->ecnt++;
+	return false;
+}
+break; /* OK */
+
+			case PGRES_TUPLES_OK:
+if (gset[compound] != NULL)
+{
+	/* store result into variables */
+	int ntuples = PQntuples(res),
+		nfields = PQnfields(res),
+		f;
+
+	if (ntuples != 1)
+	{
+		fprintf(stderr,
+"client %d file %d command %d compound %d: "
+"expecting one row, got %d\n",
+st->id, st->use_file, st->command, compound, ntuples);
+		st->ecnt++;
+		PQclear(res);
+		discard_response(st);
+		return false;
+	}
+
+	for (f = 0; f < nfields ; f++)
+	{
+		char *varname = PQfname(res, f);
+		if (*gset[compound] != '\0')
+			varname = psprintf("%s%s", gset[compound], varname);
+
+		/* store result as a string */
+		if (!putVariable(st, "gset", varname,
+		 PQgetvalue(res, 0, f)))
+		{
+			/* internal error, should it rather abort? */
+			fprintf(stderr,
+	"client %d file %d command %d compound %d: "
+	"error storing into var %s\n",
+	st->id, st->use_file, st->command, compound,
+	varname);
+			st->ecnt++;
+			PQclear(res);
+			discard_response(st);
+			return false;
+		}
+
+		if (*gset[compound] != '\0')
+			free(varname);
+	}
+}
+break;	/* OK */
+
+			default:
+/* everything else is unexpected, so probably an error */
+fprintf(stderr,
+		"client %d file %d aborted in command %d compound %d: %s",
+		st->id, st->use_file, st->command, compound,
+		PQerrorMessage(st->con));
+st->ecnt++;
+PQclear(res);
+discard_response(st);
+return false;
+		}
+
+		PQclear(res);
+		compound += 1;
+	}
+
+	if (compound == 0)
+	{
+		fprintf(stderr, "client %d command %d: no results\n", 

Re: AS OF queries

2018-01-10 Thread Konstantin Knizhnik



On 02.01.2018 21:12, Peter Eisentraut wrote:

On 12/29/17 06:28, Konstantin Knizhnik wrote:

   Can there be apparent RI
violations?

Right now AS OF is used only in selects, not in update statements. So I
do not understand how integrity constraints can be violated.

I mean, if you join tables connected by a foreign key, you can expect a
certain shape of result, for example at least one match per PK row.  But
if you select from each table "as of" a different timestamp, then that
won't hold.  That could also throw off any optimizations we might come
up with in that area, such as cross-table statistics.  Not saying it
can't or shouldn't be done, but there might be some questions.


Now I understand your statement. Yes, combining different timelines in 
the same query can violate integrity constraint.
In theory there can be some query plans which will be executed 
incorrectly  because of this constraint violation.
I do not know concrete examples of such plans right now, but I can not 
prove that such problem can  not happen.





  What happens if no old data for the
selected AS OF is available?

It will just return the version closest to the specified timestamp.

That seems strange.  Shouldn't that be an error?


I will add an option raising error in this case.
I do not want to always throw error, because Postgres is very 
conservative in reclaiming old space. And the fact that version is not 
used by any snapshot doesn't mean that it will be immediately deleted. 
So there is still chance to peek-up old data although it is out of the 
specified time travel period.






  How does this interact with catalog
changes, such as changes to row-level security settings?  (Do we apply
the current or the past settings?)

Catalog changes are not currently supported.
And I do not have good understanding how to support it if query involves
two different timeslice with different versions of the table.
Too much places in parser/optimizer have to be change to support such
"historical collisions".

Right, it's probably very hard to do.  But I think it somehow should be
recognized that catalog changes took place between the selected
timestamp(s) and now and an error or notice should be produced.

There is one challenge: right now AS OF timestamps are not required to 
be constants: them can be calculated dynamically during query execution. 
So at the time of query compilation it is not possible to check whether 
specified timestamps observe catalog changes or not.


--
Konstantin Knizhnik
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company




Re: AS OF queries

2018-01-10 Thread Konstantin Knizhnik



On 03.01.2018 23:49, legrand legrand wrote:

Maybe that a simple check of the asof_timestamp value like:

asof_timestamp >= now() - time_travel_period
AND
asof_timestamp >= latest_table_ddl

would permit to raise a warning or an error message saying that query result
can not be garanteed with this asof_timestamp value.


latest_table_ddl being found with

SELECT greatest( max(pg_xact_commit_timestamp( rel.xmin )),
max(pg_xact_commit_timestamp( att.xmin ))) as latest_table_ddl
FROM  pg_catalog.pg_attribute att   
INNER JOIN pg_catalog.pg_class rel  
ON att.attrelid = rel.oid WHERE rel.relname = '' and
rel.relowner= ...

(tested with add/alter/drop column and drop/create/truncate table)


Well, it can be done.
But performing this query on each access to the table seems to be bad 
idea: in case of nested loop join it can cause significant degrade of 
performance.
The obvious solution is to calculate this latest_table_ddl timestamp 
once and store it it somewhere (in ScanState?)

But I am not sure that this check is actually needed.
If table is changed in some incompatible way, then we will get error in 
any case.
If table change is not critical for this query (for example some column 
was added or removed which is not used in this query),

then should we really throw error in this case?

--
Konstantin Knizhnik
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company




Re: [HACKERS] Surjective functional indexes

2018-01-10 Thread Konstantin Knizhnik



On 07.01.2018 01:59, Stephen Frost wrote:

Greetings,

* Konstantin Knizhnik (k.knizh...@postgrespro.ru) wrote:

On 15.12.2017 01:21, Michael Paquier wrote:

On Fri, Dec 15, 2017 at 6:15 AM, Alvaro Herrera  wrote:

Konstantin Knizhnik wrote:

If you still thing that additional 16 bytes per relation in statistic is too
high overhead, then I will also remove autotune.

I think it's pretty clear that these additional bytes are excessive.

The bar to add new fields in PgStat_TableCounts in very high, and one
attempt to tackle its scaling problems with many relations is here by
Horiguchi-san:
https://www.postgresql.org/message-id/20171211.201523.24172046.horiguchi.kyot...@lab.ntt.co.jp
His patch may be worth a look if you need more fields for your
feature. So it seems to me that the patch as currently presented has
close to zero chance to be committed as long as you keep your changes
to pgstat.c.

Ok, looks like everybody think that autotune based on statistic is bad idea.
Attached please find patch without autotune.

This patch appears to apply with a reasonable amount of fuzz, builds,
and passes 'make check', at least, therefore I'm going to mark it
'Needs Review'.

I will note that the documentation doesn't currently build due to this:

/home/sfrost/git/pg/dev/cleanup/doc/src/sgml/ref/create_index.sgml:302: parser 
error : Opening and ending tag mismatch: literal line 302 and unparseable
 recheck_on_update

but I don't think it makes sense for that to stand in the way of someone
doing a review of the base patch.  Still, please do fix the
documentation build when you get a chance.

Thanks!

Stephen

Sorry, issue with documentation is fixed.



--
Konstantin Knizhnik
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

diff --git a/doc/src/sgml/ref/create_index.sgml b/doc/src/sgml/ref/create_index.sgml
index 0255375..acccaa5 100644
--- a/doc/src/sgml/ref/create_index.sgml
+++ b/doc/src/sgml/ref/create_index.sgml
@@ -294,8 +294,33 @@ CREATE [ UNIQUE ] INDEX [ CONCURRENTLY ] [ [ IF NOT EXISTS ] 
 The optional WITH clause specifies storage
 parameters for the index.  Each index method has its own set of allowed
-storage parameters.  The B-tree, hash, GiST and SP-GiST index methods all
-accept this parameter:
+storage parameters.  All indexes accept the following parameter:
+   
+
+   
+   
+recheck_on_update
+
+ 
+   Functional index is based on on projection function: function which extract subset of its argument.
+   In mathematic such functions are called non-injective. For injective function if any attribute used in the indexed
+   expression is changed, then value of index expression is also changed. So to check that index is affected by the
+   update, it is enough to check the set of changed fields. By default this parameters is assigned true value and function is considered
+   as non-injective.
+   In this case change of any of indexed key doesn't mean that value of the function is changed. For example, for
+   the expression expression(bookinfo->>'isbn') defined
+   for column of JSON type is changed only when ISBN is changed, which rarely happen. The same is true for most
+   functional indexes. For non-injective functions, Postgres compares values of indexed expression for old and updated tuple and updates
+   index only when function results are different. It allows to eliminate index update and use HOT update.
+   But there are extra evaluations of the functions. So if function is expensive or probability that change of indexed column will not effect
+   the function value is small, then marking index as recheck_on_update may increase update speed.
+
+
+   
+   
+
+   
+ The B-tree, hash, GiST and SP-GiST index methods all accept this parameter:

 

diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c
index aa9c0f1..1aaab78 100644
--- a/src/backend/access/common/reloptions.c
+++ b/src/backend/access/common/reloptions.c
@@ -131,6 +131,15 @@ static relopt_bool boolRelOpts[] =
 	},
 	{
 		{
+			"recheck_on_update",
+			"Evaluate functional index expression on update to check if its values is changed",
+			RELOPT_KIND_INDEX,
+			AccessExclusiveLock
+		},
+		true
+	},
+	{
+		{
 			"security_barrier",
 			"View acts as a row security barrier",
 			RELOPT_KIND_VIEW,
@@ -1311,7 +1320,7 @@ fillRelOptions(void *rdopts, Size basesize,
 break;
 			}
 		}
-		if (validate && !found)
+		if (validate && !found && options[i].gen->kinds != RELOPT_KIND_INDEX)
 			elog(ERROR, "reloption \"%s\" not found in parse table",
  options[i].gen->name);
 	}
@@ -1468,6 +1477,40 @@ index_reloptions(amoptions_function amoptions, Datum reloptions, bool validate)
 }
 
 /*
+ * Parse generic options for all indexes.
+ *
+ *	reloptions	options as text[] datum
+ *	validate	error flag
+ */
+bytea *
+index_generic_reloptions(Datum reloptions, bool validate)
+{

Re: Re: [HACKERS] pgbench randomness initialization

2018-01-10 Thread Fabien COELHO



This is a simple patch that does what it says on the tin. I ran into
trouble with the pgbench TAP test *even before applying the patch*, but
only because I was doing a VPATH build as a user without 'write'
on the source tree (001_pgbench_with_server.pl tried to make pgbench
create log files there). Bad me. Oddly, that was the only test in the
whole tree to have such an issue, so here I add a pre-patch to fix that.
Now my review needs a review. :)


Yep. I find the multiple chdir solution a little bit too extreme.

ISTM that it should rather add the correct path to --log-prefix by 
prepending $node->basedir, like the pgbench function does for -f scripts.


See attached.

--
Fabien.diff --git a/src/bin/pgbench/t/001_pgbench_with_server.pl b/src/bin/pgbench/t/001_pgbench_with_server.pl
index e579334..ba7e363 100644
--- a/src/bin/pgbench/t/001_pgbench_with_server.pl
+++ b/src/bin/pgbench/t/001_pgbench_with_server.pl
@@ -611,24 +611,26 @@ sub check_pgbench_logs
 	ok(unlink(@logs), "remove log files");
 }
 
+my $bdir = $node->basedir;
+
 # with sampling rate
 pgbench(
-'-n -S -t 50 -c 2 --log --log-prefix=001_pgbench_log_2 --sampling-rate=0.5',
+"-n -S -t 50 -c 2 --log --log-prefix=$bdir/001_pgbench_log_2 --sampling-rate=0.5",
 	0,
 	[ qr{select only}, qr{processed: 100/100} ],
 	[qr{^$}],
 	'pgbench logs');
 
-check_pgbench_logs('001_pgbench_log_2', 1, 8, 92,
+check_pgbench_logs("$bdir/001_pgbench_log_2", 1, 8, 92,
 	qr{^0 \d{1,2} \d+ \d \d+ \d+$});
 
 # check log file in some detail
 pgbench(
-	'-n -b se -t 10 -l --log-prefix=001_pgbench_log_3',
+	"-n -b se -t 10 -l --log-prefix=$bdir/001_pgbench_log_3",
 	0, [ qr{select only}, qr{processed: 10/10} ],
 	[qr{^$}], 'pgbench logs contents');
 
-check_pgbench_logs('001_pgbench_log_3', 1, 10, 10,
+check_pgbench_logs("$bdir/001_pgbench_log_3", 1, 10, 10,
 	qr{^\d \d{1,2} \d+ \d \d+ \d+$});
 
 # done


Re: Re: [HACKERS] pgbench randomness initialization

2018-01-10 Thread Fabien COELHO


Here is a rebase, plus some more changes:

I have improved the error message to tell from where the value was 
provided.


I have removed the test to the exact values produced from the expression 
test run.


I have added a test which run from the same seed value several times
and checks that the output values are the same.

--
Fabien.diff --git a/doc/src/sgml/ref/pgbench.sgml b/doc/src/sgml/ref/pgbench.sgml
index 3dd492c..22ebb51 100644
--- a/doc/src/sgml/ref/pgbench.sgml
+++ b/doc/src/sgml/ref/pgbench.sgml
@@ -680,6 +680,45 @@ pgbench  options  d
  
 
  
+  --random-seed=SEED
+  
+   
+Set random generator seed.  Seeds the system random number generator,
+which then produces a sequence of initial generator states, one for
+each thread.
+Values for SEED may be:
+time (the default, the seed is based on the current time),
+rand (use a strong random source, failing if none
+is available), or any unsigned octal (012470),
+decimal (5432) or
+hexedecimal (0x1538) integer value.
+The random generator is invoked explicitly from a pgbench script
+(random... functions) or implicitly (for instance option
+--rate uses it to schedule transactions).
+When explicitely set, the value used for seeding is shown on the terminal.
+Any value allowed for SEED may also be
+provided through the environment variable
+PGBENCH_RANDOM_SEED.
+To ensure that the provided seed impacts all possible uses, put this option
+first or use the environment variable.
+  
+  
+Setting the seed explicitly allows to reproduce a pgbench
+run exactly, as far as random numbers are concerned.
+As the random state is managed per thread, this means the exact same
+pgbench run for an identical invocation if there is one
+client per thread and there are no external or data dependencies.
+From a statistical viewpoint reproducing runs exactly is a bad idea because
+it can hide the performance variability or improve performance unduly,
+e.g. by hitting the same pages as a previous run.
+However, it may also be of great help for debugging, for instance
+re-running a tricky case which leads to an error.
+Use wisely.
+   
+  
+ 
+
+ 
   --sampling-rate=rate
   

diff --git a/src/bin/pgbench/pgbench.c b/src/bin/pgbench/pgbench.c
index 31ea6ca..206dfd5 100644
--- a/src/bin/pgbench/pgbench.c
+++ b/src/bin/pgbench/pgbench.c
@@ -560,6 +560,7 @@ usage(void)
 		   "  --log-prefix=PREFIX  prefix for transaction time log file\n"
 		   "   (default: \"pgbench_log\")\n"
 		   "  --progress-timestamp use Unix epoch timestamps for progress\n"
+		   "  --random-seed=SEED   set random seed (\"time\", \"rand\", integer)\n"
 		   "  --sampling-rate=NUM  fraction of transactions to log (e.g., 0.01 for 1%%)\n"
 		   "\nCommon options:\n"
 		   "  -d, --debug  print debugging output\n"
@@ -4362,6 +4363,55 @@ printResults(TState *threads, StatsData *total, instr_time total_time,
 	}
 }
 
+/* call srandom based on some seed. NULL triggers the default behavior. */
+static void
+set_random_seed(const char *seed, const char *origin)
+{
+	unsigned int iseed;
+
+	if (seed == NULL || strcmp(seed, "time") == 0)
+	{
+		/* rely on current time */
+		instr_time	now;
+		INSTR_TIME_SET_CURRENT(now);
+		iseed = (unsigned int) INSTR_TIME_GET_MICROSEC(now);
+	}
+	else if (strcmp(seed, "rand") == 0)
+	{
+		/* use some "strong" random source */
+		if (!pg_strong_random(&iseed, sizeof(iseed)))
+		{
+			fprintf(stderr, "cannot seed random from a strong source\n");
+			exit(1);
+		}
+	}
+	else
+	{
+		/* parse uint seed value */
+		char garbage;
+		if (!
+			/* hexa */
+			((seed[0] == '0' && (seed[1] == 'x' || seed[1] == 'X') &&
+			 sscanf(seed, "%x%c", &iseed, &garbage) == 1) ||
+			 /* octal */
+			(seed[0] == '0' && seed[1] != 'x' && seed[1] != 'X' &&
+			 sscanf(seed, "%o%c", &iseed, &garbage) == 1) ||
+			 /* decimal */
+			 (seed[0] != '0' &&
+			  sscanf(seed, "%u%c", &iseed, &garbage) == 1)))
+		{
+			fprintf(stderr,
+	"error while scanning '%s' from %s, expecting an unsigned integer, 'time' or 'rand'\n",
+	seed, origin);
+			exit(1);
+		}
+	}
+
+	if (seed != NULL)
+		fprintf(stderr, "setting random seed to %u\n", iseed);
+	srandom(iseed);
+}
+
 
 int
 main(int argc, char **argv)
@@ -4404,6 +4454,7 @@ main(int argc, char **argv)
 		{"progress-timestamp", no_argument, NULL, 6},
 		{"log-prefix", required_argument, NULL, 7},
 		{"foreign-keys", no_argument, NULL, 8},
+		{"random-seed", required_argument, NULL, 9},
 		{NULL, 0, NULL, 0}
 	};
 
@@ -4472,6 +4523,9 @@ main(int argc, char **argv)
 	state = (CState *) pg_malloc(sizeof(CState));
 	memset(state, 0, sizeof(CState));
 
+	/* set random seed early, because it may be used while parsing scripts.

Re: pgsql: Improve scripting language in pgbench

2018-01-10 Thread Fabien COELHO


Hello Teodor,

I just noticed while rebasing stuff that there is some crust in 
"pgbench/t/001_pgbench_with_server.pl" coming from this patch:


 +=head
 +
 +} });
 +
 +=cut

I cannot find any use for these lines which are ignored by perl execution 
anyway. It may be some leftovers from debugging which got past everyone. 
If so, I think that it is better removed, see the attached cleanup patch.


--
Fabien.diff --git a/src/bin/pgbench/t/001_pgbench_with_server.pl b/src/bin/pgbench/t/001_pgbench_with_server.pl
index a8b2962..99286f6 100644
--- a/src/bin/pgbench/t/001_pgbench_with_server.pl
+++ b/src/bin/pgbench/t/001_pgbench_with_server.pl
@@ -340,12 +340,6 @@ pgbench(
 SELECT :v0, :v1, :v2, :v3;
 } });
 
-=head
-
-} });
-
-=cut
-
 # backslash commands
 pgbench(
 	'-t 1', 0,


Re: AS OF queries

2018-01-10 Thread legrand legrand
> But performing this query on each access to the table seems to be bad
> idea: in case of nested loop join it can cause significant degrade of
> performance.

this could be a pre-plan / pre-exec check, no more.

> But I am not sure that this check is actually needed.
> If table is changed in some incompatible way, then we will get error in
> any case.

It seems that with path v3, a query with asof_timestamp 
set before a truncate or alter table doesn't throw any error, 
just gives an empty result (even if there was data).

> If table change is not critical for this query (for example some column
> was added or removed which is not used in this query),
> then should we really throw error in this case? 

no error is needed if result is correct.

Regards
PAscal



--
Sent from: http://www.postgresql-archive.org/PostgreSQL-hackers-f1928748.html



Re: General purpose hashing func in pgbench

2018-01-10 Thread Ildar Musin

09/01/2018 23:11, Fabien COELHO пишет:
>
> Hello Ildar,
>
>> Sorry for a long delay. I've added hash() function which is just an
>> alias for murmur2. I've also utilized variable arguments feature from
>> least()/greates() functions to make optional seed parameter, but I
>> consider this as a hack.
>
> Patch needs a rebase after Teodor push for a set of pgbench functions.
Done. Congratulations on your patch finally being committed : )
>
>> Should we probably add some infrastructure for optional arguments?
>
> You can look at the handling of "CASE" which may or may not have an
> "ELSE" clause.
>
> I'd suggest you use a new negative argument with the special meaning
> for hash, and create the seed value when missing when building the
> function, so as to simplify the executor code.
Added a new nargs option -3 for hash functions and moved arguments check
to parser. It's starting to look a bit odd and I'm thinking about
replacing bare numbers (-1, -2, -3) with #defined macros. E.g.:

#define PGBENCH_NARGS_VARIABLE (-1)
#define PGBENCH_NARGS_CASE (-2)
#define PGBENCH_NARGS_HASH (-3)
>
> Instead of 0, I'd consider providing a random default so that the
> hashing behavior is not the same from one run to the next. What do you
> think?
>
Makes sence since each thread is also initializes its random_state with
random numbers before start. So I added global variable 'hash_seed' and
initialize it with random() before threads spawned.
> Like the previous version, murmur2 with seed looks much more random
> than fnv with seed...
>

-- 
Ildar Musin
Postgres Professional: http://www.postgrespro.com
Russian Postgres Company 




Re: General purpose hashing func in pgbench

2018-01-10 Thread Ildar Musin

10/01/2018 16:35, Ildar Musin пишет:
> 09/01/2018 23:11, Fabien COELHO пишет:
>> Hello Ildar,
>>
>>> Sorry for a long delay. I've added hash() function which is just an
>>> alias for murmur2. I've also utilized variable arguments feature from
>>> least()/greates() functions to make optional seed parameter, but I
>>> consider this as a hack.
>> Patch needs a rebase after Teodor push for a set of pgbench functions.
> Done. Congratulations on your patch finally being committed : )
>>> Should we probably add some infrastructure for optional arguments?
>> You can look at the handling of "CASE" which may or may not have an
>> "ELSE" clause.
>>
>> I'd suggest you use a new negative argument with the special meaning
>> for hash, and create the seed value when missing when building the
>> function, so as to simplify the executor code.
> Added a new nargs option -3 for hash functions and moved arguments check
> to parser. It's starting to look a bit odd and I'm thinking about
> replacing bare numbers (-1, -2, -3) with #defined macros. E.g.:
>
> #define PGBENCH_NARGS_VARIABLE (-1)
> #define PGBENCH_NARGS_CASE (-2)
> #define PGBENCH_NARGS_HASH (-3)
>> Instead of 0, I'd consider providing a random default so that the
>> hashing behavior is not the same from one run to the next. What do you
>> think?
>>
> Makes sence since each thread is also initializes its random_state with
> random numbers before start. So I added global variable 'hash_seed' and
> initialize it with random() before threads spawned.
>> Like the previous version, murmur2 with seed looks much more random
>> than fnv with seed...
>>
Sorry, here is the patch

-- 
Ildar Musin
Postgres Professional: http://www.postgrespro.com
Russian Postgres Company 

diff --git a/src/bin/pgbench/exprparse.y b/src/bin/pgbench/exprparse.y
index e23ca51..847b011 100644
--- a/src/bin/pgbench/exprparse.y
+++ b/src/bin/pgbench/exprparse.y
@@ -349,6 +349,15 @@ static const struct
{
"!case_end", -2, PGBENCH_CASE
},
+   {
+   "hash", -3, PGBENCH_HASH_MURMUR2
+   },
+   {
+   "hash_murmur2", -3, PGBENCH_HASH_MURMUR2
+   },
+   {
+   "hash_fnv1a", -3, PGBENCH_HASH_FNV1A
+   },
/* keep as last array element */
{
NULL, 0, 0
@@ -447,6 +456,15 @@ make_func(yyscan_t yyscanner, int fnumber, PgBenchExprList 
*args)
expr_yyerror_more(yyscanner, "odd and >= 3 number of 
arguments expected",
  "case control 
structure");
}
+   /* special case: hash functions with optional arguments */
+   if (PGBENCH_FUNCTIONS[fnumber].nargs == -3)
+   {
+   int len = elist_length(args);
+
+   if (len < 1 || len > 2)
+   expr_yyerror_more(yyscanner, "unexpected number of 
arguments",
+ 
PGBENCH_FUNCTIONS[fnumber].fname);
+   }
 
expr->etype = ENODE_FUNCTION;
expr->u.function.function = PGBENCH_FUNCTIONS[fnumber].tag;
diff --git a/src/bin/pgbench/pgbench.c b/src/bin/pgbench/pgbench.c
index 31ea6ca..6bf94cc 100644
--- a/src/bin/pgbench/pgbench.c
+++ b/src/bin/pgbench/pgbench.c
@@ -61,6 +61,14 @@
 #define ERRCODE_UNDEFINED_TABLE  "42P01"
 
 /*
+ * Hashing constants
+ */
+#define FNV_PRIME 0x10001b3
+#define FNV_OFFSET_BASIS 0xcbf29ce484222325
+#define MM2_MUL 0xc6a4a7935bd1e995
+#define MM2_ROT 47
+
+/*
  * Multi-platform pthread implementations
  */
 
@@ -439,6 +447,8 @@ static int  num_scripts;/* number of scripts in 
sql_script[] */
 static int num_commands = 0;   /* total number of Command structs */
 static int64 total_weight = 0;
 
+static int hash_seed;  /* default seed used in hash functions 
*/
+
 static int debug = 0;  /* debug flag */
 
 /* Builtin test scripts */
@@ -915,6 +925,51 @@ getZipfianRand(TState *thread, int64 min, int64 max, 
double s)
 }
 
 /*
+ * FNV-1a hash function
+ */
+static int64
+getHashFnv1a(int64 val, uint64 seed)
+{
+   int64   result;
+   int i;
+
+   result = FNV_OFFSET_BASIS ^ seed;
+   for (i = 0; i < 8; ++i)
+   {
+   int32 octet = val & 0xff;
+
+   val = val >> 8;
+   result = result ^ octet;
+   result = result * FNV_PRIME;
+   }
+
+   return result;
+}
+
+/*
+ * Murmur2 hash function
+ */
+static int64
+getHashMurmur2(int64 val, uint64 seed)
+{
+   uint64  result = seed ^ (sizeof(int64) * MM2_MUL);
+   uint64  k = (uint64) val;
+
+   k *= MM2_MUL;
+   k ^= k >> MM2_ROT;
+   k *= MM2_MUL;
+
+   result ^= k;
+   result *= MM2_MUL;
+
+   result ^= result >> MM2_ROT;
+   result *= MM2_MUL;
+   result ^= result >> MM2_ROT;
+
+   return (int64) result;
+}
+
+/*
  * Initialize the given SimpleStats struct to all zeroes
  */
 static void
@@ -2209,

Re: AS OF queries

2018-01-10 Thread Konstantin Knizhnik



On 10.01.2018 16:02, legrand legrand wrote:

But performing this query on each access to the table seems to be bad
idea: in case of nested loop join it can cause significant degrade of
performance.

this could be a pre-plan / pre-exec check, no more.


AS-OF timestamp can be taken from outer table, so it is necessary to 
repeat this check at each nested loop join iteration.





But I am not sure that this check is actually needed.
If table is changed in some incompatible way, then we will get error in
any case.

It seems that with path v3, a query with asof_timestamp
set before a truncate or alter table doesn't throw any error,
just gives an empty result (even if there was data).


Sorry, truncate is not compatible with AS OF. It is performed at file 
level and deletes old old version.

So if you want to use time travel, you should not use truncate.




If table change is not critical for this query (for example some column
was added or removed which is not used in this query),
then should we really throw error in this case?

no error is needed if result is correct.


Does it mean that no explicit check is needed that table metadata was 
not checked after specified timeslice?



Attached please find new version of the AS OF patch which throws error 
if specified AS OF timestamp is older that time travel horizon and 
"check_asof_timestamp" parameter is set to true (by default it is 
switched off).



--
Konstantin Knizhnik
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

diff --git a/src/backend/executor/execScan.c b/src/backend/executor/execScan.c
index 837abc0..3ac7868 100644
--- a/src/backend/executor/execScan.c
+++ b/src/backend/executor/execScan.c
@@ -21,7 +21,8 @@
 #include "executor/executor.h"
 #include "miscadmin.h"
 #include "utils/memutils.h"
-
+#include "utils/snapmgr.h"
+#include "utils/timestamp.h"
 
 
 /*
@@ -296,3 +297,43 @@ ExecScanReScan(ScanState *node)
 		}
 	}
 }
+
+/*
+ * Evaluate ASOF timestamp,
+ * check that it belongs to the time travel period (if specified)
+ * and assign it to snapshot.
+ * This function throws error if specified snapshot is out of
+ * time_travel_period and check_asof_timestamp parameter is true
+ */
+void ExecAsofTimestamp(EState* estate, ScanState* ss)
+{
+	if (ss->asofExpr)
+	{
+		if (!ss->asofTimestampSet)
+		{
+			Datum		val;
+			bool		isNull;
+
+			val = ExecEvalExprSwitchContext(ss->asofExpr,
+			ss->ps.ps_ExprContext,
+			&isNull);
+			if (isNull)
+			{
+/* Interpret NULL timestamp as no timestamp */
+ss->asofTimestamp = 0;
+			}
+			else
+			{
+ss->asofTimestamp = DatumGetInt64(val);
+if (check_asof_timestamp && time_travel_period > 0)
+{
+	TimestampTz horizon = GetCurrentTimestamp()	- (TimestampTz)time_travel_period*USECS_PER_SEC;
+	if (timestamptz_cmp_internal(horizon, ss->asofTimestamp) > 0)
+		elog(ERROR, "Specified AS OF timestamp is out of time travel horizon");
+}
+			}
+			ss->asofTimestampSet = true;
+		}
+		estate->es_snapshot->asofTimestamp = ss->asofTimestamp;
+	}
+}
diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c
index eb5bbb5..b880c18 100644
--- a/src/backend/executor/nodeBitmapHeapscan.c
+++ b/src/backend/executor/nodeBitmapHeapscan.c
@@ -78,6 +78,7 @@ BitmapHeapNext(BitmapHeapScanState *node)
 	ExprContext *econtext;
 	HeapScanDesc scan;
 	TIDBitmap  *tbm;
+	EState	   *estate;
 	TBMIterator *tbmiterator = NULL;
 	TBMSharedIterator *shared_tbmiterator = NULL;
 	TBMIterateResult *tbmres;
@@ -85,11 +86,13 @@ BitmapHeapNext(BitmapHeapScanState *node)
 	TupleTableSlot *slot;
 	ParallelBitmapHeapState *pstate = node->pstate;
 	dsa_area   *dsa = node->ss.ps.state->es_query_dsa;
+	TimestampTz outerAsofTimestamp = 0;
 
 	/*
 	 * extract necessary information from index scan node
 	 */
 	econtext = node->ss.ps.ps_ExprContext;
+	estate = node->ss.ps.state;
 	slot = node->ss.ss_ScanTupleSlot;
 	scan = node->ss.ss_currentScanDesc;
 	tbm = node->tbm;
@@ -99,6 +102,9 @@ BitmapHeapNext(BitmapHeapScanState *node)
 		shared_tbmiterator = node->shared_tbmiterator;
 	tbmres = node->tbmres;
 
+	outerAsofTimestamp = estate->es_snapshot->asofTimestamp;
+	ExecAsofTimestamp(estate, &node->ss);
+
 	/*
 	 * If we haven't yet performed the underlying index scan, do it, and begin
 	 * the iteration over the bitmap.
@@ -364,11 +370,21 @@ BitmapHeapNext(BitmapHeapScanState *node)
 			}
 		}
 
-		/* OK to return this tuple */
+		/*
+		 * Restore ASOF timestamp for the current snapshot
+		 */
+		estate->es_snapshot->asofTimestamp = outerAsofTimestamp;
+
+	/* OK to return this tuple */
 		return slot;
 	}
 
 	/*
+	 * Restore ASOF timestamp for the current snapshot
+	 */
+	estate->es_snapshot->asofTimestamp = outerAsofTimestamp;
+
+	/*
 	 * if we get here it means we are at the end of the scan..
 	 */
 	return ExecClearTuple(slot);
@@ -746,6 +762,8 @@ ExecReScanBitmapHeapScan(BitmapHeapScanState *node)
 {
 	PlanState  *outerPlan

Re: [HACKERS] PATCH: psql tab completion for SELECT

2018-01-10 Thread Vik Fearing
On 01/10/2018 06:38 AM, Edmund Horner wrote:
> Hi Vik, thanks so much for the comments and the offer to review!
> 
> I kept a low profile after my first message as there was already a
> commitfest in progress, but now I'm working on a V2 patch.
> 
> I will include aggregate and window functions as you suggest.  And
> I'll exclude trigger functions.

Thanks.

> I'd like to exclude more if we could, because there are already over
> 1000 completions on an empty database.  I had thought of filtering out
> functions with an argument of type internal but couldn't figure out
> how to interrogate the proargtypes oidvector in a nice way.

If you just want to do internal,

WHERE regtype 'internal' <> ALL (proargtypes)

but if you'll want to add more banned types, then

WHERE NOT (proargtypes::regtype[] && array['internal',
'unknown']::regtype[])

This is a very good test to add.

> Regarding support for older versions, psql fails silently if a tab
> completion query fails.

No it doesn't, see below.

> We could just let it do this, which is what
> happens with, for example, ALTER PUBLICATION against a 9.6 server.  I
> can't see any existing completions that check the server version --
> but completions that don't work against older versions, like ALTER
> PUBLICATION, also aren't useful for older versions.  SELECT is a bit
> different as it can be useful against older versions that don't have
> the pg_aggregate.aggcombinefn that my query uses for filtering out
> aggregation support functions.

That's a bug in my opinion, but not one that needs to be addressed by
this patch.

There are no completions that cater to the server version (yet) but all
the \d stuff certainly does.  You can see that in action in
src/bin/psql/describe.c as well as all over the place in pg_dump and the
like.

> There's also the small irritation that when a completion query fails,
> it aborts the user's current transaction to provide an incentive for
> handling older versions gracefully.

That is actively hostile and not at all what I would consider "silently
failing".  If you don't want to do the multiple versions thing, you
should at least check if you're on 9.6+ before issuing the query.

> Regarding multiple columns, I have an idea that if we check that:
> 
> a) the last of any SELECT/WHERE/GROUP BY/etc.-level keyword is a
> SELECT (i.e. that we're in the SELECT clause of the query), and
> b) the last word was a comma (or ended in a comma).
> we can then proffer the column/function completions.
> 
> There may be other completions that could benefit from such a check,
> e.g. table names after a comma in the FROM clause, but I just want to
> get SELECT working first.

I would like to see this as a separate patch.  Let's keep this one
simple and just do like the other things currently do.
-- 
Vik Fearing  +33 6 46 75 15 36
http://2ndQuadrant.fr PostgreSQL : Expertise, Formation et Support



Re: [HACKERS] Secondary index access optimizations

2018-01-10 Thread Konstantin Knizhnik



On 09.01.2018 19:48, Antonin Houska wrote:

Konstantin Knizhnik  wrote:


On 14.08.2017 19:33, Konstantin Knizhnik wrote:

  On 14.08.2017 12:37, Konstantin Knizhnik wrote:

  Hi hackers,

  I am trying to compare different ways of optimizing work with huge 
append-only tables in PostgreSQL where primary key is something like timestamp 
and queries are usually accessing most recent data using some secondary keys. 
Size of secondary index is one of the most critical
  factors limiting insert/search performance. As far as data is inserted in 
timestamp ascending order, access to primary key is well localized and accessed 
tables are present in memory. But if we create secondary key for the whole 
table, then access to it will require random reads from
  the disk and significantly decrease performance.

  There are two well known solutions of the problem:
  1. Table partitioning
  2. Partial indexes

  This approaches I want to compare. First of all I want to check if optimizer 
is able to generate efficient query execution plan covering different time 
intervals.
  Unfortunately in both cases generated plan is not optimal.

  1. Table partitioning:

  create table base (k integer primary key, v integer);
  create table part1 (check (k between 1 and 1)) inherits (base);
  create table part2 (check (k between 10001 and 2)) inherits (base);
  create index pi1 on part1(v);
  create index pi2 on part2(v);
  insert int part1 values (generate series(1,1), random());
  insert into part2 values (generate_series(10001,2), random());
  explain select * from base where k between 1 and 2 and v = 100;
  QUERY PLAN
  ---
  Append (cost=0.00..15.65 rows=3 width=8)
  -> Seq Scan on base (cost=0.00..0.00 rows=1 width=8)
  Filter: ((k >= 1) AND (k <= 2) AND (v = 100))
  -> Index Scan using pi1 on part1 (cost=0.29..8.31 rows=1 width=8)
  Index Cond: (v = 100)
  Filter: ((k >= 1) AND (k <= 2))
  -> Index Scan using pi2 on part2 (cost=0.29..7.34 rows=1 width=8)
  Index Cond: (v = 100)
  Filter: ((k >= 1) AND (k <= 2))

  Questions:
  - Is there some way to avoid sequential scan of parent table? Yes, it is 
empty and so sequential scan will not take much time, but ... it still requires 
some additional actions and so increasing query execution time.
  - Why index scan of partition indexes includes filter condition if it is 
guaranteed by check constraint that all records of this partition match search 
predicate?

  2. Partial indexes:

  create table t (k integer primary key, v integer);
  insert into t values (generate_series(1,2),random());
  create index i1 on t(v) where k between 1 and 1;
  create index i2 on t(v) where k between 10001 and 2;
  postgres=# explain select * from t where k between 1 and 1 and v = 100;
  QUERY PLAN
  
  Index Scan using i1 on t (cost=0.29..7.28 rows=1 width=8)
  Index Cond: (v = 100)
  (2 rows)

  Here we get perfect plan. Let's try to extend search interval:

  postgres=# explain select * from t where k between 1 and 2 and v = 100;
  QUERY PLAN
  --
  Index Scan using t_pkey on t (cost=0.29..760.43 rows=1 width=8)
  Index Cond: ((k >= 1) AND (k <= 2))
  Filter: (v = 100)
  (3 rows)

  Unfortunately in this case Postgres is not able to apply partial indexes.
  And this is what I expected to get:

  postgres=# explain select * from t where k between 1 and 1 and v = 100 
union all select * from t where k between 10001 and 2 and v = 100;
  QUERY PLAN
  --
  Append (cost=0.29..14.58 rows=2 width=8)
  -> Index Scan using i1 on t (cost=0.29..7.28 rows=1 width=8)
  Index Cond: (v = 100)
  -> Index Scan using i2 on t t_1 (cost=0.29..7.28 rows=1 width=8)
  Index Cond: (v = 100)

  I wonder if there are some principle problems in supporting this two things 
in optimizer:
  1. Remove search condition for primary key if it is fully satisfied by 
derived table check constraint.
  2. Append index scans of several partial indexes if specified interval is 
covered by their conditions.

  I wonder if someone is familiar with this part of optimizer ad can easily fix 
it.
  Otherwise I am going to spend some time on solving this problems (if 
community think that such optimizations will be useful).

  Replying to myself: the following small patch removes redundant checks from 
index scans for derived tables:

  diff --git a/src/backend/optimizer/util/plancat.c 
b/src/backend/optimizer/util/plancat.c
  index 939045d..1f7c9cf 100644
  --- a/src/backend/optimizer/util/plancat.c
  +++ b/src/backend/optimizer/util/plancat.c
  @@ -1441,6 +1441,20 @@ relation_excluded_by_constraints(PlannerInfo *root,
  if (predicate_refuted_by(safe_constraints, rel->baserestrictinfo, false))
  return true;

  + /*
  

Re: CUBE seems a bit confused about ORDER BY

2018-01-10 Thread Alexander Korotkov
On Thu, Dec 14, 2017 at 2:39 PM, Teodor Sigaev  wrote:

> SQL-query seems too huge for release notes and isn't looking for
> materialized view (fixable) and functional indexes with function which
> contains this operator somewhere inside (not fixable by this query). I
> think, just words is enough.


I found that cube_2.out expected output in regression tests is used on
Windows 2003 in my virtual machine.  I've checked that for both cube
patches, regression tests pass correctly on that virtual machine.

--
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company


Re: [HACKERS] Proposal: Local indexes for partitioned table

2018-01-10 Thread Alvaro Herrera
David Rowley wrote:

> Hi Álvaro,

Hi David,

Thanks for the review.  Attached is a delta patch that fixes most
things, except your item 14 below.

Before getting into the details of the items you list, in this version I
also fixed a couple of issues noted by Jaime Casanova; namely

a) ALTER INDEX SET TABLESPACE threw an error for partitioned indexes.
Handled the same was as for partitioned tables: silently do nothing.

b) Expression indexes not considered at all.  (You also commented on
this point).  Jaime reported it for this case:
  create index on t_part USING gin (string_to_array(btrim((t)::text, 
'-'::text), '--'::text));

I dealt with this by adding code to CompareIndexInfo that runs equal()
on the expression lists, after applying map_variable_attnos() (to
support the case of attaching a table with dropped or reordered
columns).  As far as I can see, this works correctly for very simple
expressions, though I didn't test Jaime's case specifically; I suppose I
should add a few more tests.

On to your notes:

> 1. "either partition" -> "either the partition"
> 4. Extra newline
> 8. Missing if (attachInfos) pfree(attachInfos);
> 10. lock -> locks:
> 11. "already the right" -> "already in the right"
> 16. It's not exactly a problem, but I had a quick look and I don't see
> any other uses of list_free() checking for a non-NIL list first:

Fixed.

> 2. In findDependentObjects(), should the following if test be below
> the ReleaseDeletionLock call?
> 
> + if (foundDep->deptype == DEPENDENCY_INTERNAL_AUTO)
> + break;
>   ReleaseDeletionLock(object);

No, as far as I understand we should keep that lock if we break out of
the loop -- it's the lock on the object being deleted (the index
partition in this case).

I did break the comment in two, so that now the "First," paragraph
appears below the "if () break" test.  That seems to make more sense.
It looks like this now:

/*
 * 3. Not all the owning objects have been visited, so
 * transform this deletion request into a delete of this
 * owning object.
 *
 * For INTERNAL_AUTO dependencies, we don't enforce this;
 * in other words, we don't follow the links back to the
 * owning object.
 */
if (foundDep->deptype == DEPENDENCY_INTERNAL_AUTO)
break;

/*
 * First, release caller's lock on this object and get
 * deletion lock on the owning object.  (We must release
 * caller's lock to avoid deadlock against a concurrent
 * deletion of the owning object.)
 */
ReleaseDeletionLock(object);
AcquireDeletionLock(&otherObject, 0);


> 3. The following existing comment indicates that we'll be creating a
> disk file for the index. Should we update this comment to say that
> this is the case only for RELKIND_INDEX?

> Maybe:
> 
> /*
> * create the index relation's relcache entry and for non-partitioned
> * indexes, a physical disk file. (If we fail further down, it's the
> * smgr's responsibility to remove the disk file again.)
> */

I used this:
/*
 * create the index relation's relcache entry and, if necessary, the
 * physical disk file. (If we fail further down, it's the smgr's
 * responsibility to remove the disk file again, if any.)
 */

> 5. Do you think it's out of scope to support expression indexes?

Answered above.

> 6. pg_inherits.inhseqno is int, not smallint. I understand you copied
> this from StoreCatalogInheritance1(), so maybe a fix should be put in
> before this patch is?

Hmm, yeah, will fix.

> 7. Is the following a bug fix? If so, should it not be fixed and
> backpatched, rather than sneaked in here?
> 
> @@ -10119,6 +10195,7 @@ ATExecChangeOwner(Oid relationOid, Oid
> newOwnerId, bool recursing, LOCKMODE lock
>   * relation, as well as its toast table (if it has one).
>   */
>   if (tuple_class->relkind == RELKIND_RELATION ||
> + tuple_class->relkind == RELKIND_PARTITIONED_TABLE ||

No, it's not a bug fix -- this block is only concerned with searching
index for this relkind, and prior to this patch PARTITIONED_TABLEs do
not have indexes, so it's ok not to consider the case.

> 9. The first OR branch of the Assert will always be false, so might as
> well get rid of it.

True.  Removed.

> 12. It seems to be RangeVarGetRelidExtended() that locks the partition
> index, not the callback.
> 
> /* no deadlock risk: our callback above already acquired the lock */

Hmm ... yeah, you're right.  Fixed.

> 13. We seem to only be holding an AccessShareLock on the partitioned
> table. What happens if someone concurrently does ALTER TABLE
> partitioned_table DETACH our_partition;?
> 
> parentTbl = relation_open(parentIdx->rd_index->indrelid, AccessShareLock);
> 
> and;
> /* Make sure it indexes a partition of the other index's

Re: Rangejoin rebased

2018-01-10 Thread Simon Riggs
On 10 January 2018 at 04:24, Jeff Davis  wrote:
> On Sat, Jan 6, 2018 at 10:38 AM, Simon Riggs  wrote:
>> For this to be useful, it needs to include some details of how to use
>> it when people have NOT used range datatypes in their tables.
>
> Good idea. I added an example that doesn't have range types in the base table.

Cool, thanks

...


It would be useful to consider any related use cases.
Are there applications for range operators other than &&?

Do we optimize for TIMESTAMP <@ RANGE as well?

Does this link in nicely with partition-aware joins?
Does it allow partition exclusion if you join a daterange to a time
range partitioned table?

-- 
Simon Riggshttp://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services



let's make the list of reportable GUCs configurable (was Re: Add %r substitution for psql prompts to show recovery status)

2018-01-10 Thread Robert Haas
On Tue, Jan 9, 2018 at 3:36 PM, Peter Eisentraut
 wrote:
> I agree a backend status message is the right way to do this.
>
> We could perhaps report transaction_read_only, if we don't want to add a
> new one.

That's not really the same thing, though.

I think that we really need to think about allowing clients to tell
the server which GUCs they'd like reported, instead of having a single
list to which everyone is bound.  A new protocol option using the
facilities added in commit ae65f6066dc3d19a55f4fdcd3b30003c5ad8dbed,
like _pq_.report=work_mem might be one way, though that doesn't give a
psql session the option of changing its mind in mid-session.  If we
wanted to allow that, we could teach the server to understand a new
message to change protocol options and let psql send it to
sufficiently-new servers.

Way back in the day, we used to reject every proposed new EXPLAIN
option because we didn't want to add keywords for all of those things;
nowadays, we reject every new GUC_REPORT GUC because it adds overhead
for everyone.  The solution there was to change the rules of the game
so that EXPLAIN options didn't have to be keywords, and I think the
solution here is to change the game so it doesn't add overhead for
everyone.

We might also want to consider de-GUC-reportify some things that are
GUC_REPORT now.  DateStyle, IntervalStyle, TimeZone, application_name
aren't used by anything in core.  Apparently we have application_name
as GUC_REPORT because pgbouncer wants it
(59ed94ad0c9f74a3f057f359316c845cedc4461e, 2009), TimeZone because
JDBC wants it (b5ae0d69da8f83e400921fcdd171e5bdadb45db3, 2004),
IntervalStyle I suppose because it was copied from DateStyle
(df7641e25ab4da6f3a48222cbda0e121ccb32ad5, 2008) and DateStyle from
the very beginning for unspecified reasons
(b700a672feadbb6f122b7c7249967fb0f58dda2b, 2003).  If clients can
request the GUCs they care about, it becomes much less important for
the default list to be comprehensive.

As a side benefit, then Craig and Tom can stop arguing about whether
server_version_num should be GUC_REPORT.  Under this proposal, you can
have it whichever way you like.

-- 
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company



Re: Rangejoin rebased

2018-01-10 Thread Simon Riggs
On 10 January 2018 at 04:24, Jeff Davis  wrote:

> Done.

I think you need to make changes to other parts of the docs also, so
that it is clear what will now be possible

https://www.postgresql.org/docs/devel/static/using-explain.html
https://www.postgresql.org/docs/devel/static/xoper-optimization.html#id-1.8.3.17.9
https://www.postgresql.org/docs/devel/static/planner-optimizer.html

-- 
Simon Riggshttp://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services



Re: Multi-level hierarchy with parallel append can lead to an extra subplan.

2018-01-10 Thread Robert Haas
On Tue, Jan 9, 2018 at 5:41 AM, Amit Khandekar  wrote:
> This subplan should not be there. It looks like that's because when
> we add up subpaths while preparing Parallel Append path containing mix of
> partial and non-partial child paths, in accumulate_append_subpath(),
> the subpath being added is an Append path because it's a multi-level
> partition. So in the condition "else if (special_subpaths != NULL)",
> both *subpaths and *special_paths get updated. But then it is not
> returning from there when it should. Instead, the control passes to
> the end of function where *subpaths is again populated :
> *subpaths = lappend(*subpaths, path);
> which leads to an extra child with partial subpath getting added.

Boy, what a dumb mistake on my part.  Thanks for finding the bug,
Rajkumar.  Thanks for analyzing it, Amit.  I have committed the fix.

-- 
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company



Re: let's make the list of reportable GUCs configurable (was Re: Add %r substitution for psql prompts to show recovery status)

2018-01-10 Thread Simon Riggs
On 10 January 2018 at 16:08, Robert Haas  wrote:

> I think that we really need to think about allowing clients to tell
> the server which GUCs they'd like reported, instead of having a single
> list to which everyone is bound.

+1

-- 
Simon Riggshttp://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services



Re: portal pinning

2018-01-10 Thread Peter Eisentraut
On 1/8/18 20:28, Peter Eisentraut wrote:
> On 1/8/18 15:27, Andrew Dunstan wrote:
>> This seems like a good idea, and the code change is tiny and clean. I
>> don't know of any third party PLs or other libraries might be pinning
>> the portals already on their own. How would they be affected if they did?
> 
> They would get an error if they tried to pin it a second time.  So this
> would require a small source-level adjustment.  But I doubt this is
> actually the case anywhere, seeing that we are not even using this
> consistently in core.

committed

-- 
Peter Eisentraut  http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services



Re: CUBE seems a bit confused about ORDER BY

2018-01-10 Thread Alvaro Herrera
Teodor Sigaev wrote:

> SQL-query seems too huge for release notes and isn't looking for
> materialized view (fixable) and functional indexes with function which
> contains this operator somewhere inside (not fixable by this query). I
> think, just words is enough.

But the query can be made a little bit shorter and more comprehensible:

SELECT pg_describe_object(dep.classid, dep.objid, dep.objsubid)
FROM pg_catalog.pg_extension ext
   JOIN pg_catalog.pg_depend edep ON edep.refobjid = ext.oid
   JOIN pg_catalog.pg_operator oper ON oper.oid = edep.objid
   JOIN pg_catalog.pg_depend dep ON dep.refobjid = oper.oid
WHERE
   ext.extname = 'cube' AND
   edep.refclassid = 'pg_catalog.pg_extension'::regclass AND
   edep.classid = 'pg_catalog.pg_operator'::regclass AND
   edep.deptype = 'e' AND
   oper.oprname = '~>' AND
   dep.refclassid = 'pg_catalog.pg_operator'::regclass
;

which returns the following

 pg_describe_object 


 regla «_RETURN» en vista materializada f
 índice tmp_idx
 restricción «tmp_c_check» en tabla tmp
 operador 15 (cube, integer) de familia de operadores gist_cube_ops para el 
método de acceso gist: ~>(cube,integer)
(4 filas)

(after 
create materialized view f as select * from tmp where c~>1 > 1;
)

I think this is useful enough.  The fact remains that we can't check
very well for functions; maybe suggest a LIKE clause to look for ~>
anywhere in function source code?


(It looks like you could get rid of the 'deptype' qual and
dep.refclassid also)

-- 
Álvaro Herrerahttps://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services



Re: [HACKERS] SQL/JSON in PostgreSQL

2018-01-10 Thread Andrew Dunstan


On 01/10/2018 01:37 AM, Oleg Bartunov wrote:
>
>
>
>     this. Something similar to what we're using for json itself (a
>     simple lexer and a recursive descent parser) would be more
> suitable.
>
>
> flex/bison is right tool for jsonpath, which is complex thing. 



It's not really very complex. The bison grammar has 21 non-terminals. As
languages go that's not huge.

May main concerns are code size and performance. RD parsers are
typically very fast and compact unless they are badly written. There are
reasons that years ago gcc switched from using bison to using a hand cut
RD parser. I guess we wouldn't expect for the most part that jsonpath
expressions would need to be compiled per row, so maybe performance
isn't that critical. But if we do expect really dynamic jsonpath
expressions then we need to make sure we are as fast as we can get.

I guess if all you have is a hammer everything looks like a nail, but we
should not assume that bison is the answer to every parsing problem we have.

I'm not going to hold up the patch over this issue. I should probably
have looked closer and raised it months ago. But if and when I get time
I will look into some benchmarking.

> I also note that the later patches have no documentation whatsoever.
> That needs serious work, and if you want to get these patches in then
> please supply some documentation ASAP. If you need help with
> English we
> can work on that, but just throwing patches of this size and
> complexity
> over the wall into the commitfest without any documentation is not the
> way to proceed.
>
>
> Andrew, we are back from holidays and I will start writing on
> documentation. I have difficulty with design of documentation, since
> it's unclear to me how detailed it should be. I'm inclining to follow
> xml style of documentation, which is quite formal and could be more
> easy to write.


OK, good. The sooner the better though. Err on the side of more detail
please.

cheers

andrew


-- 

Andrew Dunstanhttps://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services




Re: [HACKERS] SQL/JSON in PostgreSQL

2018-01-10 Thread Oleg Bartunov
On 10 Jan 2018 20:14, "Andrew Dunstan" 
wrote:



On 01/10/2018 01:37 AM, Oleg Bartunov wrote:
>
>
>
> this. Something similar to what we're using for json itself (a
> simple lexer and a recursive descent parser) would be more
> suitable.
>
>
> flex/bison is right tool for jsonpath, which is complex thing.



It's not really very complex. The bison grammar has 21 non-terminals. As
languages go that's not huge.

May main concerns are code size and performance. RD parsers are
typically very fast and compact unless they are badly written. There are
reasons that years ago gcc switched from using bison to using a hand cut
RD parser. I guess we wouldn't expect for the most part that jsonpath
expressions would need to be compiled per row, so maybe performance
isn't that critical. But if we do expect really dynamic jsonpath
expressions then we need to make sure we are as fast as we can get.



Good point, our jsonpath can be expression ( the standard describes it as
constant literal).


I guess if all you have is a hammer everything looks like a nail, but we
should not assume that bison is the answer to every parsing problem we have.

I'm not going to hold up the patch over this issue. I should probably
have looked closer and raised it months ago. But if and when I get time
I will look into some benchmarking.


I think it's not important right now and we could always replace parser
later.


> I also note that the later patches have no documentation whatsoever.
> That needs serious work, and if you want to get these patches in then
> please supply some documentation ASAP. If you need help with
> English we
> can work on that, but just throwing patches of this size and
> complexity
> over the wall into the commitfest without any documentation is not the
> way to proceed.
>
>
> Andrew, we are back from holidays and I will start writing on
> documentation. I have difficulty with design of documentation, since
> it's unclear to me how detailed it should be. I'm inclining to follow
> xml style of documentation, which is quite formal and could be more
> easy to write.


OK, good. The sooner the better though. Err on the side of more detail
please.

cheers

andrew


--

Andrew Dunstanhttps://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services


Re: let's make the list of reportable GUCs configurable (was Re: Add %r substitution for psql prompts to show recovery status)

2018-01-10 Thread Tom Lane
Robert Haas  writes:
> I think that we really need to think about allowing clients to tell
> the server which GUCs they'd like reported, instead of having a single
> list to which everyone is bound.

Interesting idea ...

> As a side benefit, then Craig and Tom can stop arguing about whether
> server_version_num should be GUC_REPORT.  Under this proposal, you can
> have it whichever way you like.

... but I don't think it fixes that, because you couldn't send this new
request without making an assumption about the server version being
new enough to support it.  My entire beef with making server_version_num
be GUC_REPORT is that it would encourage people to write client code that
fails outright against older servers.  I'm afraid what you are suggesting
will be an equally attractive nuisance.

regards, tom lane



Re: let's make the list of reportable GUCs configurable (was Re: Add %r substitution for psql prompts to show recovery status)

2018-01-10 Thread Joshua D. Drake

On 01/10/2018 09:22 AM, Tom Lane wrote:

... but I don't think it fixes that, because you couldn't send this new
request without making an assumption about the server version being
new enough to support it.  My entire beef with making server_version_num
be GUC_REPORT is that it would encourage people to write client code that
fails outright against older servers.  I'm afraid what you are suggesting
will be an equally attractive nuisance.


It seems to me that is not our problem. Why do we care if some developer 
says, "I only work with 9.6"? If I am understanding your complaint.


JD



regards, tom lane



--
Command Prompt, Inc. || http://the.postgres.company/ || @cmdpromptinc

PostgreSQL centered full stack support, consulting and development.
Advocate: @amplifypostgres || Learn: https://postgresconf.org
* Unless otherwise stated, opinions are my own.   *




Re: BUG #14941: Vacuum crashes

2018-01-10 Thread Bossart, Nathan
On 1/9/18, 8:55 PM, "Michael Paquier"  wrote:
> On Tue, Jan 09, 2018 at 01:46:55PM -0800, Andres Freund wrote:
>> On 2018-01-09 10:23:12 -0500, Robert Haas wrote:
>> > On Mon, Jan 8, 2018 at 11:27 PM, Michael Paquier
>> >  wrote:
>> > > On Thu, Dec 28, 2017 at 10:46:18PM +, Bossart, Nathan wrote:
>> > >> I agree, this makes more sense.  I've made this change in v3 of 0003.
>> > >
>> > > Based on the opinions gathered on this thread, 0001 and 0002 seem to be
>> > > in the shape wanted, and those look good for shipping.
>> > 
>> > Committed 0001.
>> 
>> FWIW, I think in the future it'd be good to develop new features outside
>> of a thread about "vacuum crashes".
>
> You are right atht this is bad practice. Partially my fault for not
> recommending the move to a new thread.

Yes, sorry about that.  I should have opened a new thread.

Nathan



Re: let's make the list of reportable GUCs configurable (was Re: Add %r substitution for psql prompts to show recovery status)

2018-01-10 Thread Tom Lane
"Joshua D. Drake"  writes:
> On 01/10/2018 09:22 AM, Tom Lane wrote:
>> ... but I don't think it fixes that, because you couldn't send this new
>> request without making an assumption about the server version being
>> new enough to support it.  My entire beef with making server_version_num
>> be GUC_REPORT is that it would encourage people to write client code that
>> fails outright against older servers.  I'm afraid what you are suggesting
>> will be an equally attractive nuisance.

> It seems to me that is not our problem. Why do we care if some developer 
> says, "I only work with 9.6"? If I am understanding your complaint.

I don't care at all if J. Random Developer's homegrown code only works
with the PG version he's using.  The concern I have is that unwanted
server version dependencies will sneak into widely used code, like
psql, or libpq, or jdbc.  Or another way of putting it: Robert's proposal
is a protocol version break, just like most stuff at this level.  Trying
to pretend it isn't doesn't make it not one.

regards, tom lane



Re: PATCH: Configurable file mode mask

2018-01-10 Thread David Steele
On 1/8/18 8:58 PM, Peter Eisentraut wrote:
> On 1/3/18 08:11, Robert Haas wrote:
>> On Tue, Jan 2, 2018 at 11:43 AM, David Steele  wrote:
> I think MakeDirectory() is a good wrapper, but isn't
 MakeDirectoryPerm() sort of silly?
>>>
>>> There's one place in the backend (storage/ipc/ipc.c) that sets non-default
>>> directory permissions.  This function is intended to support that and any
>>> extensions that need to set custom perms.
>>
>> Yeah, but all it does is call mkdir(), which could just as well be
>> called directly.  I think there's a pointer to a wrapper when it does
>> something for you -- supply an argument, log something, handle
>> portability concerns -- but this wrapper does exactly nothing.
> 
> Yeah, I didn't like this aspect when this patch was originally
> submitted.  We want to keep the code legible for future new
> contributors.  Having these generic-sounding but specific-in-purpose
> wrapper functions can be pretty confusing.  Let's use mkdir() when it's
> the appropriate function, and let's figure out a different name for
> "make a data directory subdirectory in a secure and robust way".

I think there is value to keeping the function names symmetric to the
FileOpen()/FileOpenPerm() variants but I'm clearly in the minority so
there's no point in pursuing it.

How about MakeDirectoryDefaultPerm()?  That's what I'll go with if I
don't hear any other ideas.  The single call to MakeDirectoryPerm() will
be reverted to mkdir() and I'll remove the function.

Thanks,
-- 
-David
da...@pgmasters.net



Re: let's make the list of reportable GUCs configurable (was Re: Add %r substitution for psql prompts to show recovery status)

2018-01-10 Thread Joshua D. Drake

On 01/10/2018 09:36 AM, Tom Lane wrote:

It seems to me that is not our problem. Why do we care if some developer

says, "I only work with 9.6"? If I am understanding your complaint.

I don't care at all if J. Random Developer's homegrown code only works
with the PG version he's using.  The concern I have is that unwanted
server version dependencies will sneak into widely used code, like
psql, or libpq, or jdbc.  Or another way of putting it: Robert's proposal
is a protocol version break, just like most stuff at this level.  Trying
to pretend it isn't doesn't make it not one.


That makes sense, thanks for clarifying.

JD


regards, tom lane



--
Command Prompt, Inc. || http://the.postgres.company/ || @cmdpromptinc

PostgreSQL centered full stack support, consulting and development.
Advocate: @amplifypostgres || Learn: https://postgresconf.org
* Unless otherwise stated, opinions are my own.   *




Re: PATCH: Configurable file mode mask

2018-01-10 Thread Stephen Frost
David,

* David Steele (da...@pgmasters.net) wrote:
> On 1/8/18 8:58 PM, Peter Eisentraut wrote:
> > On 1/3/18 08:11, Robert Haas wrote:
> >> On Tue, Jan 2, 2018 at 11:43 AM, David Steele  wrote:
> > I think MakeDirectory() is a good wrapper, but isn't
>  MakeDirectoryPerm() sort of silly?
> >>>
> >>> There's one place in the backend (storage/ipc/ipc.c) that sets non-default
> >>> directory permissions.  This function is intended to support that and any
> >>> extensions that need to set custom perms.
> >>
> >> Yeah, but all it does is call mkdir(), which could just as well be
> >> called directly.  I think there's a pointer to a wrapper when it does
> >> something for you -- supply an argument, log something, handle
> >> portability concerns -- but this wrapper does exactly nothing.
> > 
> > Yeah, I didn't like this aspect when this patch was originally
> > submitted.  We want to keep the code legible for future new
> > contributors.  Having these generic-sounding but specific-in-purpose
> > wrapper functions can be pretty confusing.  Let's use mkdir() when it's
> > the appropriate function, and let's figure out a different name for
> > "make a data directory subdirectory in a secure and robust way".
> 
> I think there is value to keeping the function names symmetric to the
> FileOpen()/FileOpenPerm() variants but I'm clearly in the minority so
> there's no point in pursuing it.
> 
> How about MakeDirectoryDefaultPerm()?  That's what I'll go with if I
> don't hear any other ideas.  The single call to MakeDirectoryPerm() will
> be reverted to mkdir() and I'll remove the function.

I would be sure to include a good comment/explanation about why mkdir()
is being used there and not MakeDirectoryDefaultPerm() (unless one
exists already), just to avoid later hackers thinking that mkdir() is
the way to go in general when, in most cases, MakeDirectoryDefaultPerm()
should be used.

Thanks!

Stephen


signature.asc
Description: PGP signature


Dubious shortcut in ckpt_buforder_comparator()

2018-01-10 Thread Tom Lane
While analyzing a recent crash report[1], I noticed that bufmgr.c's
ckpt_buforder_comparator is coded to assume that no two CkptSortItems
could have equal page IDs; it therefore skips the final comparison
and will never return 0 (equal).  I do not think that assumption is
correct.  I do not see anything preventing the scenario where, while
we are scanning the buffer pool at the beginning of BufferSync(),
somebody writes out a dirty buffer we've already seen and recycles
it for another page, and then somebody else fetches that same page
back into a different buffer and dirties it, and finally we arrive
at that second buffer and conclude it should be written too.

If we do get two entries with equal page IDs into the CkptSortItem
list, ckpt_buforder_comparator will give self-inconsistent results:
when actually x = y, it will say both x > y and y > x depending on
which way you pass the arguments to it.

Now, I cannot find any reason to think this would have awful consequences
given our current implementation of qsort.  But we might not always use
that code --- I remember some discussion of timsort, for instance ---
and another sorting implementation might be less happy about it.

So I think we should get rid of that micro-optimization, which is
probably useless anyway from a performance standpoint, and do the
comparison honestly.  Any objections?

regards, tom lane

[1] 
https://postgr.es/m/CAMd+QORjQGFPhRqXj8DSLn2_UgnTHtP=kc7h2e5_hgz0gla...@mail.gmail.com



Re: PATCH: Configurable file mode mask

2018-01-10 Thread Alvaro Herrera
David Steele wrote:
> On 1/8/18 8:58 PM, Peter Eisentraut wrote:

> > Yeah, I didn't like this aspect when this patch was originally
> > submitted.  We want to keep the code legible for future new
> > contributors.  Having these generic-sounding but specific-in-purpose
> > wrapper functions can be pretty confusing.  Let's use mkdir() when it's
> > the appropriate function, and let's figure out a different name for
> > "make a data directory subdirectory in a secure and robust way".

> How about MakeDirectoryDefaultPerm()?  That's what I'll go with if I
> don't hear any other ideas.  The single call to MakeDirectoryPerm() will
> be reverted to mkdir() and I'll remove the function.

I'd go with MakeDirectory, documenting exactly what it does and why, and
be done with it.  If your new function satisfies future users, great; if
not, it can be patched (or not) once we know exactly what these callers
need.

You know, YAGNI.

-- 
Álvaro Herrerahttps://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services



Re: General purpose hashing func in pgbench

2018-01-10 Thread Fabien COELHO


Hello Ildar,


Patch needs a rebase after Teodor push for a set of pgbench functions.

Done. Congratulations on your patch finally being committed : )


Over 21 months... I hope that pgbench will have hash functions sooner:-)


Should we probably add some infrastructure for optional arguments?


You can look at the handling of "CASE" which may or may not have an
"ELSE" clause.

I'd suggest you use a new negative argument with the special meaning
for hash, and create the seed value when missing when building the
function, so as to simplify the executor code.



Added a new nargs option -3 for hash functions and moved arguments check
to parser. It's starting to look a bit odd and I'm thinking about
replacing bare numbers (-1, -2, -3) with #defined macros. E.g.:

#define PGBENCH_NARGS_VARIABLE (-1)
#define PGBENCH_NARGS_CASE (-2)
#define PGBENCH_NARGS_HASH (-3)


Yes, I'm more than fine with improving readability.


Instead of 0, I'd consider providing a random default so that the
hashing behavior is not the same from one run to the next. What do you
think?


Makes sence since each thread is also initializes its random_state with
random numbers before start. So I added global variable 'hash_seed' and
initialize it with random() before threads spawned.


Hmm. I do not think that we should want a shared seed value. The seed 
should be different for each call so as to avoid undesired correlations. 
If wanted, correlation could be obtained by using an explicit identical 
seed.


ISTM that the best way to add the seed is to call random() when the second 
arg is missing in make_func. Also, this means that the executor would 
always get its two arguments, so it would simplify the code there.


--
Fabien.



Re: [HACKERS] Parallel tuplesort (for parallel B-Tree index creation)

2018-01-10 Thread Robert Haas
On Tue, Jan 9, 2018 at 10:36 PM, Thomas Munro
 wrote:
> This looks good to me.

The addition to README.parallel is basically wrong, because workers
have been allowed to write WAL since the ParallelContext machinery.
See the
XactLastRecEnd handling in parallel.c.  Workers can, for example, due
HOT cleanups during SELECT scans, just as the leader can.  The
language here is obsolete anyway in light of commit
e9baa5e9fa147e00a2466ab2c40eb99c8a700824, but this isn't the right way
to update it.  I'll propose a separate patch for that.

The change to the ParallelContext signature in parallel.h makes an
already-overlength line even longer.  A line break seems warranted
just after the first argument, plus pgindent afterward.

I am not a fan of the leader-as-worker terminology.  The leader is not
a worker, full stop.  I think we should instead talk about whether the
leader participates (so, ii_LeaderAsWorker -> ii_LeaderParticipates,
for example, plus many comment updates).  Similarly, it seems
SortCoordinateData's nLaunched should be nParticipants, and BTLeader's
nworkertuplesorts should be nparticipanttuplesorts.

There is also the question of whether we want to respect
parallel_leader_participation in this context.  The issues which might
motivate the desire for such behavior in the context of a query do not
exist when creating a btree index, so maybe we're just making this
complicated. On the other hand, if some other type of parallel index
build does end up doing a Gather-like operation then we might regret
deciding that parallel_leader_participation doesn't apply to index
builds, so maybe it's OK the way we have it.  On the third hand, the
complexity of having the leader maybe-participate seems like it
extends to a fair number of places in the code, and getting rid of all
that complexity seems appealing.

One place where this actually causes a problem is the message changes
to index_build().  The revised ereport() violates translatability
guidelines, which require that messages not be assembled from pieces.
See 
https://www.postgresql.org/docs/devel/static/nls-programmer.html#NLS-GUIDELINES

A comment added to tuplesort.h says that work_mem should be at least
64KB, but does not give any reason.  I think one should be given, at
least briefly, so that someone looking at these comments in the future
can, for example, figure out whether the comment is still correct
after future code changes.  Or else, remove the comment.

+ * Parallel sort callers are required to coordinate multiple tuplesort states
+ * in a leader process, and one or more worker processes.  The leader process

I think the comma should be removed.  As written it, it looks like we
are coordinating multiple tuplesort states in a leader process, and,
separately, we are coordinating one or more worker processes.  But in
fact we are coordinating multiple tuplesort states which are in a
group of processes that includes the leader and one or more worker
processes.

Generally, I think the comments in tuplesort.h are excellent.  I
really like the overview of how the new interfaces should be used,
although I find it slightly wonky that the leader needs two separate
Tuplesortstates if it wants to participate.

I don't understand why this patch needs to tinker with the tests in
vacuum.sql.  The comments say that "If we did not do this, errors
raised would concern running ANALYZE in parallel mode."  However, why
should parallel CREATE INDEX having any impact on ANALYZE at all?
Also, as a practical matter, if I revert those changes, 'make check'
still passes with or without force_parallel_mode=on.

I really dislike the fact that this patch invents another thing for
force_parallel_mode to do.  I invented force_parallel_mode mostly as a
way of testing that functions were correctly labeled for
parallel-safety, and I think it would be just fine if it never does
anything else.  As it is, it already does two quite separate things to
accomplish that goal: (1) forcibly run the whole plan with parallel
mode restrictions enabled, provided that the plan is not
parallel-unsafe, and (2) runs the plan in a worker, provided that the
plan is parallel-safe.  There's a subtle difference between those two
condition, which is that not parallel-unsafe does not equal
parallel-safe; there is also parallel-restricted.  The fact that
force_parallel_mode controls two different behaviors has, I think,
already caused some confusion for prominent PostgreSQL developers and,
likely, users as well.  Making it do a third thing seems to me to be
adding to the confusion, and not only because there are no
documentation changes to match.  If we go down this road, there will
probably be more additions -- what happens when parallel VACUUM
arrives, or parallel CLUSTER, or whatever?  I don't think it will be a
good thing for PostgreSQL if we end up with force_parallel_mode=on as
a general "use parallelism even though it's stupid" flag, requiring
supporting code in many different places throughout the 

Re: General purpose hashing func in pgbench

2018-01-10 Thread Fabien COELHO



Patch needs a rebase after Teodor push for a set of pgbench functions.

Done. Congratulations on your patch finally being committed : )


I forgot: please provide a doc & some coverage tests as well!

--
Fabien.



Re: [HACKERS] [FEATURE PATCH] pg_stat_statements with plans (v02)

2018-01-10 Thread Julian Markwort

Hello hackers,

I'd like to follow up to my previous proposition of tracking (some) best 
and worst plans for different queries in the pg_stat_statements extension.


Based on the comments and suggestions made towards my last endeavour, 
I've taken the path of computing the interquartile distance (by means of 
an adapted z-test, under the assumption of normal distribution, based on 
the mean_time and stddev_time already used by the extension).


A bad plan is recorded, if there is no previously recorded plan, or if 
the current execution time is greater than the maximum of the previously 
recorded plan's time and the query's mean+1.5*interquartile_distance.
A good plan is recorded on a similar condition; The execution time needs 
to be shorter than the minimum of the previously recorded good plan's 
time and the query's mean-1.5*interquartile_distance.


The boundaries are chosen to resemble the boundaries for whiskers in 
boxplots.
Using these boundaries, plans will be updated very seldomly, as long as 
they are more or less normally distributed.
Changes in the plans (for example the use of indices) used for each kind 
of query will most likely result in execution times exceeding these 
boundaries, so such changes are (very probably) recorded.


The ideal solution would be to compare the current plan with the last 
plan and only update when there is a difference between them, however I 
think this is unreasonably complex and a rather expensive task to 
compute on the completion of every query.


The intent of this patch is to provide a quick insight into the plans 
currently used by the database for the execution of certain queries. The 
tracked plans only represent instances of queries with very good or very 
poor performance.


I've (re)submitted this patch for the next commitfest as well.

Kind regards
Julian


On 03/04/2017 02:56 PM, Julian Markwort wrote:
Alright, for the next version of this patch I'll look into standard 
deviation (an implementation of Welfords' algorithm already exists in 
pg_stat_statements).


On 3/4/17 14:18, Peter Eisentraut wrote:


The other problem is that this measures execution time, which can vary
for reasons other than plan.  I would have expected that the cost
numbers are tracked somehow.
I've already thought of tracking specific parts of the explanation, 
like the cost numbers, instead of the whole string, I'll think of 
something, but if anybody has any bright ideas in the meantime, I'd 
gladly listen to them.



There is also the issue of generic vs specific plans, which this
approach might be papering over.
Would you be so kind and elaborate a little bit on this? I'm not sure 
if I understand this correctly. This patch only tracks specific plans, 
yes. The inital idea was that there might be some edge-cases that are 
not apparent when looking at generalized plans or queries.


kind regards
Julian


diff --git a/contrib/pg_stat_statements/Makefile b/contrib/pg_stat_statements/Makefile
index 39b368b70e..4d658d0ec7 100644
--- a/contrib/pg_stat_statements/Makefile
+++ b/contrib/pg_stat_statements/Makefile
@@ -4,7 +4,7 @@ MODULE_big = pg_stat_statements
 OBJS = pg_stat_statements.o $(WIN32RES)
 
 EXTENSION = pg_stat_statements
-DATA = pg_stat_statements--1.4.sql pg_stat_statements--1.4--1.5.sql \
+DATA = pg_stat_statements--1.5.sql pg_stat_statements--1.4--1.5.sql \
 	pg_stat_statements--1.3--1.4.sql pg_stat_statements--1.2--1.3.sql \
 	pg_stat_statements--1.1--1.2.sql pg_stat_statements--1.0--1.1.sql \
 	pg_stat_statements--unpackaged--1.0.sql
diff --git a/contrib/pg_stat_statements/expected/pg_stat_statements.out b/contrib/pg_stat_statements/expected/pg_stat_statements.out
index 5318c3550c..3e79890d50 100644
--- a/contrib/pg_stat_statements/expected/pg_stat_statements.out
+++ b/contrib/pg_stat_statements/expected/pg_stat_statements.out
@@ -395,4 +395,40 @@ SELECT query, calls, rows FROM pg_stat_statements ORDER BY query COLLATE "C";
  SELECT pg_stat_statements_reset() | 1 |1
 (8 rows)
 
+-- test to see if any plans have been recorded.
+SELECT
+  CASE WHEN good_plan_time > 0 THEN 1 ELSE 0 END,
+  CASE WHEN bad_plan_time > 0 THEN 1 ELSE 0 END,
+  CASE WHEN good_plan_timestamp >= timestamp '1970-01-01 00:00:00' THEN 1 ELSE 0 END,
+  CASE WHEN good_plan_timestamp >= timestamp '1970-01-01 00:00:00' THEN 1 ELSE 0 END
+FROM pg_stat_statements ORDER BY query COLLATE "C";
+ case | case | case | case 
+--+--+--+--
+0 |0 |0 |0
+0 |0 |0 |0
+0 |0 |0 |0
+0 |0 |0 |0
+0 |0 |0 |0
+0 |0 |0 |0
+1 |1 |1 |1
+1 |1 |1 |1
+1 |1 |1 |1
+(9 rows)
+
+-- test if there is some text in the recorded plans.
+select substr(good_plan, 0, 11), substr(bad_plan, 0, 11) from pg_stat_statements ORDER BY query COLLATE "C";
+   substr   |   substr   
++
+| 
+| 
+| 
+  

Re: portal pinning

2018-01-10 Thread Vladimir Sitnikov
> committed

I'm afraid it causes regressions for pgjdbc.
Here's CI log: https://travis-ci.org/pgjdbc/pgjdbc/jobs/327327402

The errors are:
testMetaData[typeName = REF_CURSOR, cursorType =
2,012](org.postgresql.test.jdbc2.RefCursorTest)  Time elapsed: 0.032 sec
 <<< ERROR! org.postgresql.util.PSQLException: ERROR: cannot drop pinned
portal ""

It looks like "refcursor" expressions are somehow broken.

The test code is to execute testspg__getRefcursor procedure
that is defined as follows

CREATE OR REPLACE FUNCTION testspg__getRefcursor () RETURNS refcursor AS '
declare v_resset refcursor; begin open v_resset for select id from testrs
order by id;
return v_resset; end;' LANGUAGE plpgsql;

Would you please check that?

Vladimir


Re: [HACKERS] GSoC 2017 : Patch for predicate locking in Gist index

2018-01-10 Thread Shubham Barai
On 8 January 2018 at 23:13, Shubham Barai  wrote:

>
>
> On 8 January 2018 at 22:44, Shubham Barai 
> wrote:
>
>>
>>
>> On 5 January 2018 at 03:18, Alexander Korotkov > > wrote:
>>
>>> On Thu, Jan 4, 2018 at 7:07 PM, Andrey Borodin 
>>> wrote:
>>>
 29 нояб. 2017 г., в 22:50, Shubham Barai 
 написал(а):

  I have fixed formatting style. Please take a look at updated patch.


 Here's rebased patch. Every issue has been addressed, so I'm marking
 this patch as ready for committer.

>>>
>>> I'm sorry for concentrating on boring things, but formatting of
>>> predicate-gist.spec still doesn't look good for me.
>>>
>>> # To verify serialization failures, queries and permutations are written
 in such
 # a way that an index scan(from one transaction) and an index
 insert(from another
 # transaction) will try to access the same part(sub-tree) of the index.
 #
 # To check reduced false positives, queries and permutations are
 written in such
 # a way that an index scan(from one transaction) and an index
 insert(from another
 # transaction) will try to access different parts(sub-tree) of the
 index.

>>>
>>> No space before open bracket (I think it should be when there are
>>> multiple words brackets).
>>> Also, we're trying to fit our lines to 80 characters (if it's not
>>> objectively difficult).
>>> And these are two almost same paragraphs.  I think it should be
>>> simplified.
>>>
>>> setup
 {
  create table gist_point_tbl(id int4, p point);
  create index gist_pointidx on gist_point_tbl using gist(p);
  insert into gist_point_tbl (id, p)
  select g, point(g*10, g*10) from generate_series(1, 1000) g;
 }
 setup
 {
   BEGIN ISOLATION LEVEL SERIALIZABLE;
   set enable_seqscan=off;
   set enable_bitmapscan=off;
   set enable_indexonlyscan=on;
 }
 setup {
   BEGIN ISOLATION LEVEL SERIALIZABLE;
   set enable_seqscan=off;
   set enable_bitmapscan=off;
   set enable_indexonlyscan=on;
 }
>>>
>>>
>>> I didn't get idea of using various indentation styles for same purpose.
>>>
>>> step "wx3" { insert into gist_point_tbl (id, p)
   select g, point(g*500, g*500) from
 generate_series(12, 18) g; }
>>>
>>>
>>> Indented using spaces here...
>>>
>>>
>>
>>
> I have fixed formatting issues. Please have a look at updated patch.
>
>
  The previous patch couldn't be applied cleanly because there were some
 modifications to isolation_schedule. I have updated the patch now.


Regards,
Shubham


Predicate-Locking-in-gist-index_v8.patch
Description: Binary data


Re: Why standby restores some WALs many times from archive?

2018-01-10 Thread Sergey Burladyan
I think I found what happened here.

One WAL record can be split between WAL files.

In XLogReadRecord, if last WAL record is incomplete, it try to get next WAL:
/* Copy the first fragment of the record from the first page. */
memcpy(state->readRecordBuf,
   state->readBuf + RecPtr % XLOG_BLCKSZ, len);
buffer = state->readRecordBuf + len;
gotlen = len;

do
{
/* Calculate pointer to beginning of next page */
targetPagePtr += XLOG_BLCKSZ;

/* Wait for the next page to become available */
readOff = ReadPageInternal(state, targetPagePtr,
 Min(total_len 
- gotlen + SizeOfXLogShortPHD,
 
XLOG_BLCKSZ));

if (readOff < 0)
goto err;

but in my case next WAL not yet in archive (archive_timeout=0 in master)
and it clean cache:
err:

/*
 * Invalidate the xlog page we've cached. We might read from a different
 * source after failure.
 */
state->readSegNo = 0;
state->readOff = 0;
state->readLen = 0;

PG switch to streaming and last WAL (0001002B for
example) still not replayed completely. We do not use streaming and it
switch back to archive:
LOG:  restored log file "0001002B" from archive
...
DEBUG:  could not restore file "0001002C" from archive: child 
process exited with exit code 1
DEBUG:  switched WAL source from archive to stream after failure
DEBUG:  switched WAL source from stream to archive after failure

Now it must reread first part of last WAL record from
0001002B, but XLogFileReadAnyTLI is _always_ read
from the archive first, even if this file is already in pg_xlog:
if (source == XLOG_FROM_ANY || source == XLOG_FROM_ARCHIVE)
{
fd = XLogFileRead(segno, emode, tli,
  XLOG_FROM_ARCHIVE, 
true);
if (fd != -1)
{
elog(DEBUG1, "got WAL segment from archive");
if (!expectedTLEs)
expectedTLEs = tles;
return fd;
}
}

if (source == XLOG_FROM_ANY || source == XLOG_FROM_PG_XLOG)
{
fd = XLogFileRead(segno, emode, tli,
  XLOG_FROM_PG_XLOG, 
true);
if (fd != -1)
{
if (!expectedTLEs)
expectedTLEs = tles;
return fd;
}
}

Well, I think we'll be able to cache locally the last WAL file in 
restore_command
if needed :-)

-- 
Sergey Burladyan



Re: portal pinning

2018-01-10 Thread Peter Eisentraut
On 1/10/18 13:53, Vladimir Sitnikov wrote:
>> committed
> 
> I'm afraid it causes regressions for pgjdbc.
> Here's CI log: https://travis-ci.org/pgjdbc/pgjdbc/jobs/327327402
> 
> The errors are:
> testMetaData[typeName = REF_CURSOR, cursorType =
> 2,012](org.postgresql.test.jdbc2.RefCursorTest)  Time elapsed: 0.032 sec 
>  <<< ERROR! org.postgresql.util.PSQLException: ERROR: cannot drop pinned
> portal ""   
> 
> It looks like "refcursor" expressions are somehow broken.

Hmm, it seems like we are missing some test coverage in core for that.

I guess I'll have to revert this patch and go with the first approach of
putting it directly into PL/Perl and PL/Python.

-- 
Peter Eisentraut  http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services



Re: Dubious shortcut in ckpt_buforder_comparator()

2018-01-10 Thread Andres Freund
Hi,

On 2018-01-10 13:06:50 -0500, Tom Lane wrote:
> So I think we should get rid of that micro-optimization, which is
> probably useless anyway from a performance standpoint, and do the
> comparison honestly.  Any objections?

No, absolutely none. You're going to change it?

Greetings,

Andres Freund



Re: Why standby restores some WALs many times from archive?

2018-01-10 Thread Jeff Janes
On Sat, Dec 30, 2017 at 4:20 AM, Michael Paquier 
wrote:

> On Sat, Dec 30, 2017 at 04:30:07AM +0300, Sergey Burladyan wrote:
> > We use this scripts:
> > https://github.com/avito-tech/dba-utils/tree/master/pg_archive
> >
> > But I can reproduce problem with simple cp & mv:
> > archive_command:
> >   test ! -f /var/lib/postgresql/wals/%f && \
> >   test ! -f /var/lib/postgresql/wals/%f.tmp && \
> >   cp %p /var/lib/postgresql/wals/%f.tmp && \
> >   mv /var/lib/postgresql/wals/%f.tmp /var/lib/postgresql/wals/%f
>
> This is unsafe. PostgreSQL expects the WAL segment archived to be
> flushed to disk once the archive command has returned its result to the
> backend. Don't be surprised if you get corrupted instances or that you
> are not able to recover up to a consistent point if you need to roll in
> a backup. Note that the documentation of PostgreSQL provides a simple
> example of archive command, which is itself bad enough not to use.
>

True, but that but doesn't explain the current situation, as it reproduces
without an OS level crash so a missing sync would not be relevant. (and on
some systems, mv'ing a file will force it to be synced under some
conditions, so it might be safe anyway)

I thought I'd seen something recently in the mail lists or commit log about
an off-by-one error which causes it to re-fetch the previous file rather
than the current file if the previous file ends with just the right type of
record and amount of padding.  But now I can't find it.

Cheers,

Jeff


Re: let's make the list of reportable GUCs configurable (was Re: Add %r substitution for psql prompts to show recovery status)

2018-01-10 Thread Chapman Flack
On 01/10/2018 11:08 AM, Robert Haas wrote:

> I think that we really need to think about allowing clients to tell
> the server which GUCs they'd like reported, instead of having a single
> list to which everyone is bound.

+1

That already sounded like a good idea back in
https://www.postgresql.org/message-id/11465.1339163239%40sss.pgh.pa.us,
in a thread about making Connection.setReadOnly() work right in PGJDBC.

-Chap



Re: [HACKERS] Race between SELECT and ALTER TABLE NO INHERIT

2018-01-10 Thread Tom Lane
Kyotaro HORIGUCHI  writes:
> [ 0002-Lock-parent-on-ALTER-TABLE-NO-INHERIT.patch ]

I don't especially like any of the patches proposed on this thread.
The one with rechecking inheritance seems expensive, duplicative,
and complicated.  The approach of taking a lock on the parent will
create deadlocks in usage patterns that did not encounter such
deadlocks before --- in particular, in the scenarios in which this
behavior matters at all, the ALTER will deadlock against ordinary
queries that lock the parent before the child.  So I can't believe
that anyone who's hitting the problem in the field will think the
extra lock is an improvement.

I think that there might be a much simpler solution to this, which
is to just remove make_inh_translation_list's tests of attinhcount,
as per attached.  Those are really pretty redundant: since we are
matching by column name, the unique index on pg_attribute already
guarantees there is at most one matching column.  I have a feeling
that those tests are my fault and I put them in on the theory that
they could save a few strcmp executions --- but if they're causing
problems, we can certainly drop them.  In any case they'd only save
something meaningful if most of the child's columns are non-inherited,
which doesn't seem like the way to bet.

In this way, if the child gets past the initial check on whether it's
been dropped, we'll still succeed in matching its columns, even if
they are no longer marked as inherited.  For me, this works fine with
either the ALTER NO INHERIT case (c1 does get scanned, with no error)
or the DROP TABLE case (c1 doesn't get scanned).  Now, you can break
it if you really try hard: make the concurrent transaction do both
ALTER NO INHERIT and then DROP COLUMN.  But that's not a scenario
that's been complained of, so I don't want to add a ton of mechanism
to fix it.

regards, tom lane

diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c
index 95557d7..5c4d113 100644
--- a/src/backend/optimizer/prep/prepunion.c
+++ b/src/backend/optimizer/prep/prepunion.c
@@ -1832,7 +1832,7 @@ make_inh_translation_list(Relation oldrelation, Relation newrelation,
 		 */
 		if (old_attno < newnatts &&
 			(att = TupleDescAttr(new_tupdesc, old_attno)) != NULL &&
-			!att->attisdropped && att->attinhcount != 0 &&
+			!att->attisdropped &&
 			strcmp(attname, NameStr(att->attname)) == 0)
 			new_attno = old_attno;
 		else
@@ -1840,7 +1840,7 @@ make_inh_translation_list(Relation oldrelation, Relation newrelation,
 			for (new_attno = 0; new_attno < newnatts; new_attno++)
 			{
 att = TupleDescAttr(new_tupdesc, new_attno);
-if (!att->attisdropped && att->attinhcount != 0 &&
+if (!att->attisdropped &&
 	strcmp(attname, NameStr(att->attname)) == 0)
 	break;
 			}


Re: Dubious shortcut in ckpt_buforder_comparator()

2018-01-10 Thread Tom Lane
Andres Freund  writes:
> On 2018-01-10 13:06:50 -0500, Tom Lane wrote:
>> So I think we should get rid of that micro-optimization, which is
>> probably useless anyway from a performance standpoint, and do the
>> comparison honestly.  Any objections?

> No, absolutely none. You're going to change it?

Will do.

regards, tom lane



Re: let's make the list of reportable GUCs configurable (was Re: Add %r substitution for psql prompts to show recovery status)

2018-01-10 Thread Robert Haas
On Wed, Jan 10, 2018 at 12:36 PM, Tom Lane  wrote:
> I don't care at all if J. Random Developer's homegrown code only works
> with the PG version he's using.  The concern I have is that unwanted
> server version dependencies will sneak into widely used code, like
> psql, or libpq, or jdbc.  Or another way of putting it: Robert's proposal
> is a protocol version break, just like most stuff at this level.  Trying
> to pretend it isn't doesn't make it not one.

Your argument here sounds suspiciously like "If we add a new feature
and people use it in a stupid way then it may cause their stuff not to
work".

Everything that worked before adding an option like _pq_.report
continues to work afterward.  Granted, if they try to use the option,
it will only work on versions that support that option, but that is
true of any new feature.  Furthermore, they will easily be able to
tell based on the reported server version whether or not their request
for different behavior was accepted by the server.  Therefore, if they
write their code well, there should be no danger of a client thinking
that they are getting behavior A while actually getting behavior B.
If we suppose that they write their code poorly, then there could well
be a problem, but drivers that contain bad code are fairly serious
problem with or without this feature.  It's not like checking the
server version to see whether it's new enough to know about
_pq_.report is rocket science.

I agree that my follow-on proposal of dropping GUC_REPORT for some
GUCs IS a protocol version break, and that may be a good reason to
reject that part of the proposal, or postpone it until we bump to 3.1
for some other reason.  Note that commit
ae65f6066dc3d19a55f4fdcd3b30003c5ad8dbed contemplates a specific
scheme for making servers that speak 3.1 and later backward-compatible
with existing clients that want 3.0, so any client that requests 3.1
can be assumed to be comfortable with all 3.1 behaviors.  Of course,
it's not like we've always been similarly careful in the past: you
yourself committed patches adding GUC_REPORT reporting to various GUCs
over the years, and if you worried about what impact that would have
on naive clients, the commit log does not record it, and we certainly
didn't bump the protocol version to account for it.

If you want an example of a much more serious protocol version break,
you don't have to look any further than the major features section of
the v10 release notes.  With SCRAM, we don't have to fall back on weak
arguments like "driver authors might use it wrong!" to see how things
get broken.  They're just wildly broken, and we don't care, because
SCRAM is a good feature. v10 also broke the replication sub-protocol
both by adding facilities for logical replication (which is arguably a
backward-compatible change) and by changing the format of hot standby
feedback messages (which isn't, unless we think clients ought to
ignore differences in the message length and just look at the initial
bytes, but I'm not sure we're particularly careful about extending
replication messages only at the end, so that sounds like a risky
strategy).

-- 
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company



Re: let's make the list of reportable GUCs configurable (was Re: Add %r substitution for psql prompts to show recovery status)

2018-01-10 Thread Tom Lane
Robert Haas  writes:
> Your argument here sounds suspiciously like "If we add a new feature
> and people use it in a stupid way then it may cause their stuff not to
> work".

I think you're attacking a straw man ...

> Everything that worked before adding an option like _pq_.report
> continues to work afterward.  Granted, if they try to use the option,
> it will only work on versions that support that option, but that is
> true of any new feature.  Furthermore, they will easily be able to
> tell based on the reported server version whether or not their request
> for different behavior was accepted by the server.

My point is specifically that that reasoning fails for features that you
might try to use to determine what the server version is, or that you
might try to use before finding out what the server version is.  For
example somebody might get cute and put an attempt to set _pq_.report
into their connection request packet.  It'll work fine as long as they
don't test against old servers.

Yes, you can code correctly if you recognize that the hazard exists,
I'm just worried about people failing to recognize that.  We don't
have any infrastructure for systematic testing of newer client code
against older server code, so it's something that bears worrying IMO.

So mostly what I'm objecting to is your claim that applying this feature
to server_version_num will do anything useful.  Leaving that aside,
it might well be a worthwhile idea.

regards, tom lane



Re: [HACKERS] Parallel tuplesort (for parallel B-Tree index creation)

2018-01-10 Thread Evgeniy Shishkin


> On Jan 10, 2018, at 21:45, Robert Haas  wrote:
> 
> The documentation for max_parallel_workers_maintenance cribs from the
> documentation for max_parallel_workers_per_gather in saying that we'll
> use fewer workers than expected "which may be inefficient". 

Can we actually call it max_parallel_maintenance_workers instead?
I mean we don't have work_mem_maintenance.




Re: let's make the list of reportable GUCs configurable (was Re: Add %r substitution for psql prompts to show recovery status)

2018-01-10 Thread Chapman Flack
On 01/10/2018 03:11 PM, Robert Haas wrote:

> it will only work on versions that support that option, but that is
> true of any new feature.  Furthermore, they will easily be able to
> tell based on the reported server version whether or not their request
> for different behavior was accepted by the server.  Therefore, if they
> write their code well, there should be no danger of a client thinking
> that they are getting behavior A while actually getting behavior B.

SSL certificates support a notion of 'extension' where a certificate
can include beyond-the-standard doodads that the party on the other
end might or might not understand, and they can be marked either
'critical' ("please refuse my connection if you don't understand
this one") or not ("we'll muddle along if you don't understand
that one").

Is there a notion like that in the pq protocol now? If not, and
a protocol bump becomes necessary to meet some need, would it be
worth adding such a notion at the same time, to simplify future
evolution?

-Chap



Re: let's make the list of reportable GUCs configurable (was Re: Add %r substitution for psql prompts to show recovery status)

2018-01-10 Thread Robert Haas
On Wed, Jan 10, 2018 at 3:22 PM, Tom Lane  wrote:
> My point is specifically that that reasoning fails for features that you
> might try to use to determine what the server version is, or that you
> might try to use before finding out what the server version is.

OK, I didn't understand that your objection was that narrow.

> For
> example somebody might get cute and put an attempt to set _pq_.report
> into their connection request packet.  It'll work fine as long as they
> don't test against old servers.

Even though I've referred to commit
ae65f6066dc3d19a55f4fdcd3b30003c5ad8dbed twice in this email thread so
far, this comment makes it look as though you haven't read it, or even
the commit message.  The startup packet would be the only place you
*could* put it.

> Yes, you can code correctly if you recognize that the hazard exists,
> I'm just worried about people failing to recognize that.  We don't
> have any infrastructure for systematic testing of newer client code
> against older server code, so it's something that bears worrying IMO.
>
> So mostly what I'm objecting to is your claim that applying this feature
> to server_version_num will do anything useful.  Leaving that aside,
> it might well be a worthwhile idea.

Well, the reason Craig wants to server_version_num to be GUC_REPORT is
that it's simpler to parse than server_version, and therefore less
error-prone.  I discovered today that Craig Sabino Mullane had it as
GUC_REPORT in the 2006 patch that originally added it.  That was
commit 04912899e792094ed00766b99b6c604cadf9edf7, which articulated the
parsing-is-simpler justification explicitly.  You forced the removal
of GUC_REPORT back then, too (commit
5086dfceba79ecd5d1eb28b8f4ed5221838ff3a6).

But if we add this feature and somebody wants to use it for
server_version_num, it's really pretty simple.  In the startup packet,
you say _pq_.report=server_version_num.  Then, you call
PQparameterStatus(conn, "server_version_num").  If you don't get a
value, you try calling PQparameterStatus(conn, "server_version") and
extracting the second word.  If that still doesn't work then you give
up.  That doesn't seem either useless or difficult to implement
correctly from here.

-- 
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company



Re: let's make the list of reportable GUCs configurable (was Re: Add %r substitution for psql prompts to show recovery status)

2018-01-10 Thread Robert Haas
On Wed, Jan 10, 2018 at 3:32 PM, Chapman Flack  wrote:
> Is there a notion like that in the pq protocol now? If not, and
> a protocol bump becomes necessary to meet some need, would it be
> worth adding such a notion at the same time, to simplify future
> evolution?

For the fourth time in this thread,
https://git.postgresql.org/pg/commitdiff/ae65f6066dc3d19a55f4fdcd3b30003c5ad8dbed

-- 
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company



Re: let's make the list of reportable GUCs configurable (was Re: Add %r substitution for psql prompts to show recovery status)

2018-01-10 Thread Tom Lane
Robert Haas  writes:
> But if we add this feature and somebody wants to use it for
> server_version_num, it's really pretty simple.  In the startup packet,
> you say _pq_.report=server_version_num.  Then, you call
> PQparameterStatus(conn, "server_version_num").  If you don't get a
> value, you try calling PQparameterStatus(conn, "server_version") and
> extracting the second word.  If that still doesn't work then you give
> up.  That doesn't seem either useless or difficult to implement
> correctly from here.

Yeah, but what's the point?  If yuou have to maintain the server_version
parsing code anyway, you're not saving any complexity with this.  You're
just creating a code backwater that you probably won't test adequately.

regards, tom lane



Re: AS OF queries

2018-01-10 Thread legrand legrand
> Sorry, truncate is not compatible with AS OF. It is performed at file
> level and deletes old old version.
> So if you want to use time travel, you should not use truncate. 

As time travel doesn't support truncate, I would prefer it to be checked.
If no check is performed, ASOF queries (with timestamp before truncate ) 
would return no data even when there was: this could be considered as a
wrong result.

if a truncate is detected, an error should be raised, saying data is no more
available before truncate timestamp.


> Does it mean that no explicit check is needed that table metadata was
> not checked after specified timeslice? 

Not sure, it would depend on metadata modification type ...
adding/dropping a columns seems working, 
what about altering a column type or dropping / recreating a table ?

Regards
PAscal



--
Sent from: http://www.postgresql-archive.org/PostgreSQL-hackers-f1928748.html



Re: let's make the list of reportable GUCs configurable (was Re: Add %r substitution for psql prompts to show recovery status)

2018-01-10 Thread Robert Haas
On Wed, Jan 10, 2018 at 3:55 PM, Tom Lane  wrote:
> Robert Haas  writes:
>> But if we add this feature and somebody wants to use it for
>> server_version_num, it's really pretty simple.  In the startup packet,
>> you say _pq_.report=server_version_num.  Then, you call
>> PQparameterStatus(conn, "server_version_num").  If you don't get a
>> value, you try calling PQparameterStatus(conn, "server_version") and
>> extracting the second word.  If that still doesn't work then you give
>> up.  That doesn't seem either useless or difficult to implement
>> correctly from here.
>
> Yeah, but what's the point?  If yuou have to maintain the server_version
> parsing code anyway, you're not saving any complexity with this.  You're
> just creating a code backwater that you probably won't test adequately.

Well, you obviously don't buy the idea that parsing server_version_num
might be more reliable than parsing server_version.  If you did buy
that idea, you might want to use the more-reliable technique when
possible and fall back otherwise, but I think you've made up your mind
about this.  Anyway, a proposal like this gets us out of the business
of legislating what Everyone Must Do, which I think can only be a
plus.

-- 
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company



Re: PATCH: Configurable file mode mask

2018-01-10 Thread Peter Eisentraut
On 1/10/18 12:37, David Steele wrote:
> How about MakeDirectoryDefaultPerm()?  That's what I'll go with if I
> don't hear any other ideas.  The single call to MakeDirectoryPerm() will
> be reverted to mkdir() and I'll remove the function.

Works for me.

-- 
Peter Eisentraut  http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services



Re: [HACKERS] Parallel tuplesort (for parallel B-Tree index creation)

2018-01-10 Thread Robert Haas
On Wed, Jan 10, 2018 at 3:29 PM, Evgeniy Shishkin  wrote:
>> On Jan 10, 2018, at 21:45, Robert Haas  wrote:
>> The documentation for max_parallel_workers_maintenance cribs from the
>> documentation for max_parallel_workers_per_gather in saying that we'll
>> use fewer workers than expected "which may be inefficient".
>
> Can we actually call it max_parallel_maintenance_workers instead?
> I mean we don't have work_mem_maintenance.

Good point.

-- 
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company



Re: [HACKERS] Parallel tuplesort (for parallel B-Tree index creation)

2018-01-10 Thread Peter Geoghegan
On Wed, Jan 10, 2018 at 10:45 AM, Robert Haas  wrote:
> The addition to README.parallel is basically wrong, because workers
> have been allowed to write WAL since the ParallelContext machinery.
> See the
> XactLastRecEnd handling in parallel.c.  Workers can, for example, due
> HOT cleanups during SELECT scans, just as the leader can.  The
> language here is obsolete anyway in light of commit
> e9baa5e9fa147e00a2466ab2c40eb99c8a700824, but this isn't the right way
> to update it.  I'll propose a separate patch for that.

WFM.

> The change to the ParallelContext signature in parallel.h makes an
> already-overlength line even longer.  A line break seems warranted
> just after the first argument, plus pgindent afterward.

Okay.

> I am not a fan of the leader-as-worker terminology.  The leader is not
> a worker, full stop.  I think we should instead talk about whether the
> leader participates (so, ii_LeaderAsWorker -> ii_LeaderParticipates,
> for example, plus many comment updates).  Similarly, it seems
> SortCoordinateData's nLaunched should be nParticipants, and BTLeader's
> nworkertuplesorts should be nparticipanttuplesorts.

Okay.

> There is also the question of whether we want to respect
> parallel_leader_participation in this context.  The issues which might
> motivate the desire for such behavior in the context of a query do not
> exist when creating a btree index, so maybe we're just making this
> complicated. On the other hand, if some other type of parallel index
> build does end up doing a Gather-like operation then we might regret
> deciding that parallel_leader_participation doesn't apply to index
> builds, so maybe it's OK the way we have it.  On the third hand, the
> complexity of having the leader maybe-participate seems like it
> extends to a fair number of places in the code, and getting rid of all
> that complexity seems appealing.

I only added support for the leader-as-worker case because I assumed
that it was important to have CREATE INDEX process allocation work
"analogously" to parallel query, even though it's clear that the two
situations are not really completely comparable when you dig deep
enough. Getting rid of the leader participating as a worker has
theoretical downsides, but real practical upsides. I am also tempted
to just get rid of it.

> One place where this actually causes a problem is the message changes
> to index_build().  The revised ereport() violates translatability
> guidelines, which require that messages not be assembled from pieces.
> See 
> https://www.postgresql.org/docs/devel/static/nls-programmer.html#NLS-GUIDELINES

Noted. Another place where a worker Tuplesortstate in the leader
process causes problems is plan_create_index_workers(), especially
because of things like force_parallel_mode and
parallel_leader_participation.

> A comment added to tuplesort.h says that work_mem should be at least
> 64KB, but does not give any reason.  I think one should be given, at
> least briefly, so that someone looking at these comments in the future
> can, for example, figure out whether the comment is still correct
> after future code changes.  Or else, remove the comment.

The reason for needing to do this is that a naive division of
work_mem/maintenance_work_mem within a caller like nbtsort.c, could,
in general, result in a workMem that is as low as 0 (due to integer
truncation of the result of a division). Clearly *that* is too low. In
fact, we need at least enough memory to store the initial minimal
memtuples array, which needs to respect ALLOCSET_SEPARATE_THRESHOLD.
There is also the matter of having per-tape space for
TAPE_BUFFER_OVERHEAD when we spill to disk (note also the special case
for pass-by-value datum sorts low on memory). There have been a couple
of unavoidable OOM bugs in tuplesort over the years already.

How about I remove the comment, but have tuplesort_begin_common()
force each Tuplesortstate to have workMem that is at least 64KB
(minimum legal work_mem value) in all cases? We can just formalize the
existing assumption that workMem cannot go below 64KB, really, and it
isn't reasonably to use so little workMem within a parallel worker (it
should be prevented by plan_create_index_workers() in the real world,
where parallelism is never artificially forced).

There is no need to make this complicated by worrying about whether or
not 64KB is the true minimum (value that avoids "can't happen"
errors), IMV.

> + * Parallel sort callers are required to coordinate multiple tuplesort states
> + * in a leader process, and one or more worker processes.  The leader process
>
> I think the comma should be removed.  As written it, it looks like we
> are coordinating multiple tuplesort states in a leader process, and,
> separately, we are coordinating one or more worker processes.

Okay.

> Generally, I think the comments in tuplesort.h are excellent.

Thanks.

> I really like the overview of how the new interfaces should be used,
> although I find it slightly wonk

Re: [HACKERS] Parallel tuplesort (for parallel B-Tree index creation)

2018-01-10 Thread Peter Geoghegan
On Wed, Jan 10, 2018 at 1:31 PM, Robert Haas  wrote:
>> Can we actually call it max_parallel_maintenance_workers instead?
>> I mean we don't have work_mem_maintenance.
>
> Good point.

WFM.

-- 
Peter Geoghegan



Re: Fix a Oracle-compatible instr function in the documentation

2018-01-10 Thread Tom Lane
I wrote:
> Evidently, they consider that negative beg_index indicates
> the last place where the target substring can *begin*, whereas
> our code thinks it is the last place where the target can *end*.

> After a bit of fooling around with it, I produced code that agrees
> with Oracle as far as I can tell, but it wouldn't be a bad idea
> for someone else to test it some more.

I spent some more time comparing this version's behavior with
rextester's Oracle instance, and couldn't find any other
discrepancies, so I pushed it.

regards, tom lane



Re: [HACKERS] Parallel tuplesort (for parallel B-Tree index creation)

2018-01-10 Thread Robert Haas
On Wed, Jan 10, 2018 at 5:05 PM, Peter Geoghegan  wrote:
> How about I remove the comment, but have tuplesort_begin_common()
> force each Tuplesortstate to have workMem that is at least 64KB
> (minimum legal work_mem value) in all cases? We can just formalize the
> existing assumption that workMem cannot go below 64KB, really, and it
> isn't reasonably to use so little workMem within a parallel worker (it
> should be prevented by plan_create_index_workers() in the real world,
> where parallelism is never artificially forced).

+1.  I think this doesn't even need to be documented.  You can simply
write a comment that says something /* Always allow each worker to use
at least 64kB.  If the amount of memory allowed for the sort is very
small, this might technically cause us to exceed it, but since it's
tiny compared to the overall memory cost of running a worker in the
first place, it shouldn't matter. */

> I share your general feelings on all of this, but I really don't know
> what to do about it. Which of these alternatives is the least worst,
> all things considered?

Let's get the patch committed without any explicit way of forcing the
number of workers and then think about adding that later.

It will be good if you and Rushabh can agree on who will produce the
next version of this patch, and also if I have some idea when that
version should be expected.  On another point, we will need to agree
on how this should be credited in an eventual commit message.  I do
not agree with adding Heikki as an author unless he contributed code,
but we can credit him in some other way, like "Thanks are also due to
Heikki Linnakangas for significant improvements to X, Y, and Z that
made this patch possible."  I assume the author credit will be "Peter
Geoghegan, Rushabh Lathia" in that order, but let me know if anyone
thinks that isn't the right idea.

-- 
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company



Re: [HACKERS] Parallel tuplesort (for parallel B-Tree index creation)

2018-01-10 Thread Alvaro Herrera
Robert Haas wrote:

> + * To support parallel sort operations involving coordinated callers to
> + * tuplesort.c routines across multiple workers, it is necessary to
> + * concatenate each worker BufFile/tapeset into one single leader-wise
> + * logical tapeset.  Workers should have produced one final materialized
> + * tape (their entire output) when this happens in leader; there will always
> + * be the same number of runs as input tapes, and the same number of input
> + * tapes as workers.
> 
> I can't interpret the word "leader-wise".  A partition-wise join is a
> join done one partition at a time, but a leader-wise logical tape set
> is not done one leader at a time.  If there's another meaning to the
> affix -wise, I'm not familiar with it.  Don't we just mean "a single
> logical tapeset managed by the leader"?

https://www.merriam-webster.com/dictionary/-wise
-wise
adverb combining form
Definition of -wise
1 a : in the manner of crabwise fanwise
b : in the position or direction of slantwise clockwise
2 : with regard to : in respect of dollarwise

I think "one at a time" is not the right way to interpret the affix.
Rather, a "partitionwise join" is a join done "in the manner of
partitions", that is, the characteristics of the partitions are
considered when the join is done.

I'm not defending the "leader-wise" term here, though, because I can't
make sense of it, regardless of how I interpret the -wise affix.

-- 
Álvaro Herrerahttps://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services



Re: [HACKERS] WIP: Separate log file for extension

2018-01-10 Thread Tom Lane
Antonin Houska  writes:
> After having read the thread on your patch I think that the reason you were
> asked to evaluate performance was that your patch can possibly make syslogger
> a bottleneck. In contrast, my patch does not prevent user from disabling the
> syslogger if it (the syslogger) seems to cause performance issues.

Just to clarify that: we know that in workloads that emit lots of log
output, the log collector *already is* a bottleneck; there are reports
that some people can't use it because it's too slow.  See e.g. towards
the end of this thread:

https://www.postgresql.org/message-id/flat/CABUevExztL0GORyWM9S4tR_Ft3FmJbRaxQdxj%2BBQZjpvmRurdw%40mail.gmail.com

and particularly the referenced thread from 2011.  (I seem to recall other
reports but didn't have much luck finding them.)

I'm quite concerned by the proposed patch, and not even so much any
performance issues; what bothers me is that it adds complexity into a
portion of the system where we can ill afford it.  Bugs in the logging
mechanism compromise one's ability to have any faith in tracking down
other bugs.  The difficulty of reconfiguring the logger on the fly
is another reason to not want more configuration options for it.
On the whole, therefore, I'd just as soon not go there --- especially
seeing that there's been little field demand for such features.

regards, tom lane



Re: [HACKERS] Parallel tuplesort (for parallel B-Tree index creation)

2018-01-10 Thread Peter Geoghegan
On Wed, Jan 10, 2018 at 2:21 PM, Robert Haas  wrote:
>> I share your general feelings on all of this, but I really don't know
>> what to do about it. Which of these alternatives is the least worst,
>> all things considered?
>
> Let's get the patch committed without any explicit way of forcing the
> number of workers and then think about adding that later.

It could be argued that you need some way of forcing low memory in
workers with any committed version. So while this sounds reasonable,
it might not be compatible with throwing out what I've done with
force_parallel_mode up-front, before you commit anything. What do you
think?

> It will be good if you and Rushabh can agree on who will produce the
> next version of this patch, and also if I have some idea when that
> version should be expected.

I'll take it.

> On another point, we will need to agree
> on how this should be credited in an eventual commit message.  I do
> not agree with adding Heikki as an author unless he contributed code,
> but we can credit him in some other way, like "Thanks are also due to
> Heikki Linnakangas for significant improvements to X, Y, and Z that
> made this patch possible."

I agree that I should have been more nuanced with this. Here's what I intended:

Heikki is not the author of any of the code in the final commit, but
he is morally a (secondary) author of the feature as a whole, and
should be credited as such within the final release notes. This is
justified by the history here, which is that he was involved with the
patch fairly early on, and did some work that was particularly
important to the feature, that almost certainly would not otherwise
have happened. Sure, it helped the serial case too, but much less so.
That's really not why he did it.

> I assume the author credit will be "Peter
> Geoghegan, Rushabh Lathia" in that order, but let me know if anyone
> thinks that isn't the right idea.

"Peter Geoghegan, Rushabh Lathia" seems right. Thomas did write a very
small amount of the actual code, but I think it was more of a review
thing (he is already credited as a reviewer).

-- 
Peter Geoghegan



Re: [HACKERS] Parallel tuplesort (for parallel B-Tree index creation)

2018-01-10 Thread Peter Geoghegan
On Wed, Jan 10, 2018 at 2:36 PM, Alvaro Herrera  wrote:
> I think "one at a time" is not the right way to interpret the affix.
> Rather, a "partitionwise join" is a join done "in the manner of
> partitions", that is, the characteristics of the partitions are
> considered when the join is done.
>
> I'm not defending the "leader-wise" term here, though, because I can't
> make sense of it, regardless of how I interpret the -wise affix.

I've already conceded the point, but fwiw "leader-wise" comes from the
idea of having a leader-wise space following concatenating worker
tapes (who have original/worker-wise space). We must apply an offset
to get from a worker-wise offset to a leader-wise offset.

This made more sense in an earlier version. I overlooked this during
recent self review.

-- 
Peter Geoghegan



Re: Fix a Oracle-compatible instr function in the documentation

2018-01-10 Thread Tatsuo Ishii
> I spent some more time comparing this version's behavior with
> rextester's Oracle instance, and couldn't find any other
> discrepancies, so I pushed it.

Great!

I agree with your commit message:

> Back-patch to all supported branches.  Although this patch only touches
> documentation, we should probably call it out as a bug fix in the next
> minor release notes, since users who have adopted the functions will
> likely want to update their versions.

Best regards,
--
Tatsuo Ishii
SRA OSS, Inc. Japan
English: http://www.sraoss.co.jp/index_en.php
Japanese:http://www.sraoss.co.jp



Re: [HACKERS] Parallel tuplesort (for parallel B-Tree index creation)

2018-01-10 Thread Thomas Munro
On Thu, Jan 11, 2018 at 11:42 AM, Peter Geoghegan  wrote:
> "Peter Geoghegan, Rushabh Lathia" seems right. Thomas did write a very
> small amount of the actual code, but I think it was more of a review
> thing (he is already credited as a reviewer).

+1

-- 
Thomas Munro
http://www.enterprisedb.com



Re: BUG #14941: Vacuum crashes

2018-01-10 Thread Michael Paquier
On Wed, Jan 10, 2018 at 05:26:43PM +, Bossart, Nathan wrote:
> Right.  I don't have a terribly strong opinion either way.  I think the
> counter-argument is that logging skipped relations might provide
> valuable feedback to users.  If I ran a database-wide VACUUM and a
> relation was skipped due to lock contention, it might be nice to know
> that so I can handle the relation individually.

Or users may not care about getting information they don't care
about. When running a database-wide VACUUM or ANALYZE you don't know
what is exactly the list of relations this is working on. Getting
information about things you don't know beforehand can be considered as
misinformation.

Perhaps others have different opinions. Sawada-san?

> Perhaps any logging changes for VACOPT_NOWAIT could be handled in a
> separate thread.  For now, I've updated 0003 to remove the logging
> changes.

Thanks. I am marking those as ready for committer, you are providing the
set patch patch which offer the most consistent experience.
--
Michael


signature.asc
Description: PGP signature


Re: [HACKERS] PATCH: psql tab completion for SELECT

2018-01-10 Thread Edmund Horner
On 11 January 2018 at 03:28, Vik Fearing  wrote:
> On 01/10/2018 06:38 AM, Edmund Horner wrote:
>> Regarding support for older versions, psql fails silently if a tab
>> completion query fails.
>
> No it doesn't, see below.
>
>> We could just let it do this, which is what
>> happens with, for example, ALTER PUBLICATION against a 9.6 server.  I
>> can't see any existing completions that check the server version --
>> but completions that don't work against older versions, like ALTER
>> PUBLICATION, also aren't useful for older versions.  SELECT is a bit
>> different as it can be useful against older versions that don't have
>> the pg_aggregate.aggcombinefn that my query uses for filtering out
>> aggregation support functions.
>
> That's a bug in my opinion, but not one that needs to be addressed by
> this patch.
>
> There are no completions that cater to the server version (yet) but all
> the \d stuff certainly does.  You can see that in action in
> src/bin/psql/describe.c as well as all over the place in pg_dump and the
> like.
>
>> There's also the small irritation that when a completion query fails,
>> it aborts the user's current transaction to provide an incentive for
>> handling older versions gracefully.
>
> That is actively hostile and not at all what I would consider "silently
> failing".  If you don't want to do the multiple versions thing, you
> should at least check if you're on 9.6+ before issuing the query.

There's also the less-critical problem that you can't complete
anything from an already-failed transaction.

Let's just talk about a separate patch that might improve this.

I can think of two options:

1. Use a separate connection for completions.  The big problem with
this is people want to complete on objects created in their current
transaction.  Maybe there's a way to use SET TRANSACTION SNAPSHOT to
access the user's transaction but this seems far too intrusive just
for a bit of tab completion.

2. Use savepoints.  In exec_query you'd have:

SAVEPOINT _psql_tab_completion;
run the query
ROLLBACK TO _psql_tab_completion;   // Just in case there was an
error, but safe to do anyway.
RELEASE SAVEPOINT _psql_tab_completion;// This may not be worth doing.

It doesn't help with tab completion in already-failed transactions.
But would it be a reasonable way to make tab completion safer?  I
don't know whether savepoint creation/rollback/release has some
cumulative cost that we'd want to avoid incurring.



Re: [HACKERS] [PATCH] Generic type subscripting

2018-01-10 Thread Tom Lane
Andres Freund  writes:
> You might not love me for this suggestion, but I'd like to see the
> renaming here split from the rest of the patch. There's a lot of diff
> that's just more or less automatic changes, making it hard to see the
> actual meaningful changes.

Yeah, I'm beginning to wonder if we should do the renaming at all.
It's useful for being sure we've found everyplace that needs to change
... but if lots of those places don't actually need more than the
name changes, maybe it's just make-work and code thrashing.


There's a set of other issues that are starting to bother me.
Perhaps it's not in this patch's charter to resolve them, but I think
we need to figure out whether that's true.  It's a bit hard to
explain clearly, but let me see how well I can state these:

* The complaint I had about the "container" terminology isn't just
terminology.  Rather, there is a bunch of knowledge in the system that
some data types can be found embedded in other types; for one example,
see find_composite_type_dependencies.  In the case of standard arrays,
it's clear that the array type does contain its element type in this
sense, and we'd better be aware of that in situations such as applying
DDL that changes the element type.  It's much less clear what it means
if you say that type X has a subscripting function that yields type Y.
I think the issue can be ignored as long as Y is not a type mutable by
any provided DDL commands, but is that OK as a permanent restriction?
If not, do we need to do anything about it in the v1 patch?  If we don't,
do we need to enforce some restriction on what Y can be for types
other than true arrays?

* There are other identifiable array-specific behaviors that people
might expect a generic subscripting feature to let them into.
For example, if we allow JSONB to be subscripted to obtain TEXT,
does that mean a polymorphic function f(x anyarray) should now match
JSONB?  It's even more fun if you decide that subscripting JSONB
should yield JSONB, which is probably a more useful definition than
TEXT really.  Then ANYARRAY and ANYELEMENT would need to be the same
type, which is likely to blow holes in the polymorphic type code,
though I've not looked closely for problems.  In the same vein, if
JSONB is subscriptable, should "x = ANY(y)" work for y of type JSONB?
I'm not actually sure that we'd want these sorts of things to happen,
even as follow-on extensions.  For instance, a polymorphic function
f(x anyarray) would very likely think it can apply array_dims(x) or
iterate from array_lower(x,1) to array_upper(x,1).  Providing it
a subscripting function won't get you far if the subscripts aren't
consecutive integers.

* There's an awful lot of places in the backend that call get_element_type
or get_base_element_type or type_is_array or type_is_array_domain, and
aren't touched by the patch as it stands.  Some of them might represent
dependencies that we need to worry about that don't fall under either of
the above categories.  So just touching the places that mess with ArrayRef
isn't getting us real far in terms of being sure we've considered
everything that needs considering.

regards, tom lane



Re: [PATCH] GET DIAGNOSTICS FUNCTION_NAME

2018-01-10 Thread Yugo Nagata
On Sun, 31 Dec 2017 11:57:02 -0500
Tom Lane  wrote:

> Yugo Nagata  writes:
> > Attached is a patch to implement a feature to get the current function
> > name by GET DIAGNOSTICS in PL/pgSQL function.
> 
> While this is certainly not a very large patch, it's still code that
> we'd have to maintain forever, so I think it's appropriate to ask
> some harder questions before accepting it.
> 
> 1. I'm having a hard time visualizing an actual concrete use case for
> this --- exactly when would a function not know its own name?  Neither
> your "our client wanted it" justification nor the cited stackoverflow
> question provide anything close to an adequate rationale.  I can think of
> concrete uses for an operation like "give me the name of my immediate
> caller", but that's not what this is.

Our client's use case was mainly to output debug messages at begining and
end of functions by using the same code. In addition, names of cursors
declared in the function were based on the function name, and they wanted
to get the function name to handle cursors.

However, I don't inisist on this patch, so If anyone other don't need this
feature, I'll withdraw this.

Regards,

> 
> 2. The specific semantics you've chosen --- in effect, regprocedureout
> results --- seem to be more because that was already available than
> anything else.  I can imagine wanting just the bare name, or the
> schema-qualified name, or even the numeric OID (if we're in the
> business of introspection, being able to look up the function's own
> pg_proc entry might be useful).  I'm not proposing that we offer
> all those variants, certainly, but without concrete use cases it's
> pretty hard to be sure we picked the most useful behavior.
> 
> 3. In connection with #2, I'm dubious that FUNCTION_NAME is le mot
> juste, because that would seem to imply that it is just the name,
> which it isn't.  If we stick with the regprocedureout semantics
> I'd be inclined to propose FUNCTION_SIGNATURE.
> 
>   regards, tom lane
> 


-- 
Yugo Nagata 



Re: BUG #14941: Vacuum crashes

2018-01-10 Thread Masahiko Sawada
On Thu, Jan 11, 2018 at 8:14 AM, Michael Paquier
 wrote:
> On Wed, Jan 10, 2018 at 05:26:43PM +, Bossart, Nathan wrote:
>> Right.  I don't have a terribly strong opinion either way.  I think the
>> counter-argument is that logging skipped relations might provide
>> valuable feedback to users.  If I ran a database-wide VACUUM and a
>> relation was skipped due to lock contention, it might be nice to know
>> that so I can handle the relation individually.
>
> Or users may not care about getting information they don't care
> about. When running a database-wide VACUUM or ANALYZE you don't know
> what is exactly the list of relations this is working on. Getting
> information about things you don't know beforehand can be considered as
> misinformation.
>
> Perhaps others have different opinions. Sawada-san?

Hmm, I agree that v4 patch is more simple and consistent with current
behavior. The logging is not necessary if relation has been dropped
but I think that it's necessary if a relation exists but database-wide
VACUUM or ANALYZE could not take the lock on it. I can image a use
case where a user don't rely on autovacuum and do manual vacuums in a
maintenance window (per-week or per-day). In this case the log message
of skipping vacuum would be useful for user. The user might not be
interested in what is exactly the list of relations but might be
interested in the relations that were skipped to vacuum. IIUC since
autovacuum always passes a list of VacuumRelation (length is 1 and
having non-NULL RangeVar) to vacuum(), if autovacuum could not take
the lock on the relation it certainly emits that log. But with v4
patch, that log message is not emitted only if user do database-wide
VACUUM or ANALYZE and could not take the lock. On the other hand, the
downside of that we emits that log even if relations == NULL is we
emits that log even for toast tables and child tables. It would be
annoy user and might be unnecessary information. To deal with it, if
NOWAIT is specified we can fill RangeVar in get_all_vacuum_rels and
make vacuum_rel not emit "relation no longer exists" log. That way we
can emits that log by database-wide VACUUM/ANALYZE, while not emitting
for toast table and child tables. But I'm not sure it's worth to
effort for this direction (v3patch + tricks) going so far as making
such tricks. I'd like to hear opinions.

Regards,

--
Masahiko Sawada
NIPPON TELEGRAPH AND TELEPHONE CORPORATION
NTT Open Source Software Center



Re: refactor subscription tests to use PostgresNode's wait_for_catchup

2018-01-10 Thread Peter Eisentraut
On 1/8/18 23:47, Michael Paquier wrote:
>> @@ -1505,7 +1515,7 @@ sub wait_for_catchup
>>. $target_lsn . " on "
>>. $self->name . "\n";
>>  my $query =
>> -qq[SELECT '$target_lsn' <= ${mode}_lsn FROM pg_catalog.pg_stat_replication 
>> WHERE application_name = '$standby_name';];
>> +qq[SELECT $lsn_expr <= ${mode}_lsn FROM pg_catalog.pg_stat_replication 
>> WHERE application_name = '$standby_name';];
>>  $self->poll_query_until('postgres', $query)
>>or die "timed out waiting for catchup, current location is "
>>. ($self->safe_psql('postgres', $query) || '(unknown)');
> 
> This log is wrong from the beginning. Here $query returns a boolean
> status and not a location. I think that when the poll dies because of a
> timeout you should do a lookup at ${mode}_lsn from pg_stat_replication
> when application_name matching $standby_name. Could you fix that as
> well?

Should we just remove it?  Apparently, it was never functional to begin
with.  Otherwise, we'd have to write a second query to return the value
to print.  wait_for_slot_catchup has the same issue.  Seems like a lot
of overhead for something that has never been used.

> Could you also update promote_standby in RewindTest.pm? Your refactoring
> to use pg_current_wal_lsn() if a target_lsn is not possible makes this
> move possible. Using the generic APIs gives better logs as well.

Right.

-- 
Peter Eisentraut  http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services



Re: refactor subscription tests to use PostgresNode's wait_for_catchup

2018-01-10 Thread Michael Paquier
On Wed, Jan 10, 2018 at 09:45:56PM -0500, Peter Eisentraut wrote:
> On 1/8/18 23:47, Michael Paquier wrote:
> Should we just remove it?  Apparently, it was never functional to begin
> with.  Otherwise, we'd have to write a second query to return the value
> to print.  wait_for_slot_catchup has the same issue.  Seems like a lot
> of overhead for something that has never been used.

Fine for me to remove it.
--
Michael


signature.asc
Description: PGP signature


Re: pl/perl extension fails on Windows

2018-01-10 Thread Noah Misch
On Sun, Dec 10, 2017 at 11:46:08AM -0800, Noah Misch wrote:
> On Sun, Dec 10, 2017 at 12:36:13PM +, Christian Ullrich wrote:
> > * Noah Misch wrote:
> > > On Wed, Nov 29, 2017 at 11:45:35PM -0500, Tom Lane wrote:
> > >> Oh, OK.  In that case, we need to get some representatives of these
> > >> more modern builds into the buildfarm while we're at it.
> > > 
> > > Yep.  Among machines already in the buildfarm, the one running member
> > > woodlouse is the best candidate for this.  Its owner could install
> > > http://strawberryperl.com/download/5.26.1.1/strawberry-perl-5.26.1.1-32bit.msi
> > > and setup another animal on the same machine that builds 32-bit and 
> > > enables
> > > Perl.  Christian, are you interested in doing this?
> > 
> > Ready to go, waiting for animal assignment. For now, I can confirm that it 
> > works, that is, the buildfarm --test run is successful.
> 
> Thanks!

Did the animal assignment come through?  I don't see such an animal reporting.



Re: [HACKERS] UPDATE of partition key

2018-01-10 Thread David Rowley
Thanks for making those changes.

On 11 January 2018 at 04:00, Amit Khandekar  wrote:
> Yes, I understand that there won't be any update scan plans. But, with
> the modifications done in ExecInitModifyTable(), I wanted to run that
> code with this scenario where there are no partitions, to make sure it
> does not behave weirdly or crash. Any suggestions for comments, given
> this perspective ? For now, I have made the comment this way:
>
> -- Check that partition-key UPDATE works sanely on a partitioned table
> that does not have any child partitions.

Sounds good.

>> 18. Why two RESET SESSION AUTHORIZATIONs?
>>
>> reset session authorization;
>> drop trigger trig_d_1_15 ON part_d_1_15;
>> drop function func_d_1_15();
>> -- Policy expression contains SubPlan
>> reset session authorization;
>
> The second reset is actually in a different paragraph. The reason it's
> there is to ensure we have reset it regardless of the earlier cleanup.

hmm, I was reviewing the .out file, which does not have the empty
lines. Still seems a bit surplus.

> Attached v35 patch. Thanks.

Thanks. I'll try to look at it soon.

-- 
 David Rowley   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training & Services



Re: [HACKERS] UPDATE of partition key

2018-01-10 Thread Amit Khandekar
On 11 January 2018 at 10:44, David Rowley  wrote:
>>> 18. Why two RESET SESSION AUTHORIZATIONs?
>>>
>>> reset session authorization;
>>> drop trigger trig_d_1_15 ON part_d_1_15;
>>> drop function func_d_1_15();
>>> -- Policy expression contains SubPlan
>>> reset session authorization;
>>
>> The second reset is actually in a different paragraph. The reason it's
>> there is to ensure we have reset it regardless of the earlier cleanup.
>
> hmm, I was reviewing the .out file, which does not have the empty
> lines. Still seems a bit surplus.

I believe the output file does not have the blank lines present in the
.sql file. I was referring to the paragraph in the *.sql* file.

-- 
Thanks,
-Amit Khandekar
EnterpriseDB Corporation
The Postgres Database Company



Re: [HACKERS] Parallel tuplesort (for parallel B-Tree index creation)

2018-01-10 Thread Rushabh Lathia
On Thu, Jan 11, 2018 at 3:35 AM, Peter Geoghegan  wrote:

> On Wed, Jan 10, 2018 at 1:31 PM, Robert Haas 
> wrote:
> >> Can we actually call it max_parallel_maintenance_workers instead?
> >> I mean we don't have work_mem_maintenance.
> >
> > Good point.
>
> WFM.
>
>
This is good point. I agree with max_parallel_maintenance_workers.


> --
> Peter Geoghegan
>



-- 
Rushabh Lathia


Re: [HACKERS] Proposal: Local indexes for partitioned table

2018-01-10 Thread David Rowley
On 11 January 2018 at 04:37, Alvaro Herrera  wrote:
> In prior incarnations of the patch, I had an if-test to prevent
> attaching invalid indexes, but I decided to remove it at some point --
> mainly thinking of attaching a partition for which a CREATE INDEX
> CONCURRENTLY was running which already had the index as invalid and was
> later expected to become valid.  I suppose that doesn't really work
> anyway because of locking considerations (you can't attach a partition
> in which CIC is concurrently running, can you).  I'll think some more
> about this case and post an updated version later.

I guess CIC will need to check if the index has a parent index when
setting indisvalid = true, and do likewise to the parent index if all
other siblings are valid.

-- 
 David Rowley   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training & Services



numeric regression test passes, but why?

2018-01-10 Thread Chapman Flack
I see there are some tests in src/test/regress:

sql/numeric.sql
expected/numeric.out

They pass. I see "numeric ... ok" in a make check.

I do not doubt they are being run, because if I edit numeric.sql
and fudge some digits, say around

-- cases that used to error out
select 0.12 ^ (-25);
select 0.5678 ^ (-85);

and I change the -25 to -26, numeric then fails in my next make check.

All that seems just as it should be. Why, then, if I try to duplicate
those exact tests in an interactive session, would this happen:

postgres=# select 0.12 ^ (-25);
ERROR:  division by zero
STATEMENT:  select 0.12 ^ (-25);

postgres=# select 0.5678 ^ (-85);
ERROR:  division by zero
STATEMENT:  select 0.5678 ^ (-85);

... they error out, that is, do exactly the thing the tests are there
to make sure they do not.

Is there some special GUC setting in effect during the make check
that would be different in my ordinary session? What else could
be different? This is making me question my sanity.

-Chap



Re: [HACKERS] Planning counters in pg_stat_statements

2018-01-10 Thread Haribabu Kommi
On Tue, Nov 7, 2017 at 4:10 PM, Thomas Munro 
wrote:

> Hi hackers,
>
> I have often wanted $SUBJECT and was happy to find that Fujii-san had
> posted a patch five years ago[1].  The reception then seemed positive.
> So here is a refurbished and (hopefully) improved version of his patch
> with a new column for the replan count.  Thoughts?
>

Thanks for the patch.

-DATA = pg_stat_statements--1.4.sql pg_stat_statements--1.4--1.5.sql \
+DATA = pg_stat_statements--1.5.sql pg_stat_statements--1.4--1.5.sql \

The current version of the pg_stat_statements is already 1.5, this patch
should increase it to 1.6.

+/* contrib/pg_stat_statements/pg_stat_statements--1.5.sql */

Add the contents of the pg_stat_statements--1.4--1.5.sql file here,
otherwise
when the user installs this as a new extension, he will loose those changes.

+OUT plans int8,

Addition of this column is good to find out how many time the plan is
generated
for the same query. But I am thinking "plans" column name may confuse?

+OUT plan_time float8,

I am fine with the current name also, just checking how about
"planning_time"
similar like EXPLAIN?


+ 
+  plan_time
+  double precision
+  
+  Total time spent planning, in milliseconds
+ 

How about "Total time spent in the statement planning"?

And also it needs an update for the total_time column as
"Total time spent in the statement execution".

+
 /* 
  * PlannedStmt node
  *

Extra change.


Regards,
Hari Babu
Fujitsu Australia


RE: pl/perl extension fails on Windows

2018-01-10 Thread Christian Ullrich
* From: Noah Misch [mailto:n...@leadboat.com]

> > > Ready to go, waiting for animal assignment. For now, I can
> confirm that it works, that is, the buildfarm --test run is
> successful.

> Did the animal assignment come through?  I don't see such an animal
> reporting.

No, not yet. Sorry, I lost track of it, or I would have mentioned it again 
earlier.

-- 
Christian


pg_get_functiondef forgets about most GUC_LIST_INPUT GUCs

2018-01-10 Thread Michael Paquier
Hi all,

While reviewing another patch related to the use of pg_strcasecmp in the
backend, I have noticed this bit in ruleutils.c:

/*
 * Some GUC variable names are 'LIST' type and hence must not
 * be quoted.
 */
if (pg_strcasecmp(configitem, "DateStyle") == 0
|| pg_strcasecmp(configitem, "search_path") == 0)
appendStringInfoString(&buf, pos);
else
simple_quote_literal(&buf, pos);

However this fails to consider all GUCs marked as GUC_LIST_INPUT, like
the recent wal_consistency_checking. guc.c already holds a find_option()
which can be used to retrieve the flags of a parameter. What about using
that and filtering by GUC_LIST_INPUT? This requires exposing the
function, and I am not sure what people think about that.

Thoughts?
--
Michael


signature.asc
Description: PGP signature


Re: [HACKERS] Restricting maximum keep segments by repslots

2018-01-10 Thread Kyotaro HORIGUCHI
Hello. Thank you for the comment.

(And sorry for the absense.)

At Fri, 22 Dec 2017 15:04:20 +0300, Sergei Kornilov  wrote in 
<337571513944...@web55j.yandex.ru>
> Hello
> I think limit wal in replication slots is useful in some cases. But first 
> time i was confused with proposed terminology 
> secured/insecured/broken/unknown state.

I'm not confident on the terminology. Suggestions are welcome on
the wording that makes more sense.

> patch -p1 gives some "Stripping trailing CRs from patch"
> messages for me, but applied to current HEAD and builds. After

Hmm. I wonder why I get that complaint so often. (It's rather
common? or caused by the MIME format of my mail?)  I'd say with
confidence that it is because you retrieved the patch file on
Windows mailer.

> little testing i understood the difference in
> secured/insecured/broken terminology. Secured means garantee to
> keep wal, insecure - wal may be deleted with next checkpoint,
> broken - wal already deleted.

Right. I'm sorry that I haven't written that clearly anywhere and
bothered you confirming that. I added documentation as the forth
patch.

> I think, we may split "secure" to "streaming"
> and... hmm... "waiting"? "keeping"? - according active flag for
> clarify and readable "status" field.

streaming / keeping and lost? (and unknown) Also "status" is
surely offers somewhat obscure meaning. wal_status (or
(wal_)availability) and min_keep_lsn maeke more sense?

The additional fields in pg_replication_slots have been changed
as the follows in the attached patch.

  confirmed_flush_lsn:
+ wal_status : (streaming | keeping | lost | unknown)
+ min_keep_lsn   : 


The changes of documentation are seen in the following html files.

doc/src/sgml/html/warm-standby.html#STREAMING-REPLICATION-SLOTS
doc/src/sgml/html/runtime-config-replication.html#GUC-MAX-SLOT-WAL-KEEP-SIZE
doc/src/sgml/html/view-pg-replication-slots.html


One annoyance is that the min_keep_lsn always has the same value
among all slots.


regards,

-- 
Kyotaro Horiguchi
NTT Open Source Software Center
>From 95e2a5eec2cfd9cbcafbac8617bd5ccdecbed6d2 Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi 
Date: Thu, 21 Dec 2017 21:20:20 +0900
Subject: [PATCH 1/4] Add WAL releaf vent for replication slots

Adds a capability to limit the number of segments kept by replication
slots by a GUC variable.
---
 src/backend/access/transam/xlog.c | 114 +-
 src/backend/utils/misc/guc.c  |  11 +++
 src/backend/utils/misc/postgresql.conf.sample |   1 +
 src/include/access/xlog.h |   1 +
 4 files changed, 107 insertions(+), 20 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index e42b828..bdb7156 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -105,6 +105,7 @@ int			wal_level = WAL_LEVEL_MINIMAL;
 int			CommitDelay = 0;	/* precommit delay in microseconds */
 int			CommitSiblings = 5; /* # concurrent xacts needed to sleep */
 int			wal_retrieve_retry_interval = 5000;
+int			max_slot_wal_keep_size_mb = 0;
 
 #ifdef WAL_DEBUG
 bool		XLOG_DEBUG = false;
@@ -861,6 +862,7 @@ static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
 static void LocalSetXLogInsertAllowed(void);
 static void CreateEndOfRecoveryRecord(void);
 static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
+static XLogSegNo GetMinKeepSegment(XLogRecPtr currpos, XLogRecPtr minSlotPtr);
 static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo);
 static XLogRecPtr XLogGetReplicationSlotMinimumLSN(void);
 
@@ -9348,6 +9350,74 @@ CreateRestartPoint(int flags)
 }
 
 /*
+ * Returns minimum segment number the next checktpoint must leave considering
+ * wal_keep_segments, replication slots and max_slot_wal_keep_size.
+ */
+static XLogSegNo
+GetMinKeepSegment(XLogRecPtr currpos, XLogRecPtr minSlotPtr)
+{
+	uint64 keepSegs;
+	XLogSegNo currSeg;
+	XLogSegNo tailSeg;
+	uint64 slotlimitbytes;
+	uint64 slotlimitfragment;
+	uint64 currposoff;
+	XLogRecPtr slotpos = minSlotPtr;
+	XLogSegNo	slotSeg;
+
+	Assert(wal_keep_segments >= 0);
+	Assert(max_slot_wal_keep_size_mb >= 0);
+
+	XLByteToSeg(currpos, currSeg, wal_segment_size);
+	XLByteToSeg(slotpos, slotSeg, wal_segment_size);
+
+	/*
+	 * wal_keep_segments keeps more segments than slot, slotpos is no longer
+	 * useful. Don't perform subtraction to keep values positive.
+	 */
+	if (slotpos != InvalidXLogRecPtr && currSeg <= slotSeg + wal_keep_segments)
+		slotpos = InvalidXLogRecPtr;
+
+	/* slots aren't useful, consider only wal_keep_segments */
+	if (slotpos == InvalidXLogRecPtr)
+	{
+		/* avoid underflow, don't go below 1 */
+		if (currSeg <= wal_keep_segments)
+			return 1;
+
+		return currSeg - wal_keep_segments;
+	}
+
+	/* just return slotSeg if we don't put a limit */
+	if (max_slot_wal_keep_size_mb == 0)
+		return slotSeg;
+
+	/*
+	 * Slot limit is defined and slot gives the oldest segment to keep,
+	 * cal

Re: numeric regression test passes, but why?

2018-01-10 Thread Tom Lane
Chapman Flack  writes:
> I see there are some tests in src/test/regress:
> [ that don't work for me ]
> Is there some special GUC setting in effect during the make check
> that would be different in my ordinary session? What else could
> be different? This is making me question my sanity.

Hm, it won't help your sanity to know that those cases pass fine
for me, interactively, on a couple of different machines:

$ psql postgres
psql (11devel)
Type "help" for help.

postgres=#  select 0.12 ^ (-25);
 ?column?  
---
 104825960103961013959336.4983657883169110
(1 row)

postgres=# select 0.5678 ^ (-85);
?column?

 782333637740774446257.7719390061997396
(1 row)

You sure you're using a stock build of Postgres?  No handmade
versions of operator ^ lying around?

regards, tom lane



Re: Incorrect comment for expand_single_inheritance_child

2018-01-10 Thread Etsuro Fujita

(2018/01/09 23:46), Robert Haas wrote:

On Mon, Jan 8, 2018 at 10:37 PM, Etsuro Fujita
  wrote:

Attached is a patch for that.


Committed.


Thank you!

Best regards,
Etsuro Fujita



Re: TAP test module - PostgresClient

2018-01-10 Thread Kyotaro HORIGUCHI
Thank you for the discussion.

# I didn't noticed that the license has been changed.

At Sat, 30 Dec 2017 14:35:27 -0500, Andrew Dunstan 
 wrote in 
<4ab7546e-dd48-c985-2b26-e98d58920...@2ndquadrant.com>
> 
> 
> On 12/30/2017 10:45 AM, Tom Lane wrote:
> > Andrew Dunstan  writes:
> >> As for out-dating, if we used DBD::PgPP we'd not be not in great danger
> >> there - it doesn't appear to have changed for many years - latest
> >> version is dated 2010. If we were to use it we'd have a dependency on
> >> DBI, but that in itself doesn't seem a great burden.
> > [ blowing the dust off my old red fedora... ]  Actually, there's a
> > different problem with this proposal: you can bet that DBD::Pg has got a
> > build dependency on Postgres.  If Postgres starts to depend on DBD::Pg
> > then we've created circular-dependency hell for packagers.  
> 
> The Pure Perl driver has no such dependency, since it doesn't require
> libpq. But ...
> 
> > I much prefer the other line of thought about doing whatever we need
> > to do to make psql workable for the desired type of tests. 
> 
> ... agreed ...

The module intends to perform multiple operations interactively
on a session, or a transaction while performing test. We must
keep the session by something persistent to do that. The
PostgresClient is that for TAP tests. If we want to let psql have
such feature, it would be something like "psql server" or
"reconnectable session" of frontend protocol. Both seem too much
or leading to something dangerous.

> >  Or just
> > write a bespoke testing tool.
> >
> > 
> 
> ... that's pretty much where we came in.

Agreed. And we can add anything PostgreSQL or test specific
features to this.

regards,

-- 
Kyotaro Horiguchi
NTT Open Source Software Center




Re: postgres_fdw: perform UPDATE/DELETE .. RETURNING on a join directly

2018-01-10 Thread Etsuro Fujita
(2017/12/27 20:55), Etsuro Fujita wrote:
> Attached is an updated version of the patch.

I revised code/comments a little bit.  PFA new version.

Best regards,
Etsuro Fujita
*** a/contrib/postgres_fdw/deparse.c
--- b/contrib/postgres_fdw/deparse.c
***
*** 132,138  static void deparseTargetList(StringInfo buf,
    Bitmapset *attrs_used,
    bool qualify_col,
    List **retrieved_attrs);
! static void deparseExplicitTargetList(List *tlist, List **retrieved_attrs,
  		  deparse_expr_cxt *context);
  static void deparseSubqueryTargetList(deparse_expr_cxt *context);
  static void deparseReturningList(StringInfo buf, PlannerInfo *root,
--- 132,140 
    Bitmapset *attrs_used,
    bool qualify_col,
    List **retrieved_attrs);
! static void deparseExplicitTargetList(List *tlist,
! 		  bool is_returning,
! 		  List **retrieved_attrs,
  		  deparse_expr_cxt *context);
  static void deparseSubqueryTargetList(deparse_expr_cxt *context);
  static void deparseReturningList(StringInfo buf, PlannerInfo *root,
***
*** 168,178  static void deparseLockingClause(deparse_expr_cxt *context);
  static void appendOrderByClause(List *pathkeys, deparse_expr_cxt *context);
  static void appendConditions(List *exprs, deparse_expr_cxt *context);
  static void deparseFromExprForRel(StringInfo buf, PlannerInfo *root,
! 	  RelOptInfo *joinrel, bool use_alias, List **params_list);
  static void deparseFromExpr(List *quals, deparse_expr_cxt *context);
  static void deparseRangeTblRef(StringInfo buf, PlannerInfo *root,
     RelOptInfo *foreignrel, bool make_subquery,
!    List **params_list);
  static void deparseAggref(Aggref *node, deparse_expr_cxt *context);
  static void appendGroupByClause(List *tlist, deparse_expr_cxt *context);
  static void appendAggOrderBy(List *orderList, List *targetList,
--- 170,182 
  static void appendOrderByClause(List *pathkeys, deparse_expr_cxt *context);
  static void appendConditions(List *exprs, deparse_expr_cxt *context);
  static void deparseFromExprForRel(StringInfo buf, PlannerInfo *root,
! 	  RelOptInfo *foreignrel, bool use_alias,
! 	  Index ignore_rel, List **ignore_conds,
! 	  List **params_list);
  static void deparseFromExpr(List *quals, deparse_expr_cxt *context);
  static void deparseRangeTblRef(StringInfo buf, PlannerInfo *root,
     RelOptInfo *foreignrel, bool make_subquery,
!    Index ignore_rel, List **ignore_conds, List **params_list);
  static void deparseAggref(Aggref *node, deparse_expr_cxt *context);
  static void appendGroupByClause(List *tlist, deparse_expr_cxt *context);
  static void appendAggOrderBy(List *orderList, List *targetList,
***
*** 1028,1034  deparseSelectSql(List *tlist, bool is_subquery, List **retrieved_attrs,
  		 * For a join or upper relation the input tlist gives the list of
  		 * columns required to be fetched from the foreign server.
  		 */
! 		deparseExplicitTargetList(tlist, retrieved_attrs, context);
  	}
  	else
  	{
--- 1032,1038 
  		 * For a join or upper relation the input tlist gives the list of
  		 * columns required to be fetched from the foreign server.
  		 */
! 		deparseExplicitTargetList(tlist, false, retrieved_attrs, context);
  	}
  	else
  	{
***
*** 1071,1077  deparseFromExpr(List *quals, deparse_expr_cxt *context)
  	appendStringInfoString(buf, " FROM ");
  	deparseFromExprForRel(buf, context->root, scanrel,
  		  (bms_num_members(scanrel->relids) > 1),
! 		  context->params_list);
  
  	/* Construct WHERE clause */
  	if (quals != NIL)
--- 1075,1081 
  	appendStringInfoString(buf, " FROM ");
  	deparseFromExprForRel(buf, context->root, scanrel,
  		  (bms_num_members(scanrel->relids) > 1),
! 		  (Index) 0, NULL, context->params_list);
  
  	/* Construct WHERE clause */
  	if (quals != NIL)
***
*** 1340,1348  get_jointype_name(JoinType jointype)
   *
   * retrieved_attrs is the list of continuously increasing integers starting
   * from 1. It has same number of entries as tlist.
   */
  static void
! deparseExplicitTargetList(List *tlist, List **retrieved_attrs,
  		  deparse_expr_cxt *context)
  {
  	ListCell   *lc;
--- 1344,1357 
   *
   * retrieved_attrs is the list of continuously increasing integers starting
   * from 1. It has same number of entries as tlist.
+  *
+  * This is used for both SELECT and RETURNING targetlists; the is_returning
+  * parameter is true only for a RETURNING targetlist.
   */
  static void
! deparseExplicitTargetList(List *tlist,
! 		  bool is_returning,
! 		  List **retrieved_attrs,
  		  deparse_expr_cxt *context)
  {
  	ListCell   *lc;
***
*** 1357,1369  deparseExplicitTargetList(List *tlist, List **retrieved_attrs,
  
  		if (i > 0)
  			appendStringInfoString(buf, ", ");
  		deparseExpr((Expr *) tle->expr, context);
  
  		*retrieved_attrs = lappend_int(*retrieved_attrs, i + 1);
  		i