date:20240908

Re: SPI_connect, SPI_connect_ext return type

2024-09-08 Thread Tom Lane

Stepan Neretin  writes:
>> This combines portions of Stepan's
>> two patches with some additional work (mostly, that he'd missed fixing
>> any of contrib/).

> Thank you for the feedback! I believe the patch looks satisfactory. Should
> we await a review? The changes seem straightforward to me.

I too think it's good to go.  If no one complains or asks for
more time to review, I will push it Monday or so.

regards, tom lane

Re: Sort functions with specialized comparators

2024-09-08 Thread Stepan Neretin

Hi! I rebase, clean and some refactor my patches.

Best regards, Stepan Neretin.
From f88cbb80e478d5ac3f23945b4ba0ee881f0d9cd4 Mon Sep 17 00:00:00 2001
From: "Andrey M. Borodin" 
Date: Sun, 8 Sep 2024 15:43:39 +0700
Subject: [PATCH v2 01/10] Use specialized sort facilities

---
 contrib/intarray/_int.h  | 12 
 contrib/intarray/_int_gist.c |  2 +-
 contrib/intarray/_int_op.c   | 19 +--
 contrib/intarray/_int_tool.c | 12 
 src/include/port.h   |  2 ++
 src/port/qsort.c | 35 +++
 6 files changed, 47 insertions(+), 35 deletions(-)

diff --git a/contrib/intarray/_int.h b/contrib/intarray/_int.h
index 0352cbd368..5225c9090a 100644
--- a/contrib/intarray/_int.h
+++ b/contrib/intarray/_int.h
@@ -176,16 +176,4 @@ bool		execconsistent(QUERYTYPE *query, ArrayType *array, bool calcnot);
 bool		gin_bool_consistent(QUERYTYPE *query, bool *check);
 bool		query_has_required_values(QUERYTYPE *query);
 
-int			compASC(const void *a, const void *b);
-int			compDESC(const void *a, const void *b);
-
-/* sort, either ascending or descending */
-#define QSORT(a, direction) \
-	do { \
-		int		_nelems_ = ARRNELEMS(a); \
-		if (_nelems_ > 1) \
-			qsort((void*) ARRPTR(a), _nelems_, sizeof(int32), \
-  (direction) ? compASC : compDESC ); \
-	} while(0)
-
 #endif			/* ___INT_H__ */
diff --git a/contrib/intarray/_int_gist.c b/contrib/intarray/_int_gist.c
index a09b7fa812..d39e40c66a 100644
--- a/contrib/intarray/_int_gist.c
+++ b/contrib/intarray/_int_gist.c
@@ -150,7 +150,7 @@ g_int_union(PG_FUNCTION_ARGS)
 		ptr += nel;
 	}
 
-	QSORT(res, 1);
+	sort_int32_asc(ARRPTR(res), ARRNELEMS(res));
 	res = _int_unique(res);
 	*size = VARSIZE(res);
 	PG_RETURN_POINTER(res);
diff --git a/contrib/intarray/_int_op.c b/contrib/intarray/_int_op.c
index 5b164f6788..34d3aa183f 100644
--- a/contrib/intarray/_int_op.c
+++ b/contrib/intarray/_int_op.c
@@ -198,7 +198,6 @@ sort(PG_FUNCTION_ARGS)
 	text	   *dirstr = (fcinfo->nargs == 2) ? PG_GETARG_TEXT_PP(1) : NULL;
 	int32		dc = (dirstr) ? VARSIZE_ANY_EXHDR(dirstr) : 0;
 	char	   *d = (dirstr) ? VARDATA_ANY(dirstr) : NULL;
-	int			dir = -1;
 
 	CHECKARRVALID(a);
 	if (ARRNELEMS(a) < 2)
@@ -208,18 +207,18 @@ sort(PG_FUNCTION_ARGS)
 		   && (d[0] == 'A' || d[0] == 'a')
 		   && (d[1] == 'S' || d[1] == 's')
 		   && (d[2] == 'C' || d[2] == 'c')))
-		dir = 1;
+		sort_int32_asc(ARRPTR(a), ARRNELEMS(a));
 	else if (dc == 4
 			 && (d[0] == 'D' || d[0] == 'd')
 			 && (d[1] == 'E' || d[1] == 'e')
 			 && (d[2] == 'S' || d[2] == 's')
 			 && (d[3] == 'C' || d[3] == 'c'))
-		dir = 0;
-	if (dir == -1)
+		sort_int32_desc(ARRPTR(a), ARRNELEMS(a));
+	else
 		ereport(ERROR,
 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
  errmsg("second parameter must be \"ASC\" or \"DESC\"")));
-	QSORT(a, dir);
+
 	PG_RETURN_POINTER(a);
 }
 
@@ -229,7 +228,7 @@ sort_asc(PG_FUNCTION_ARGS)
 	ArrayType  *a = PG_GETARG_ARRAYTYPE_P_COPY(0);
 
 	CHECKARRVALID(a);
-	QSORT(a, 1);
+	sort_int32_asc(ARRPTR(a), ARRNELEMS(a));
 	PG_RETURN_POINTER(a);
 }
 
@@ -239,7 +238,7 @@ sort_desc(PG_FUNCTION_ARGS)
 	ArrayType  *a = PG_GETARG_ARRAYTYPE_P_COPY(0);
 
 	CHECKARRVALID(a);
-	QSORT(a, 0);
+	sort_int32_desc(ARRPTR(a), ARRNELEMS(a));
 	PG_RETURN_POINTER(a);
 }
 
@@ -381,7 +380,7 @@ intset_union_elem(PG_FUNCTION_ARGS)
 
 	result = intarray_add_elem(a, PG_GETARG_INT32(1));
 	PG_FREE_IF_COPY(a, 0);
-	QSORT(result, 1);
+	sort_int32_asc(ARRPTR(result), ARRNELEMS(result));
 	PG_RETURN_POINTER(_int_unique(result));
 }
 
@@ -403,10 +402,10 @@ intset_subtract(PG_FUNCTION_ARGS)
 	CHECKARRVALID(a);
 	CHECKARRVALID(b);
 
-	QSORT(a, 1);
+	sort_int32_asc(ARRPTR(a), ARRNELEMS(a));
 	a = _int_unique(a);
 	ca = ARRNELEMS(a);
-	QSORT(b, 1);
+	sort_int32_asc(ARRPTR(b), ARRNELEMS(b));
 	b = _int_unique(b);
 	cb = ARRNELEMS(b);
 	result = new_intArrayType(ca);
diff --git a/contrib/intarray/_int_tool.c b/contrib/intarray/_int_tool.c
index c85280c842..e83c6aadc6 100644
--- a/contrib/intarray/_int_tool.c
+++ b/contrib/intarray/_int_tool.c
@@ -393,15 +393,3 @@ int_to_intset(int32 elem)
 	aa[0] = elem;
 	return result;
 }
-
-int
-compASC(const void *a, const void *b)
-{
-	return pg_cmp_s32(*(const int32 *) a, *(const int32 *) b);
-}
-
-int
-compDESC(const void *a, const void *b)
-{
-	return pg_cmp_s32(*(const int32 *) b, *(const int32 *) a);
-}
diff --git a/src/include/port.h b/src/include/port.h
index ba9ab0d34f..b82f969fc5 100644
--- a/src/include/port.h
+++ b/src/include/port.h
@@ -443,6 +443,8 @@ extern char *strsep(char **stringp, const char *delim);
 extern void pg_qsort(void *base, size_t nel, size_t elsize,
 	 int (*cmp) (const void *, const void *));
 extern int	pg_qsort_strcmp(const void *a, const void *b);
+extern void sort_int32_asc(int32 *base, size_t nel);
+extern void sort_int32_desc(int32 *base, size_t nel);
 
 #define qsort(a,b,c,d) pg_qsort(a,b,c,d)
 
diff --git a/src/port/qsort.c b/src/port/qsort.c
index 7879e6cd56..5175c8a6dd 100644

Deadlock due to locking order violation while inserting into a leaf relation

2024-09-08 Thread Dilip Kumar

Basically, when we are inserting into a leaf relation (or lower level
relation of the partitioned relation), we acquire the lock on the leaf
relation during parsing time itself whereas parent lock is acquire
during generate_partition_qual().  Now concurrently if we try to drop
the partition root then it will acquire the lock in reverse order,
i.e. parent first and then child so this will create a deadlock.
Below example reproduce this case.

Setup:


CREATE TABLE test(a int, b int) partition by range(a);
CREATE TABLE test1 partition of test for values from (1) to (10);

Test:
--
--Session1:
INSERT INTO test1 VALUES (1, 4);
-- let session is lock the relation test1 and make it wait before it
locks test (put breakpoint in ExecInitModifyTable)

--Session2:
-- try to drop the top table which will try to take AccessExclusive
lock on all partitions
DROP TABLE test;

--session3
-- see PG_LOCKS
-- we can see that session1 has locked locked root table test(16384)
waiting on test1(16387) as session1 is holding that lock

   locktype| database | relation |  pid  |mode | granted
---+--+---+---+-+
  relation  |5 |16387 | 30368 | RowExclusiveLock| t
 relation  |5 |16387 |  30410 | AccessExclusiveLock | f
 relation  |5 |16384 |  30410 | AccessExclusiveLock | t
(11 rows)


--Session1, now as soon as you continue in gdb in session 1 it will
hit the deadlock
ERROR:  40P01: deadlock detected
DETAIL:  Process 30368 waits for AccessShareLock on relation 16384 of
database 5; blocked by process 30410.
Process 30410 waits for AccessExclusiveLock on relation 16387 of
database 5; blocked by process 30368.
HINT:  See server log for query details.
LOCATION:  DeadLockReport, deadlock.c:1135


-- 
Regards,
Dilip Kumar
EnterpriseDB: http://www.enterprisedb.com

Re: Sort functions with specialized comparators

2024-09-08 Thread David Rowley

On Sun, 8 Sept 2024 at 20:51, Stepan Neretin  wrote:
> Hi! I rebase, clean and some refactor my patches.

I'm unsure what exactly is going on with this thread. It started with
Andrey proposing a patch to speed up intarray sorting and now it's
turned into you proposing 10 patches which implement a series of sort
specialisation functions without any justification as to why the
change is useful.

If you want to have a performance patch accepted, then you'll need to
show your test case and the performance results before and after.

What this patch series looks like to me is that you've just searched
the code base for qsort and just implemented a specialised qsort
version without any regard as to whether the change is useful or not.
For example, looking at v2-0006, you've added a specialisation to sort
the columns which are specified in the CREATE STATISTICS command. This
seems highly unlikely to be useful. The number of elements in this
array is limited by STATS_MAX_DIMENSIONS, which is 8. Are you claiming
the sort specialisation you've added makes a meaningful performance
improvement to sorting an 8 element array?

It looks to me like you've just derailed Andrey's proposal. I suggest
you validate which ones of these patches you can demonstrate produce a
meaningful performance improvement, ditch the remainder, and then
start your own thread showing your test case and results.

David

Re: [PoC] Add CANONICAL option to xmlserialize

2024-09-08 Thread Oliver Ford

The following review has been posted through the commitfest application:
make installcheck-world:  tested, failed
Implements feature:   tested, failed
Spec compliant:   tested, failed
Documentation:tested, failed

LGTM

The new status of this patch is: Ready for Committer

Re: Introduce XID age and inactive timeout based replication slot invalidation

2024-09-08 Thread Bharath Rupireddy

Hi,

Thanks for reviewing.

On Mon, Sep 2, 2024 at 1:37 PM Peter Smith  wrote:
>
> Commit message.
>
> 1.
> Because such synced slots are typically considered not
> active (for them to be later considered as inactive) as they don't
> perform logical decoding to produce the changes.
>
> This sentence is bad grammar. The docs have the same wording, so
> please see my doc review comment #4 suggestion below.

+1

> 2.
> +   
> +Invalidates replication slots that are inactive for longer than
> +specified amount of time. If this value is specified without units,
> +it is taken as seconds. A value of zero (which is default) disables
> +the timeout mechanism. This parameter can only be set in
> +the postgresql.conf file or on the server
> +command line.
> +   
> +
>
> nit - This is OK as-is, but OTOH why not make the wording consistent
> with the previous GUC description? (e.g. see my v43 [1] #2 review
> comment)

+1.

> 3.
> +   
> +This invalidation check happens either when the slot is acquired
> +for use or during checkpoint. The time since the slot has become
> +inactive is known from its
> +inactive_since value using which the
> +timeout is measured.
> +   
> +
>
> I felt this is slightly misleading because slot acquiring has nothing
> to do with setting the slot invalidation anymore. Furthermore, the 2nd
> sentence is bad grammar.
>
> nit - IMO something simple like the following rewording can address
> both of those points:
>
> Slot invalidation due to inactivity timeout occurs during checkpoint.
> The duration of slot inactivity is calculated using the slot's
> inactive_since field value.

+1.

> 4.
> +Because such synced slots are typically considered not active
> +(for them to be later considered as inactive) as they don't perform
> +logical decoding to produce the changes.
>
> That sentence has bad grammar.
>
> nit – suggest a much simpler replacement:
> Synced slots are always considered to be inactive because they don't
> perform logical decoding to produce changes.

+1.

> 5.
> +#define IsInactiveTimeoutSlotInvalidationApplicable(s) \
>
> 5a.
> I felt this would be better implemented as an inline function. Then it
> can be commented on properly to explain the parts of the condition.
> e.g. the large comment currently in InvalidatePossiblyObsoleteSlot()
> would be more appropriate in this function.

+1.

> 5b.
> The name is very long. Can't it be something shorter/simpler like:
> 'IsSlotATimeoutCandidate()'
>
> ~~~

Missing inactive in the above suggested name. Used
SlotInactiveTimeoutCheckAllowed, similar to XLogInsertAllowed.

> 6. ReplicationSlotAcquire
>
> -ReplicationSlotAcquire(const char *name, bool nowait)
> +ReplicationSlotAcquire(const char *name, bool nowait,
> +bool check_for_invalidation)
>
> nit - Previously this new parameter really did mean to "check" for
> [and set the slot] invalidation. But now I suggest renaming it to
> 'error_if_invalid' to properly reflect the new usage. And also in the
> slot.h.

+1.

> 7.
> + /*
> + * Error out if the slot has been invalidated previously. Because there's
> + * no use in acquiring the invalidated slot.
> + */
>
> nit - The comment is contrary to the code. If there was no reason to
> skip this error, then you would not have the new parameter allowing
> you to skip this error. I suggest just repeating the same comment as
> in the function header.

+1.

> 8. ReportSlotInvalidation
>
> nit - Added some blank lines for consistency.

+1.

> 9. InvalidatePossiblyObsoleteSlot
>
> 9a.
> Consistency is good (commit message, docs and code comments for this),
> but the added sentence has bad grammar. Please see the docs review
> comment #4 above for some alternate phrasing.

+1.

> 9b.
> Now that this logic is moved into a macro (I suggested it should be an
> inline function) IMO this comment does not belong here anymore because
> it is commenting code that you cannot see. Instead, this comment (or
> something like it) should be as comments within the new function.
>
> ==
> src/include/replication/slot.h

+1.

> 10.
> +extern void ReplicationSlotAcquire(const char *name, bool nowait,
> +bool check_for_invalidation);
>
> Change the new param name as described in the earlier review comment.

+1.

> Please refer to the attached file which implements some of the nits
> mentioned above.

Merged the diff into v45. Thanks.

On Tue, Sep 3, 2024 at 12:26 PM Peter Smith  wrote:
>
> TEST CASE #1
>
> 1.
> +# Wait for the inactive replication slot to be invalidated.
>
> Is that comment correct? IIUC the synced slot should *already* be
> invalidated from the primary, so here we are not really "waiting" for
> it to be invalidated; Instead, we are just "confirming" that the
> synchronized slot is already invalidated with the correct reason as
> expected.

Modified the comment.

> 2.
> +# Synced slot mustn't get invalidated on the stan

Re: Introduce XID age and inactive timeout based replication slot invalidation

2024-09-08 Thread Bharath Rupireddy

Hi,

Thanks for reviewing.

On Tue, Sep 3, 2024 at 3:01 PM shveta malik  wrote:
>
> 1)
> I see that ReplicationSlotAlter() will error out if the slot is
> invalidated due to timeout. I have not tested it myself, but do you
> know if  slot-alter errors out for other invalidation causes as well?
> Just wanted to confirm that the behaviour is consistent for all
> invalidation causes.

Will respond to Amit's comment soon.

> 2)
> When a slot is invalidated, and we try to use that slot, it gives this msg:
>
> ERROR:  can no longer get changes from replication slot "mysubnew1_2"
> DETAIL:  The slot became invalid because it was inactive since
> 2024-09-03 14:23:34.094067+05:30, which is more than 600 seconds ago.
> HINT:  You might need to increase "replication_slot_inactive_timeout.".
>
> Isn't HINT misleading? Even if we increase it now, the slot can not be
> reused again.
>
> Below is one side effect if inactive_since keeps on changing:
>
> postgres=# SELECT * FROM pg_replication_slot_advance('mysubnew1_1',
> pg_current_wal_lsn());
> ERROR:  can no longer get changes from replication slot "mysubnew1_1"
> DETAIL:  The slot became invalid because it was inactive since
> 2024-09-04 10:03:56.68053+05:30, which is more than 10 seconds ago.
> HINT:  You might need to increase "replication_slot_inactive_timeout.".
>
> postgres=# select now();
>now
> -
>  2024-09-04 10:04:00.26564+05:30
>
> 'DETAIL' gives wrong information, we are not past 10-seconds. This is
> because inactive_since got updated even in ERROR scenario.
>
> ERROR:  can no longer get changes from replication slot "mysubnew1_1"
> DETAIL:  The slot became invalid because it was inactive since
> 2024-09-04 10:06:38.980939+05:30, which is more than 129600 seconds
> ago.
> postgres=# select now();
>now
> --
>  2024-09-04 10:07:35.201894+05:30
>
> I feel we should change this message itself.

Removed the hint and corrected the detail message as following:

errmsg("can no longer get changes from replication slot \"%s\"",
NameStr(s->data.name)),
errdetail("This slot has been invalidated because it was inactive for
longer than the amount of time specified by \"%s\".",
"replication_slot_inactive_timeout.")));

> 3)
> When the slot is invalidated, the' inactive_since' still keeps on
> changing when there is a subscriber trying to start replication
> continuously. I think ReplicationSlotAcquire() keeps on failing and
> thus Release keeps on setting it again and again. Shouldn't we stop
> setting/chnaging  'inactive_since' once the slot is invalidated
> already, otherwise it will be misleading.
>
> postgres=# select failover,synced,inactive_since,invalidation_reason
> from pg_replication_slots;
>
>  failover | synced |  inactive_since  | invalidation_reason
> --++--+-
>  t| f  | 2024-09-03 14:23:.. | inactive_timeout
>
> after sometime:
>  failover | synced |  inactive_since  | invalidation_reason
> --++--+-
>  t| f  | 2024-09-03 14:26:..| inactive_timeout

Changed it to not update inactive_since for slots invalidated due to
inactive timeout.

> 4)
> src/sgml/config.sgml:
>
> 4a)
> + A value of zero (which is default) disables the timeout mechanism.
>
> Better will be:
> A value of zero (which is default) disables the inactive timeout
> invalidation mechanism .

Changed.

> 4b)
> 'synced' and inactive_since should point to pg_replication_slots:
>
> example:
>  linkend="view-pg-replication-slots">pg_replication_slots.synced

Modified.

> 5)
> src/sgml/system-views.sgml:
> + ..the slot has been inactive for longer than the duration specified
> by replication_slot_inactive_timeout parameter.
>
> Better to have:
> ..the slot has been inactive for a time longer than the duration
> specified by the replication_slot_inactive_timeout parameter.

Changed it to the following to be consistent with the config.sgml.

  inactive_timeout means that the slot has been
  inactive for longer than the amount of time specified by the
   parameter.

Please find the v45 patch posted upthread at
https://www.postgresql.org/message-id/CALj2ACWXQT3_HY40ceqKf1DadjLQP6b1r%3D0sZRh-xhAOd-b0pA%40mail.gmail.com
for the changes.

-- 
Bharath Rupireddy
PostgreSQL Contributors Team
RDS Open Source Databases
Amazon Web Services: https://aws.amazon.com

Re: Sort functions with specialized comparators

2024-09-08 Thread Stepan Neretin

Hi, why do you think that I rejected Andrey's offer? I included his patch
first in my own. Yes, patch 2-0006 is the only patch to which I have not
attached any statistics and it looks really dubious. But the rest seem
useful. Above, I attached a speed graph for one of the patches and tps(
pgbench)
What do you think is the format in which to make benchmarks for my patches?
Best regards, Stepan Neretin.

Re: Add RESPECT/IGNORE NULLS and FROM FIRST/LAST options

2024-09-08 Thread Vik Fearing


On 9/7/24 22:25, Oliver Ford wrote:

On Sat, May 6, 2023 at 9:41 AM Oliver Ford  wrote:




On Sat, 6 May 2023, 04:57 Tatsuo Ishii,  wrote:


Attached is the patch to implement this (on top of your patch).

test=# SELECT row_number() RESPECT NULLS OVER () FROM (SELECT 1) AS s;
ERROR:  window function row_number cannot have RESPECT NULLS or IGNORE NULLS



The last time this was discussed 
(https://www.postgresql.org/message-id/1037735.1610402426%40sss.pgh.pa.us) it 
was suggested to make the feature generalizable, beyond what the standard says 
it should be limited to.

With it generalizable, there would need to be extra checks for custom 
functions, such as if they allow multiple column arguments (which I'll add in 
v2 of the patch if the design's accepted).

So I think we need a consensus on whether to stick to limiting it to several 
specific functions, or making it generalized yet agreeing the rules to limit it 
(such as no agg functions, and no functions with multiple column arguments).


Reviving this thread, I've attached a rebased patch with code, docs,
and tests and added it to November commitfest.


Excellent!  One of these days we'll get this in. :-)

I have a problem with this test, though:

SELECT sum(orbit) RESPECT NULLS OVER () FROM planets; -- succeeds

Why should that succeed?  Especially since aggregates such as SUM() will 
ignore nulls!  The error message on its partner seems to confirm this:


SELECT sum(orbit) IGNORE NULLS OVER () FROM planets; -- fails
ERROR:  aggregate functions do not accept RESPECT/IGNORE NULLS

I believe they should both fail.
--
Vik Fearing

Re: [PoC] Add CANONICAL option to xmlserialize

2024-09-08 Thread Laurenz Albe

On Sun, 2024-09-08 at 11:43 +, Oliver Ford wrote:
> The following review has been posted through the commitfest application:
> make installcheck-world:  tested, failed
> Implements feature:   tested, failed
> Spec compliant:   tested, failed
> Documentation:tested, failed
> 
> LGTM
> 
> The new status of this patch is: Ready for Committer

Huh?  Do you mean "tested, passes"?

Yours,
Laurenz Albe

Re: [PoC] Add CANONICAL option to xmlserialize

2024-09-08 Thread Oliver Ford

On Sun, Sep 8, 2024 at 2:44 PM Laurenz Albe  wrote:
>
> On Sun, 2024-09-08 at 11:43 +, Oliver Ford wrote:
> > The following review has been posted through the commitfest application:
> > make installcheck-world:  tested, failed
> > Implements feature:   tested, failed
> > Spec compliant:   tested, failed
> > Documentation:tested, failed
> >
> > LGTM
> >
> > The new status of this patch is: Ready for Committer
>
> Huh?  Do you mean "tested, passes"?
>
> Yours,
> Laurenz Albe

Whoops, yes all tests and docs pass!

Re: Yet another way for pg_ctl stop to fail on Windows

2024-09-08 Thread Alexander Lakhin


07.09.2024 21:11, Noah Misch wrote:




Noah, what do you think of handling this error in line with handling of
ERROR_BROKEN_PIPE and ERROR_BAD_PIPE (which was done in 0ea1f2a3a)?

I tried the following change:
     switch (GetLastError())
     {
     case ERROR_BROKEN_PIPE:
     case ERROR_BAD_PIPE:
+   case ERROR_PIPE_BUSY:
and saw no issues.

That would be a strict improvement over returning EINVAL like we do today.  We
do use PIPE_UNLIMITED_INSTANCES, so I expect the causes of ERROR_PIPE_BUSY are
process exit and ENOMEM-like situations.  While that change is the best thing
if the process is exiting, it could silently drop the signal in ENOMEM-like
situations.  Consider the following alternative.  If sig==0, just return 0
like you propose, because the process isn't completely gone.  Otherwise, sleep
and retry the signal, like pgwin32_open_handle() retries after certain errors.
What do you think of that?


Thank you for your attention to the issue!

I agree with your approach. It looks like Microsoft recommends to loop on
ERROR_PIPE_BUSY: [1] (they say "Calling CallNamedPipe is equivalent to
calling the CreateFile ..." at [2]).

So if we aim to not only fix "pg_ctl stop", but to make pgkill() robust,
it's the way to go, IMHO. I'm not sure about an infinite loop they show,
I'd vote for a loop with the same characteristics as in
pgwin32_open_handle().

I've managed to trigger ERROR_PIPE_BUSY with "pg_ctl reload", when running
20 TAP tests (see attached) in parallel (with 20 threads executing
$node->reload() in each), and with the kill() call inside do_reload()
repeated x100 (see the modification patch attached too):
...
# Running: pg_ctl -D 
.../099_test_pgkill\data/t_099_test_pgkill_node_data/pgdata reload
### Reloading node "node"
# Running: pg_ctl -D 
.../099_test_pgkill\data/t_099_test_pgkill_node_data/pgdata reload
[13:41:46.850](2.400s) # 18
server signaled
server signaled
server signaled
server signaled
server signaled
server signaled
server signaled
server signaled
!!!pgkill| GetLastError(): 231
pg_ctl: could not send reload signal (PID: 3912), iteration: 81, res: -1, 
errno: 22: Invalid argument
server signaled
[13:41:52.594](5.744s) # 19
...

[1] https://learn.microsoft.com/en-us/windows/win32/ipc/named-pipe-client
[2] 
https://learn.microsoft.com/en-us/windows/win32/api/winbase/nf-winbase-callnamedpipea

Best regards,
Alexanderdiff --git a/src/bin/pg_ctl/pg_ctl.c b/src/bin/pg_ctl/pg_ctl.c
index e7e878c22f..cab6eb9472 100644
--- a/src/bin/pg_ctl/pg_ctl.c
+++ b/src/bin/pg_ctl/pg_ctl.c
@@ -1167,12 +1174,17 @@ do_reload(void)
 		exit(1);
 	}
 
-	if (kill(pid, sig) != 0)
+for (int i = 1; i <= 100; i++)
+{
+int res, en;
+	if ((res = kill(pid, sig)) != 0)
 	{
-		write_stderr(_("%s: could not send reload signal (PID: %d): %m\n"),
-	 progname, (int) pid);
+en = errno;
+		write_stderr(_("%s: could not send reload signal (PID: %d), iteration: %d, res: %d, errno: %d: %m\n"),
+	 progname, (int) pid, i, res, en);
 		exit(1);
 	}
+}
 
 	print_msg(_("server signaled\n"));
 }
diff --git a/src/port/kill.c b/src/port/kill.c
index 412c2f19c1..b3d2c00796 100644
--- a/src/port/kill.c
+++ b/src/port/kill.c
@@ -70,6 +70,7 @@ pgkill(int pid, int sig)
 		return 0;
 	}
 
+int le = GetLastError();
 	switch (GetLastError())
 	{
 		case ERROR_BROKEN_PIPE:
@@ -89,6 +90,7 @@ pgkill(int pid, int sig)
 			errno = EPERM;
 			return -1;
 		default:
+fprintf(stderr, "!!!pgkill| GetLastError(): %d\n", le);
 			errno = EINVAL;		/* unexpected */
 			return -1;
 	}


099_test_pgkill.pl
Description: Perl program

Re: [PoC] Add CANONICAL option to xmlserialize

2024-09-08 Thread Jim Jones

Hi Oliver

On 08.09.24 15:56, Oliver Ford wrote:
> Whoops, yes all tests and docs pass!

Thanks for the review!

Best, Jim

Re: Yet another way for pg_ctl stop to fail on Windows

2024-09-08 Thread Noah Misch

On Sun, Sep 08, 2024 at 06:00:00PM +0300, Alexander Lakhin wrote:
> 07.09.2024 21:11, Noah Misch wrote:

> > > Noah, what do you think of handling this error in line with handling of
> > > ERROR_BROKEN_PIPE and ERROR_BAD_PIPE (which was done in 0ea1f2a3a)?
> > > 
> > > I tried the following change:
> > >      switch (GetLastError())
> > >      {
> > >      case ERROR_BROKEN_PIPE:
> > >      case ERROR_BAD_PIPE:
> > > +   case ERROR_PIPE_BUSY:
> > > and saw no issues.
> > That would be a strict improvement over returning EINVAL like we do today.  
> > We
> > do use PIPE_UNLIMITED_INSTANCES, so I expect the causes of ERROR_PIPE_BUSY 
> > are
> > process exit and ENOMEM-like situations.  While that change is the best 
> > thing
> > if the process is exiting, it could silently drop the signal in ENOMEM-like
> > situations.  Consider the following alternative.  If sig==0, just return 0
> > like you propose, because the process isn't completely gone.  Otherwise, 
> > sleep
> > and retry the signal, like pgwin32_open_handle() retries after certain 
> > errors.
> > What do you think of that?

> I agree with your approach. It looks like Microsoft recommends to loop on
> ERROR_PIPE_BUSY: [1] (they say "Calling CallNamedPipe is equivalent to
> calling the CreateFile ..." at [2]).

I see Microsoft suggests WaitNamedPipeA() as opposed to just polling.
WaitNamedPipeA() should be more responsive.  Given how rare this has been, it
likely doesn't matter whether we use WaitNamedPipeA() or polling.  I'd lean
toward whichever makes the code simpler, probably polling.

> So if we aim to not only fix "pg_ctl stop", but to make pgkill() robust,
> it's the way to go, IMHO. I'm not sure about an infinite loop they show,
> I'd vote for a loop with the same characteristics as in
> pgwin32_open_handle().

I agree with bounding the total time of each kill(), like
pgwin32_open_handle() does for open().

> [1] https://learn.microsoft.com/en-us/windows/win32/ipc/named-pipe-client
> [2] 
> https://learn.microsoft.com/en-us/windows/win32/api/winbase/nf-winbase-callnamedpipea

libpq: Process buffered SSL read bytes to support records >8kB on async API

2024-09-08 Thread Lars Kanis


Dear hackers,

I'm the maintainer of ruby-pg the ruby interface to the PostgreSQL 
database. This binding uses the asynchronous API of libpq by default to 
facilitate the ruby IO wait and scheduling mechanisms.


This works well with the vanilla postgresql server, but it leads to 
starvation with other types of servers using the postgresql wire 
protocol 3. This is because the current functioning of the libpq async 
interface depends on a maximum size of SSL records of 8kB.


The following servers were reported to starve with ruby-pg:

* AWS RDS Aurora Serverless [1]
* YugabyteDb [2]
* CockroachDB [3]

They block infinitely on certain message sizes sent from the backend to 
the libpq frontend. It is best described in [4]. A repro docker 
composition is provided by YugabyteDB at [2].


To fix this issue the attached patch calls pqReadData() repeatedly in 
PQconsumeInput() until there is no buffered SSL data left to be read. 
Another solution could be to process buffered SSL read bytes in 
PQisBusy() instead of PQconsumeInput() .


The synchronous libpq API isn't affected, since it supports arbitrary 
SSL record sizes already. That's why I think that the asynchronous API 
should also support bigger SSL record sizes.


Regards, Lars


[1] https://github.com/ged/ruby-pg/issues/325
[2] https://github.com/ged/ruby-pg/issues/588
[3] https://github.com/ged/ruby-pg/issues/583
[4] https://github.com/ged/ruby-pg/issues/325#issuecomment-737561270

From ab793829a4ce473f1cc2bbc0e2a6f6753553255d Mon Sep 17 00:00:00 2001
From: Lars Kanis 
Date: Sun, 8 Sep 2024 13:59:05 +0200
Subject: [PATCH] libpq: Process buffered SSL read bytes to support records
 >8kB on async API

The async API of libpq doesn't support SSL record sizes >8kB so far.
This size isn't exceeded by vanilla PostgreSQL, but by other products using
the postgres wire protocol 3.
PQconsumeInput() reads all data readable from the socket, so that the read
condition is cleared.
But it doesn't process all the data that is pending on the SSL layer.
Also a subsequent call to PQisBusy() doesn't process it, so that the client
is triggered to wait for more readable data on the socket.
But this never arrives, so that the connection blocks infinitely.

To fix this issue call pqReadData() repeatedly until there is no buffered
SSL data left to be read.

The synchronous libpq API isn't affected, since it supports arbitrary SSL
record sizes already.
---
 src/interfaces/libpq/fe-exec.c | 13 +
 1 file changed, 13 insertions(+)

diff --git a/src/interfaces/libpq/fe-exec.c b/src/interfaces/libpq/fe-exec.c
index 0d224a852..637894ee1 100644
--- a/src/interfaces/libpq/fe-exec.c
+++ b/src/interfaces/libpq/fe-exec.c
@@ -2006,6 +2006,19 @@ PQconsumeInput(PGconn *conn)
 	if (pqReadData(conn) < 0)
 		return 0;
 
+	#ifdef USE_SSL
+		/*
+		 * Ensure all buffered read bytes in the SSL library are processed,
+		 * which might be not the case, if the SSL record size exceeds 8k.
+		 * Otherwise parseInput can't process the data.
+		 */
+		while (conn->ssl_in_use && pgtls_read_pending(conn))
+		{
+			if (pqReadData(conn) < 0)
+return 0;
+		}
+	#endif
+
 	/* Parsing of the data waits till later. */
 	return 1;
 }
-- 
2.43.0

Re: CI, macports, darwin version problems

2024-09-08 Thread Thomas Munro

On Sat, Aug 3, 2024 at 12:07 AM Joe Conway  wrote:
> I tried making this run like a service using launchctl, but that was
> giving the permissions errors. I finally gave up trying to figure it out
> and just accepted that I need to manually start the script whenever I
> reboot the mac.

It seems to be unhappy recently:

Persistent worker failed to start the task: tart isolation failed:
failed to create VM cloned from
"ghcr.io/cirruslabs/macos-runner:sonoma": tart command returned
non-zero exit code: "tart/VMStorageOCI.swift:5: Fatal error: 'try!'
expression unexpectedly raised an error: Error
Domain=NSCocoaErrorDomain Code=512 \"The file “ci-run” couldn’t be
saved in the folder “Users”.\" UserInfo={NSFilePath=/Users/ci-run,
NSUnderlyingError=0x619f0720 {Error Domain=NSPOSIXErrorDomain
Code=20 \"Not a directory\"}}"

Re: Sort functions with specialized comparators

2024-09-08 Thread David Rowley

On Mon, 9 Sept 2024 at 01:00, Stepan Neretin  wrote:
> Hi, why do you think that I rejected Andrey's offer? I included his patch 
> first in my own. Yes, patch 2-0006 is the only patch to which I have not 
> attached any statistics and it looks really dubious. But the rest seem 
> useful. Above, I attached a speed graph for one of the patches and 
> tps(pgbench)

The difference with your patches and Andrey's patch is that he
included a benchmark which is targeted to the code he changed and his
results show a speed-up.

What it appears that you've done is made an assortment of changes and
picked the least effort thing that tests performance of something. You
by chance saw a performance increase so assumed it was due to your
changes.

> What do you think is the format in which to make benchmarks for my patches?

You'll need a benchmark that exercises the code you've changed to some
degree where it has a positive impact on performance. As far as I can
see, you've not done that yet.

Just to give you the benefit of the doubt, I applied all 10 v2 patches
and adjusted the call sites to add a NOTICE to include the size of the
array being sorted. Here is the result of running your benchmark:

$ pgbench  -t1000 -d postgres
pgbench (18devel)
NOTICE:  RelationGetIndexList 1
NOTICE:  RelationGetStatExtList 0
NOTICE:  RelationGetIndexList 3
NOTICE:  RelationGetStatExtList 0
NOTICE:  RelationGetIndexList 2
NOTICE:  RelationGetStatExtList 0
NOTICE:  RelationGetIndexList 1
NOTICE:  RelationGetStatExtList 0
NOTICE:  RelationGetIndexList 2
NOTICE:  RelationGetStatExtList 0
starting vacuum...NOTICE:  RelationGetIndexList 1
NOTICE:  RelationGetIndexList 0
end.
NOTICE:  RelationGetIndexList 1
NOTICE:  RelationGetStatExtList 0
NOTICE:  RelationGetIndexList 1
NOTICE:  RelationGetStatExtList 0
NOTICE:  RelationGetIndexList 1
NOTICE:  RelationGetStatExtList 0
transaction type: 
scaling factor: 1
query mode: simple
number of clients: 1
number of threads: 1
maximum number of tries: 1
number of transactions per client: 1000
number of transactions actually processed: 1000/1000
number of failed transactions: 0 (0.000%)
latency average = 0.915 ms
initial connection time = 23.443 ms
tps = 1092.326732 (without initial connection time)

Note that -t1000 shows the same number of notices as -t1.

So, it seems everything you've changed that runs in your benchmark is
RelationGetIndexList() and RelationGetStatExtList(). In one of the
calls to RelationGetIndexList() we're sorting up to a maximum of 3
elements.

Just to be clear, I'm not stating that I think all of your changes are
useless. If you want these patches accepted, then you're going to need
to prove they're useful and you've not done that.

Also, unless Andrey is happy for you to tag onto the work he's doing,
I'd suggest another thread for that work. I don't think there's any
good reason for that work to delay Andrey's work.

David

Test improvements and minor code fixes for formatting.c.

2024-09-08 Thread Tom Lane

Over in the thread about teaching to_number() to handle Roman
numerals [1], it was noted that we currently have precisely zero
test coverage for to_char()'s existing Roman-numeral code, except
in the timestamp code path which shares nothing with the numeric
code path.

In looking at this, I found that there's also no test coverage
for the , V, or PL format codes.  Also, the possibility of
overflow while converting an input value to int in order to
pass it to int_to_roman was ignored.  Attached is a patch that
adds more test coverage and cleans up the Roman-numeral code
a little bit.

BTW, I also discovered that there is a little bit of support
for a "B" format code: we can parse it, but then we ignore it.
And it's not documented.  Oracle defines this code as a flag
that:

Returns blanks for the integer part of a fixed-point number
when the integer part is zero (regardless of zeros in the
format model).

It doesn't seem super hard to implement that, but given the
complete lack of complaints about it being missing, maybe we
should just rip out the incomplete support instead?

regards, tom lane

[1] 
https://www.postgresql.org/message-id/flat/CAMWA6ybh4M1VQqpmnu2tfSwO%2B3gAPeA8YKnMHVADeB%3DXDEvT_A%40mail.gmail.com

diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 461fc3f437..e2d45989d8 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -8739,6 +8739,8 @@ SELECT regexp_match('abc01234xyz', '(?:(.*?)(\d+)(.*)){1,1}');
n is the number of digits following
V.  V with
to_number divides in a similar manner.
+   The V can be thought of as marking the position
+   of an implicit decimal point in the input or output string.
to_char and to_number
do not support the use of
V combined with a decimal point
diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c
index 68fa89418f..116d79ae11 100644
--- a/src/backend/utils/adt/formatting.c
+++ b/src/backend/utils/adt/formatting.c
@@ -5189,6 +5189,11 @@ NUM_cache(int len, NUMDesc *Num, text *pars_str, bool *shouldFree)
 }
 
 
+/*
+ * Convert integer to Roman numerals
+ * Result is upper-case and not blank-padded (NUM_processor converts as needed)
+ * If input is out-of-range, produce '###'
+ */
 static char *
 int_to_roman(int number)
 {
@@ -5201,32 +5206,42 @@ int_to_roman(int number)
 	result = (char *) palloc(16);
 	*result = '\0';
 
+	/*
+	 * This range limit is the same as in Oracle(TM).  The difficulty with
+	 * handling 4000 or more is that we'd need to use more than 3 "M"'s, and
+	 * more than 3 of the same digit isn't considered a valid Roman string.
+	 */
 	if (number > 3999 || number < 1)
 	{
 		fill_str(result, '#', 15);
 		return result;
 	}
+
+	/* Convert to decimal, then examine each digit */
 	len = snprintf(numstr, sizeof(numstr), "%d", number);
+	Assert(len > 0 && len <= 4);
 
 	for (p = numstr; *p != '\0'; p++, --len)
 	{
 		num = *p - ('0' + 1);
 		if (num < 0)
-			continue;
-
-		if (len > 3)
+			continue;			/* ignore zeroes */
+		/* switch on current column position */
+		switch (len)
 		{
-			while (num-- != -1)
-strcat(result, "M");
-		}
-		else
-		{
-			if (len == 3)
+			case 4:
+while (num-- >= 0)
+	strcat(result, "M");
+break;
+			case 3:
 strcat(result, rm100[num]);
-			else if (len == 2)
+break;
+			case 2:
 strcat(result, rm10[num]);
-			else if (len == 1)
+break;
+			case 1:
 strcat(result, rm1[num]);
+break;
 		}
 	}
 	return result;
@@ -6367,7 +6382,6 @@ numeric_to_char(PG_FUNCTION_ARGS)
 	char	   *numstr,
 			   *orgnum,
 			   *p;
-	Numeric		x;
 
 	NUM_TOCHAR_prepare;
 
@@ -6376,12 +6390,15 @@ numeric_to_char(PG_FUNCTION_ARGS)
 	 */
 	if (IS_ROMAN(&Num))
 	{
-		x = DatumGetNumeric(DirectFunctionCall2(numeric_round,
-NumericGetDatum(value),
-Int32GetDatum(0)));
-		numstr =
-			int_to_roman(DatumGetInt32(DirectFunctionCall1(numeric_int4,
-		   NumericGetDatum(x;
+		int32		intvalue;
+		bool		err;
+
+		/* Round and convert to int */
+		intvalue = numeric_int4_opt_error(value, &err);
+		/* On overflow, just use PG_INT32_MAX; int_to_roman will cope */
+		if (err)
+			intvalue = PG_INT32_MAX;
+		numstr = int_to_roman(intvalue);
 	}
 	else if (IS_(&Num))
 	{
@@ -6421,6 +6438,7 @@ numeric_to_char(PG_FUNCTION_ARGS)
 	{
 		int			numstr_pre_len;
 		Numeric		val = value;
+		Numeric		x;
 
 		if (IS_MULTI(&Num))
 		{
@@ -6589,12 +6607,18 @@ int8_to_char(PG_FUNCTION_ARGS)
 	NUM_TOCHAR_prepare;
 
 	/*
-	 * On DateType depend part (int32)
+	 * On DateType depend part (int64)
 	 */
 	if (IS_ROMAN(&Num))
 	{
-		/* Currently don't support int8 conversion to roman... */
-		numstr = int_to_roman(DatumGetInt32(DirectFunctionCall1(int84, Int64GetDatum(value;
+		int32		intvalue;
+
+		/* On overflow, just use PG_INT32_MAX; int_to_roman will cope */
+		if (value <= PG_INT32_MAX && value >= PG_INT32_MIN)

Re: [PATCH] Add roman support for to_number function

2024-09-08 Thread Tom Lane

I wrote:
> * Further to Aleksander's point about lack of test coverage for
> the to_char direction, I see from
> https://coverage.postgresql.org/src/backend/utils/adt/formatting.c.gcov.html
> that almost none of the existing roman-number code paths are covered
> today.  While it's not strictly within the charter of this patch
> to improve that, maybe we should try to do so --- at the very
> least it'd raise formatting.c's coverage score a few notches.

For the archives' sake: I created a patch and a separate discussion
thread [1] for that.

regards, tom lane

[1] https://www.postgresql.org/message-id/flat/2956175.1725831...@sss.pgh.pa.us

Re: Test improvements and minor code fixes for formatting.c.

2024-09-08 Thread Tom Lane

I wrote:
> [ v1-formatting-test-improvements.patch ]

Meh ... cfbot points out I did the float-to-int conversions wrong.
v2 attached.

regards, tom lane

diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 461fc3f437..e2d45989d8 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -8739,6 +8739,8 @@ SELECT regexp_match('abc01234xyz', '(?:(.*?)(\d+)(.*)){1,1}');
n is the number of digits following
V.  V with
to_number divides in a similar manner.
+   The V can be thought of as marking the position
+   of an implicit decimal point in the input or output string.
to_char and to_number
do not support the use of
V combined with a decimal point
diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c
index 68fa89418f..85a7dd4561 100644
--- a/src/backend/utils/adt/formatting.c
+++ b/src/backend/utils/adt/formatting.c
@@ -5189,6 +5189,11 @@ NUM_cache(int len, NUMDesc *Num, text *pars_str, bool *shouldFree)
 }
 
 
+/*
+ * Convert integer to Roman numerals
+ * Result is upper-case and not blank-padded (NUM_processor converts as needed)
+ * If input is out-of-range, produce '###'
+ */
 static char *
 int_to_roman(int number)
 {
@@ -5201,32 +5206,42 @@ int_to_roman(int number)
 	result = (char *) palloc(16);
 	*result = '\0';
 
+	/*
+	 * This range limit is the same as in Oracle(TM).  The difficulty with
+	 * handling 4000 or more is that we'd need to use more than 3 "M"'s, and
+	 * more than 3 of the same digit isn't considered a valid Roman string.
+	 */
 	if (number > 3999 || number < 1)
 	{
 		fill_str(result, '#', 15);
 		return result;
 	}
+
+	/* Convert to decimal, then examine each digit */
 	len = snprintf(numstr, sizeof(numstr), "%d", number);
+	Assert(len > 0 && len <= 4);
 
 	for (p = numstr; *p != '\0'; p++, --len)
 	{
 		num = *p - ('0' + 1);
 		if (num < 0)
-			continue;
-
-		if (len > 3)
+			continue;			/* ignore zeroes */
+		/* switch on current column position */
+		switch (len)
 		{
-			while (num-- != -1)
-strcat(result, "M");
-		}
-		else
-		{
-			if (len == 3)
+			case 4:
+while (num-- >= 0)
+	strcat(result, "M");
+break;
+			case 3:
 strcat(result, rm100[num]);
-			else if (len == 2)
+break;
+			case 2:
 strcat(result, rm10[num]);
-			else if (len == 1)
+break;
+			case 1:
 strcat(result, rm1[num]);
+break;
 		}
 	}
 	return result;
@@ -6367,7 +6382,6 @@ numeric_to_char(PG_FUNCTION_ARGS)
 	char	   *numstr,
 			   *orgnum,
 			   *p;
-	Numeric		x;
 
 	NUM_TOCHAR_prepare;
 
@@ -6376,12 +6390,15 @@ numeric_to_char(PG_FUNCTION_ARGS)
 	 */
 	if (IS_ROMAN(&Num))
 	{
-		x = DatumGetNumeric(DirectFunctionCall2(numeric_round,
-NumericGetDatum(value),
-Int32GetDatum(0)));
-		numstr =
-			int_to_roman(DatumGetInt32(DirectFunctionCall1(numeric_int4,
-		   NumericGetDatum(x;
+		int32		intvalue;
+		bool		err;
+
+		/* Round and convert to int */
+		intvalue = numeric_int4_opt_error(value, &err);
+		/* On overflow, just use PG_INT32_MAX; int_to_roman will cope */
+		if (err)
+			intvalue = PG_INT32_MAX;
+		numstr = int_to_roman(intvalue);
 	}
 	else if (IS_(&Num))
 	{
@@ -6421,6 +6438,7 @@ numeric_to_char(PG_FUNCTION_ARGS)
 	{
 		int			numstr_pre_len;
 		Numeric		val = value;
+		Numeric		x;
 
 		if (IS_MULTI(&Num))
 		{
@@ -6589,12 +6607,18 @@ int8_to_char(PG_FUNCTION_ARGS)
 	NUM_TOCHAR_prepare;
 
 	/*
-	 * On DateType depend part (int32)
+	 * On DateType depend part (int64)
 	 */
 	if (IS_ROMAN(&Num))
 	{
-		/* Currently don't support int8 conversion to roman... */
-		numstr = int_to_roman(DatumGetInt32(DirectFunctionCall1(int84, Int64GetDatum(value;
+		int32		intvalue;
+
+		/* On overflow, just use PG_INT32_MAX; int_to_roman will cope */
+		if (value <= PG_INT32_MAX && value >= PG_INT32_MIN)
+			intvalue = (int32) value;
+		else
+			intvalue = PG_INT32_MAX;
+		numstr = int_to_roman(intvalue);
 	}
 	else if (IS_(&Num))
 	{
@@ -6695,7 +6719,18 @@ float4_to_char(PG_FUNCTION_ARGS)
 	NUM_TOCHAR_prepare;
 
 	if (IS_ROMAN(&Num))
-		numstr = int_to_roman((int) rint(value));
+	{
+		int32		intvalue;
+
+		/* See notes in ftoi4() */
+		value = rint(value);
+		/* On overflow, just use PG_INT32_MAX; int_to_roman will cope */
+		if (!isnan(value) && FLOAT4_FITS_IN_INT32(value))
+			intvalue = (int32) value;
+		else
+			intvalue = PG_INT32_MAX;
+		numstr = int_to_roman(intvalue);
+	}
 	else if (IS_(&Num))
 	{
 		if (isnan(value) || isinf(value))
@@ -6797,7 +6832,18 @@ float8_to_char(PG_FUNCTION_ARGS)
 	NUM_TOCHAR_prepare;
 
 	if (IS_ROMAN(&Num))
-		numstr = int_to_roman((int) rint(value));
+	{
+		int32		intvalue;
+
+		/* See notes in dtoi4() */
+		value = rint(value);
+		/* On overflow, just use PG_INT32_MAX; int_to_roman will cope */
+		if (!isnan(value) && FLOAT8_FITS_IN_INT32(value))
+			intvalue = (int32) value;
+		else
+			intvalue = PG_INT32_MAX;
+		numstr = int_to_roman(intvalu

Re: PATCH: Issue with set_indexsafe_procflags in ReindexRelationConcurrently

2024-09-08 Thread Michael Paquier

On Fri, Sep 06, 2024 at 01:27:12PM +0200, Michail Nikolaev wrote:
> While working on [1], I have found a small issue with correctness
> of set_indexsafe_procflags usage in ReindexRelationConcurrently introduced
> in [2].
> 
> > idx->safe = (indexRel->rd_indexprs == NIL && indexRel->rd_indpred == NIL);
> 
> It is always true because there are no RelationGetIndexExpressions
> and RelationGetIndexPredicate before that check.
> 
> Two patches with reproducer + fix are attached.
> 
> The issue is simple, but I'll register this in commitfest just in case.

Ugh.  It means that we've always considered as "safe" concurrent
rebuilds of indexes that have expressions or predicates, but they're
not safe at all.  Other concurrent jobs should wait for them.

Adding these two calls as you are suggesting is probably a good idea
anyway to force a correct setup of the flag.  Will see about that.

I don't understand why an isolation test is required here if we were
to add a validity test, because you can cause a failure in the REINDEX
with a set of SQLs in a single session.  I'm OK to add a test, perhaps
with a NOTICE set when the safe flag is true.  Anyway, what you have
is more than enough to prove your point.  Thanks for that.
--
Michael


signature.asc
Description: PGP signature

Re: Jargon and acronyms on this mailing list

2024-09-08 Thread David Rowley

On Sat, 31 Aug 2024 at 04:02, Greg Sabino Mullane  wrote:
> I normally wouldn't mention my blog entries here, but this one was about the 
> hackers mailing list, so wanted to let people know about it in case you don't 
> follow Planet Postgres. I scanned the last year's worth of posts and gathered 
> the most used acronyms and jargon. The most commonly used acronym was IMO (in 
> my opinion), followed by FWIW (for what it's worth), and IIUC (if I 
> understand correctly). The complete list can be found in the post below, I'll 
> refrain from copying everything here.
>
> https://www.crunchydata.com/blog/understanding-the-postgres-hackers-mailing-list

I think this is useful. Thanks for posting.

I think HEAD is commonly misused to mean master instead of the latest
commit of the current branch. I see the buildfarm even does that.
Thanks for getting that right in your blog post.

David

Re: Jargon and acronyms on this mailing list

2024-09-08 Thread Tatsuo Ishii

> I normally wouldn't mention my blog entries here, but this one was about
> the hackers mailing list, so wanted to let people know about it in case you
> don't follow Planet Postgres. I scanned the last year's worth of posts and
> gathered the most used acronyms and jargon. The most commonly used acronym
> was IMO (in my opinion), followed by FWIW (for what it's worth), and IIUC
> (if I understand correctly). The complete list can be found in the post
> below, I'll refrain from copying everything here.
> 
> https://www.crunchydata.com/blog/understanding-the-postgres-hackers-mailing-list

Thank you for the excellent article. I think it is very useful for
non-native English speakers like me.

Best reagards,
--
Tatsuo Ishii
SRA OSS K.K.
English: http://www.sraoss.co.jp/index_en/
Japanese:http://www.sraoss.co.jp

Re: ANALYZE ONLY

2024-09-08 Thread David Rowley

On Sun, 1 Sept 2024 at 13:32, Michael Harris  wrote:
> v3 of the patch is attached, and I will submit it to the commitfest.

I think this patch is pretty good.

I only have a few things I'd point out:

1. The change to ddl.sgml, technically you're removing the caveat, but
I think the choice you've made to mention the updated behaviour is
likely a good idea still. So I agree with this change, but just wanted
to mention it as someone else might think otherwise.

2. I think there's some mixing of tense in the changes to analyze.sgml:

"If ONLY was not specified, it will also recurse
into each partition and update its statistics."

You've written "was" (past tense), but then the existing text uses
"will" (future tense). I guess if the point in time is after parse and
before work has been done, then that's correct, but I think using "is"
instead of "was" is better.

3. In vacuum.sgml;

"If ONLY is not specified, the table and all its
descendant tables or partitions (if any) are vacuumed"

Maybe "are also vacuumed" instead of "are vacuumed" is more clear?
It's certainly true for inheritance parents, but I guess you could
argue that's wrong for partitioned tables.

4. A very minor detail, but I think in vacuum.c the WARNING you've
added should use RelationGetRelationName(). We seem to be very
inconsistent with using that macro and I see it's not used just above
for the lock warning, which I imagine you copied.

Aside from those, that just leaves me with the behavioural change. I
noted Tom was ok with the change in behaviour for ANALYZE (mentioned
in [1]). Tom, wondering if you feel the same for VACUUM too?  If we're
doing this, I think we'd need to be quite clear about it on the
release notes.

David

[1] https://postgr.es/m/1919201.1724289...@sss.pgh.pa.us

Re: not null constraints, again

2024-09-08 Thread jian he

On Sat, Aug 31, 2024 at 11:59 AM Alvaro Herrera  wrote:
>
> Hello
>
> Here I present another attempt at making not-null constraints be
> catalogued.  This is largely based on the code reverted at 9ce04b50e120,
> except that we now have a not-null constraint automatically created for
> every column of a primary key, and such constraint cannot be removed
> while the PK exists.  Thanks to this, a lot of rather ugly code is gone,
> both in pg_dump and in backend -- in particular the handling of NO
> INHERIT, which was needed for pg_dump.
>
> Noteworthy psql difference: because there are now even more not-null
> constraints than before, the \d+ display would be far too noisy if we
> just let it grow.  So here I've made it omit any constraints that
> underlie the primary key.  This should be OK since you can't do much
> with those constraints while the PK is still there.  If you drop the PK,
> the next \d+ will show those constraints.
>

hi.
my brief review.

create table t1(a int, b int, c int not null, primary key(a, b));
\d+ t1
ERROR:  operator is not unique: smallint[] <@ smallint[]
LINE 8: coalesce(NOT ARRAY[at.attnum] <@ (SELECT conkey FROM pg_cata...
  ^
HINT:  Could not choose a best candidate operator. You might need to
add explicit type casts.

the regression test still passed, i have no idea why.
anyway, the following changes make the above ERROR disappear.
also seems more lean.

printfPQExpBuffer(&buf,
  /* FIXME the coalesce trick looks silly. What's a better way? */
  "SELECT co.conname, at.attname, co.connoinherit, co.conislocal,\n"
  "co.coninhcount <> 0\n"
  "FROM pg_catalog.pg_constraint co JOIN\n"
  "pg_catalog.pg_attribute at ON\n"
  "(at.attrelid = co.conrelid AND at.attnum = co.conkey[1])\n"
  "WHERE co.contype = 'n' AND\n"
  "co.conrelid = '%s'::pg_catalog.regclass AND\n"
  "coalesce(NOT ARRAY[at.attnum] <@ (SELECT conkey FROM
pg_catalog.pg_constraint\n"
  "  WHERE contype = 'p' AND conrelid = '%s'::regclass), true)\n"
  "ORDER BY at.attnum",
  oid,
  oid);

change to

printfPQExpBuffer(&buf,
  "SELECT co.conname, at.attname,
co.connoinherit, co.conislocal,\n"
  "co.coninhcount <> 0\n"
  "FROM pg_catalog.pg_constraint co JOIN\n"
  "pg_catalog.pg_attribute at ON\n"
  "(at.attrelid = co.conrelid AND
at.attnum = co.conkey[1])\n"
  "WHERE co.contype = 'n' AND\n"
  "co.conrelid = '%s'::pg_catalog.regclass AND\n"
  "NOT EXISTS (SELECT 1 FROM
pg_catalog.pg_constraint co1 where co1.contype = 'p'\n"
  "AND at.attnum = any(co1.conkey) AND
co1.conrelid = '%s'::pg_catalog.regclass)\n"
  "ORDER BY at.attnum",
   oid,
   oid);

steal idea from https://stackoverflow.com/a/75614278/15603477

create type comptype as (r float8, i float8);
create domain dcomptype1 as comptype not null no inherit;
with cte as (
  SELECT oid, conrelid::regclass, conname FROM  pg_catalog.pg_constraint
  where contypid in ('dcomptype1'::regtype))
select pg_get_constraintdef(oid) from cte;
current output is
NOT NULL

but it's not the same as
CREATE TABLE ATACC1 (a int, not null a no inherit);
with cte as ( SELECT oid, conrelid::regclass, conname FROM
pg_catalog.pg_constraint
  where conrelid in ('ATACC1'::regclass))
select pg_get_constraintdef(oid) from cte;
NOT NULL a NO INHERIT

i don't really sure the meaning of "on inherit" in
"create domain dcomptype1 as comptype not null no inherit;"

bold idea. print out the constraint name: violates not-null constraint \"%s\"
for the following code:
ereport(ERROR,
(errcode(ERRCODE_NOT_NULL_VIOLATION),
 errmsg("null value in column \"%s\" of
relation \"%s\" violates not-null constraint",
NameStr(att->attname),
RelationGetRelationName(orig_rel)),
 val_desc ? errdetail("Failing row contains
%s.", val_desc) : 0,
 errtablecol(orig_rel, attrChk)));

in extractNotNullColumn
we can Assert(colnum > 0);

create table t3(a int  , b int , c int ,not null a, not null c, not
null b, not null tableoid);
this should not be allowed?

foreach(lc,
RelationGetNotNullConstraints(RelationGetRelid(relation), false))
{
AlterTableCmd *atsubcmd;

atsubcmd = makeNode(AlterTableCmd);
atsubcmd->subtype = AT_AddConstraint;
atsubcmd->def = (Node *) lfirst_node(Constraint, lc);
atsubcmds = lappend(atsubcmds, atsubcmd);
}
forgive me for being hypocritical.
I

Re: CI, macports, darwin version problems

2024-09-08 Thread Joe Conway


On 9/8/24 16:55, Thomas Munro wrote:

On Sat, Aug 3, 2024 at 12:07 AM Joe Conway  wrote:

I tried making this run like a service using launchctl, but that was
giving the permissions errors. I finally gave up trying to figure it out
and just accepted that I need to manually start the script whenever I
reboot the mac.


It seems to be unhappy recently:

Persistent worker failed to start the task: tart isolation failed:
failed to create VM cloned from
"ghcr.io/cirruslabs/macos-runner:sonoma": tart command returned
non-zero exit code: "tart/VMStorageOCI.swift:5: Fatal error: 'try!'
expression unexpectedly raised an error: Error
Domain=NSCocoaErrorDomain Code=512 \"The file “ci-run” couldn’t be
saved in the folder “Users”.\" UserInfo={NSFilePath=/Users/ci-run,
NSUnderlyingError=0x619f0720 {Error Domain=NSPOSIXErrorDomain
Code=20 \"Not a directory\"}}"



Seems the mounted drive got unmounted somehow ¯\_(ツ)_/¯

Please check it out and let me know if it is working properly now.


--
Joe Conway
PostgreSQL Contributors Team
RDS Open Source Databases
Amazon Web Services: https://aws.amazon.com

Fix possible memory leak in rescanLatestTimeLine()

2024-09-08 Thread Yugo Nagata

In rescanLatestTimeLine(), if a new target timeline is found, expectedTLEs is 
replaced with newExpectedTLEs that is newly allocated by readTimeLineHistory(),
and old expectedTLEs is released using list_free_deep().

However, if the current timeline is not part of the history of the new 
timeline, 
the function returns without using newExpectedTLEs, nor releasing it. I wonder 
this is a memory leak and it is better to release it, although the affect may
be not so much. 

I've attached the patch.

Regards,
Yugo Nagata

-- 
Yugo Nagata 
diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c
index 178491f6f5..ec518d35e8 100644
--- a/src/backend/access/transam/xlogrecovery.c
+++ b/src/backend/access/transam/xlogrecovery.c
@@ -4157,6 +4157,7 @@ rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN)
 (errmsg("new timeline %u is not a child of database system timeline %u",
 		newtarget,
 		replayTLI)));
+		list_free_deep(newExpectedTLEs);
 		return false;
 	}
 
@@ -4172,6 +4173,7 @@ rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN)
 		newtarget,
 		replayTLI,
 		LSN_FORMAT_ARGS(replayLSN;
+		list_free_deep(newExpectedTLEs);
 		return false;
 	}

Re: DOCS - pg_replication_slot . Fix the 'inactive_since' description

2024-09-08 Thread Peter Smith

Saying "The time..." is fine, but the suggestions given seem backwards to me:
- The time this slot was inactivated
- The time when the slot became inactive.
- The time when the slot was deactivated.

e.g. It is not like light switch. So, there is no moment when the
active slot "became inactive" or "was deactivated".

Rather, the 'inactive_since' timestamp field is simply:
- The time the slot was last active.
- The last time the slot was active.

==
Kind Regards,
Peter Smith.
Fujitsu Australia

Re: Add callbacks for fixed-numbered stats flush in pgstats

2024-09-08 Thread Michael Paquier

On Wed, Sep 04, 2024 at 06:37:01PM +0900, Kyotaro Horiguchi wrote:
> I doubt that overprotection is always better, but in this case, it's
> not overprotection at all. The flush callbacks are called
> unconditionally once we decide to flush anything. Sorry for the noise.

Yes, it's intended to be a fastpath, with checks happening if we have
no data pending for any variable-numbered stats.
--
Michael


signature.asc
Description: PGP signature

Re: DOCS - pg_replication_slot . Fix the 'inactive_since' description

2024-09-08 Thread David G. Johnston

On Sun, Sep 8, 2024, 18:55 Peter Smith  wrote:

> Saying "The time..." is fine, but the suggestions given seem backwards to
> me:
> - The time this slot was inactivated
> - The time when the slot became inactive.
> - The time when the slot was deactivated.
>
> e.g. It is not like light switch. So, there is no moment when the
> active slot "became inactive" or "was deactivated".
>

While this is plausible the existing wording and the name of the field
definitely fail to convey this.

> Rather, the 'inactive_since' timestamp field is simply:
> - The time the slot was last active.
> - The last time the slot was active.
>

I see your point but that wording is also quite confusing when an active
slot returns null for this field.

At this point I'm confused enough to need whatever wording is taken to be
supported by someone explaining the code that interacts with this field.

I suppose I'm expecting something like: The time the last activity
finished, or null if an activity is in-progress.

David J.

>
>

Re: Add callbacks for fixed-numbered stats flush in pgstats

2024-09-08 Thread Michael Paquier

On Wed, Sep 04, 2024 at 06:27:43AM +, Bertrand Drouvot wrote:
> Agree. The idea was to add an additional parameter (say "check_only") to the
> flush_fixed_cb. If this parameter is set to true then the flush_fixed_cb would
> do nothing (no flush at all) but return a boolean that would indicate if there
> is pending stats. In case it returns false, then we could avoid the clock 
> check.

Hmm.  It's the same thing in terms of logic, but I am not really
convinced that it is a good idea to mix a code path for a sanity check
with a code path dealing with the action, particularly considering
that there is the nowait option to think about in the flush callback
which would require one to think about 4 possible states rather than
two possibilities.  So that's more error-prone for extension
developers, IMO.

At the end, I have used my original suggestions for the callbacks.  If
there are voices in favor of something different, we still have a good
chunk of the development and beta cycle for that.
--
Michael

signature.asc
Description: PGP signature

Re: PostgreSQL 17 release announcement draft

2024-09-08 Thread Jonathan S. Katz


On 9/7/24 6:58 PM, David Rowley wrote:

On Sun, 8 Sept 2024 at 06:44, Jonathan S. Katz  wrote:

I've attached the latest copy.


Is "This release expands on functionality both for managing data in
partitions" still relevant given the MERGE/SPLIT PARTITION was
reverted [1]?


AFAICT yes, as identity columns and exclusion constraints can now be 
used. It looks like there are a few items with the optimizer (which 
looking at the release notes, you'd be aware of :) but unsure if those 
should be added.


Timing wise, we're rapidly approaching the Sep 9 12:00 UTC cutoff; if 
there's no additional feedback, once I wake up I'll do one more pass 
over the text and then freeze it.


Thanks,

Jonathan


OpenPGP_signature.asc
Description: OpenPGP digital signature

Re: DOCS - pg_replication_slot . Fix the 'inactive_since' description

2024-09-08 Thread Peter Smith

On Mon, Sep 9, 2024 at 12:20 PM David G. Johnston
 wrote:
>
>
>
> On Sun, Sep 8, 2024, 18:55 Peter Smith  wrote:
>>
>> Saying "The time..." is fine, but the suggestions given seem backwards to me:
>> - The time this slot was inactivated
>> - The time when the slot became inactive.
>> - The time when the slot was deactivated.
>>
>> e.g. It is not like light switch. So, there is no moment when the
>> active slot "became inactive" or "was deactivated".
>
>
> While this is plausible the existing wording and the name of the field 
> definitely fail to convey this.
>
>>
>> Rather, the 'inactive_since' timestamp field is simply:
>> - The time the slot was last active.
>> - The last time the slot was active.
>
>
> I see your point but that wording is also quite confusing when an active slot 
> returns null for this field.
>
> At this point I'm confused enough to need whatever wording is taken to be 
> supported by someone explaining the code that interacts with this field.
>

Me too. I created this thread primarily to get the description changed
to clarify this field represents a moment in time, rather than a
duration. So I will be happy with any wording that addresses that.

> I suppose I'm expecting something like: The time the last activity finished, 
> or null if an activity is in-progress.

==
Kind Regards,
Peter Smith.
Fujitsu Australia

Re: Remove emode argument from XLogFileRead/XLogFileReadAnyTLI

2024-09-08 Thread Michael Paquier

On Fri, Sep 06, 2024 at 08:10:43PM +0900, Yugo Nagata wrote:
> Since 1bb2558046c, XLogFileRead() doesn't use the emode argument. 
> Also, since abf5c5c9a4f, XLogFileReadAnyTLI() is called just once
> and emode is always DEBUG2. So, I think we can remove the emode
> argument from these functions. I've atached the patch.

It's true that the last relevant caller of XLogFileReadAnyTLI() that
required an emode is abf5c5c9a4f1, as you say, that's also what I am
tracking down.  Any objections to that?
--
Michael


signature.asc
Description: PGP signature

Re: Jargon and acronyms on this mailing list

2024-09-08 Thread Tom Lane

David Rowley  writes:
> I think HEAD is commonly misused to mean master instead of the latest
> commit of the current branch. I see the buildfarm even does that.
> Thanks for getting that right in your blog post.

IIRC, HEAD *was* the technically correct term back when we were
using CVS.  Old habits die hard.

regards, tom lane

Re: Jargon and acronyms on this mailing list

2024-09-08 Thread Michael Paquier

On Sun, Sep 08, 2024 at 11:35:36PM -0400, Tom Lane wrote:
> David Rowley  writes:
>> I think HEAD is commonly misused to mean master instead of the latest
>> commit of the current branch. I see the buildfarm even does that.
>> Thanks for getting that right in your blog post.
> 
> IIRC, HEAD *was* the technically correct term back when we were
> using CVS.  Old habits die hard.

Even if it's a new habit from a new technology.  I began getting
involved with upstream the year when the moved from CVS to git
happened, and just inherited this habit from everybody else ;)
--
Michael


signature.asc
Description: PGP signature

Re: Introduce XID age and inactive timeout based replication slot invalidation

2024-09-08 Thread shveta malik

On Thu, Sep 5, 2024 at 9:30 AM Amit Kapila  wrote:
>
> On Wed, Sep 4, 2024 at 9:17 AM shveta malik  wrote:
> >
> > On Tue, Sep 3, 2024 at 3:01 PM shveta malik  wrote:
> > >
> > >
> > > 1)
> > > I see that ReplicationSlotAlter() will error out if the slot is
> > > invalidated due to timeout. I have not tested it myself, but do you
> > > know if  slot-alter errors out for other invalidation causes as well?
> > > Just wanted to confirm that the behaviour is consistent for all
> > > invalidation causes.
> >
> > I was able to test this and as anticipated behavior is different. When
> > slot is invalidated due to say 'wal_removed', I am still able to do
> > 'alter' of that slot.
> > Please see:
> >
> > Pub:
> >   slot_name  | failover | synced |  inactive_since  |
> > invalidation_reason
> > -+--++--+-
> >  mysubnew1_1 | t| f  | 2024-09-04 08:58:12.802278+05:30 |
> > wal_removed
> >
> > Sub:
> > newdb1=# alter subscription mysubnew1_1 disable;
> > ALTER SUBSCRIPTION
> >
> > newdb1=# alter subscription mysubnew1_1 set (failover=false);
> > ALTER SUBSCRIPTION
> >
> > Pub: (failover altered)
> >   slot_name  | failover | synced |  inactive_since  |
> > invalidation_reason
> > -+--++--+-
> >  mysubnew1_1 | f| f  | 2024-09-04 08:58:47.824471+05:30 |
> > wal_removed
> >
> >
> > while when invalidation_reason is 'inactive_timeout', it fails:
> >
> > Pub:
> >   slot_name  | failover | synced |  inactive_since  |
> > invalidation_reason
> > -+--++--+-
> >  mysubnew1_1 | t| f  | 2024-09-03 14:30:57.532206+05:30 |
> > inactive_timeout
> >
> > Sub:
> > newdb1=# alter subscription mysubnew1_1 disable;
> > ALTER SUBSCRIPTION
> >
> > newdb1=# alter subscription mysubnew1_1 set (failover=false);
> > ERROR:  could not alter replication slot "mysubnew1_1": ERROR:  can no
> > longer get changes from replication slot "mysubnew1_1"
> > DETAIL:  The slot became invalid because it was inactive since
> > 2024-09-04 08:54:20.308996+05:30, which is more than 0 seconds ago.
> > HINT:  You might need to increase "replication_slot_inactive_timeout.".
> >
> > I think the behavior should be same.
> >
>
> We should not allow the invalid replication slot to be altered
> irrespective of the reason unless there is any benefit.
>

Okay, then I think we need to change the existing behaviour of the
other invalidation causes which still allow alter-slot.

thanks
Shveta

Re: On disable_cost

2024-09-08 Thread Richard Guo

On Fri, Sep 6, 2024 at 5:27 PM Richard Guo  wrote:
> On Fri, Sep 6, 2024 at 5:00 PM Alexander Lakhin  wrote:
> > static void
> > label_sort_with_costsize(PlannerInfo *root, Sort *plan, double limit_tuples)
> > {
> > ...
> >  cost_sort(&sort_path, root, NIL,
> >lefttree->total_cost,
> >plan->plan.disabled_nodes,
> >lefttree->plan_rows,
> >lefttree->plan_width,
> >0.0,
> >work_mem,
> >limit_tuples);
> >
> > Given the cost_sort() declaration:
> > void
> > cost_sort(Path *path, PlannerInfo *root,
> >List *pathkeys, int input_disabled_nodes,
> >Cost input_cost, double tuples, int width,
> >Cost comparison_cost, int sort_mem,
> >double limit_tuples)
> >
> > Aren't the input_disabled_nodes and input_cost arguments swapped in the
> > above call?
>
> Nice catch!  I checked other callers to cost_sort, and they are all
> good.

Fixed.

Thanks
Richard

Re: Sort functions with specialized comparators

2024-09-08 Thread Andrey M. Borodin

> On 9 Sep 2024, at 02:31, David Rowley  wrote:
> 
> Also, unless Andrey is happy for you to tag onto the work he's doing,
> I'd suggest another thread for that work. I don't think there's any
> good reason for that work to delay Andrey's work.

Stepan asked for mentoring project, so I handed him this patch set. We are 
working together, but the main goal is integrating Stepan into dev process. 
Well, the summer was really hot and we somehow were not advancing the project… 
So your thread bump is very timely!
Many thanks for your input about benchmarks! We will focus on measuring impact 
of changes. I totally share your concerns about optimization of sorts that are 
not used frequently.

Best regards, Andrey Borodin.

Re: Introduce XID age and inactive timeout based replication slot invalidation

2024-09-08 Thread Bharath Rupireddy

Hi,

On Mon, Sep 9, 2024 at 9:17 AM shveta malik  wrote:
>
> > We should not allow the invalid replication slot to be altered
> > irrespective of the reason unless there is any benefit.
>
> Okay, then I think we need to change the existing behaviour of the
> other invalidation causes which still allow alter-slot.

+1. Perhaps, track it in a separate thread?

--
Bharath Rupireddy
PostgreSQL Contributors Team
RDS Open Source Databases
Amazon Web Services: https://aws.amazon.com

Re: Introduce XID age and inactive timeout based replication slot invalidation

2024-09-08 Thread shveta malik

On Mon, Sep 9, 2024 at 10:26 AM Bharath Rupireddy
 wrote:
>
> Hi,
>
> On Mon, Sep 9, 2024 at 9:17 AM shveta malik  wrote:
> >
> > > We should not allow the invalid replication slot to be altered
> > > irrespective of the reason unless there is any benefit.
> >
> > Okay, then I think we need to change the existing behaviour of the
> > other invalidation causes which still allow alter-slot.
>
> +1. Perhaps, track it in a separate thread?

I think so. It does not come under the scope of this thread.

thanks
Shveta

Re: long-standing data loss bug in initial sync of logical replication

2024-09-08 Thread Shlok Kyal

On Mon, 2 Sept 2024 at 10:12, Amit Kapila  wrote:
>
> On Fri, Aug 30, 2024 at 3:06 PM Shlok Kyal  wrote:
> >
> > Next I am planning to test solely on the logical decoding side and
> > will share the results.
> >
>
> Thanks, the next set of proposed tests makes sense to me. It will also
> be useful to generate some worst-case scenarios where the number of
> invalidations is more to see the distribution cost in such cases. For
> example, Truncate/Drop a table with 100 or 1000 partitions.
>
> --
> With Regards,
> Amit Kapila.

Hi,

I did some performance testing solely on the logical decoding side and
found some degradation in performance, for the following testcase:
1. Created a publisher on a single table, say 'tab_conc1';
2. Created a second publisher on a single table say 'tp';
4. two sessions are running in parallel, let's say S1 and S2.
5. Begin a transaction in S1.
6. Now in a loop (this loop runs 'count' times):
 S1: Insert a row in table 'tab_conc1'
 S2: BEGIN;  Alter publication DROP/ ADD tp; COMMIT
7. COMMIT the transaction in S1.
8. run 'pg_logical_slot_get_binary_changes' to get the decoding changes.

Observation:
With fix a new entry is added in decoding. During debugging I found
that this entry only comes when we do a 'INSERT' in Session 1 after we
do 'ALTER PUBLICATION' in another session in parallel (or we can say
due to invalidation). Also, I observed that this new entry is related
to sending replica identity, attributes,etc as function
'logicalrep_write_rel' is called.

Performance:
We see a performance degradation as we are sending new entries during
logical decoding. Results are an average of 5 runs.

count|Head (sec)|Fix (sec)|Degradation (%)
--
1   |1.298|1.574 |21.26348228
5   |22.892  |24.997   |9.195352088
10 |88.602  |93.759   |5.820410374

I have also attached the test script here.

Thanks and Regards,
Shlok Kyal

test2.pl
Description: Binary data

Re: long-standing data loss bug in initial sync of logical replication

2024-09-08 Thread Shlok Kyal

On Mon, 2 Sept 2024 at 10:12, Amit Kapila  wrote:
>
> On Fri, Aug 30, 2024 at 3:06 PM Shlok Kyal  wrote:
> >
> > Next I am planning to test solely on the logical decoding side and
> > will share the results.
> >
>
> Thanks, the next set of proposed tests makes sense to me. It will also
> be useful to generate some worst-case scenarios where the number of
> invalidations is more to see the distribution cost in such cases. For
> example, Truncate/Drop a table with 100 or 1000 partitions.
>
> --
> With Regards,
> Amit Kapila.

Also, I did testing with a table with partitions. To test for the
scenario where the number of invalidations are more than distribution.
Following is the test case:
1. Created a publisher on a single table, say 'tconc_1';
2. Created a second publisher on a partition table say 'tp';
3. Created 'tcount' partitions for the table 'tp'.
4. two sessions are running in parallel, let's say S1 and S2.
5. Begin a transaction in S1.
6. S1: Insert a row in table 'tconc_1'
   S2: BEGIN; TRUNCATE TABLE tp; COMMIT;
   With patch, this will add 'tcount * 3' invalidation messages to
transaction in session 1.
   S1: Insert a row in table 't_conc1'
7. COMMIT the transaction in S1.
8. run 'pg_logical_slot_get_binary_changes' to get the decoding changes.

Performance:
We see a degradation in performance. Results are an average of 5 runs.

count of partitions |   Head (sec) |Fix (sec) |Degradation (%)
-
1000 |   0.114   |0.118 |3.50877193
5000 |   0.502   |0.522 |3.984063745
1   |   1.012   |1.024 |1.185770751

I have also attached the test script here. And will also do further testing.

Thanks and Regards,
Shlok Kyal

test3.pl
Description: Binary data

Re: Introduce XID age and inactive timeout based replication slot invalidation

2024-09-08 Thread shveta malik

On Sun, Sep 8, 2024 at 5:25 PM Bharath Rupireddy
 wrote:
>
>
> Please find the v45 patch. Addressed above and Shveta's review comments [1].
>

Thanks for the patch. Please find my comments:

1)
src/sgml/config.sgml:

+  Synced slots are always considered to be inactive because they
don't perform logical decoding to produce changes.

It is better we avoid such a statement, as internally we use logical
decoding to advance restart-lsn, see
'LogicalSlotAdvanceAndCheckSnapState' called form slotsync.c.

2)
src/sgml/config.sgml:

+ disables the inactive timeout invalidation mechanism

+ Slot invalidation due to inactivity timeout occurs during checkpoint.

Either have 'inactive' at both the places or 'inactivity'.

3)
slot.c:
+static bool InvalidatePossiblyObsoleteSlot(ReplicationSlotInvalidationCause
cause,
+ReplicationSlot *s,
+XLogRecPtr oldestLSN,
+Oid dboid,
+TransactionId snapshotConflictHorizon,
+bool *invalidated);
+static inline bool SlotInactiveTimeoutCheckAllowed(ReplicationSlot *s);

I think, we do not need above 2 declarations. The code compile fine
without these as the usage is later than the definition.

4)
+ /*
+ * An error is raised if error_if_invalid is true and the slot has been
+ * invalidated previously.
+ */
+ if (error_if_invalid && s->data.invalidated == RS_INVAL_INACTIVE_TIMEOUT)

The comment is generic while the 'if condition' is specific to one
invalidation cause. Even though I feel it can be made generic test for
all invalidation causes but that is not under scope of this thread and
needs more testing/analysis. For the time being, we can make comment
specific to the concerned invalidation cause. The header of function
will also need the same change.

5)
SlotInactiveTimeoutCheckAllowed():

+ * Check if inactive timeout invalidation mechanism is disabled or slot is
+ * currently being used or server is in recovery mode or slot on standby is
+ * currently being synced from the primary.
+ *

These comments say exact opposite of what we are checking in code.
Since the function name has 'Allowed' in it, we should be putting
comments which say what allows it instead of what disallows it.

6)

+ * Synced slots are always considered to be inactive because they don't
+ * perform logical decoding to produce changes.
+ */
+static inline bool
+SlotInactiveTimeoutCheckAllowed(ReplicationSlot *s)

Perhaps we should avoid mentioning logical decoding here. When slots
are synced, they are performing decoding and their inactive_since is
changing continuously. A better way to make this statement will be:

We want to ensure that the slots being synchronized are not
invalidated, as they need to be preserved for future use when the
standby server is promoted to the primary. This is necessary for
resuming logical replication from the new primary server.

7)

InvalidatePossiblyObsoleteSlot()

we are calling SlotInactiveTimeoutCheckAllowed() twice in this
function. We shall optimize.

At the first usage place, shall we simply get timestamp when cause is
RS_INVAL_INACTIVE_TIMEOUT without checking
SlotInactiveTimeoutCheckAllowed() as IMO it does not seem a
performance critical section. Or if we retain check at first place,
then at the second place we can avoid calling it again based on
whether 'now' is NULL or not.

thanks
Shveta

Re: race condition in pg_class

2024-09-08 Thread Nitin Motiani

On Sat, Sep 7, 2024 at 12:25 AM Noah Misch  wrote:
>
> On Fri, Sep 06, 2024 at 03:22:48PM +0530, Nitin Motiani wrote:
> > Thanks. I have no other comments.
>
> https://commitfest.postgresql.org/49/5090/ remains in status="Needs review".
> When someone moves it to status="Ready for Committer", I will commit
> inplace090, inplace110, and inplace120 patches.  If one of you is comfortable
> with that, please modify the status.

Done.

Re: Virtual generated columns

2024-09-08 Thread Peter Eisentraut


On 04.09.24 12:33, Dean Rasheed wrote:

I left the 0001 patch alone for now and put the new rewriting
implementation into 0002.  (Unfortunately, the diff is kind of useless
for visual inspection.)  Let me know if this matches what you had in
mind, please.  Also, is this the right place in fireRIRrules()?

Yes, that's what I had in mind except that it has to be called from
the second loop in fireRIRrules(), after any RLS policies have been
added, because it's possible for a RLS policy expression to refer to
virtual generated columns. It's OK to do it in the same loop that
expands RLS policies, because such policies can only refer to columns
of the same relation, so once the RLS policies have been expanded for
a given relation, nothing else should get added to the query that can
refer to columns of that relation, at that query level, so at that
point it should be safe to expand virtual generated columns.


If I move the code like that, then the postgres_fdw test fails.  So 
there is some additional interaction there that I need to study.

Re: Virtual generated columns

2024-09-08 Thread Peter Eisentraut


On 05.09.24 10:27, jian he wrote:

On Wed, Sep 4, 2024 at 4:40 PM Peter Eisentraut  wrote:



Here is an implementation of this.  It's much nicer!  It also appears to
fix all the additional test cases that have been presented.  (I haven't
integrated them into the patch set yet.)

I left the 0001 patch alone for now and put the new rewriting
implementation into 0002.  (Unfortunately, the diff is kind of useless
for visual inspection.)  Let me know if this matches what you had in
mind, please.  Also, is this the right place in fireRIRrules()?


hi. some minor issues.

in get_dependent_generated_columns we can

 /* skip if not generated column */
 if (!TupleDescAttr(tupdesc, defval->adnum - 1)->attgenerated)
 continue;
change to
 /* skip if not generated stored column */
 if (!(TupleDescAttr(tupdesc, defval->adnum -
1)->attgenerated == ATTRIBUTE_GENERATED_STORED))
 continue;


I need to study more what to do with this function.  I'm not completely 
sure whether this should apply only to stored generated columns.



in ExecInitStoredGenerated
"if ((tupdesc->constr && tupdesc->constr->has_generated_stored)))"
is true.
then later we finish the loop
(for (int i = 0; i < natts; i++) loop)

we can "Assert(ri_NumGeneratedNeeded > 0)"
so we can ensure once has_generated_stored flag is true,
then we should have at least one stored generated attribute.


This is technically correct, but this code isn't touched by this patch, 
so I don't think it belongs here.



similarly, in expand_generated_columns_internal
we can aslo add "Assert(list_length(tlist) > 0);"
above
node = ReplaceVarsFromTargetList(node, rt_index, 0, rte, tlist,
REPLACEVARS_CHANGE_VARNO, rt_index, NULL);


Ok, I'll add that.


@@ -2290,7 +2291,9 @@ ExecBuildSlotValueDescription(Oid reloid,
if (table_perm || column_perm)
{
- if (slot->tts_isnull[i])
+ if (att->attgenerated == ATTRIBUTE_GENERATED_VIRTUAL)
+ val = "virtual";
+ else if (slot->tts_isnull[i])
 val = "null";
else
{
Oid  foutoid;
bool typisvarlena;
getTypeOutputInfo(att->atttypid, &foutoid, &typisvarlena);
val = OidOutputFunctionCall(foutoid, slot->tts_values[i]);
}

we can add Assert here, if i understand it correctly, like
  if (att->attgenerated == ATTRIBUTE_GENERATED_VIRTUAL)
{
Assert(slot->tts_isnull[i]);
  val = "virtual";
}


Also technically correct, but I don't see what benefit this would bring. 
 The code guarded by that assert would not make use of the thing being 
asserted.

Re: Commit Timestamp and LSN Inversion issue

2024-09-08 Thread Nisha Moond

On Wed, Sep 4, 2024 at 12:23 PM shveta malik  wrote:
>
> Hello hackers,
> (Cc people involved in the earlier discussion)
>
> I would like to discuss the $Subject.
>
> While discussing Logical Replication's Conflict Detection and
> Resolution (CDR) design in [1] , it came to  our notice that the
> commit LSN and timestamp may not correlate perfectly i.e. commits may
> happen with LSN1 < LSN2 but with Ts1 > Ts2. This issue may arise
> because, during the commit process, the timestamp (xactStopTimestamp)
> is captured slightly earlier than when space is reserved in the WAL.
>
>  ~~
>
>  Reproducibility of conflict-resolution problem due to the timestamp inversion
> 
> It was suggested that timestamp inversion *may* impact the time-based
> resolutions such as last_update_wins (targeted to be implemented in
> [1]) as we may end up making wrong decisions if timestamps and LSNs
> are not correctly ordered. And thus we tried some tests but failed to
> find any practical scenario where it could be a problem.
>
> Basically, the proposed conflict resolution is a row-level resolution,
> and to cause the row value to be inconsistent, we need to modify the
> same row in concurrent transactions and commit the changes
> concurrently. But this doesn't seem possible because concurrent
> updates on the same row are disallowed (e.g., the later update will be
> blocked due to the row lock).  See [2] for the details.
>
> We tried to give some thoughts on multi table cases as well e.g.,
> update table A with foreign key and update the table B that table A
> refers to. But update on table A will block the update on table B as
> well, so we could not reproduce data-divergence due to the
> LSN/timestamp mismatch issue there.
>
>  ~~
>
> Idea proposed to fix the timestamp inversion issue
> 
> There was a suggestion in [3] to acquire the timestamp while reserving
> the space (because that happens in LSN order). The clock would need to
> be monotonic (easy enough with CLOCK_MONOTONIC), but also cheap. The
> main problem why it's being done outside the critical section, because
> gettimeofday() may be quite expensive. There's a concept of hybrid
> clock, combining "time" and logical counter, which might be useful
> independently of CDR.
>
> On further analyzing this idea, we found that CLOCK_MONOTONIC can be
> accepted only by clock_gettime() which has more precision than
> gettimeofday() and thus is equally or more expensive theoretically (we
> plan to test it and post the results). It does not look like a good
> idea to call any of these when holding spinlock to reserve the wal
> position.  As for the suggested solution "hybrid clock", it might not
> help here because the logical counter is only used to order the
> transactions with the same timestamp. The problem here is how to get
> the timestamp along with wal position
> reservation(ReserveXLogInsertLocation).
>

Here are the tests done to compare clock_gettime() and gettimeofday()
performance.

Machine details :
Intel(R) Xeon(R) CPU E7-4890 v2 @ 2.80GHz
CPU(s): 120; 800GB RAM

Three functions were tested across three different call volumes (1
million, 100 million, and 1 billion):
1) clock_gettime() with CLOCK_REALTIME
2) clock_gettime() with CLOCK_MONOTONIC
3) gettimeofday()

--> clock_gettime() with CLOCK_MONOTONIC sometimes shows slightly
better performance, but not consistently. The difference in time taken
by all three functions is minimal, with averages varying by no more
than ~2.5%. Overall, the performance between CLOCK_MONOTONIC and
gettimeofday() is essentially the same.

Below are the test results -
(each test was run twice for consistency)

1) For 1 million calls:
 1a) clock_gettime() with CLOCK_REALTIME:
- Run 1: 0.01770 seconds, Run 2: 0.01772 seconds, Average: 0.01771 seconds.
 1b) clock_gettime() with CLOCK_MONOTONIC:
- Run 1: 0.01753 seconds, Run 2: 0.01748 seconds, Average: 0.01750 seconds.
 1c) gettimeofday():
- Run 1: 0.01742 seconds, Run 2: 0.01777 seconds, Average: 0.01760 seconds.

2) For 100 million calls:
 2a) clock_gettime() with CLOCK_REALTIME:
- Run 1: 1.76649 seconds, Run 2: 1.76602 seconds, Average: 1.76625 seconds.
 2b) clock_gettime() with CLOCK_MONOTONIC:
- Run 1: 1.72768 seconds, Run 2: 1.72988 seconds, Average: 1.72878 seconds.
 2c) gettimeofday():
- Run 1: 1.72436 seconds, Run 2: 1.72174 seconds, Average: 1.72305 seconds.

3) For 1 billion calls:
 3a) clock_gettime() with CLOCK_REALTIME:
- Run 1: 17.63859 seconds, Run 2: 17.65529 seconds, Average:
17.64694 seconds.
 3b) clock_gettime() with CLOCK_MONOTONIC:
- Run 1: 17.15109 seconds, Run 2: 17.27406 seconds, Average:
17.21257 seconds.
 3c) gettimeofday():
- Run 1: 17.21368 seconds, Run 2: 17.22983 seconds, Average:
17.22175 seconds.

Attached the scripts used for tests.

--
Thanks,
Nisha
<>

Re: Invalid Assert while validating REPLICA IDENTITY?

2024-09-08 Thread Dilip Kumar

On Fri, Sep 6, 2024 at 4:48 PM vignesh C  wrote:
>
> On Mon, 2 Sept 2024 at 18:22, Dilip Kumar  wrote:
> >
> > On Mon, Sep 2, 2024 at 3:32 PM Amit Kapila  wrote:
> > >
> > > On Mon, Sep 2, 2024 at 11:21 AM Dilip Kumar  wrote:
> > > >
> > > > While working on some other code I noticed that in
> > > > FindReplTupleInLocalRel() there is an assert [1] that seems to be
> > > > passing IndexRelation to GetRelationIdentityOrPK() whereas it should
> > > > be passing normal relation.
> > > >
> > >
> > > Agreed. But this should lead to assertion failure. Did you try testing it?
> >
> > No, I did not test this particular case, it impacted me with my other
> > addition of the code where I got Index Relation as input to the
> > RelationGetIndexList() function, and my local changes were impacted by
> > that.  I will write a test for this stand-alone case so that it hits
> > the assert.  Thanks for looking into this.
>
> The FindReplTupleInLocalRel function can be triggered by both update
> and delete operations, but this only occurs if the relation has been
> marked as updatable by the logicalrep_rel_mark_updatable function. If
> the relation is marked as non-updatable, an error will be thrown by
> check_relation_updatable. Given this, if a relation is updatable, the
> IsIndexUsableForReplicaIdentityFull condition might always evaluate to
> true due to the previous checks in logicalrep_rel_mark_updatable.
> Therefore, it’s possible that we might not encounter the Assert
> statement, as IsIndexUsableForReplicaIdentityFull may consistently be
> true.
> Thoughts?

With that it seems that the first Assert condition is useless isn't it?

-- 
Regards,
Dilip Kumar
EnterpriseDB: http://www.enterprisedb.com

Re: Allow logical failover slots to wait on synchronous replication

2024-09-08 Thread shveta malik

On Fri, Aug 30, 2024 at 12:56 AM John H  wrote:
>
> Hi Shveta,
>
> Thanks for reviewing it so quickly.
>
> On Thu, Aug 29, 2024 at 2:35 AM shveta malik  wrote:
> >
> > Thanks for the patch. Few comments and queries:
> >
> > 1)
> > + static XLogRecPtr   lsn[NUM_SYNC_REP_WAIT_MODE];
> >
> > We shall name it as 'lsns' as there are multiple.
> >
>
> This follows the same naming convention in walsender_private.h
>
> > 2)
> >
> > + for (i = 0; i < NUM_SYNC_REP_WAIT_MODE; i++)
> > + {
> > + lsn[i] = InvalidXLogRecPtr;
> > + }
> >
> > Can we do it like below similar to what you have done at another place:
> > memset(lsn, InvalidXLogRecPtr, sizeof(lsn));
> >
>
> I don't think memset works in this case? Well, I think *technically* works but
> not sure if that's something worth optimizing.
> If I understand correctly, memset takes in a char for the value and
> not XLogRecPtr (uint64).
>
> So something like: memset(lsn, 0, sizeof(lsn))
>
> InvalidXLogRecPtr is defined as 0 so I think it works but there's an
> implicit dependency here
> for correctness.
>
> > 3)
> > + if (!initialized)
> > + {
> > + for (i = 0; i < NUM_SYNC_REP_WAIT_MODE; i++)
> > + {
> > + lsn[i] = InvalidXLogRecPtr;
> > + }
> > + }
> >
> > I do not see 'initialized' set to TRUE anywhere. Can you please
> > elaborate the intent here?
>
> You're right I thought I fixed this. WIll update.
>
> >
> > 4)
> > + int mode = SyncRepWaitMode;
> > + int i;
> > +
> > + if (!initialized)
> > + {
> > + for (i = 0; i < NUM_SYNC_REP_WAIT_MODE; i++)
> > + {
> > + lsn[i] = InvalidXLogRecPtr;
> > + }
> > + }
> > + if (mode == SYNC_REP_NO_WAIT)
> > + return true;
> >
> > I do not understand this code well. As Amit also pointed out,  'mode'
> > may change. When we initialize 'mode' lets say SyncRepWaitMode is
> > SYNC_REP_NO_WAIT but by the time we check 'if (mode ==
> > SYNC_REP_NO_WAIT)', SyncRepWaitMode has changed to say
> > SYNC_REP_WAIT_FLUSH, if so, then we will wrongly return true from
> > here. Is that a possibility? ProcessConfigFile() is in the caller, and
> > thus we may end up using the wrong mode.
> >
>
> Yes it's possible for mode to change. In my comment to Amit in the other 
> thread,
> I think we have to store mode and base our execution of this logic and ignore
> SyncRepWaitMode changing due to ProcesConfigFile/SIGHUP for one iteration.
>
> We can store the value of mode later, so something like:
>
> > if (SyncRepWaitMode == SYNC_REP_NO_WAIT)
> > return true;
> > mode = SyncRepWaitMode
> > if (lsn[mode] >= wait_for_lsn)
> >   return true;
>
> But it's the same issue which is when you check lsn[mode],
> SyncRepWaitMode could have changed to
> something else, so you always have to initialize the value and will
> always have this discrepancy.
>
> I'm skeptical end users are changing SyncRepWaitMode in their database

> clusters as
> it has implications for their durability and I would assume they run
> with the same durability guarantees.
>

I was trying to have a look at the patch again, it doesn't apply on
the head, needs rebase.

Regarding 'mode = SyncRepWaitMode', FWIW, SyncRepWaitForLSN() also
does in a similar way. It gets mode in local var initially and uses it
later. See [1]. So isn't there a chance too that 'SyncRepWaitMode' can
change in between.

[1]:
mode = SyncRepWaitMode;
.

if (!WalSndCtl->sync_standbys_defined ||
lsn <= WalSndCtl->lsn[mode])
{
LWLockRelease(SyncRepLock);
return;
}



thanks
Shveta

Re: per backend I/O statistics

2024-09-08 Thread Bertrand Drouvot

Hi,

On Fri, Sep 06, 2024 at 03:03:17PM +, Bertrand Drouvot wrote:
> - If we agree on the current proposal then I'll do  some refactoring in 
> pg_stat_get_backend_io(), pg_stat_get_my_io() and pg_stat_get_io() to avoid
> duplicated code (it's not done yet to ease the review).

Please find attached v3, mandatory rebase due to fc415edf8c.

Regards,

-- 
Bertrand Drouvot
PostgreSQL Contributors Team
RDS Open Source Databases
Amazon Web Services: https://aws.amazon.com
>From 335949f1c408a99aed74c0231d5e436b5f319746 Mon Sep 17 00:00:00 2001
From: Bertrand Drouvot 
Date: Thu, 22 Aug 2024 15:16:50 +
Subject: [PATCH v3 1/2] per backend I/O statistics

While pg_stat_io provides cluster-wide I/O statistics, this commit adds a new
pg_my_stat_io view to display "my" backend I/O statistics. The KIND_IO stats
are still "fixed amount" ones as the maximum number of backend is fixed.

The statistics snapshot is made for the global stats (the aggregated ones) and
for my backend stats. The snapshot is not build in each backend to copy all
the others backends stats, as 1/ there is no use case (there is no need to get
others backends I/O statistics while taking care of the stats_fetch_consistency)
and 2/ that could be memory expensive depending of the number of max connections.

A subsequent commit will add a new pg_stat_get_backend_io() function to be
able to retrieve the I/O statistics for a given backend pid (each execution
re-fetches counters from shared memory because as stated above there is no
snapshots being created in each backend to copy the other backends stats).

Bump catalog version needs to be done.
---
 doc/src/sgml/config.sgml  |   4 +-
 doc/src/sgml/monitoring.sgml  |  28 ++
 src/backend/catalog/system_views.sql  |  22 +
 src/backend/utils/activity/pgstat.c   |  11 ++-
 src/backend/utils/activity/pgstat_io.c|  91 ++---
 src/backend/utils/activity/pgstat_shmem.c |   4 +-
 src/backend/utils/adt/pgstatfuncs.c   | 114 +-
 src/include/catalog/pg_proc.dat   |   9 ++
 src/include/pgstat.h  |  14 ++-
 src/include/utils/pgstat_internal.h   |  14 ++-
 src/test/regress/expected/rules.out   |  19 
 src/test/regress/expected/stats.out   |  44 -
 src/test/regress/sql/stats.sql|  25 -
 13 files changed, 366 insertions(+), 33 deletions(-)
   8.5% doc/src/sgml/
  29.4% src/backend/utils/activity/
  23.9% src/backend/utils/adt/
   4.4% src/include/catalog/
   3.1% src/include/utils/
   3.1% src/include/
  14.4% src/test/regress/expected/
  10.0% src/test/regress/sql/

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 0aec11f443..2a59b97093 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -8331,7 +8331,9 @@ COPY postgres_log FROM '/full/path/to/logfile.csv' WITH csv;
 displayed in 
 pg_stat_database,
 
-pg_stat_io, in the output of
+pg_stat_io,
+
+pg_my_stat_io, in the output of
  when the BUFFERS option
 is used, in the output of  when
 the VERBOSE option is used, by autovacuum
diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml
index 933de6fe07..b28ca4e0ca 100644
--- a/doc/src/sgml/monitoring.sgml
+++ b/doc/src/sgml/monitoring.sgml
@@ -488,6 +488,16 @@ postgres   27093  0.0  0.0  30096  2752 ?Ss   11:34   0:00 postgres: ser
  
  
 
+ 
+  pg_my_stat_iopg_my_stat_io
+  
+   One row for each combination of context and target object containing
+   my backend I/O statistics.
+   See 
+   pg_my_stat_io for details.
+ 
+ 
+
  
   pg_stat_replication_slotspg_stat_replication_slots
   One row per replication slot, showing statistics about the
@@ -2948,6 +2958,24 @@ description | Waiting for a newly initialized WAL file to reach durable storage
 
 
 
+ 
+
+ 
+  pg_my_stat_io
+
+  
+   pg_my_stat_io
+  
+
+  
+   The pg_my_stat_io view will contain one row for each
+   combination of target I/O object and I/O context, showing
+   my backend I/O statistics. Combinations which do not make sense are
+   omitted. The fields are exactly the same as the ones in the  pg_stat_io
+   view.
+  
+
  
 
  
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index 7fd5d256a1..b4939b7bc6 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -1168,6 +1168,28 @@ SELECT
b.stats_reset
 FROM pg_stat_get_io() b;
 
+CREATE VIEW pg_my_stat_io AS
+SELECT
+   b.backend_type,
+   b.object,
+   b.context,
+   b.reads,
+   b.read_time,
+   b.writes,
+   b.write_time,
+   b.writebacks,
+   b.writeback_time,
+   b.extends,
+   b.extend_time,
+   b.op_bytes,
+   b.hits,
+   b.evictions,
+   b.reuses,
+   b.fsyncs,
+   b.fsync_time,
+   b.stats_rese

Re: Partitioned tables and [un]loggedness

2024-09-08 Thread Michael Paquier

On Tue, Sep 03, 2024 at 10:33:02AM -0500, Nathan Bossart wrote:
> This works correctly for CREATE TABLE, but ALTER TABLE still succeeds.
> Interestingly, it doesn't seem to actually change relpersistence for
> partitioned tables.  I think we might need to adjust
> ATPrepChangePersistence().

Adjusting ATPrepChangePersistence() looks incorrect to me seeing that
we have all the machinery in ATSimplePermissions() at our disposal,
and that this routine decides that ATT_TABLE should map to both
RELKIND_RELATION and RELKIND_PARTITIONED_TABLE.

How about inventing a new ATT_PARTITIONED_TABLE and make a clean split
between both relkinds?  I'd guess that blocking both SET LOGGED and
UNLOGGED for partitioned tables is the best move, even if it is
possible to block only one or the other, of course.
--
Michael


signature.asc
Description: PGP signature

52 matches

Mail list logo