Re: SPI_connect, SPI_connect_ext return type
Stepan Neretin writes: >> This combines portions of Stepan's >> two patches with some additional work (mostly, that he'd missed fixing >> any of contrib/). > Thank you for the feedback! I believe the patch looks satisfactory. Should > we await a review? The changes seem straightforward to me. I too think it's good to go. If no one complains or asks for more time to review, I will push it Monday or so. regards, tom lane
Re: Sort functions with specialized comparators
Hi! I rebase, clean and some refactor my patches. Best regards, Stepan Neretin. From f88cbb80e478d5ac3f23945b4ba0ee881f0d9cd4 Mon Sep 17 00:00:00 2001 From: "Andrey M. Borodin" Date: Sun, 8 Sep 2024 15:43:39 +0700 Subject: [PATCH v2 01/10] Use specialized sort facilities --- contrib/intarray/_int.h | 12 contrib/intarray/_int_gist.c | 2 +- contrib/intarray/_int_op.c | 19 +-- contrib/intarray/_int_tool.c | 12 src/include/port.h | 2 ++ src/port/qsort.c | 35 +++ 6 files changed, 47 insertions(+), 35 deletions(-) diff --git a/contrib/intarray/_int.h b/contrib/intarray/_int.h index 0352cbd368..5225c9090a 100644 --- a/contrib/intarray/_int.h +++ b/contrib/intarray/_int.h @@ -176,16 +176,4 @@ bool execconsistent(QUERYTYPE *query, ArrayType *array, bool calcnot); bool gin_bool_consistent(QUERYTYPE *query, bool *check); bool query_has_required_values(QUERYTYPE *query); -int compASC(const void *a, const void *b); -int compDESC(const void *a, const void *b); - -/* sort, either ascending or descending */ -#define QSORT(a, direction) \ - do { \ - int _nelems_ = ARRNELEMS(a); \ - if (_nelems_ > 1) \ - qsort((void*) ARRPTR(a), _nelems_, sizeof(int32), \ - (direction) ? compASC : compDESC ); \ - } while(0) - #endif /* ___INT_H__ */ diff --git a/contrib/intarray/_int_gist.c b/contrib/intarray/_int_gist.c index a09b7fa812..d39e40c66a 100644 --- a/contrib/intarray/_int_gist.c +++ b/contrib/intarray/_int_gist.c @@ -150,7 +150,7 @@ g_int_union(PG_FUNCTION_ARGS) ptr += nel; } - QSORT(res, 1); + sort_int32_asc(ARRPTR(res), ARRNELEMS(res)); res = _int_unique(res); *size = VARSIZE(res); PG_RETURN_POINTER(res); diff --git a/contrib/intarray/_int_op.c b/contrib/intarray/_int_op.c index 5b164f6788..34d3aa183f 100644 --- a/contrib/intarray/_int_op.c +++ b/contrib/intarray/_int_op.c @@ -198,7 +198,6 @@ sort(PG_FUNCTION_ARGS) text *dirstr = (fcinfo->nargs == 2) ? PG_GETARG_TEXT_PP(1) : NULL; int32 dc = (dirstr) ? VARSIZE_ANY_EXHDR(dirstr) : 0; char *d = (dirstr) ? VARDATA_ANY(dirstr) : NULL; - int dir = -1; CHECKARRVALID(a); if (ARRNELEMS(a) < 2) @@ -208,18 +207,18 @@ sort(PG_FUNCTION_ARGS) && (d[0] == 'A' || d[0] == 'a') && (d[1] == 'S' || d[1] == 's') && (d[2] == 'C' || d[2] == 'c'))) - dir = 1; + sort_int32_asc(ARRPTR(a), ARRNELEMS(a)); else if (dc == 4 && (d[0] == 'D' || d[0] == 'd') && (d[1] == 'E' || d[1] == 'e') && (d[2] == 'S' || d[2] == 's') && (d[3] == 'C' || d[3] == 'c')) - dir = 0; - if (dir == -1) + sort_int32_desc(ARRPTR(a), ARRNELEMS(a)); + else ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("second parameter must be \"ASC\" or \"DESC\""))); - QSORT(a, dir); + PG_RETURN_POINTER(a); } @@ -229,7 +228,7 @@ sort_asc(PG_FUNCTION_ARGS) ArrayType *a = PG_GETARG_ARRAYTYPE_P_COPY(0); CHECKARRVALID(a); - QSORT(a, 1); + sort_int32_asc(ARRPTR(a), ARRNELEMS(a)); PG_RETURN_POINTER(a); } @@ -239,7 +238,7 @@ sort_desc(PG_FUNCTION_ARGS) ArrayType *a = PG_GETARG_ARRAYTYPE_P_COPY(0); CHECKARRVALID(a); - QSORT(a, 0); + sort_int32_desc(ARRPTR(a), ARRNELEMS(a)); PG_RETURN_POINTER(a); } @@ -381,7 +380,7 @@ intset_union_elem(PG_FUNCTION_ARGS) result = intarray_add_elem(a, PG_GETARG_INT32(1)); PG_FREE_IF_COPY(a, 0); - QSORT(result, 1); + sort_int32_asc(ARRPTR(result), ARRNELEMS(result)); PG_RETURN_POINTER(_int_unique(result)); } @@ -403,10 +402,10 @@ intset_subtract(PG_FUNCTION_ARGS) CHECKARRVALID(a); CHECKARRVALID(b); - QSORT(a, 1); + sort_int32_asc(ARRPTR(a), ARRNELEMS(a)); a = _int_unique(a); ca = ARRNELEMS(a); - QSORT(b, 1); + sort_int32_asc(ARRPTR(b), ARRNELEMS(b)); b = _int_unique(b); cb = ARRNELEMS(b); result = new_intArrayType(ca); diff --git a/contrib/intarray/_int_tool.c b/contrib/intarray/_int_tool.c index c85280c842..e83c6aadc6 100644 --- a/contrib/intarray/_int_tool.c +++ b/contrib/intarray/_int_tool.c @@ -393,15 +393,3 @@ int_to_intset(int32 elem) aa[0] = elem; return result; } - -int -compASC(const void *a, const void *b) -{ - return pg_cmp_s32(*(const int32 *) a, *(const int32 *) b); -} - -int -compDESC(const void *a, const void *b) -{ - return pg_cmp_s32(*(const int32 *) b, *(const int32 *) a); -} diff --git a/src/include/port.h b/src/include/port.h index ba9ab0d34f..b82f969fc5 100644 --- a/src/include/port.h +++ b/src/include/port.h @@ -443,6 +443,8 @@ extern char *strsep(char **stringp, const char *delim); extern void pg_qsort(void *base, size_t nel, size_t elsize, int (*cmp) (const void *, const void *)); extern int pg_qsort_strcmp(const void *a, const void *b); +extern void sort_int32_asc(int32 *base, size_t nel); +extern void sort_int32_desc(int32 *base, size_t nel); #define qsort(a,b,c,d) pg_qsort(a,b,c,d) diff --git a/src/port/qsort.c b/src/port/qsort.c index 7879e6cd56..5175c8a6dd 100644
Deadlock due to locking order violation while inserting into a leaf relation
Basically, when we are inserting into a leaf relation (or lower level relation of the partitioned relation), we acquire the lock on the leaf relation during parsing time itself whereas parent lock is acquire during generate_partition_qual(). Now concurrently if we try to drop the partition root then it will acquire the lock in reverse order, i.e. parent first and then child so this will create a deadlock. Below example reproduce this case. Setup: CREATE TABLE test(a int, b int) partition by range(a); CREATE TABLE test1 partition of test for values from (1) to (10); Test: -- --Session1: INSERT INTO test1 VALUES (1, 4); -- let session is lock the relation test1 and make it wait before it locks test (put breakpoint in ExecInitModifyTable) --Session2: -- try to drop the top table which will try to take AccessExclusive lock on all partitions DROP TABLE test; --session3 -- see PG_LOCKS -- we can see that session1 has locked locked root table test(16384) waiting on test1(16387) as session1 is holding that lock locktype| database | relation | pid |mode | granted ---+--+---+---+-+ relation |5 |16387 | 30368 | RowExclusiveLock| t relation |5 |16387 | 30410 | AccessExclusiveLock | f relation |5 |16384 | 30410 | AccessExclusiveLock | t (11 rows) --Session1, now as soon as you continue in gdb in session 1 it will hit the deadlock ERROR: 40P01: deadlock detected DETAIL: Process 30368 waits for AccessShareLock on relation 16384 of database 5; blocked by process 30410. Process 30410 waits for AccessExclusiveLock on relation 16387 of database 5; blocked by process 30368. HINT: See server log for query details. LOCATION: DeadLockReport, deadlock.c:1135 -- Regards, Dilip Kumar EnterpriseDB: http://www.enterprisedb.com
Re: Sort functions with specialized comparators
On Sun, 8 Sept 2024 at 20:51, Stepan Neretin wrote: > Hi! I rebase, clean and some refactor my patches. I'm unsure what exactly is going on with this thread. It started with Andrey proposing a patch to speed up intarray sorting and now it's turned into you proposing 10 patches which implement a series of sort specialisation functions without any justification as to why the change is useful. If you want to have a performance patch accepted, then you'll need to show your test case and the performance results before and after. What this patch series looks like to me is that you've just searched the code base for qsort and just implemented a specialised qsort version without any regard as to whether the change is useful or not. For example, looking at v2-0006, you've added a specialisation to sort the columns which are specified in the CREATE STATISTICS command. This seems highly unlikely to be useful. The number of elements in this array is limited by STATS_MAX_DIMENSIONS, which is 8. Are you claiming the sort specialisation you've added makes a meaningful performance improvement to sorting an 8 element array? It looks to me like you've just derailed Andrey's proposal. I suggest you validate which ones of these patches you can demonstrate produce a meaningful performance improvement, ditch the remainder, and then start your own thread showing your test case and results. David
Re: [PoC] Add CANONICAL option to xmlserialize
The following review has been posted through the commitfest application: make installcheck-world: tested, failed Implements feature: tested, failed Spec compliant: tested, failed Documentation:tested, failed LGTM The new status of this patch is: Ready for Committer
Re: Introduce XID age and inactive timeout based replication slot invalidation
Hi, Thanks for reviewing. On Mon, Sep 2, 2024 at 1:37 PM Peter Smith wrote: > > Commit message. > > 1. > Because such synced slots are typically considered not > active (for them to be later considered as inactive) as they don't > perform logical decoding to produce the changes. > > This sentence is bad grammar. The docs have the same wording, so > please see my doc review comment #4 suggestion below. +1 > 2. > + > +Invalidates replication slots that are inactive for longer than > +specified amount of time. If this value is specified without units, > +it is taken as seconds. A value of zero (which is default) disables > +the timeout mechanism. This parameter can only be set in > +the postgresql.conf file or on the server > +command line. > + > + > > nit - This is OK as-is, but OTOH why not make the wording consistent > with the previous GUC description? (e.g. see my v43 [1] #2 review > comment) +1. > 3. > + > +This invalidation check happens either when the slot is acquired > +for use or during checkpoint. The time since the slot has become > +inactive is known from its > +inactive_since value using which the > +timeout is measured. > + > + > > I felt this is slightly misleading because slot acquiring has nothing > to do with setting the slot invalidation anymore. Furthermore, the 2nd > sentence is bad grammar. > > nit - IMO something simple like the following rewording can address > both of those points: > > Slot invalidation due to inactivity timeout occurs during checkpoint. > The duration of slot inactivity is calculated using the slot's > inactive_since field value. +1. > 4. > +Because such synced slots are typically considered not active > +(for them to be later considered as inactive) as they don't perform > +logical decoding to produce the changes. > > That sentence has bad grammar. > > nit – suggest a much simpler replacement: > Synced slots are always considered to be inactive because they don't > perform logical decoding to produce changes. +1. > 5. > +#define IsInactiveTimeoutSlotInvalidationApplicable(s) \ > > 5a. > I felt this would be better implemented as an inline function. Then it > can be commented on properly to explain the parts of the condition. > e.g. the large comment currently in InvalidatePossiblyObsoleteSlot() > would be more appropriate in this function. +1. > 5b. > The name is very long. Can't it be something shorter/simpler like: > 'IsSlotATimeoutCandidate()' > > ~~~ Missing inactive in the above suggested name. Used SlotInactiveTimeoutCheckAllowed, similar to XLogInsertAllowed. > 6. ReplicationSlotAcquire > > -ReplicationSlotAcquire(const char *name, bool nowait) > +ReplicationSlotAcquire(const char *name, bool nowait, > +bool check_for_invalidation) > > nit - Previously this new parameter really did mean to "check" for > [and set the slot] invalidation. But now I suggest renaming it to > 'error_if_invalid' to properly reflect the new usage. And also in the > slot.h. +1. > 7. > + /* > + * Error out if the slot has been invalidated previously. Because there's > + * no use in acquiring the invalidated slot. > + */ > > nit - The comment is contrary to the code. If there was no reason to > skip this error, then you would not have the new parameter allowing > you to skip this error. I suggest just repeating the same comment as > in the function header. +1. > 8. ReportSlotInvalidation > > nit - Added some blank lines for consistency. +1. > 9. InvalidatePossiblyObsoleteSlot > > 9a. > Consistency is good (commit message, docs and code comments for this), > but the added sentence has bad grammar. Please see the docs review > comment #4 above for some alternate phrasing. +1. > 9b. > Now that this logic is moved into a macro (I suggested it should be an > inline function) IMO this comment does not belong here anymore because > it is commenting code that you cannot see. Instead, this comment (or > something like it) should be as comments within the new function. > > == > src/include/replication/slot.h +1. > 10. > +extern void ReplicationSlotAcquire(const char *name, bool nowait, > +bool check_for_invalidation); > > Change the new param name as described in the earlier review comment. +1. > Please refer to the attached file which implements some of the nits > mentioned above. Merged the diff into v45. Thanks. On Tue, Sep 3, 2024 at 12:26 PM Peter Smith wrote: > > TEST CASE #1 > > 1. > +# Wait for the inactive replication slot to be invalidated. > > Is that comment correct? IIUC the synced slot should *already* be > invalidated from the primary, so here we are not really "waiting" for > it to be invalidated; Instead, we are just "confirming" that the > synchronized slot is already invalidated with the correct reason as > expected. Modified the comment. > 2. > +# Synced slot mustn't get invalidated on the stan
Re: Introduce XID age and inactive timeout based replication slot invalidation
Hi, Thanks for reviewing. On Tue, Sep 3, 2024 at 3:01 PM shveta malik wrote: > > 1) > I see that ReplicationSlotAlter() will error out if the slot is > invalidated due to timeout. I have not tested it myself, but do you > know if slot-alter errors out for other invalidation causes as well? > Just wanted to confirm that the behaviour is consistent for all > invalidation causes. Will respond to Amit's comment soon. > 2) > When a slot is invalidated, and we try to use that slot, it gives this msg: > > ERROR: can no longer get changes from replication slot "mysubnew1_2" > DETAIL: The slot became invalid because it was inactive since > 2024-09-03 14:23:34.094067+05:30, which is more than 600 seconds ago. > HINT: You might need to increase "replication_slot_inactive_timeout.". > > Isn't HINT misleading? Even if we increase it now, the slot can not be > reused again. > > Below is one side effect if inactive_since keeps on changing: > > postgres=# SELECT * FROM pg_replication_slot_advance('mysubnew1_1', > pg_current_wal_lsn()); > ERROR: can no longer get changes from replication slot "mysubnew1_1" > DETAIL: The slot became invalid because it was inactive since > 2024-09-04 10:03:56.68053+05:30, which is more than 10 seconds ago. > HINT: You might need to increase "replication_slot_inactive_timeout.". > > postgres=# select now(); >now > - > 2024-09-04 10:04:00.26564+05:30 > > 'DETAIL' gives wrong information, we are not past 10-seconds. This is > because inactive_since got updated even in ERROR scenario. > > ERROR: can no longer get changes from replication slot "mysubnew1_1" > DETAIL: The slot became invalid because it was inactive since > 2024-09-04 10:06:38.980939+05:30, which is more than 129600 seconds > ago. > postgres=# select now(); >now > -- > 2024-09-04 10:07:35.201894+05:30 > > I feel we should change this message itself. Removed the hint and corrected the detail message as following: errmsg("can no longer get changes from replication slot \"%s\"", NameStr(s->data.name)), errdetail("This slot has been invalidated because it was inactive for longer than the amount of time specified by \"%s\".", "replication_slot_inactive_timeout."))); > 3) > When the slot is invalidated, the' inactive_since' still keeps on > changing when there is a subscriber trying to start replication > continuously. I think ReplicationSlotAcquire() keeps on failing and > thus Release keeps on setting it again and again. Shouldn't we stop > setting/chnaging 'inactive_since' once the slot is invalidated > already, otherwise it will be misleading. > > postgres=# select failover,synced,inactive_since,invalidation_reason > from pg_replication_slots; > > failover | synced | inactive_since | invalidation_reason > --++--+- > t| f | 2024-09-03 14:23:.. | inactive_timeout > > after sometime: > failover | synced | inactive_since | invalidation_reason > --++--+- > t| f | 2024-09-03 14:26:..| inactive_timeout Changed it to not update inactive_since for slots invalidated due to inactive timeout. > 4) > src/sgml/config.sgml: > > 4a) > + A value of zero (which is default) disables the timeout mechanism. > > Better will be: > A value of zero (which is default) disables the inactive timeout > invalidation mechanism . Changed. > 4b) > 'synced' and inactive_since should point to pg_replication_slots: > > example: > linkend="view-pg-replication-slots">pg_replication_slots.synced Modified. > 5) > src/sgml/system-views.sgml: > + ..the slot has been inactive for longer than the duration specified > by replication_slot_inactive_timeout parameter. > > Better to have: > ..the slot has been inactive for a time longer than the duration > specified by the replication_slot_inactive_timeout parameter. Changed it to the following to be consistent with the config.sgml. inactive_timeout means that the slot has been inactive for longer than the amount of time specified by the parameter. Please find the v45 patch posted upthread at https://www.postgresql.org/message-id/CALj2ACWXQT3_HY40ceqKf1DadjLQP6b1r%3D0sZRh-xhAOd-b0pA%40mail.gmail.com for the changes. -- Bharath Rupireddy PostgreSQL Contributors Team RDS Open Source Databases Amazon Web Services: https://aws.amazon.com
Re: Sort functions with specialized comparators
Hi, why do you think that I rejected Andrey's offer? I included his patch first in my own. Yes, patch 2-0006 is the only patch to which I have not attached any statistics and it looks really dubious. But the rest seem useful. Above, I attached a speed graph for one of the patches and tps( pgbench) What do you think is the format in which to make benchmarks for my patches? Best regards, Stepan Neretin.
Re: Add RESPECT/IGNORE NULLS and FROM FIRST/LAST options
On 9/7/24 22:25, Oliver Ford wrote: On Sat, May 6, 2023 at 9:41 AM Oliver Ford wrote: On Sat, 6 May 2023, 04:57 Tatsuo Ishii, wrote: Attached is the patch to implement this (on top of your patch). test=# SELECT row_number() RESPECT NULLS OVER () FROM (SELECT 1) AS s; ERROR: window function row_number cannot have RESPECT NULLS or IGNORE NULLS The last time this was discussed (https://www.postgresql.org/message-id/1037735.1610402426%40sss.pgh.pa.us) it was suggested to make the feature generalizable, beyond what the standard says it should be limited to. With it generalizable, there would need to be extra checks for custom functions, such as if they allow multiple column arguments (which I'll add in v2 of the patch if the design's accepted). So I think we need a consensus on whether to stick to limiting it to several specific functions, or making it generalized yet agreeing the rules to limit it (such as no agg functions, and no functions with multiple column arguments). Reviving this thread, I've attached a rebased patch with code, docs, and tests and added it to November commitfest. Excellent! One of these days we'll get this in. :-) I have a problem with this test, though: SELECT sum(orbit) RESPECT NULLS OVER () FROM planets; -- succeeds Why should that succeed? Especially since aggregates such as SUM() will ignore nulls! The error message on its partner seems to confirm this: SELECT sum(orbit) IGNORE NULLS OVER () FROM planets; -- fails ERROR: aggregate functions do not accept RESPECT/IGNORE NULLS I believe they should both fail. -- Vik Fearing
Re: [PoC] Add CANONICAL option to xmlserialize
On Sun, 2024-09-08 at 11:43 +, Oliver Ford wrote: > The following review has been posted through the commitfest application: > make installcheck-world: tested, failed > Implements feature: tested, failed > Spec compliant: tested, failed > Documentation:tested, failed > > LGTM > > The new status of this patch is: Ready for Committer Huh? Do you mean "tested, passes"? Yours, Laurenz Albe
Re: [PoC] Add CANONICAL option to xmlserialize
On Sun, Sep 8, 2024 at 2:44 PM Laurenz Albe wrote: > > On Sun, 2024-09-08 at 11:43 +, Oliver Ford wrote: > > The following review has been posted through the commitfest application: > > make installcheck-world: tested, failed > > Implements feature: tested, failed > > Spec compliant: tested, failed > > Documentation:tested, failed > > > > LGTM > > > > The new status of this patch is: Ready for Committer > > Huh? Do you mean "tested, passes"? > > Yours, > Laurenz Albe Whoops, yes all tests and docs pass!
Re: Yet another way for pg_ctl stop to fail on Windows
07.09.2024 21:11, Noah Misch wrote: Noah, what do you think of handling this error in line with handling of ERROR_BROKEN_PIPE and ERROR_BAD_PIPE (which was done in 0ea1f2a3a)? I tried the following change: switch (GetLastError()) { case ERROR_BROKEN_PIPE: case ERROR_BAD_PIPE: + case ERROR_PIPE_BUSY: and saw no issues. That would be a strict improvement over returning EINVAL like we do today. We do use PIPE_UNLIMITED_INSTANCES, so I expect the causes of ERROR_PIPE_BUSY are process exit and ENOMEM-like situations. While that change is the best thing if the process is exiting, it could silently drop the signal in ENOMEM-like situations. Consider the following alternative. If sig==0, just return 0 like you propose, because the process isn't completely gone. Otherwise, sleep and retry the signal, like pgwin32_open_handle() retries after certain errors. What do you think of that? Thank you for your attention to the issue! I agree with your approach. It looks like Microsoft recommends to loop on ERROR_PIPE_BUSY: [1] (they say "Calling CallNamedPipe is equivalent to calling the CreateFile ..." at [2]). So if we aim to not only fix "pg_ctl stop", but to make pgkill() robust, it's the way to go, IMHO. I'm not sure about an infinite loop they show, I'd vote for a loop with the same characteristics as in pgwin32_open_handle(). I've managed to trigger ERROR_PIPE_BUSY with "pg_ctl reload", when running 20 TAP tests (see attached) in parallel (with 20 threads executing $node->reload() in each), and with the kill() call inside do_reload() repeated x100 (see the modification patch attached too): ... # Running: pg_ctl -D .../099_test_pgkill\data/t_099_test_pgkill_node_data/pgdata reload ### Reloading node "node" # Running: pg_ctl -D .../099_test_pgkill\data/t_099_test_pgkill_node_data/pgdata reload [13:41:46.850](2.400s) # 18 server signaled server signaled server signaled server signaled server signaled server signaled server signaled server signaled !!!pgkill| GetLastError(): 231 pg_ctl: could not send reload signal (PID: 3912), iteration: 81, res: -1, errno: 22: Invalid argument server signaled [13:41:52.594](5.744s) # 19 ... [1] https://learn.microsoft.com/en-us/windows/win32/ipc/named-pipe-client [2] https://learn.microsoft.com/en-us/windows/win32/api/winbase/nf-winbase-callnamedpipea Best regards, Alexanderdiff --git a/src/bin/pg_ctl/pg_ctl.c b/src/bin/pg_ctl/pg_ctl.c index e7e878c22f..cab6eb9472 100644 --- a/src/bin/pg_ctl/pg_ctl.c +++ b/src/bin/pg_ctl/pg_ctl.c @@ -1167,12 +1174,17 @@ do_reload(void) exit(1); } - if (kill(pid, sig) != 0) +for (int i = 1; i <= 100; i++) +{ +int res, en; + if ((res = kill(pid, sig)) != 0) { - write_stderr(_("%s: could not send reload signal (PID: %d): %m\n"), - progname, (int) pid); +en = errno; + write_stderr(_("%s: could not send reload signal (PID: %d), iteration: %d, res: %d, errno: %d: %m\n"), + progname, (int) pid, i, res, en); exit(1); } +} print_msg(_("server signaled\n")); } diff --git a/src/port/kill.c b/src/port/kill.c index 412c2f19c1..b3d2c00796 100644 --- a/src/port/kill.c +++ b/src/port/kill.c @@ -70,6 +70,7 @@ pgkill(int pid, int sig) return 0; } +int le = GetLastError(); switch (GetLastError()) { case ERROR_BROKEN_PIPE: @@ -89,6 +90,7 @@ pgkill(int pid, int sig) errno = EPERM; return -1; default: +fprintf(stderr, "!!!pgkill| GetLastError(): %d\n", le); errno = EINVAL; /* unexpected */ return -1; } 099_test_pgkill.pl Description: Perl program
Re: [PoC] Add CANONICAL option to xmlserialize
Hi Oliver On 08.09.24 15:56, Oliver Ford wrote: > Whoops, yes all tests and docs pass! Thanks for the review! Best, Jim
Re: Yet another way for pg_ctl stop to fail on Windows
On Sun, Sep 08, 2024 at 06:00:00PM +0300, Alexander Lakhin wrote: > 07.09.2024 21:11, Noah Misch wrote: > > > Noah, what do you think of handling this error in line with handling of > > > ERROR_BROKEN_PIPE and ERROR_BAD_PIPE (which was done in 0ea1f2a3a)? > > > > > > I tried the following change: > > > switch (GetLastError()) > > > { > > > case ERROR_BROKEN_PIPE: > > > case ERROR_BAD_PIPE: > > > + case ERROR_PIPE_BUSY: > > > and saw no issues. > > That would be a strict improvement over returning EINVAL like we do today. > > We > > do use PIPE_UNLIMITED_INSTANCES, so I expect the causes of ERROR_PIPE_BUSY > > are > > process exit and ENOMEM-like situations. While that change is the best > > thing > > if the process is exiting, it could silently drop the signal in ENOMEM-like > > situations. Consider the following alternative. If sig==0, just return 0 > > like you propose, because the process isn't completely gone. Otherwise, > > sleep > > and retry the signal, like pgwin32_open_handle() retries after certain > > errors. > > What do you think of that? > I agree with your approach. It looks like Microsoft recommends to loop on > ERROR_PIPE_BUSY: [1] (they say "Calling CallNamedPipe is equivalent to > calling the CreateFile ..." at [2]). I see Microsoft suggests WaitNamedPipeA() as opposed to just polling. WaitNamedPipeA() should be more responsive. Given how rare this has been, it likely doesn't matter whether we use WaitNamedPipeA() or polling. I'd lean toward whichever makes the code simpler, probably polling. > So if we aim to not only fix "pg_ctl stop", but to make pgkill() robust, > it's the way to go, IMHO. I'm not sure about an infinite loop they show, > I'd vote for a loop with the same characteristics as in > pgwin32_open_handle(). I agree with bounding the total time of each kill(), like pgwin32_open_handle() does for open(). > [1] https://learn.microsoft.com/en-us/windows/win32/ipc/named-pipe-client > [2] > https://learn.microsoft.com/en-us/windows/win32/api/winbase/nf-winbase-callnamedpipea
libpq: Process buffered SSL read bytes to support records >8kB on async API
Dear hackers, I'm the maintainer of ruby-pg the ruby interface to the PostgreSQL database. This binding uses the asynchronous API of libpq by default to facilitate the ruby IO wait and scheduling mechanisms. This works well with the vanilla postgresql server, but it leads to starvation with other types of servers using the postgresql wire protocol 3. This is because the current functioning of the libpq async interface depends on a maximum size of SSL records of 8kB. The following servers were reported to starve with ruby-pg: * AWS RDS Aurora Serverless [1] * YugabyteDb [2] * CockroachDB [3] They block infinitely on certain message sizes sent from the backend to the libpq frontend. It is best described in [4]. A repro docker composition is provided by YugabyteDB at [2]. To fix this issue the attached patch calls pqReadData() repeatedly in PQconsumeInput() until there is no buffered SSL data left to be read. Another solution could be to process buffered SSL read bytes in PQisBusy() instead of PQconsumeInput() . The synchronous libpq API isn't affected, since it supports arbitrary SSL record sizes already. That's why I think that the asynchronous API should also support bigger SSL record sizes. Regards, Lars [1] https://github.com/ged/ruby-pg/issues/325 [2] https://github.com/ged/ruby-pg/issues/588 [3] https://github.com/ged/ruby-pg/issues/583 [4] https://github.com/ged/ruby-pg/issues/325#issuecomment-737561270 From ab793829a4ce473f1cc2bbc0e2a6f6753553255d Mon Sep 17 00:00:00 2001 From: Lars Kanis Date: Sun, 8 Sep 2024 13:59:05 +0200 Subject: [PATCH] libpq: Process buffered SSL read bytes to support records >8kB on async API The async API of libpq doesn't support SSL record sizes >8kB so far. This size isn't exceeded by vanilla PostgreSQL, but by other products using the postgres wire protocol 3. PQconsumeInput() reads all data readable from the socket, so that the read condition is cleared. But it doesn't process all the data that is pending on the SSL layer. Also a subsequent call to PQisBusy() doesn't process it, so that the client is triggered to wait for more readable data on the socket. But this never arrives, so that the connection blocks infinitely. To fix this issue call pqReadData() repeatedly until there is no buffered SSL data left to be read. The synchronous libpq API isn't affected, since it supports arbitrary SSL record sizes already. --- src/interfaces/libpq/fe-exec.c | 13 + 1 file changed, 13 insertions(+) diff --git a/src/interfaces/libpq/fe-exec.c b/src/interfaces/libpq/fe-exec.c index 0d224a852..637894ee1 100644 --- a/src/interfaces/libpq/fe-exec.c +++ b/src/interfaces/libpq/fe-exec.c @@ -2006,6 +2006,19 @@ PQconsumeInput(PGconn *conn) if (pqReadData(conn) < 0) return 0; + #ifdef USE_SSL + /* + * Ensure all buffered read bytes in the SSL library are processed, + * which might be not the case, if the SSL record size exceeds 8k. + * Otherwise parseInput can't process the data. + */ + while (conn->ssl_in_use && pgtls_read_pending(conn)) + { + if (pqReadData(conn) < 0) +return 0; + } + #endif + /* Parsing of the data waits till later. */ return 1; } -- 2.43.0
Re: CI, macports, darwin version problems
On Sat, Aug 3, 2024 at 12:07 AM Joe Conway wrote: > I tried making this run like a service using launchctl, but that was > giving the permissions errors. I finally gave up trying to figure it out > and just accepted that I need to manually start the script whenever I > reboot the mac. It seems to be unhappy recently: Persistent worker failed to start the task: tart isolation failed: failed to create VM cloned from "ghcr.io/cirruslabs/macos-runner:sonoma": tart command returned non-zero exit code: "tart/VMStorageOCI.swift:5: Fatal error: 'try!' expression unexpectedly raised an error: Error Domain=NSCocoaErrorDomain Code=512 \"The file “ci-run” couldn’t be saved in the folder “Users”.\" UserInfo={NSFilePath=/Users/ci-run, NSUnderlyingError=0x619f0720 {Error Domain=NSPOSIXErrorDomain Code=20 \"Not a directory\"}}"
Re: Sort functions with specialized comparators
On Mon, 9 Sept 2024 at 01:00, Stepan Neretin wrote: > Hi, why do you think that I rejected Andrey's offer? I included his patch > first in my own. Yes, patch 2-0006 is the only patch to which I have not > attached any statistics and it looks really dubious. But the rest seem > useful. Above, I attached a speed graph for one of the patches and > tps(pgbench) The difference with your patches and Andrey's patch is that he included a benchmark which is targeted to the code he changed and his results show a speed-up. What it appears that you've done is made an assortment of changes and picked the least effort thing that tests performance of something. You by chance saw a performance increase so assumed it was due to your changes. > What do you think is the format in which to make benchmarks for my patches? You'll need a benchmark that exercises the code you've changed to some degree where it has a positive impact on performance. As far as I can see, you've not done that yet. Just to give you the benefit of the doubt, I applied all 10 v2 patches and adjusted the call sites to add a NOTICE to include the size of the array being sorted. Here is the result of running your benchmark: $ pgbench -t1000 -d postgres pgbench (18devel) NOTICE: RelationGetIndexList 1 NOTICE: RelationGetStatExtList 0 NOTICE: RelationGetIndexList 3 NOTICE: RelationGetStatExtList 0 NOTICE: RelationGetIndexList 2 NOTICE: RelationGetStatExtList 0 NOTICE: RelationGetIndexList 1 NOTICE: RelationGetStatExtList 0 NOTICE: RelationGetIndexList 2 NOTICE: RelationGetStatExtList 0 starting vacuum...NOTICE: RelationGetIndexList 1 NOTICE: RelationGetIndexList 0 end. NOTICE: RelationGetIndexList 1 NOTICE: RelationGetStatExtList 0 NOTICE: RelationGetIndexList 1 NOTICE: RelationGetStatExtList 0 NOTICE: RelationGetIndexList 1 NOTICE: RelationGetStatExtList 0 transaction type: scaling factor: 1 query mode: simple number of clients: 1 number of threads: 1 maximum number of tries: 1 number of transactions per client: 1000 number of transactions actually processed: 1000/1000 number of failed transactions: 0 (0.000%) latency average = 0.915 ms initial connection time = 23.443 ms tps = 1092.326732 (without initial connection time) Note that -t1000 shows the same number of notices as -t1. So, it seems everything you've changed that runs in your benchmark is RelationGetIndexList() and RelationGetStatExtList(). In one of the calls to RelationGetIndexList() we're sorting up to a maximum of 3 elements. Just to be clear, I'm not stating that I think all of your changes are useless. If you want these patches accepted, then you're going to need to prove they're useful and you've not done that. Also, unless Andrey is happy for you to tag onto the work he's doing, I'd suggest another thread for that work. I don't think there's any good reason for that work to delay Andrey's work. David
Test improvements and minor code fixes for formatting.c.
Over in the thread about teaching to_number() to handle Roman numerals [1], it was noted that we currently have precisely zero test coverage for to_char()'s existing Roman-numeral code, except in the timestamp code path which shares nothing with the numeric code path. In looking at this, I found that there's also no test coverage for the , V, or PL format codes. Also, the possibility of overflow while converting an input value to int in order to pass it to int_to_roman was ignored. Attached is a patch that adds more test coverage and cleans up the Roman-numeral code a little bit. BTW, I also discovered that there is a little bit of support for a "B" format code: we can parse it, but then we ignore it. And it's not documented. Oracle defines this code as a flag that: Returns blanks for the integer part of a fixed-point number when the integer part is zero (regardless of zeros in the format model). It doesn't seem super hard to implement that, but given the complete lack of complaints about it being missing, maybe we should just rip out the incomplete support instead? regards, tom lane [1] https://www.postgresql.org/message-id/flat/CAMWA6ybh4M1VQqpmnu2tfSwO%2B3gAPeA8YKnMHVADeB%3DXDEvT_A%40mail.gmail.com diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index 461fc3f437..e2d45989d8 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -8739,6 +8739,8 @@ SELECT regexp_match('abc01234xyz', '(?:(.*?)(\d+)(.*)){1,1}'); n is the number of digits following V. V with to_number divides in a similar manner. + The V can be thought of as marking the position + of an implicit decimal point in the input or output string. to_char and to_number do not support the use of V combined with a decimal point diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c index 68fa89418f..116d79ae11 100644 --- a/src/backend/utils/adt/formatting.c +++ b/src/backend/utils/adt/formatting.c @@ -5189,6 +5189,11 @@ NUM_cache(int len, NUMDesc *Num, text *pars_str, bool *shouldFree) } +/* + * Convert integer to Roman numerals + * Result is upper-case and not blank-padded (NUM_processor converts as needed) + * If input is out-of-range, produce '###' + */ static char * int_to_roman(int number) { @@ -5201,32 +5206,42 @@ int_to_roman(int number) result = (char *) palloc(16); *result = '\0'; + /* + * This range limit is the same as in Oracle(TM). The difficulty with + * handling 4000 or more is that we'd need to use more than 3 "M"'s, and + * more than 3 of the same digit isn't considered a valid Roman string. + */ if (number > 3999 || number < 1) { fill_str(result, '#', 15); return result; } + + /* Convert to decimal, then examine each digit */ len = snprintf(numstr, sizeof(numstr), "%d", number); + Assert(len > 0 && len <= 4); for (p = numstr; *p != '\0'; p++, --len) { num = *p - ('0' + 1); if (num < 0) - continue; - - if (len > 3) + continue; /* ignore zeroes */ + /* switch on current column position */ + switch (len) { - while (num-- != -1) -strcat(result, "M"); - } - else - { - if (len == 3) + case 4: +while (num-- >= 0) + strcat(result, "M"); +break; + case 3: strcat(result, rm100[num]); - else if (len == 2) +break; + case 2: strcat(result, rm10[num]); - else if (len == 1) +break; + case 1: strcat(result, rm1[num]); +break; } } return result; @@ -6367,7 +6382,6 @@ numeric_to_char(PG_FUNCTION_ARGS) char *numstr, *orgnum, *p; - Numeric x; NUM_TOCHAR_prepare; @@ -6376,12 +6390,15 @@ numeric_to_char(PG_FUNCTION_ARGS) */ if (IS_ROMAN(&Num)) { - x = DatumGetNumeric(DirectFunctionCall2(numeric_round, -NumericGetDatum(value), -Int32GetDatum(0))); - numstr = - int_to_roman(DatumGetInt32(DirectFunctionCall1(numeric_int4, - NumericGetDatum(x; + int32 intvalue; + bool err; + + /* Round and convert to int */ + intvalue = numeric_int4_opt_error(value, &err); + /* On overflow, just use PG_INT32_MAX; int_to_roman will cope */ + if (err) + intvalue = PG_INT32_MAX; + numstr = int_to_roman(intvalue); } else if (IS_(&Num)) { @@ -6421,6 +6438,7 @@ numeric_to_char(PG_FUNCTION_ARGS) { int numstr_pre_len; Numeric val = value; + Numeric x; if (IS_MULTI(&Num)) { @@ -6589,12 +6607,18 @@ int8_to_char(PG_FUNCTION_ARGS) NUM_TOCHAR_prepare; /* - * On DateType depend part (int32) + * On DateType depend part (int64) */ if (IS_ROMAN(&Num)) { - /* Currently don't support int8 conversion to roman... */ - numstr = int_to_roman(DatumGetInt32(DirectFunctionCall1(int84, Int64GetDatum(value; + int32 intvalue; + + /* On overflow, just use PG_INT32_MAX; int_to_roman will cope */ + if (value <= PG_INT32_MAX && value >= PG_INT32_MIN)
Re: [PATCH] Add roman support for to_number function
I wrote: > * Further to Aleksander's point about lack of test coverage for > the to_char direction, I see from > https://coverage.postgresql.org/src/backend/utils/adt/formatting.c.gcov.html > that almost none of the existing roman-number code paths are covered > today. While it's not strictly within the charter of this patch > to improve that, maybe we should try to do so --- at the very > least it'd raise formatting.c's coverage score a few notches. For the archives' sake: I created a patch and a separate discussion thread [1] for that. regards, tom lane [1] https://www.postgresql.org/message-id/flat/2956175.1725831...@sss.pgh.pa.us
Re: Test improvements and minor code fixes for formatting.c.
I wrote: > [ v1-formatting-test-improvements.patch ] Meh ... cfbot points out I did the float-to-int conversions wrong. v2 attached. regards, tom lane diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index 461fc3f437..e2d45989d8 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -8739,6 +8739,8 @@ SELECT regexp_match('abc01234xyz', '(?:(.*?)(\d+)(.*)){1,1}'); n is the number of digits following V. V with to_number divides in a similar manner. + The V can be thought of as marking the position + of an implicit decimal point in the input or output string. to_char and to_number do not support the use of V combined with a decimal point diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c index 68fa89418f..85a7dd4561 100644 --- a/src/backend/utils/adt/formatting.c +++ b/src/backend/utils/adt/formatting.c @@ -5189,6 +5189,11 @@ NUM_cache(int len, NUMDesc *Num, text *pars_str, bool *shouldFree) } +/* + * Convert integer to Roman numerals + * Result is upper-case and not blank-padded (NUM_processor converts as needed) + * If input is out-of-range, produce '###' + */ static char * int_to_roman(int number) { @@ -5201,32 +5206,42 @@ int_to_roman(int number) result = (char *) palloc(16); *result = '\0'; + /* + * This range limit is the same as in Oracle(TM). The difficulty with + * handling 4000 or more is that we'd need to use more than 3 "M"'s, and + * more than 3 of the same digit isn't considered a valid Roman string. + */ if (number > 3999 || number < 1) { fill_str(result, '#', 15); return result; } + + /* Convert to decimal, then examine each digit */ len = snprintf(numstr, sizeof(numstr), "%d", number); + Assert(len > 0 && len <= 4); for (p = numstr; *p != '\0'; p++, --len) { num = *p - ('0' + 1); if (num < 0) - continue; - - if (len > 3) + continue; /* ignore zeroes */ + /* switch on current column position */ + switch (len) { - while (num-- != -1) -strcat(result, "M"); - } - else - { - if (len == 3) + case 4: +while (num-- >= 0) + strcat(result, "M"); +break; + case 3: strcat(result, rm100[num]); - else if (len == 2) +break; + case 2: strcat(result, rm10[num]); - else if (len == 1) +break; + case 1: strcat(result, rm1[num]); +break; } } return result; @@ -6367,7 +6382,6 @@ numeric_to_char(PG_FUNCTION_ARGS) char *numstr, *orgnum, *p; - Numeric x; NUM_TOCHAR_prepare; @@ -6376,12 +6390,15 @@ numeric_to_char(PG_FUNCTION_ARGS) */ if (IS_ROMAN(&Num)) { - x = DatumGetNumeric(DirectFunctionCall2(numeric_round, -NumericGetDatum(value), -Int32GetDatum(0))); - numstr = - int_to_roman(DatumGetInt32(DirectFunctionCall1(numeric_int4, - NumericGetDatum(x; + int32 intvalue; + bool err; + + /* Round and convert to int */ + intvalue = numeric_int4_opt_error(value, &err); + /* On overflow, just use PG_INT32_MAX; int_to_roman will cope */ + if (err) + intvalue = PG_INT32_MAX; + numstr = int_to_roman(intvalue); } else if (IS_(&Num)) { @@ -6421,6 +6438,7 @@ numeric_to_char(PG_FUNCTION_ARGS) { int numstr_pre_len; Numeric val = value; + Numeric x; if (IS_MULTI(&Num)) { @@ -6589,12 +6607,18 @@ int8_to_char(PG_FUNCTION_ARGS) NUM_TOCHAR_prepare; /* - * On DateType depend part (int32) + * On DateType depend part (int64) */ if (IS_ROMAN(&Num)) { - /* Currently don't support int8 conversion to roman... */ - numstr = int_to_roman(DatumGetInt32(DirectFunctionCall1(int84, Int64GetDatum(value; + int32 intvalue; + + /* On overflow, just use PG_INT32_MAX; int_to_roman will cope */ + if (value <= PG_INT32_MAX && value >= PG_INT32_MIN) + intvalue = (int32) value; + else + intvalue = PG_INT32_MAX; + numstr = int_to_roman(intvalue); } else if (IS_(&Num)) { @@ -6695,7 +6719,18 @@ float4_to_char(PG_FUNCTION_ARGS) NUM_TOCHAR_prepare; if (IS_ROMAN(&Num)) - numstr = int_to_roman((int) rint(value)); + { + int32 intvalue; + + /* See notes in ftoi4() */ + value = rint(value); + /* On overflow, just use PG_INT32_MAX; int_to_roman will cope */ + if (!isnan(value) && FLOAT4_FITS_IN_INT32(value)) + intvalue = (int32) value; + else + intvalue = PG_INT32_MAX; + numstr = int_to_roman(intvalue); + } else if (IS_(&Num)) { if (isnan(value) || isinf(value)) @@ -6797,7 +6832,18 @@ float8_to_char(PG_FUNCTION_ARGS) NUM_TOCHAR_prepare; if (IS_ROMAN(&Num)) - numstr = int_to_roman((int) rint(value)); + { + int32 intvalue; + + /* See notes in dtoi4() */ + value = rint(value); + /* On overflow, just use PG_INT32_MAX; int_to_roman will cope */ + if (!isnan(value) && FLOAT8_FITS_IN_INT32(value)) + intvalue = (int32) value; + else + intvalue = PG_INT32_MAX; + numstr = int_to_roman(intvalu
Re: PATCH: Issue with set_indexsafe_procflags in ReindexRelationConcurrently
On Fri, Sep 06, 2024 at 01:27:12PM +0200, Michail Nikolaev wrote: > While working on [1], I have found a small issue with correctness > of set_indexsafe_procflags usage in ReindexRelationConcurrently introduced > in [2]. > > > idx->safe = (indexRel->rd_indexprs == NIL && indexRel->rd_indpred == NIL); > > It is always true because there are no RelationGetIndexExpressions > and RelationGetIndexPredicate before that check. > > Two patches with reproducer + fix are attached. > > The issue is simple, but I'll register this in commitfest just in case. Ugh. It means that we've always considered as "safe" concurrent rebuilds of indexes that have expressions or predicates, but they're not safe at all. Other concurrent jobs should wait for them. Adding these two calls as you are suggesting is probably a good idea anyway to force a correct setup of the flag. Will see about that. I don't understand why an isolation test is required here if we were to add a validity test, because you can cause a failure in the REINDEX with a set of SQLs in a single session. I'm OK to add a test, perhaps with a NOTICE set when the safe flag is true. Anyway, what you have is more than enough to prove your point. Thanks for that. -- Michael signature.asc Description: PGP signature
Re: Jargon and acronyms on this mailing list
On Sat, 31 Aug 2024 at 04:02, Greg Sabino Mullane wrote: > I normally wouldn't mention my blog entries here, but this one was about the > hackers mailing list, so wanted to let people know about it in case you don't > follow Planet Postgres. I scanned the last year's worth of posts and gathered > the most used acronyms and jargon. The most commonly used acronym was IMO (in > my opinion), followed by FWIW (for what it's worth), and IIUC (if I > understand correctly). The complete list can be found in the post below, I'll > refrain from copying everything here. > > https://www.crunchydata.com/blog/understanding-the-postgres-hackers-mailing-list I think this is useful. Thanks for posting. I think HEAD is commonly misused to mean master instead of the latest commit of the current branch. I see the buildfarm even does that. Thanks for getting that right in your blog post. David
Re: Jargon and acronyms on this mailing list
> I normally wouldn't mention my blog entries here, but this one was about > the hackers mailing list, so wanted to let people know about it in case you > don't follow Planet Postgres. I scanned the last year's worth of posts and > gathered the most used acronyms and jargon. The most commonly used acronym > was IMO (in my opinion), followed by FWIW (for what it's worth), and IIUC > (if I understand correctly). The complete list can be found in the post > below, I'll refrain from copying everything here. > > https://www.crunchydata.com/blog/understanding-the-postgres-hackers-mailing-list Thank you for the excellent article. I think it is very useful for non-native English speakers like me. Best reagards, -- Tatsuo Ishii SRA OSS K.K. English: http://www.sraoss.co.jp/index_en/ Japanese:http://www.sraoss.co.jp
Re: ANALYZE ONLY
On Sun, 1 Sept 2024 at 13:32, Michael Harris wrote: > v3 of the patch is attached, and I will submit it to the commitfest. I think this patch is pretty good. I only have a few things I'd point out: 1. The change to ddl.sgml, technically you're removing the caveat, but I think the choice you've made to mention the updated behaviour is likely a good idea still. So I agree with this change, but just wanted to mention it as someone else might think otherwise. 2. I think there's some mixing of tense in the changes to analyze.sgml: "If ONLY was not specified, it will also recurse into each partition and update its statistics." You've written "was" (past tense), but then the existing text uses "will" (future tense). I guess if the point in time is after parse and before work has been done, then that's correct, but I think using "is" instead of "was" is better. 3. In vacuum.sgml; "If ONLY is not specified, the table and all its descendant tables or partitions (if any) are vacuumed" Maybe "are also vacuumed" instead of "are vacuumed" is more clear? It's certainly true for inheritance parents, but I guess you could argue that's wrong for partitioned tables. 4. A very minor detail, but I think in vacuum.c the WARNING you've added should use RelationGetRelationName(). We seem to be very inconsistent with using that macro and I see it's not used just above for the lock warning, which I imagine you copied. Aside from those, that just leaves me with the behavioural change. I noted Tom was ok with the change in behaviour for ANALYZE (mentioned in [1]). Tom, wondering if you feel the same for VACUUM too? If we're doing this, I think we'd need to be quite clear about it on the release notes. David [1] https://postgr.es/m/1919201.1724289...@sss.pgh.pa.us
Re: not null constraints, again
On Sat, Aug 31, 2024 at 11:59 AM Alvaro Herrera wrote: > > Hello > > Here I present another attempt at making not-null constraints be > catalogued. This is largely based on the code reverted at 9ce04b50e120, > except that we now have a not-null constraint automatically created for > every column of a primary key, and such constraint cannot be removed > while the PK exists. Thanks to this, a lot of rather ugly code is gone, > both in pg_dump and in backend -- in particular the handling of NO > INHERIT, which was needed for pg_dump. > > Noteworthy psql difference: because there are now even more not-null > constraints than before, the \d+ display would be far too noisy if we > just let it grow. So here I've made it omit any constraints that > underlie the primary key. This should be OK since you can't do much > with those constraints while the PK is still there. If you drop the PK, > the next \d+ will show those constraints. > hi. my brief review. create table t1(a int, b int, c int not null, primary key(a, b)); \d+ t1 ERROR: operator is not unique: smallint[] <@ smallint[] LINE 8: coalesce(NOT ARRAY[at.attnum] <@ (SELECT conkey FROM pg_cata... ^ HINT: Could not choose a best candidate operator. You might need to add explicit type casts. the regression test still passed, i have no idea why. anyway, the following changes make the above ERROR disappear. also seems more lean. printfPQExpBuffer(&buf, /* FIXME the coalesce trick looks silly. What's a better way? */ "SELECT co.conname, at.attname, co.connoinherit, co.conislocal,\n" "co.coninhcount <> 0\n" "FROM pg_catalog.pg_constraint co JOIN\n" "pg_catalog.pg_attribute at ON\n" "(at.attrelid = co.conrelid AND at.attnum = co.conkey[1])\n" "WHERE co.contype = 'n' AND\n" "co.conrelid = '%s'::pg_catalog.regclass AND\n" "coalesce(NOT ARRAY[at.attnum] <@ (SELECT conkey FROM pg_catalog.pg_constraint\n" " WHERE contype = 'p' AND conrelid = '%s'::regclass), true)\n" "ORDER BY at.attnum", oid, oid); change to printfPQExpBuffer(&buf, "SELECT co.conname, at.attname, co.connoinherit, co.conislocal,\n" "co.coninhcount <> 0\n" "FROM pg_catalog.pg_constraint co JOIN\n" "pg_catalog.pg_attribute at ON\n" "(at.attrelid = co.conrelid AND at.attnum = co.conkey[1])\n" "WHERE co.contype = 'n' AND\n" "co.conrelid = '%s'::pg_catalog.regclass AND\n" "NOT EXISTS (SELECT 1 FROM pg_catalog.pg_constraint co1 where co1.contype = 'p'\n" "AND at.attnum = any(co1.conkey) AND co1.conrelid = '%s'::pg_catalog.regclass)\n" "ORDER BY at.attnum", oid, oid); steal idea from https://stackoverflow.com/a/75614278/15603477 create type comptype as (r float8, i float8); create domain dcomptype1 as comptype not null no inherit; with cte as ( SELECT oid, conrelid::regclass, conname FROM pg_catalog.pg_constraint where contypid in ('dcomptype1'::regtype)) select pg_get_constraintdef(oid) from cte; current output is NOT NULL but it's not the same as CREATE TABLE ATACC1 (a int, not null a no inherit); with cte as ( SELECT oid, conrelid::regclass, conname FROM pg_catalog.pg_constraint where conrelid in ('ATACC1'::regclass)) select pg_get_constraintdef(oid) from cte; NOT NULL a NO INHERIT i don't really sure the meaning of "on inherit" in "create domain dcomptype1 as comptype not null no inherit;" bold idea. print out the constraint name: violates not-null constraint \"%s\" for the following code: ereport(ERROR, (errcode(ERRCODE_NOT_NULL_VIOLATION), errmsg("null value in column \"%s\" of relation \"%s\" violates not-null constraint", NameStr(att->attname), RelationGetRelationName(orig_rel)), val_desc ? errdetail("Failing row contains %s.", val_desc) : 0, errtablecol(orig_rel, attrChk))); in extractNotNullColumn we can Assert(colnum > 0); create table t3(a int , b int , c int ,not null a, not null c, not null b, not null tableoid); this should not be allowed? foreach(lc, RelationGetNotNullConstraints(RelationGetRelid(relation), false)) { AlterTableCmd *atsubcmd; atsubcmd = makeNode(AlterTableCmd); atsubcmd->subtype = AT_AddConstraint; atsubcmd->def = (Node *) lfirst_node(Constraint, lc); atsubcmds = lappend(atsubcmds, atsubcmd); } forgive me for being hypocritical. I
Re: CI, macports, darwin version problems
On 9/8/24 16:55, Thomas Munro wrote: On Sat, Aug 3, 2024 at 12:07 AM Joe Conway wrote: I tried making this run like a service using launchctl, but that was giving the permissions errors. I finally gave up trying to figure it out and just accepted that I need to manually start the script whenever I reboot the mac. It seems to be unhappy recently: Persistent worker failed to start the task: tart isolation failed: failed to create VM cloned from "ghcr.io/cirruslabs/macos-runner:sonoma": tart command returned non-zero exit code: "tart/VMStorageOCI.swift:5: Fatal error: 'try!' expression unexpectedly raised an error: Error Domain=NSCocoaErrorDomain Code=512 \"The file “ci-run” couldn’t be saved in the folder “Users”.\" UserInfo={NSFilePath=/Users/ci-run, NSUnderlyingError=0x619f0720 {Error Domain=NSPOSIXErrorDomain Code=20 \"Not a directory\"}}" Seems the mounted drive got unmounted somehow ¯\_(ツ)_/¯ Please check it out and let me know if it is working properly now. -- Joe Conway PostgreSQL Contributors Team RDS Open Source Databases Amazon Web Services: https://aws.amazon.com
Fix possible memory leak in rescanLatestTimeLine()
In rescanLatestTimeLine(), if a new target timeline is found, expectedTLEs is replaced with newExpectedTLEs that is newly allocated by readTimeLineHistory(), and old expectedTLEs is released using list_free_deep(). However, if the current timeline is not part of the history of the new timeline, the function returns without using newExpectedTLEs, nor releasing it. I wonder this is a memory leak and it is better to release it, although the affect may be not so much. I've attached the patch. Regards, Yugo Nagata -- Yugo Nagata diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index 178491f6f5..ec518d35e8 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -4157,6 +4157,7 @@ rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN) (errmsg("new timeline %u is not a child of database system timeline %u", newtarget, replayTLI))); + list_free_deep(newExpectedTLEs); return false; } @@ -4172,6 +4173,7 @@ rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN) newtarget, replayTLI, LSN_FORMAT_ARGS(replayLSN; + list_free_deep(newExpectedTLEs); return false; }
Re: DOCS - pg_replication_slot . Fix the 'inactive_since' description
Saying "The time..." is fine, but the suggestions given seem backwards to me: - The time this slot was inactivated - The time when the slot became inactive. - The time when the slot was deactivated. e.g. It is not like light switch. So, there is no moment when the active slot "became inactive" or "was deactivated". Rather, the 'inactive_since' timestamp field is simply: - The time the slot was last active. - The last time the slot was active. == Kind Regards, Peter Smith. Fujitsu Australia
Re: Add callbacks for fixed-numbered stats flush in pgstats
On Wed, Sep 04, 2024 at 06:37:01PM +0900, Kyotaro Horiguchi wrote: > I doubt that overprotection is always better, but in this case, it's > not overprotection at all. The flush callbacks are called > unconditionally once we decide to flush anything. Sorry for the noise. Yes, it's intended to be a fastpath, with checks happening if we have no data pending for any variable-numbered stats. -- Michael signature.asc Description: PGP signature
Re: DOCS - pg_replication_slot . Fix the 'inactive_since' description
On Sun, Sep 8, 2024, 18:55 Peter Smith wrote: > Saying "The time..." is fine, but the suggestions given seem backwards to > me: > - The time this slot was inactivated > - The time when the slot became inactive. > - The time when the slot was deactivated. > > e.g. It is not like light switch. So, there is no moment when the > active slot "became inactive" or "was deactivated". > While this is plausible the existing wording and the name of the field definitely fail to convey this. > Rather, the 'inactive_since' timestamp field is simply: > - The time the slot was last active. > - The last time the slot was active. > I see your point but that wording is also quite confusing when an active slot returns null for this field. At this point I'm confused enough to need whatever wording is taken to be supported by someone explaining the code that interacts with this field. I suppose I'm expecting something like: The time the last activity finished, or null if an activity is in-progress. David J. > >
Re: Add callbacks for fixed-numbered stats flush in pgstats
On Wed, Sep 04, 2024 at 06:27:43AM +, Bertrand Drouvot wrote: > Agree. The idea was to add an additional parameter (say "check_only") to the > flush_fixed_cb. If this parameter is set to true then the flush_fixed_cb would > do nothing (no flush at all) but return a boolean that would indicate if there > is pending stats. In case it returns false, then we could avoid the clock > check. Hmm. It's the same thing in terms of logic, but I am not really convinced that it is a good idea to mix a code path for a sanity check with a code path dealing with the action, particularly considering that there is the nowait option to think about in the flush callback which would require one to think about 4 possible states rather than two possibilities. So that's more error-prone for extension developers, IMO. At the end, I have used my original suggestions for the callbacks. If there are voices in favor of something different, we still have a good chunk of the development and beta cycle for that. -- Michael signature.asc Description: PGP signature
Re: PostgreSQL 17 release announcement draft
On 9/7/24 6:58 PM, David Rowley wrote: On Sun, 8 Sept 2024 at 06:44, Jonathan S. Katz wrote: I've attached the latest copy. Is "This release expands on functionality both for managing data in partitions" still relevant given the MERGE/SPLIT PARTITION was reverted [1]? AFAICT yes, as identity columns and exclusion constraints can now be used. It looks like there are a few items with the optimizer (which looking at the release notes, you'd be aware of :) but unsure if those should be added. Timing wise, we're rapidly approaching the Sep 9 12:00 UTC cutoff; if there's no additional feedback, once I wake up I'll do one more pass over the text and then freeze it. Thanks, Jonathan OpenPGP_signature.asc Description: OpenPGP digital signature
Re: DOCS - pg_replication_slot . Fix the 'inactive_since' description
On Mon, Sep 9, 2024 at 12:20 PM David G. Johnston wrote: > > > > On Sun, Sep 8, 2024, 18:55 Peter Smith wrote: >> >> Saying "The time..." is fine, but the suggestions given seem backwards to me: >> - The time this slot was inactivated >> - The time when the slot became inactive. >> - The time when the slot was deactivated. >> >> e.g. It is not like light switch. So, there is no moment when the >> active slot "became inactive" or "was deactivated". > > > While this is plausible the existing wording and the name of the field > definitely fail to convey this. > >> >> Rather, the 'inactive_since' timestamp field is simply: >> - The time the slot was last active. >> - The last time the slot was active. > > > I see your point but that wording is also quite confusing when an active slot > returns null for this field. > > At this point I'm confused enough to need whatever wording is taken to be > supported by someone explaining the code that interacts with this field. > Me too. I created this thread primarily to get the description changed to clarify this field represents a moment in time, rather than a duration. So I will be happy with any wording that addresses that. > I suppose I'm expecting something like: The time the last activity finished, > or null if an activity is in-progress. == Kind Regards, Peter Smith. Fujitsu Australia
Re: Remove emode argument from XLogFileRead/XLogFileReadAnyTLI
On Fri, Sep 06, 2024 at 08:10:43PM +0900, Yugo Nagata wrote: > Since 1bb2558046c, XLogFileRead() doesn't use the emode argument. > Also, since abf5c5c9a4f, XLogFileReadAnyTLI() is called just once > and emode is always DEBUG2. So, I think we can remove the emode > argument from these functions. I've atached the patch. It's true that the last relevant caller of XLogFileReadAnyTLI() that required an emode is abf5c5c9a4f1, as you say, that's also what I am tracking down. Any objections to that? -- Michael signature.asc Description: PGP signature
Re: Jargon and acronyms on this mailing list
David Rowley writes: > I think HEAD is commonly misused to mean master instead of the latest > commit of the current branch. I see the buildfarm even does that. > Thanks for getting that right in your blog post. IIRC, HEAD *was* the technically correct term back when we were using CVS. Old habits die hard. regards, tom lane
Re: Jargon and acronyms on this mailing list
On Sun, Sep 08, 2024 at 11:35:36PM -0400, Tom Lane wrote: > David Rowley writes: >> I think HEAD is commonly misused to mean master instead of the latest >> commit of the current branch. I see the buildfarm even does that. >> Thanks for getting that right in your blog post. > > IIRC, HEAD *was* the technically correct term back when we were > using CVS. Old habits die hard. Even if it's a new habit from a new technology. I began getting involved with upstream the year when the moved from CVS to git happened, and just inherited this habit from everybody else ;) -- Michael signature.asc Description: PGP signature
Re: Introduce XID age and inactive timeout based replication slot invalidation
On Thu, Sep 5, 2024 at 9:30 AM Amit Kapila wrote: > > On Wed, Sep 4, 2024 at 9:17 AM shveta malik wrote: > > > > On Tue, Sep 3, 2024 at 3:01 PM shveta malik wrote: > > > > > > > > > 1) > > > I see that ReplicationSlotAlter() will error out if the slot is > > > invalidated due to timeout. I have not tested it myself, but do you > > > know if slot-alter errors out for other invalidation causes as well? > > > Just wanted to confirm that the behaviour is consistent for all > > > invalidation causes. > > > > I was able to test this and as anticipated behavior is different. When > > slot is invalidated due to say 'wal_removed', I am still able to do > > 'alter' of that slot. > > Please see: > > > > Pub: > > slot_name | failover | synced | inactive_since | > > invalidation_reason > > -+--++--+- > > mysubnew1_1 | t| f | 2024-09-04 08:58:12.802278+05:30 | > > wal_removed > > > > Sub: > > newdb1=# alter subscription mysubnew1_1 disable; > > ALTER SUBSCRIPTION > > > > newdb1=# alter subscription mysubnew1_1 set (failover=false); > > ALTER SUBSCRIPTION > > > > Pub: (failover altered) > > slot_name | failover | synced | inactive_since | > > invalidation_reason > > -+--++--+- > > mysubnew1_1 | f| f | 2024-09-04 08:58:47.824471+05:30 | > > wal_removed > > > > > > while when invalidation_reason is 'inactive_timeout', it fails: > > > > Pub: > > slot_name | failover | synced | inactive_since | > > invalidation_reason > > -+--++--+- > > mysubnew1_1 | t| f | 2024-09-03 14:30:57.532206+05:30 | > > inactive_timeout > > > > Sub: > > newdb1=# alter subscription mysubnew1_1 disable; > > ALTER SUBSCRIPTION > > > > newdb1=# alter subscription mysubnew1_1 set (failover=false); > > ERROR: could not alter replication slot "mysubnew1_1": ERROR: can no > > longer get changes from replication slot "mysubnew1_1" > > DETAIL: The slot became invalid because it was inactive since > > 2024-09-04 08:54:20.308996+05:30, which is more than 0 seconds ago. > > HINT: You might need to increase "replication_slot_inactive_timeout.". > > > > I think the behavior should be same. > > > > We should not allow the invalid replication slot to be altered > irrespective of the reason unless there is any benefit. > Okay, then I think we need to change the existing behaviour of the other invalidation causes which still allow alter-slot. thanks Shveta
Re: On disable_cost
On Fri, Sep 6, 2024 at 5:27 PM Richard Guo wrote: > On Fri, Sep 6, 2024 at 5:00 PM Alexander Lakhin wrote: > > static void > > label_sort_with_costsize(PlannerInfo *root, Sort *plan, double limit_tuples) > > { > > ... > > cost_sort(&sort_path, root, NIL, > >lefttree->total_cost, > >plan->plan.disabled_nodes, > >lefttree->plan_rows, > >lefttree->plan_width, > >0.0, > >work_mem, > >limit_tuples); > > > > Given the cost_sort() declaration: > > void > > cost_sort(Path *path, PlannerInfo *root, > >List *pathkeys, int input_disabled_nodes, > >Cost input_cost, double tuples, int width, > >Cost comparison_cost, int sort_mem, > >double limit_tuples) > > > > Aren't the input_disabled_nodes and input_cost arguments swapped in the > > above call? > > Nice catch! I checked other callers to cost_sort, and they are all > good. Fixed. Thanks Richard
Re: Sort functions with specialized comparators
> On 9 Sep 2024, at 02:31, David Rowley wrote: > > Also, unless Andrey is happy for you to tag onto the work he's doing, > I'd suggest another thread for that work. I don't think there's any > good reason for that work to delay Andrey's work. Stepan asked for mentoring project, so I handed him this patch set. We are working together, but the main goal is integrating Stepan into dev process. Well, the summer was really hot and we somehow were not advancing the project… So your thread bump is very timely! Many thanks for your input about benchmarks! We will focus on measuring impact of changes. I totally share your concerns about optimization of sorts that are not used frequently. Best regards, Andrey Borodin.
Re: Introduce XID age and inactive timeout based replication slot invalidation
Hi, On Mon, Sep 9, 2024 at 9:17 AM shveta malik wrote: > > > We should not allow the invalid replication slot to be altered > > irrespective of the reason unless there is any benefit. > > Okay, then I think we need to change the existing behaviour of the > other invalidation causes which still allow alter-slot. +1. Perhaps, track it in a separate thread? -- Bharath Rupireddy PostgreSQL Contributors Team RDS Open Source Databases Amazon Web Services: https://aws.amazon.com
Re: Introduce XID age and inactive timeout based replication slot invalidation
On Mon, Sep 9, 2024 at 10:26 AM Bharath Rupireddy wrote: > > Hi, > > On Mon, Sep 9, 2024 at 9:17 AM shveta malik wrote: > > > > > We should not allow the invalid replication slot to be altered > > > irrespective of the reason unless there is any benefit. > > > > Okay, then I think we need to change the existing behaviour of the > > other invalidation causes which still allow alter-slot. > > +1. Perhaps, track it in a separate thread? I think so. It does not come under the scope of this thread. thanks Shveta
Re: long-standing data loss bug in initial sync of logical replication
On Mon, 2 Sept 2024 at 10:12, Amit Kapila wrote: > > On Fri, Aug 30, 2024 at 3:06 PM Shlok Kyal wrote: > > > > Next I am planning to test solely on the logical decoding side and > > will share the results. > > > > Thanks, the next set of proposed tests makes sense to me. It will also > be useful to generate some worst-case scenarios where the number of > invalidations is more to see the distribution cost in such cases. For > example, Truncate/Drop a table with 100 or 1000 partitions. > > -- > With Regards, > Amit Kapila. Hi, I did some performance testing solely on the logical decoding side and found some degradation in performance, for the following testcase: 1. Created a publisher on a single table, say 'tab_conc1'; 2. Created a second publisher on a single table say 'tp'; 4. two sessions are running in parallel, let's say S1 and S2. 5. Begin a transaction in S1. 6. Now in a loop (this loop runs 'count' times): S1: Insert a row in table 'tab_conc1' S2: BEGIN; Alter publication DROP/ ADD tp; COMMIT 7. COMMIT the transaction in S1. 8. run 'pg_logical_slot_get_binary_changes' to get the decoding changes. Observation: With fix a new entry is added in decoding. During debugging I found that this entry only comes when we do a 'INSERT' in Session 1 after we do 'ALTER PUBLICATION' in another session in parallel (or we can say due to invalidation). Also, I observed that this new entry is related to sending replica identity, attributes,etc as function 'logicalrep_write_rel' is called. Performance: We see a performance degradation as we are sending new entries during logical decoding. Results are an average of 5 runs. count|Head (sec)|Fix (sec)|Degradation (%) -- 1 |1.298|1.574 |21.26348228 5 |22.892 |24.997 |9.195352088 10 |88.602 |93.759 |5.820410374 I have also attached the test script here. Thanks and Regards, Shlok Kyal test2.pl Description: Binary data
Re: long-standing data loss bug in initial sync of logical replication
On Mon, 2 Sept 2024 at 10:12, Amit Kapila wrote: > > On Fri, Aug 30, 2024 at 3:06 PM Shlok Kyal wrote: > > > > Next I am planning to test solely on the logical decoding side and > > will share the results. > > > > Thanks, the next set of proposed tests makes sense to me. It will also > be useful to generate some worst-case scenarios where the number of > invalidations is more to see the distribution cost in such cases. For > example, Truncate/Drop a table with 100 or 1000 partitions. > > -- > With Regards, > Amit Kapila. Also, I did testing with a table with partitions. To test for the scenario where the number of invalidations are more than distribution. Following is the test case: 1. Created a publisher on a single table, say 'tconc_1'; 2. Created a second publisher on a partition table say 'tp'; 3. Created 'tcount' partitions for the table 'tp'. 4. two sessions are running in parallel, let's say S1 and S2. 5. Begin a transaction in S1. 6. S1: Insert a row in table 'tconc_1' S2: BEGIN; TRUNCATE TABLE tp; COMMIT; With patch, this will add 'tcount * 3' invalidation messages to transaction in session 1. S1: Insert a row in table 't_conc1' 7. COMMIT the transaction in S1. 8. run 'pg_logical_slot_get_binary_changes' to get the decoding changes. Performance: We see a degradation in performance. Results are an average of 5 runs. count of partitions | Head (sec) |Fix (sec) |Degradation (%) - 1000 | 0.114 |0.118 |3.50877193 5000 | 0.502 |0.522 |3.984063745 1 | 1.012 |1.024 |1.185770751 I have also attached the test script here. And will also do further testing. Thanks and Regards, Shlok Kyal test3.pl Description: Binary data
Re: Introduce XID age and inactive timeout based replication slot invalidation
On Sun, Sep 8, 2024 at 5:25 PM Bharath Rupireddy wrote: > > > Please find the v45 patch. Addressed above and Shveta's review comments [1]. > Thanks for the patch. Please find my comments: 1) src/sgml/config.sgml: + Synced slots are always considered to be inactive because they don't perform logical decoding to produce changes. It is better we avoid such a statement, as internally we use logical decoding to advance restart-lsn, see 'LogicalSlotAdvanceAndCheckSnapState' called form slotsync.c. 2) src/sgml/config.sgml: + disables the inactive timeout invalidation mechanism + Slot invalidation due to inactivity timeout occurs during checkpoint. Either have 'inactive' at both the places or 'inactivity'. 3) slot.c: +static bool InvalidatePossiblyObsoleteSlot(ReplicationSlotInvalidationCause cause, +ReplicationSlot *s, +XLogRecPtr oldestLSN, +Oid dboid, +TransactionId snapshotConflictHorizon, +bool *invalidated); +static inline bool SlotInactiveTimeoutCheckAllowed(ReplicationSlot *s); I think, we do not need above 2 declarations. The code compile fine without these as the usage is later than the definition. 4) + /* + * An error is raised if error_if_invalid is true and the slot has been + * invalidated previously. + */ + if (error_if_invalid && s->data.invalidated == RS_INVAL_INACTIVE_TIMEOUT) The comment is generic while the 'if condition' is specific to one invalidation cause. Even though I feel it can be made generic test for all invalidation causes but that is not under scope of this thread and needs more testing/analysis. For the time being, we can make comment specific to the concerned invalidation cause. The header of function will also need the same change. 5) SlotInactiveTimeoutCheckAllowed(): + * Check if inactive timeout invalidation mechanism is disabled or slot is + * currently being used or server is in recovery mode or slot on standby is + * currently being synced from the primary. + * These comments say exact opposite of what we are checking in code. Since the function name has 'Allowed' in it, we should be putting comments which say what allows it instead of what disallows it. 6) + * Synced slots are always considered to be inactive because they don't + * perform logical decoding to produce changes. + */ +static inline bool +SlotInactiveTimeoutCheckAllowed(ReplicationSlot *s) Perhaps we should avoid mentioning logical decoding here. When slots are synced, they are performing decoding and their inactive_since is changing continuously. A better way to make this statement will be: We want to ensure that the slots being synchronized are not invalidated, as they need to be preserved for future use when the standby server is promoted to the primary. This is necessary for resuming logical replication from the new primary server. 7) InvalidatePossiblyObsoleteSlot() we are calling SlotInactiveTimeoutCheckAllowed() twice in this function. We shall optimize. At the first usage place, shall we simply get timestamp when cause is RS_INVAL_INACTIVE_TIMEOUT without checking SlotInactiveTimeoutCheckAllowed() as IMO it does not seem a performance critical section. Or if we retain check at first place, then at the second place we can avoid calling it again based on whether 'now' is NULL or not. thanks Shveta
Re: race condition in pg_class
On Sat, Sep 7, 2024 at 12:25 AM Noah Misch wrote: > > On Fri, Sep 06, 2024 at 03:22:48PM +0530, Nitin Motiani wrote: > > Thanks. I have no other comments. > > https://commitfest.postgresql.org/49/5090/ remains in status="Needs review". > When someone moves it to status="Ready for Committer", I will commit > inplace090, inplace110, and inplace120 patches. If one of you is comfortable > with that, please modify the status. Done.
Re: Virtual generated columns
On 04.09.24 12:33, Dean Rasheed wrote: I left the 0001 patch alone for now and put the new rewriting implementation into 0002. (Unfortunately, the diff is kind of useless for visual inspection.) Let me know if this matches what you had in mind, please. Also, is this the right place in fireRIRrules()? Yes, that's what I had in mind except that it has to be called from the second loop in fireRIRrules(), after any RLS policies have been added, because it's possible for a RLS policy expression to refer to virtual generated columns. It's OK to do it in the same loop that expands RLS policies, because such policies can only refer to columns of the same relation, so once the RLS policies have been expanded for a given relation, nothing else should get added to the query that can refer to columns of that relation, at that query level, so at that point it should be safe to expand virtual generated columns. If I move the code like that, then the postgres_fdw test fails. So there is some additional interaction there that I need to study.
Re: Virtual generated columns
On 05.09.24 10:27, jian he wrote: On Wed, Sep 4, 2024 at 4:40 PM Peter Eisentraut wrote: Here is an implementation of this. It's much nicer! It also appears to fix all the additional test cases that have been presented. (I haven't integrated them into the patch set yet.) I left the 0001 patch alone for now and put the new rewriting implementation into 0002. (Unfortunately, the diff is kind of useless for visual inspection.) Let me know if this matches what you had in mind, please. Also, is this the right place in fireRIRrules()? hi. some minor issues. in get_dependent_generated_columns we can /* skip if not generated column */ if (!TupleDescAttr(tupdesc, defval->adnum - 1)->attgenerated) continue; change to /* skip if not generated stored column */ if (!(TupleDescAttr(tupdesc, defval->adnum - 1)->attgenerated == ATTRIBUTE_GENERATED_STORED)) continue; I need to study more what to do with this function. I'm not completely sure whether this should apply only to stored generated columns. in ExecInitStoredGenerated "if ((tupdesc->constr && tupdesc->constr->has_generated_stored)))" is true. then later we finish the loop (for (int i = 0; i < natts; i++) loop) we can "Assert(ri_NumGeneratedNeeded > 0)" so we can ensure once has_generated_stored flag is true, then we should have at least one stored generated attribute. This is technically correct, but this code isn't touched by this patch, so I don't think it belongs here. similarly, in expand_generated_columns_internal we can aslo add "Assert(list_length(tlist) > 0);" above node = ReplaceVarsFromTargetList(node, rt_index, 0, rte, tlist, REPLACEVARS_CHANGE_VARNO, rt_index, NULL); Ok, I'll add that. @@ -2290,7 +2291,9 @@ ExecBuildSlotValueDescription(Oid reloid, if (table_perm || column_perm) { - if (slot->tts_isnull[i]) + if (att->attgenerated == ATTRIBUTE_GENERATED_VIRTUAL) + val = "virtual"; + else if (slot->tts_isnull[i]) val = "null"; else { Oid foutoid; bool typisvarlena; getTypeOutputInfo(att->atttypid, &foutoid, &typisvarlena); val = OidOutputFunctionCall(foutoid, slot->tts_values[i]); } we can add Assert here, if i understand it correctly, like if (att->attgenerated == ATTRIBUTE_GENERATED_VIRTUAL) { Assert(slot->tts_isnull[i]); val = "virtual"; } Also technically correct, but I don't see what benefit this would bring. The code guarded by that assert would not make use of the thing being asserted.
Re: Commit Timestamp and LSN Inversion issue
On Wed, Sep 4, 2024 at 12:23 PM shveta malik wrote: > > Hello hackers, > (Cc people involved in the earlier discussion) > > I would like to discuss the $Subject. > > While discussing Logical Replication's Conflict Detection and > Resolution (CDR) design in [1] , it came to our notice that the > commit LSN and timestamp may not correlate perfectly i.e. commits may > happen with LSN1 < LSN2 but with Ts1 > Ts2. This issue may arise > because, during the commit process, the timestamp (xactStopTimestamp) > is captured slightly earlier than when space is reserved in the WAL. > > ~~ > > Reproducibility of conflict-resolution problem due to the timestamp inversion > > It was suggested that timestamp inversion *may* impact the time-based > resolutions such as last_update_wins (targeted to be implemented in > [1]) as we may end up making wrong decisions if timestamps and LSNs > are not correctly ordered. And thus we tried some tests but failed to > find any practical scenario where it could be a problem. > > Basically, the proposed conflict resolution is a row-level resolution, > and to cause the row value to be inconsistent, we need to modify the > same row in concurrent transactions and commit the changes > concurrently. But this doesn't seem possible because concurrent > updates on the same row are disallowed (e.g., the later update will be > blocked due to the row lock). See [2] for the details. > > We tried to give some thoughts on multi table cases as well e.g., > update table A with foreign key and update the table B that table A > refers to. But update on table A will block the update on table B as > well, so we could not reproduce data-divergence due to the > LSN/timestamp mismatch issue there. > > ~~ > > Idea proposed to fix the timestamp inversion issue > > There was a suggestion in [3] to acquire the timestamp while reserving > the space (because that happens in LSN order). The clock would need to > be monotonic (easy enough with CLOCK_MONOTONIC), but also cheap. The > main problem why it's being done outside the critical section, because > gettimeofday() may be quite expensive. There's a concept of hybrid > clock, combining "time" and logical counter, which might be useful > independently of CDR. > > On further analyzing this idea, we found that CLOCK_MONOTONIC can be > accepted only by clock_gettime() which has more precision than > gettimeofday() and thus is equally or more expensive theoretically (we > plan to test it and post the results). It does not look like a good > idea to call any of these when holding spinlock to reserve the wal > position. As for the suggested solution "hybrid clock", it might not > help here because the logical counter is only used to order the > transactions with the same timestamp. The problem here is how to get > the timestamp along with wal position > reservation(ReserveXLogInsertLocation). > Here are the tests done to compare clock_gettime() and gettimeofday() performance. Machine details : Intel(R) Xeon(R) CPU E7-4890 v2 @ 2.80GHz CPU(s): 120; 800GB RAM Three functions were tested across three different call volumes (1 million, 100 million, and 1 billion): 1) clock_gettime() with CLOCK_REALTIME 2) clock_gettime() with CLOCK_MONOTONIC 3) gettimeofday() --> clock_gettime() with CLOCK_MONOTONIC sometimes shows slightly better performance, but not consistently. The difference in time taken by all three functions is minimal, with averages varying by no more than ~2.5%. Overall, the performance between CLOCK_MONOTONIC and gettimeofday() is essentially the same. Below are the test results - (each test was run twice for consistency) 1) For 1 million calls: 1a) clock_gettime() with CLOCK_REALTIME: - Run 1: 0.01770 seconds, Run 2: 0.01772 seconds, Average: 0.01771 seconds. 1b) clock_gettime() with CLOCK_MONOTONIC: - Run 1: 0.01753 seconds, Run 2: 0.01748 seconds, Average: 0.01750 seconds. 1c) gettimeofday(): - Run 1: 0.01742 seconds, Run 2: 0.01777 seconds, Average: 0.01760 seconds. 2) For 100 million calls: 2a) clock_gettime() with CLOCK_REALTIME: - Run 1: 1.76649 seconds, Run 2: 1.76602 seconds, Average: 1.76625 seconds. 2b) clock_gettime() with CLOCK_MONOTONIC: - Run 1: 1.72768 seconds, Run 2: 1.72988 seconds, Average: 1.72878 seconds. 2c) gettimeofday(): - Run 1: 1.72436 seconds, Run 2: 1.72174 seconds, Average: 1.72305 seconds. 3) For 1 billion calls: 3a) clock_gettime() with CLOCK_REALTIME: - Run 1: 17.63859 seconds, Run 2: 17.65529 seconds, Average: 17.64694 seconds. 3b) clock_gettime() with CLOCK_MONOTONIC: - Run 1: 17.15109 seconds, Run 2: 17.27406 seconds, Average: 17.21257 seconds. 3c) gettimeofday(): - Run 1: 17.21368 seconds, Run 2: 17.22983 seconds, Average: 17.22175 seconds. Attached the scripts used for tests. -- Thanks, Nisha <>
Re: Invalid Assert while validating REPLICA IDENTITY?
On Fri, Sep 6, 2024 at 4:48 PM vignesh C wrote: > > On Mon, 2 Sept 2024 at 18:22, Dilip Kumar wrote: > > > > On Mon, Sep 2, 2024 at 3:32 PM Amit Kapila wrote: > > > > > > On Mon, Sep 2, 2024 at 11:21 AM Dilip Kumar wrote: > > > > > > > > While working on some other code I noticed that in > > > > FindReplTupleInLocalRel() there is an assert [1] that seems to be > > > > passing IndexRelation to GetRelationIdentityOrPK() whereas it should > > > > be passing normal relation. > > > > > > > > > > Agreed. But this should lead to assertion failure. Did you try testing it? > > > > No, I did not test this particular case, it impacted me with my other > > addition of the code where I got Index Relation as input to the > > RelationGetIndexList() function, and my local changes were impacted by > > that. I will write a test for this stand-alone case so that it hits > > the assert. Thanks for looking into this. > > The FindReplTupleInLocalRel function can be triggered by both update > and delete operations, but this only occurs if the relation has been > marked as updatable by the logicalrep_rel_mark_updatable function. If > the relation is marked as non-updatable, an error will be thrown by > check_relation_updatable. Given this, if a relation is updatable, the > IsIndexUsableForReplicaIdentityFull condition might always evaluate to > true due to the previous checks in logicalrep_rel_mark_updatable. > Therefore, it’s possible that we might not encounter the Assert > statement, as IsIndexUsableForReplicaIdentityFull may consistently be > true. > Thoughts? With that it seems that the first Assert condition is useless isn't it? -- Regards, Dilip Kumar EnterpriseDB: http://www.enterprisedb.com
Re: Allow logical failover slots to wait on synchronous replication
On Fri, Aug 30, 2024 at 12:56 AM John H wrote: > > Hi Shveta, > > Thanks for reviewing it so quickly. > > On Thu, Aug 29, 2024 at 2:35 AM shveta malik wrote: > > > > Thanks for the patch. Few comments and queries: > > > > 1) > > + static XLogRecPtr lsn[NUM_SYNC_REP_WAIT_MODE]; > > > > We shall name it as 'lsns' as there are multiple. > > > > This follows the same naming convention in walsender_private.h > > > 2) > > > > + for (i = 0; i < NUM_SYNC_REP_WAIT_MODE; i++) > > + { > > + lsn[i] = InvalidXLogRecPtr; > > + } > > > > Can we do it like below similar to what you have done at another place: > > memset(lsn, InvalidXLogRecPtr, sizeof(lsn)); > > > > I don't think memset works in this case? Well, I think *technically* works but > not sure if that's something worth optimizing. > If I understand correctly, memset takes in a char for the value and > not XLogRecPtr (uint64). > > So something like: memset(lsn, 0, sizeof(lsn)) > > InvalidXLogRecPtr is defined as 0 so I think it works but there's an > implicit dependency here > for correctness. > > > 3) > > + if (!initialized) > > + { > > + for (i = 0; i < NUM_SYNC_REP_WAIT_MODE; i++) > > + { > > + lsn[i] = InvalidXLogRecPtr; > > + } > > + } > > > > I do not see 'initialized' set to TRUE anywhere. Can you please > > elaborate the intent here? > > You're right I thought I fixed this. WIll update. > > > > > 4) > > + int mode = SyncRepWaitMode; > > + int i; > > + > > + if (!initialized) > > + { > > + for (i = 0; i < NUM_SYNC_REP_WAIT_MODE; i++) > > + { > > + lsn[i] = InvalidXLogRecPtr; > > + } > > + } > > + if (mode == SYNC_REP_NO_WAIT) > > + return true; > > > > I do not understand this code well. As Amit also pointed out, 'mode' > > may change. When we initialize 'mode' lets say SyncRepWaitMode is > > SYNC_REP_NO_WAIT but by the time we check 'if (mode == > > SYNC_REP_NO_WAIT)', SyncRepWaitMode has changed to say > > SYNC_REP_WAIT_FLUSH, if so, then we will wrongly return true from > > here. Is that a possibility? ProcessConfigFile() is in the caller, and > > thus we may end up using the wrong mode. > > > > Yes it's possible for mode to change. In my comment to Amit in the other > thread, > I think we have to store mode and base our execution of this logic and ignore > SyncRepWaitMode changing due to ProcesConfigFile/SIGHUP for one iteration. > > We can store the value of mode later, so something like: > > > if (SyncRepWaitMode == SYNC_REP_NO_WAIT) > > return true; > > mode = SyncRepWaitMode > > if (lsn[mode] >= wait_for_lsn) > > return true; > > But it's the same issue which is when you check lsn[mode], > SyncRepWaitMode could have changed to > something else, so you always have to initialize the value and will > always have this discrepancy. > > I'm skeptical end users are changing SyncRepWaitMode in their database > clusters as > it has implications for their durability and I would assume they run > with the same durability guarantees. > I was trying to have a look at the patch again, it doesn't apply on the head, needs rebase. Regarding 'mode = SyncRepWaitMode', FWIW, SyncRepWaitForLSN() also does in a similar way. It gets mode in local var initially and uses it later. See [1]. So isn't there a chance too that 'SyncRepWaitMode' can change in between. [1]: mode = SyncRepWaitMode; . if (!WalSndCtl->sync_standbys_defined || lsn <= WalSndCtl->lsn[mode]) { LWLockRelease(SyncRepLock); return; } thanks Shveta
Re: per backend I/O statistics
Hi, On Fri, Sep 06, 2024 at 03:03:17PM +, Bertrand Drouvot wrote: > - If we agree on the current proposal then I'll do some refactoring in > pg_stat_get_backend_io(), pg_stat_get_my_io() and pg_stat_get_io() to avoid > duplicated code (it's not done yet to ease the review). Please find attached v3, mandatory rebase due to fc415edf8c. Regards, -- Bertrand Drouvot PostgreSQL Contributors Team RDS Open Source Databases Amazon Web Services: https://aws.amazon.com >From 335949f1c408a99aed74c0231d5e436b5f319746 Mon Sep 17 00:00:00 2001 From: Bertrand Drouvot Date: Thu, 22 Aug 2024 15:16:50 + Subject: [PATCH v3 1/2] per backend I/O statistics While pg_stat_io provides cluster-wide I/O statistics, this commit adds a new pg_my_stat_io view to display "my" backend I/O statistics. The KIND_IO stats are still "fixed amount" ones as the maximum number of backend is fixed. The statistics snapshot is made for the global stats (the aggregated ones) and for my backend stats. The snapshot is not build in each backend to copy all the others backends stats, as 1/ there is no use case (there is no need to get others backends I/O statistics while taking care of the stats_fetch_consistency) and 2/ that could be memory expensive depending of the number of max connections. A subsequent commit will add a new pg_stat_get_backend_io() function to be able to retrieve the I/O statistics for a given backend pid (each execution re-fetches counters from shared memory because as stated above there is no snapshots being created in each backend to copy the other backends stats). Bump catalog version needs to be done. --- doc/src/sgml/config.sgml | 4 +- doc/src/sgml/monitoring.sgml | 28 ++ src/backend/catalog/system_views.sql | 22 + src/backend/utils/activity/pgstat.c | 11 ++- src/backend/utils/activity/pgstat_io.c| 91 ++--- src/backend/utils/activity/pgstat_shmem.c | 4 +- src/backend/utils/adt/pgstatfuncs.c | 114 +- src/include/catalog/pg_proc.dat | 9 ++ src/include/pgstat.h | 14 ++- src/include/utils/pgstat_internal.h | 14 ++- src/test/regress/expected/rules.out | 19 src/test/regress/expected/stats.out | 44 - src/test/regress/sql/stats.sql| 25 - 13 files changed, 366 insertions(+), 33 deletions(-) 8.5% doc/src/sgml/ 29.4% src/backend/utils/activity/ 23.9% src/backend/utils/adt/ 4.4% src/include/catalog/ 3.1% src/include/utils/ 3.1% src/include/ 14.4% src/test/regress/expected/ 10.0% src/test/regress/sql/ diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 0aec11f443..2a59b97093 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -8331,7 +8331,9 @@ COPY postgres_log FROM '/full/path/to/logfile.csv' WITH csv; displayed in pg_stat_database, -pg_stat_io, in the output of +pg_stat_io, + +pg_my_stat_io, in the output of when the BUFFERS option is used, in the output of when the VERBOSE option is used, by autovacuum diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml index 933de6fe07..b28ca4e0ca 100644 --- a/doc/src/sgml/monitoring.sgml +++ b/doc/src/sgml/monitoring.sgml @@ -488,6 +488,16 @@ postgres 27093 0.0 0.0 30096 2752 ?Ss 11:34 0:00 postgres: ser + + pg_my_stat_iopg_my_stat_io + + One row for each combination of context and target object containing + my backend I/O statistics. + See + pg_my_stat_io for details. + + + pg_stat_replication_slotspg_stat_replication_slots One row per replication slot, showing statistics about the @@ -2948,6 +2958,24 @@ description | Waiting for a newly initialized WAL file to reach durable storage + + + + pg_my_stat_io + + + pg_my_stat_io + + + + The pg_my_stat_io view will contain one row for each + combination of target I/O object and I/O context, showing + my backend I/O statistics. Combinations which do not make sense are + omitted. The fields are exactly the same as the ones in the pg_stat_io + view. + + diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index 7fd5d256a1..b4939b7bc6 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -1168,6 +1168,28 @@ SELECT b.stats_reset FROM pg_stat_get_io() b; +CREATE VIEW pg_my_stat_io AS +SELECT + b.backend_type, + b.object, + b.context, + b.reads, + b.read_time, + b.writes, + b.write_time, + b.writebacks, + b.writeback_time, + b.extends, + b.extend_time, + b.op_bytes, + b.hits, + b.evictions, + b.reuses, + b.fsyncs, + b.fsync_time, + b.stats_rese
Re: Partitioned tables and [un]loggedness
On Tue, Sep 03, 2024 at 10:33:02AM -0500, Nathan Bossart wrote: > This works correctly for CREATE TABLE, but ALTER TABLE still succeeds. > Interestingly, it doesn't seem to actually change relpersistence for > partitioned tables. I think we might need to adjust > ATPrepChangePersistence(). Adjusting ATPrepChangePersistence() looks incorrect to me seeing that we have all the machinery in ATSimplePermissions() at our disposal, and that this routine decides that ATT_TABLE should map to both RELKIND_RELATION and RELKIND_PARTITIONED_TABLE. How about inventing a new ATT_PARTITIONED_TABLE and make a clean split between both relkinds? I'd guess that blocking both SET LOGGED and UNLOGGED for partitioned tables is the best move, even if it is possible to block only one or the other, of course. -- Michael signature.asc Description: PGP signature