On Fri, Jul 8, 2016 at 10:24 PM, Amit Kapila <amit.kapil...@gmail.com> wrote: > On Thu, Jul 7, 2016 at 12:34 PM, Masahiko Sawada <sawada.m...@gmail.com> > wrote: >> Than you for reviewing! >> >> On Thu, Jul 7, 2016 at 7:06 AM, Andres Freund <and...@anarazel.de> wrote: >>> On 2016-07-05 23:37:59 +0900, Masahiko Sawada wrote: >>>> diff --git a/src/backend/access/heap/heapam.c >>>> b/src/backend/access/heap/heapam.c >>>> index 57da57a..fd66527 100644 >>>> --- a/src/backend/access/heap/heapam.c >>>> +++ b/src/backend/access/heap/heapam.c >>>> @@ -3923,6 +3923,17 @@ l2: >>>> >>>> if (need_toast || newtupsize > pagefree) >>>> { >>>> + /* >>>> + * To prevent data corruption due to updating old tuple by >>>> + * other backends after released buffer >>> >>> That's not really the reason, is it? The prime problem is crash safety / >>> replication. The row-lock we're faking (by setting xmax to our xid), >>> prevents concurrent updates until this backend died. >> >> Fixed. >> >>>> , we need to emit that >>>> + * xmax of old tuple is set and clear visibility map bits if >>>> + * needed before releasing buffer. We can reuse xl_heap_lock >>>> + * for this purpose. It should be fine even if we crash >>>> midway >>>> + * from this section and the actual updating one later, since >>>> + * the xmax will appear to come from an aborted xid. >>>> + */ >>>> + START_CRIT_SECTION(); >>>> + >>>> /* Clear obsolete visibility flags ... */ >>>> oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); >>>> oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED; >>>> @@ -3936,6 +3947,46 @@ l2: >>>> /* temporarily make it look not-updated */ >>>> oldtup.t_data->t_ctid = oldtup.t_self; >>>> already_marked = true; >>>> + >>>> + /* Clear PD_ALL_VISIBLE flags */ >>>> + if (PageIsAllVisible(BufferGetPage(buffer))) >>>> + { >>>> + all_visible_cleared = true; >>>> + PageClearAllVisible(BufferGetPage(buffer)); >>>> + visibilitymap_clear(relation, >>>> BufferGetBlockNumber(buffer), >>>> + vmbuffer); >>>> + } >>>> + >>>> + MarkBufferDirty(buffer); >>>> + >>>> + if (RelationNeedsWAL(relation)) >>>> + { >>>> + xl_heap_lock xlrec; >>>> + XLogRecPtr recptr; >>>> + >>>> + /* >>>> + * For logical decoding we need combocids to >>>> properly decode the >>>> + * catalog. >>>> + */ >>>> + if (RelationIsAccessibleInLogicalDecoding(relation)) >>>> + log_heap_new_cid(relation, &oldtup); >>> >>> Hm, I don't see that being necessary here. Row locks aren't logically >>> decoded, so there's no need to emit this here. >> >> Fixed. >> >>> >>>> + /* Clear PD_ALL_VISIBLE flags */ >>>> + if (PageIsAllVisible(page)) >>>> + { >>>> + Buffer vmbuffer = InvalidBuffer; >>>> + BlockNumber block = BufferGetBlockNumber(*buffer); >>>> + >>>> + all_visible_cleared = true; >>>> + PageClearAllVisible(page); >>>> + visibilitymap_pin(relation, block, &vmbuffer); >>>> + visibilitymap_clear(relation, block, vmbuffer); >>>> + } >>>> + >>> >>> That clears all-visible unnecessarily, we only need to clear all-frozen. >>> >> >> Fixed. >> >>> >>>> @@ -8694,6 +8761,23 @@ heap_xlog_lock(XLogReaderState *record) >>>> } >>>> HeapTupleHeaderSetXmax(htup, xlrec->locking_xid); >>>> HeapTupleHeaderSetCmax(htup, FirstCommandId, false); >>>> + >>>> + /* The visibility map need to be cleared */ >>>> + if ((xlrec->infobits_set & XLHL_ALL_VISIBLE_CLEARED) != 0) >>>> + { >>>> + RelFileNode rnode; >>>> + Buffer vmbuffer = InvalidBuffer; >>>> + BlockNumber blkno; >>>> + Relation reln; >>>> + >>>> + XLogRecGetBlockTag(record, 0, &rnode, NULL, &blkno); >>>> + reln = CreateFakeRelcacheEntry(rnode); >>>> + >>>> + visibilitymap_pin(reln, blkno, &vmbuffer); >>>> + visibilitymap_clear(reln, blkno, vmbuffer); >>>> + PageClearAllVisible(page); >>>> + } >>>> + >>> >>> >>>> PageSetLSN(page, lsn); >>>> MarkBufferDirty(buffer); >>>> } >>>> diff --git a/src/include/access/heapam_xlog.h >>>> b/src/include/access/heapam_xlog.h >>>> index a822d0b..41b3c54 100644 >>>> --- a/src/include/access/heapam_xlog.h >>>> +++ b/src/include/access/heapam_xlog.h >>>> @@ -242,6 +242,7 @@ typedef struct xl_heap_cleanup_info >>>> #define XLHL_XMAX_EXCL_LOCK 0x04 >>>> #define XLHL_XMAX_KEYSHR_LOCK 0x08 >>>> #define XLHL_KEYS_UPDATED 0x10 >>>> +#define XLHL_ALL_VISIBLE_CLEARED 0x20 >>> >>> Hm. We can't easily do that in the back-patched version; because a >>> standby won't know to check for the flag . That's kinda ok, since we >>> don't yet need to clear all-visible yet at that point of >>> heap_update. But that better means we don't do so on the master either. >>> >> >> Attached latest version patch. > > + /* Clear only the all-frozen bit on visibility map if needed */ > > + if (PageIsAllVisible(BufferGetPage(buffer)) && > > + VM_ALL_FROZEN(relation, block, &vmbuffer)) > + { > + visibilitymap_clear_extended(relation, block, vmbuffer, > + VISIBILITYMAP_ALL_FROZEN); > + } > + > > + if (RelationNeedsWAL(relation)) > + { > .. > > + XLogBeginInsert(); > + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); > + > + xlrec.offnum = ItemPointerGetOffsetNumber(&oldtup.t_self); > + xlrec.locking_xid = xmax_old_tuple; > + xlrec.infobits_set = compute_infobits(oldtup.t_data->t_infomask, > + oldtup.t_data->t_infomask2); > + XLogRegisterData((char *) &xlrec, SizeOfHeapLock); > + recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK); > .. > > One thing that looks awkward in this code is that it doesn't record > whether the frozen bit is actually cleared during the actual operation > and then during replay, it always clear the frozen bit irrespective of > whether it has been cleared by the actual operation or not. >
I changed it so that we look all-frozen bit up first, and then clear it if needed. > + /* Clear only the all-frozen bit on visibility map if needed */ > + if (PageIsAllVisible(page) && > + VM_ALL_FROZEN(relation, BufferGetBlockNumber(*buffer), &vmbuffer)) > + { > + BlockNumber block = BufferGetBlockNumber(*buffer); > + > + visibilitymap_pin(relation, block, &vmbuffer); > > I think it is not right to call visibilitymap_pin after holding a > buffer lock (visibilitymap_pin can perform I/O). Refer heap_update > for how to pin the visibility map. > Thank you for your advice! Fixed. Attached separated two patched, please give me feedbacks. Regards, -- Masahiko Sawada
From 2020960bc7688afa8b0526c857a0e0af8b20d370 Mon Sep 17 00:00:00 2001 From: Masahiko Sawada <sawada.m...@gmail.com> Date: Mon, 11 Jul 2016 12:39:32 -0700 Subject: [PATCH 1/2] Fix heap_udpate set xmax without WAL logging in the already_marked = true case. --- src/backend/access/heap/heapam.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 57da57a..e7cb8ca 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -3923,6 +3923,16 @@ l2: if (need_toast || newtupsize > pagefree) { + /* + * For crash safety, we need to emit that xmax of old tuple is set + * and clear only the all-frozen bit on visibility map if needed + * before releasing the buffer. We can reuse xl_heap_lock for this + * purpose. It should be fine even if we crash midway from this + * section and the actual updating one later, since the xmax will + * appear to come from an aborted xid. + */ + START_CRIT_SECTION(); + /* Clear obsolete visibility flags ... */ oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED; @@ -3936,6 +3946,28 @@ l2: /* temporarily make it look not-updated */ oldtup.t_data->t_ctid = oldtup.t_self; already_marked = true; + + MarkBufferDirty(buffer); + + if (RelationNeedsWAL(relation)) + { + xl_heap_lock xlrec; + XLogRecPtr recptr; + + XLogBeginInsert(); + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); + + xlrec.offnum = ItemPointerGetOffsetNumber(&oldtup.t_self); + xlrec.locking_xid = xmax_old_tuple; + xlrec.infobits_set = compute_infobits(oldtup.t_data->t_infomask, + oldtup.t_data->t_infomask2); + XLogRegisterData((char *) &xlrec, SizeOfHeapLock); + recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK); + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); /* -- 2.8.1
From 37e8901df9fe6aa6125059257dc6b8d659bb2929 Mon Sep 17 00:00:00 2001 From: Masahiko Sawada <sawada.m...@gmail.com> Date: Mon, 11 Jul 2016 12:44:57 -0700 Subject: [PATCH 2/2] Clear all-frozen bit on visibility map when xmax is set. --- src/backend/access/heap/heapam.c | 49 +++++++++++++++++++++++++++++++++ src/backend/access/heap/visibilitymap.c | 19 ++++++++++--- src/include/access/visibilitymap.h | 2 ++ 3 files changed, 66 insertions(+), 4 deletions(-) diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index e7cb8ca..23e9b75 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -3947,6 +3947,14 @@ l2: oldtup.t_data->t_ctid = oldtup.t_self; already_marked = true; + /* Clear only the all-frozen bit on visibility map if needed */ + if (PageIsAllVisible(BufferGetPage(buffer)) && + VM_ALL_FROZEN(relation, block, &vmbuffer)) + { + visibilitymap_clear_extended(relation, block, vmbuffer, + VISIBILITYMAP_ALL_FROZEN); + } + MarkBufferDirty(buffer); if (RelationNeedsWAL(relation)) @@ -4538,6 +4546,8 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, ItemPointer tid = &(tuple->t_self); ItemId lp; Page page; + Buffer vmbuffer = InvalidBuffer; + BlockNumber block; TransactionId xid, xmax; uint16 old_infomask, @@ -4547,6 +4557,15 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, bool have_tuple_lock = false; *buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid)); + block = ItemPointerGetBlockNumber(tid); + + /* + * Before locking the buffer, pin the visibility map page if it appears to + * to be necessary + */ + if (PageIsAllVisible(BufferGetPage(*buffer))) + visibilitymap_pin(relation, block, &vmbuffer); + LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); page = BufferGetPage(*buffer); @@ -5066,6 +5085,15 @@ failed: if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask)) tuple->t_data->t_ctid = *tid; + /* Clear only the all-frozen bit on visibility map if needed */ + if (PageIsAllVisible(page) && + VM_ALL_FROZEN(relation, block, &vmbuffer)) + { + visibilitymap_clear_extended(relation, block, vmbuffer, + VISIBILITYMAP_ALL_FROZEN); + } + + MarkBufferDirty(*buffer); /* @@ -5104,6 +5132,8 @@ failed: END_CRIT_SECTION(); LockBuffer(*buffer, BUFFER_LOCK_UNLOCK); + if (BufferIsValid(vmbuffer)) + ReleaseBuffer(vmbuffer); /* * Don't update the visibility map here. Locking a tuple doesn't change @@ -8726,6 +8756,25 @@ heap_xlog_lock(XLogReaderState *record) } HeapTupleHeaderSetXmax(htup, xlrec->locking_xid); HeapTupleHeaderSetCmax(htup, FirstCommandId, false); + + /* The all-frozen bit on visibility map need to be cleared if needed */ + if (PageIsAllVisible(BufferGetPage(buffer))) + { + RelFileNode rnode; + Buffer vmbuffer = InvalidBuffer; + BlockNumber block; + Relation reln; + + XLogRecGetBlockTag(record, 0, &rnode, NULL, &block); + reln = CreateFakeRelcacheEntry(rnode); + visibilitymap_pin(reln, block, &vmbuffer); + + if (VM_ALL_FROZEN(reln, block, &vmbuffer)) + visibilitymap_clear_extended(reln, block, vmbuffer, + VISIBILITYMAP_ALL_FROZEN); + ReleaseBuffer(vmbuffer); + } + PageSetLSN(page, lsn); MarkBufferDirty(buffer); } diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c index b472d31..c52c0b0 100644 --- a/src/backend/access/heap/visibilitymap.c +++ b/src/backend/access/heap/visibilitymap.c @@ -11,7 +11,7 @@ * src/backend/access/heap/visibilitymap.c * * INTERFACE ROUTINES - * visibilitymap_clear - clear a bit in the visibility map + * visibilitymap_clear - clear all bits for one page in the visibility map * visibilitymap_pin - pin a map page for setting a bit * visibilitymap_pin_ok - check whether correct map page is already pinned * visibilitymap_set - set a bit in a previously pinned page @@ -157,23 +157,34 @@ static const uint8 number_of_ones_for_frozen[256] = { static Buffer vm_readbuf(Relation rel, BlockNumber blkno, bool extend); static void vm_extend(Relation rel, BlockNumber nvmblocks); +/* + * A shorthand for visibilitymap_clear_extended, for clearing all bits for one + * page in visibility map with VISIBILITYMAP_VALID_BITS. + */ +void +visibilitymap_clear(Relation rel, BlockNumber heapBlk, Buffer buf) +{ + visibilitymap_clear_extended(rel, heapBlk, buf, VISIBILITYMAP_VALID_BITS); +} /* - * visibilitymap_clear - clear all bits for one page in visibility map + * visibilitymap_clear_extended - clear bit(s) for one page in visibility map * * You must pass a buffer containing the correct map page to this function. * Call visibilitymap_pin first to pin the right one. This function doesn't do * any I/O. */ void -visibilitymap_clear(Relation rel, BlockNumber heapBlk, Buffer buf) +visibilitymap_clear_extended(Relation rel, BlockNumber heapBlk, Buffer buf, uint8 flags) { BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); int mapByte = HEAPBLK_TO_MAPBYTE(heapBlk); int mapOffset = HEAPBLK_TO_OFFSET(heapBlk); - uint8 mask = VISIBILITYMAP_VALID_BITS << mapOffset; + uint8 mask = flags << mapOffset; char *map; + Assert(flags & VISIBILITYMAP_VALID_BITS); + #ifdef TRACE_VISIBILITYMAP elog(DEBUG1, "vm_clear %s %d", RelationGetRelationName(rel), heapBlk); #endif diff --git a/src/include/access/visibilitymap.h b/src/include/access/visibilitymap.h index fca99ca..f305b03 100644 --- a/src/include/access/visibilitymap.h +++ b/src/include/access/visibilitymap.h @@ -36,6 +36,8 @@ extern void visibilitymap_clear(Relation rel, BlockNumber heapBlk, Buffer vmbuf); +extern void visibilitymap_clear_extended(Relation rel, BlockNumber heapBlk, + Buffer vmbuf, uint8 flags); extern void visibilitymap_pin(Relation rel, BlockNumber heapBlk, Buffer *vmbuf); extern bool visibilitymap_pin_ok(BlockNumber heapBlk, Buffer vmbuf); -- 2.8.1
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers