Hi, I'm trying to see if it makes sense to do the double-buffering of page writes before going further ahead with CRC checking. I came up with the attached patch; it does the double-buffering inconditionally, because as it was said, it allows releasing the io_in_progress lock (and resetting BM_IO_IN_PROGRESS) early.
So far I have not managed to convince me that this is a correct change to make; the io_in_progress bits are pretty convoluted -- for example, I wonder how does "releasing" the buffer early (before actually sending the write to the kernel and marking it not dirty) interact with checkpoint and a possible full-page image. Basically the change is to take the unsetting of BM_DIRTY out of TerminateBufferIO and into its own routine; and in FlushBuffer, release io_in_progress just after copying the buffer contents elsewhere, and mark the buffer not dirty after actually doing the write. Thoughts? -- Alvaro Herrera http://www.CommandPrompt.com/ PostgreSQL Replication, Consulting, Custom Development, 24x7 support
Index: src/backend/storage/buffer/bufmgr.c =================================================================== RCS file: /home/alvherre/Code/cvs/pgsql/src/backend/storage/buffer/bufmgr.c,v retrieving revision 1.239 diff -c -p -r1.239 bufmgr.c *** src/backend/storage/buffer/bufmgr.c 20 Oct 2008 21:11:15 -0000 1.239 --- src/backend/storage/buffer/bufmgr.c 21 Oct 2008 15:30:38 -0000 *************** static void BufferSync(int flags); *** 84,91 **** static int SyncOneBuffer(int buf_id, bool skip_recently_used); static void WaitIO(volatile BufferDesc *buf); static bool StartBufferIO(volatile BufferDesc *buf, bool forInput); ! static void TerminateBufferIO(volatile BufferDesc *buf, bool clear_dirty, ! int set_flag_bits); static void buffer_write_error_callback(void *arg); static volatile BufferDesc *BufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, --- 84,91 ---- static int SyncOneBuffer(int buf_id, bool skip_recently_used); static void WaitIO(volatile BufferDesc *buf); static bool StartBufferIO(volatile BufferDesc *buf, bool forInput); ! static void TerminateBufferIO(volatile BufferDesc *buf, int set_flag_bits); ! static void MarkBufferNotDirty(volatile BufferDesc *buf); static void buffer_write_error_callback(void *arg); static volatile BufferDesc *BufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, *************** ReadBuffer_common(SMgrRelation smgr, boo *** 395,401 **** else { /* Set BM_VALID, terminate IO, and wake up any waiters */ ! TerminateBufferIO(bufHdr, false, BM_VALID); } if (VacuumCostActive) --- 395,401 ---- else { /* Set BM_VALID, terminate IO, and wake up any waiters */ ! TerminateBufferIO(bufHdr, BM_VALID); } if (VacuumCostActive) *************** FlushBuffer(volatile BufferDesc *buf, SM *** 1792,1797 **** --- 1792,1798 ---- { XLogRecPtr recptr; ErrorContextCallback errcontext; + char dblbuf[BLCKSZ]; /* * Acquire the buffer's io_in_progress lock. If StartBufferIO returns *************** FlushBuffer(volatile BufferDesc *buf, SM *** 1834,1856 **** buf->flags &= ~BM_JUST_DIRTIED; UnlockBufHdr(buf); smgrwrite(reln, buf->tag.forkNum, buf->tag.blockNum, ! (char *) BufHdrGetBlock(buf), false); BufferFlushCount++; TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(reln->smgr_rnode.spcNode, reln->smgr_rnode.dbNode, reln->smgr_rnode.relNode); - /* - * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and - * end the io_in_progress state. - */ - TerminateBufferIO(buf, true, 0); - /* Pop the error context stack */ error_context_stack = errcontext.previous; } --- 1835,1863 ---- buf->flags &= ~BM_JUST_DIRTIED; UnlockBufHdr(buf); + /* + * We make a copy of the buffer to write. This allows us to release the + * io_in_progress lock early, before actually doing the write. + */ + memcpy(dblbuf, BufHdrGetBlock(buf), BLCKSZ); + + /* End the io_in_progress state. */ + TerminateBufferIO(buf, 0); + smgrwrite(reln, buf->tag.forkNum, buf->tag.blockNum, ! (char *) dblbuf, false); + /* Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) */ + MarkBufferNotDirty(buf); + BufferFlushCount++; TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(reln->smgr_rnode.spcNode, reln->smgr_rnode.dbNode, reln->smgr_rnode.relNode); /* Pop the error context stack */ error_context_stack = errcontext.previous; } *************** StartBufferIO(volatile BufferDesc *buf, *** 2578,2595 **** * We hold the buffer's io_in_progress lock * The buffer is Pinned * - * If clear_dirty is TRUE and BM_JUST_DIRTIED is not set, we clear the - * buffer's BM_DIRTY flag. This is appropriate when terminating a - * successful write. The check on BM_JUST_DIRTIED is necessary to avoid - * marking the buffer clean if it was re-dirtied while we were writing. - * * set_flag_bits gets ORed into the buffer's flags. It must include * BM_IO_ERROR in a failure case. For successful completion it could * be 0, or BM_VALID if we just finished reading in the page. */ static void ! TerminateBufferIO(volatile BufferDesc *buf, bool clear_dirty, ! int set_flag_bits) { Assert(buf == InProgressBuf); --- 2585,2596 ---- * We hold the buffer's io_in_progress lock * The buffer is Pinned * * set_flag_bits gets ORed into the buffer's flags. It must include * BM_IO_ERROR in a failure case. For successful completion it could * be 0, or BM_VALID if we just finished reading in the page. */ static void ! TerminateBufferIO(volatile BufferDesc *buf, int set_flag_bits) { Assert(buf == InProgressBuf); *************** TerminateBufferIO(volatile BufferDesc *b *** 2597,2604 **** Assert(buf->flags & BM_IO_IN_PROGRESS); buf->flags &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR); - if (clear_dirty && !(buf->flags & BM_JUST_DIRTIED)) - buf->flags &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED); buf->flags |= set_flag_bits; UnlockBufHdr(buf); --- 2598,2603 ---- *************** TerminateBufferIO(volatile BufferDesc *b *** 2609,2614 **** --- 2608,2631 ---- } /* + * Clear the buffer's BM_DIRTY flag, unless BM_JUST_DIRTIED is set. + * + * This is appropriate when terminating a successful write. The check on + * BM_JUST_DIRTIED is necessary to avoid marking the buffer clean if it was + * re-dirtied while we were writing. + */ + static void + MarkBufferNotDirty(volatile BufferDesc *buf) + { + LockBufHdr(buf); + + if (!(buf->flags & BM_JUST_DIRTIED)) + buf->flags &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED); + + UnlockBufHdr(buf); + } + + /* * AbortBufferIO: Clean up any active buffer I/O after an error. * * All LWLocks we might have held have been released, *************** AbortBufferIO(void) *** 2662,2668 **** errdetail("Multiple failures --- write error might be permanent."))); } } ! TerminateBufferIO(buf, false, BM_IO_ERROR); } } --- 2679,2685 ---- errdetail("Multiple failures --- write error might be permanent."))); } } ! TerminateBufferIO(buf, BM_IO_ERROR); } }
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers