COPY IN loops in heap_multi_insert() extending the table until it fills the disk when trying to insert a wide row into a table with a low fill-factor. Internally fill-factor is implemented by reserving some space space on a page. For large enough rows and small enough fill-factor bulk_multi_insert() can't fit the row even on a new empty page, so it keeps allocating new pages but is never able to place the row. It should always put at least one row on an empty page.
In the excerpt below saveFreeSpace is the reserved space for the fill-factor. while (ndone < ntuples) { ... /* * Find buffer where at least the next tuple will fit. If the page is * all-visible, this will also pin the requisite visibility map page. */ buffer = RelationGetBufferForTuple(relation, heaptuples[ndone]->t_len, ... /* Put as many tuples as fit on this page */ for (nthispage = 0; ndone + nthispage < ntuples; nthispage++) { HeapTuple heaptup = heaptuples[ndone + nthispage]; if (PageGetHeapFreeSpace(page) < MAXALIGN(heaptup->t_len) + saveFreeSpace) break; RelationPutHeapTuple(relation, buffer, heaptup); } ...Do a bunch of dirtying and logging etc ... } This was introduced in 9.2 as part of the bulk insert speedup. One more point, in the case where we don't insert any rows, we still do all the dirtying and logging work even though we did not modify the page. I have tried skip all this if no rows are added (nthispage == 0), but my access method foo is sadly out of date, so someone should take a skeptical look at that. A test case and patch against 9.2.2 is attached. It fixes the problem and passes make check. Most of the diff is just indentation changes. Whoever tries this will want to test this on a small partition by itself. -dg -- David Gould 510 282 0869 da...@sonic.net If simplicity worked, the world would be overrun with insects.
\set ECHO all \timing on drop table if exists big_sparse; CREATE TABLE big_sparse ( u_id integer NOT NULL, c_id integer DEFAULT 0 NOT NULL, a_id integer DEFAULT 0 NOT NULL, cr_id integer DEFAULT 0 NOT NULL, b integer, c_t_u text, l_date abstime DEFAULT now(), c_b double precision DEFAULT 0.0, d_b double precision, c_s text, c_s_u abstime, c_e_u abstime, co_b_r_ts abstime, e_a_s integer, c_c_i_l text, c_c_e_l text, ag_c_i_l text, ag_c_e_l text, c_c_t_l text, c_r_t_l text, c_m_t_l text, c_ci_t_l text, ag_c_t_l text, ag_r_t_l text, ag_m_t_l text, ag_ci_t_l text, c_l_t_l text, ag_l_t_l text, r_s_id smallint DEFAULT 1 NOT NULL, r_s text NOT NULL, i_u_l text, e_u_l text, f_l text, f smallint, e_s_l_e text, r_h_f_n text, cr_h smallint, cr_w smallint, n_b integer, cr_a_l text, e_id bigint NOT NULL, a_ex text, ca_b_a real[], su smallint DEFAULT 0 NOT NULL, h_b_f real[], p_id integer, ca_b_aj real[], ca_b_aj_i integer[] ) WITH (fillfactor=10); COPY big_sparse FROM stdin; 1688 700032834 700580073 704810483 25 foobar.grotxxxxxbarb/xxxxxdexyzzyen.xyzzyv=xxxxxxxxxxixxxxx&src=201002RAA9 2011-11-12 05:00:01+00 0 500 i 2010-10-31 04:00:00+00 \N 2011-11-12 05:00:01+00 113 \N \N \N \N US \N \N \N \N \N \N \N en \N 5 Rxyzzyn \N foobar./|foobar.xyzzyatxyzzy.xyzzyk/|foobar.grotxyzzyroupbarb/|grotxyzzxyzzybarb|foobar.grotxyxyzzybarb/|foobar.xyzzyxyzzyily.xyzzy/|grotxyzzyancebarb|grotxyzzydog|foobar.grotxyzzyxyzzysbarbdog/|foobar.grotxyzzyaobarb/|grotxyzzxyzzywsbarb|foobar.xyzzyun.xyzzy/|foobar.grotogejbarb/|foobar.grotxyzzxyzzybarb/|foobar.grotogcdbarbdog/|grotxyzzyetbarb|foobar.grotxyzzyipobarb/|foobar.grotxyzzyaobarb/|grotxyzzyxyzzyiabarb|grotxyzzybarbdog xyzzyign 2 1:1291|0|0 ABC DEF 36|0|0 OR 1290|0|0 ABC DEF 36|0|0 OR 13|0|0 ABC DEF 36|0|0 OR 84|2592000|0 ABC DEF 36|0|0 OR 83|2592000|0 ABC DEF 36|0|0 OR 82|2592000|0 ABC DEF 36|0|0 OR 12|0|0 ABC DEF 36|0|0 \N 0 0 25 \N 0 \N \N 0 {1,1,1,1,1,0.200000003,0.400000006,0.600000024,0.800000012,1,1,1,1,1,1,1! ,1,1,1,1,1,1,1,1} \N \N \N \.
*** postgresql-9.2.2/src/backend/access/heap/heapam.c 2012-12-03 12:16:10.000000000 -0800 --- postgresql-9.2.2dg/src/backend/access/heap/heapam.c 2012-12-12 01:55:58.174653706 -0800 *************** *** 2158,2163 **** --- 2158,2164 ---- Buffer buffer; Buffer vmbuffer = InvalidBuffer; bool all_visible_cleared = false; + bool page_is_empty; int nthispage; /* *************** *** 2173,2299 **** START_CRIT_SECTION(); /* Put as many tuples as fit on this page */ for (nthispage = 0; ndone + nthispage < ntuples; nthispage++) { HeapTuple heaptup = heaptuples[ndone + nthispage]; ! if (PageGetHeapFreeSpace(page) < MAXALIGN(heaptup->t_len) + saveFreeSpace) break; ! RelationPutHeapTuple(relation, buffer, heaptup); } - if (PageIsAllVisible(page)) - { - all_visible_cleared = true; - PageClearAllVisible(page); - visibilitymap_clear(relation, - BufferGetBlockNumber(buffer), - vmbuffer); - } - /* ! * XXX Should we set PageSetPrunable on this page ? See heap_insert() */ ! ! MarkBufferDirty(buffer); ! ! /* XLOG stuff */ ! if (needwal) { ! XLogRecPtr recptr; ! xl_heap_multi_insert *xlrec; ! XLogRecData rdata[2]; ! uint8 info = XLOG_HEAP2_MULTI_INSERT; ! char *tupledata; ! int totaldatalen; ! char *scratchptr = scratch; ! bool init; /* ! * If the page was previously empty, we can reinit the page ! * instead of restoring the whole thing. */ - init = (ItemPointerGetOffsetNumber(&(heaptuples[ndone]->t_self)) == FirstOffsetNumber && - PageGetMaxOffsetNumber(page) == FirstOffsetNumber + nthispage - 1); ! /* allocate xl_heap_multi_insert struct from the scratch area */ ! xlrec = (xl_heap_multi_insert *) scratchptr; ! scratchptr += SizeOfHeapMultiInsert; ! /* ! * Allocate offsets array. Unless we're reinitializing the page, ! * in that case the tuples are stored in order starting at ! * FirstOffsetNumber and we don't need to store the offsets ! * explicitly. ! */ ! if (!init) ! scratchptr += nthispage * sizeof(OffsetNumber); ! /* the rest of the scratch space is used for tuple data */ ! tupledata = scratchptr; ! xlrec->all_visible_cleared = all_visible_cleared; ! xlrec->node = relation->rd_node; ! xlrec->blkno = BufferGetBlockNumber(buffer); ! xlrec->ntuples = nthispage; ! ! /* ! * Write out an xl_multi_insert_tuple and the tuple data itself ! * for each tuple. ! */ ! for (i = 0; i < nthispage; i++) ! { ! HeapTuple heaptup = heaptuples[ndone + i]; ! xl_multi_insert_tuple *tuphdr; ! int datalen; if (!init) ! xlrec->offsets[i] = ItemPointerGetOffsetNumber(&heaptup->t_self); ! /* xl_multi_insert_tuple needs two-byte alignment. */ ! tuphdr = (xl_multi_insert_tuple *) SHORTALIGN(scratchptr); ! scratchptr = ((char *) tuphdr) + SizeOfMultiInsertTuple; ! ! tuphdr->t_infomask2 = heaptup->t_data->t_infomask2; ! tuphdr->t_infomask = heaptup->t_data->t_infomask; ! tuphdr->t_hoff = heaptup->t_data->t_hoff; ! ! /* write bitmap [+ padding] [+ oid] + data */ ! datalen = heaptup->t_len - offsetof(HeapTupleHeaderData, t_bits); ! memcpy(scratchptr, ! (char *) heaptup->t_data + offsetof(HeapTupleHeaderData, t_bits), ! datalen); ! tuphdr->datalen = datalen; ! scratchptr += datalen; ! } ! totaldatalen = scratchptr - tupledata; ! Assert((scratchptr - scratch) < BLCKSZ); ! rdata[0].data = (char *) xlrec; ! rdata[0].len = tupledata - scratch; ! rdata[0].buffer = InvalidBuffer; ! rdata[0].next = &rdata[1]; ! ! rdata[1].data = tupledata; ! rdata[1].len = totaldatalen; ! rdata[1].buffer = buffer; ! rdata[1].buffer_std = true; ! rdata[1].next = NULL; ! /* ! * If we're going to reinitialize the whole page using the WAL ! * record, hide buffer reference from XLogInsert. ! */ ! if (init) ! { ! rdata[1].buffer = InvalidBuffer; ! info |= XLOG_HEAP_INIT_PAGE; ! } ! recptr = XLogInsert(RM_HEAP2_ID, info, rdata); ! PageSetLSN(page, recptr); ! PageSetTLI(page, ThisTimeLineID); } END_CRIT_SECTION(); --- 2174,2309 ---- START_CRIT_SECTION(); /* Put as many tuples as fit on this page */ + page_is_empty = PageGetMaxOffsetNumber(page) == 0; for (nthispage = 0; ndone + nthispage < ntuples; nthispage++) { HeapTuple heaptup = heaptuples[ndone + nthispage]; ! if (PageGetHeapFreeSpace(page) < ! MAXALIGN(heaptup->t_len) + (page_is_empty ? 0 : saveFreeSpace)) break; ! page_is_empty = false; RelationPutHeapTuple(relation, buffer, heaptup); } /* ! * If nthispage > 0 then we modified the (possibly new) page, ! * otherwise there was not enough space for even one new tuple. */ ! if (nthispage > 0) { ! if (PageIsAllVisible(page)) ! { ! all_visible_cleared = true; ! PageClearAllVisible(page); ! visibilitymap_clear(relation, ! BufferGetBlockNumber(buffer), ! vmbuffer); ! } /* ! * XXX Should we set PageSetPrunable on this page ? See heap_insert() */ ! MarkBufferDirty(buffer); ! /* XLOG stuff */ ! if (needwal) ! { ! XLogRecPtr recptr; ! xl_heap_multi_insert *xlrec; ! XLogRecData rdata[2]; ! uint8 info = XLOG_HEAP2_MULTI_INSERT; ! char *tupledata; ! int totaldatalen; ! char *scratchptr = scratch; ! bool init; ! /* ! * If the page was previously empty, we can reinit the page ! * instead of restoring the whole thing. ! */ ! init = (ItemPointerGetOffsetNumber(&(heaptuples[ndone]->t_self)) == FirstOffsetNumber && ! PageGetMaxOffsetNumber(page) == FirstOffsetNumber + nthispage - 1); ! /* allocate xl_heap_multi_insert struct from the scratch area */ ! xlrec = (xl_heap_multi_insert *) scratchptr; ! scratchptr += SizeOfHeapMultiInsert; + /* + * Allocate offsets array. Unless we're reinitializing the page, + * in that case the tuples are stored in order starting at + * FirstOffsetNumber and we don't need to store the offsets + * explicitly. + */ if (!init) ! scratchptr += nthispage * sizeof(OffsetNumber); ! /* the rest of the scratch space is used for tuple data */ ! tupledata = scratchptr; ! xlrec->all_visible_cleared = all_visible_cleared; ! xlrec->node = relation->rd_node; ! xlrec->blkno = BufferGetBlockNumber(buffer); ! xlrec->ntuples = nthispage; ! /* ! * Write out an xl_multi_insert_tuple and the tuple data itself ! * for each tuple. ! */ ! for (i = 0; i < nthispage; i++) ! { ! HeapTuple heaptup = heaptuples[ndone + i]; ! xl_multi_insert_tuple *tuphdr; ! int datalen; ! ! if (!init) ! xlrec->offsets[i] = ItemPointerGetOffsetNumber(&heaptup->t_self); ! /* xl_multi_insert_tuple needs two-byte alignment. */ ! tuphdr = (xl_multi_insert_tuple *) SHORTALIGN(scratchptr); ! scratchptr = ((char *) tuphdr) + SizeOfMultiInsertTuple; ! ! tuphdr->t_infomask2 = heaptup->t_data->t_infomask2; ! tuphdr->t_infomask = heaptup->t_data->t_infomask; ! tuphdr->t_hoff = heaptup->t_data->t_hoff; ! ! /* write bitmap [+ padding] [+ oid] + data */ ! datalen = heaptup->t_len - offsetof(HeapTupleHeaderData, t_bits); ! memcpy(scratchptr, ! (char *) heaptup->t_data + offsetof(HeapTupleHeaderData, t_bits), ! datalen); ! tuphdr->datalen = datalen; ! scratchptr += datalen; ! } ! totaldatalen = scratchptr - tupledata; ! Assert((scratchptr - scratch) < BLCKSZ); ! ! rdata[0].data = (char *) xlrec; ! rdata[0].len = tupledata - scratch; ! rdata[0].buffer = InvalidBuffer; ! rdata[0].next = &rdata[1]; ! ! rdata[1].data = tupledata; ! rdata[1].len = totaldatalen; ! rdata[1].buffer = buffer; ! rdata[1].buffer_std = true; ! rdata[1].next = NULL; ! ! /* ! * If we're going to reinitialize the whole page using the WAL ! * record, hide buffer reference from XLogInsert. ! */ ! if (init) ! { ! rdata[1].buffer = InvalidBuffer; ! info |= XLOG_HEAP_INIT_PAGE; ! } ! ! recptr = XLogInsert(RM_HEAP2_ID, info, rdata); ! PageSetLSN(page, recptr); ! PageSetTLI(page, ThisTimeLineID); ! } } END_CRIT_SECTION(); *************** *** 2304,2310 **** ndone += nthispage; } - /* * If tuples are cachable, mark them for invalidation from the caches in * case we abort. Note it is OK to do this after releasing the buffer, --- 2314,2319 ----
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers