On Tue, 2006-03-21 at 17:47 -0500, Tom Lane wrote:

> I'm fairly unconvinced about Simon's underlying premise --- that we
> can't make good use of work_mem in sorting after the run building phase
> --- anyway.  

We can make good use of memory, but there does come a point in final
merging where too much is of no further benefit. That point seems to be
at about 256 blocks per tape; patch enclosed for testing. (256 blocks
per tape roughly doubles performance over 32 blocks at that stage).

That is never the case during run building - more is always better.

> If we cut back our memory usage 
Simon inserts the words: "too far"
> then we'll be forcing a
> significantly more-random access pattern to the temp file(s) during
> merging, because we won't be able to pre-read as much at a time.

Yes, thats right.

If we have 512MB of memory that gives us enough for 2000 tapes, yet the
initial runs might only build a few runs. There's just no way that all
512MB of memory is needed to optimise the performance of reading in a
few tapes at time of final merge.

I'm suggesting we always keep 2MB per active tape, or the full
allocation, whichever is lower. In the above example that could release
over 500MB of memory, which more importantly can be reused by subsequent
sorts if/when they occur.


Enclose two patches:
1. mergebuffers.patch allows measurement of the effects of different
merge buffer sizes, current default=32

2. reassign2.patch which implements the two kinds of resource
deallocation/reassignment proposed.

Best Regards, Simon Riggs

Index: src/backend/utils/sort/tuplesort.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/utils/sort/tuplesort.c,v
retrieving revision 1.65
diff -c -r1.65 tuplesort.c
*** src/backend/utils/sort/tuplesort.c	10 Mar 2006 23:19:00 -0000	1.65
--- src/backend/utils/sort/tuplesort.c	21 Mar 2006 19:20:23 -0000
***************
*** 179,186 ****
   */
  #define MINORDER		6		/* minimum merge order */
  #define TAPE_BUFFER_OVERHEAD		(BLCKSZ * 3)
! #define MERGE_BUFFER_SIZE			(BLCKSZ * 32)
! 
  /*
   * Private state of a Tuplesort operation.
   */
--- 179,187 ----
   */
  #define MINORDER		6		/* minimum merge order */
  #define TAPE_BUFFER_OVERHEAD		(BLCKSZ * 3)
! #define OPTIMAL_MERGE_BUFFER_SIZE	(BLCKSZ * 32)
! #define PREFERRED_MERGE_BUFFER_SIZE (BLCKSZ * 256)
! #define REUSE_SPACE_LIMIT           RELSEG_SIZE
  /*
   * Private state of a Tuplesort operation.
   */
***************
*** 255,260 ****
--- 256,270 ----
  	 */
  	int			currentRun;
  
+     /*
+      * These variables are used during final merge to reassign resources
+      * as they become available for each tape
+      */
+     int         lastPrereadTape;    /* last tape preread from */
+     int         numPrereads;        /* num times last tape has been selected */
+     int         reassignableSlots;  /* how many slots can be reassigned */
+     long        reassignableMem;    /* how much memory can be reassigned */
+ 
  	/*
  	 * Unless otherwise noted, all pointer variables below are pointers
  	 * to arrays of length maxTapes, holding per-tape data.
***************
*** 398,408 ****
--- 408,422 ----
  
  static Tuplesortstate *tuplesort_begin_common(int workMem, bool randomAccess);
  static void puttuple_common(Tuplesortstate *state, SortTuple *tuple);
+ static void grow_memtuples(Tuplesortstate *state);
+ static void shrink_memtuples(Tuplesortstate *state);
  static void inittapes(Tuplesortstate *state);
  static void selectnewtape(Tuplesortstate *state);
  static void mergeruns(Tuplesortstate *state);
  static void mergeonerun(Tuplesortstate *state);
  static void beginmerge(Tuplesortstate *state);
+ static void assignResourcesUniformly(Tuplesortstate *state, bool initialAssignment);
+ static void reassignresources(Tuplesortstate *state, int srcTape);
  static void mergepreread(Tuplesortstate *state);
  static void mergeprereadone(Tuplesortstate *state, int srcTape);
  static void dumptuples(Tuplesortstate *state, bool alltuples);
***************
*** 727,733 ****
   * moves around with tuple addition/removal, this might result in thrashing.
   * Small increases in the array size are likely to be pretty inefficient.
   */
! static bool
  grow_memtuples(Tuplesortstate *state)
  {
  	/*
--- 741,747 ----
   * moves around with tuple addition/removal, this might result in thrashing.
   * Small increases in the array size are likely to be pretty inefficient.
   */
! static void
  grow_memtuples(Tuplesortstate *state)
  {
  	/*
***************
*** 740,752 ****
  	 * this assumption should be good.  But let's check it.)
  	 */
  	if (state->availMem <= (long) (state->memtupsize * sizeof(SortTuple)))
! 		return false;
  	/*
  	 * On a 64-bit machine, allowedMem could be high enough to get us into
  	 * trouble with MaxAllocSize, too.
  	 */
  	if ((Size) (state->memtupsize * 2) >= MaxAllocSize / sizeof(SortTuple))
! 		return false;
  
  	FREEMEM(state, GetMemoryChunkSpace(state->memtuples));
  	state->memtupsize *= 2;
--- 754,766 ----
  	 * this assumption should be good.  But let's check it.)
  	 */
  	if (state->availMem <= (long) (state->memtupsize * sizeof(SortTuple)))
! 		return;
  	/*
  	 * On a 64-bit machine, allowedMem could be high enough to get us into
  	 * trouble with MaxAllocSize, too.
  	 */
  	if ((Size) (state->memtupsize * 2) >= MaxAllocSize / sizeof(SortTuple))
! 		return;
  
  	FREEMEM(state, GetMemoryChunkSpace(state->memtuples));
  	state->memtupsize *= 2;
***************
*** 756,762 ****
  	USEMEM(state, GetMemoryChunkSpace(state->memtuples));
  	if (LACKMEM(state))
  		elog(ERROR, "unexpected out-of-memory situation during sort");
! 	return true;
  }
  
  /*
--- 770,848 ----
  	USEMEM(state, GetMemoryChunkSpace(state->memtuples));
  	if (LACKMEM(state))
  		elog(ERROR, "unexpected out-of-memory situation during sort");
! 	return;
! }
! 
! static void
! shrink_memtuples(Tuplesortstate *state)
! {
!     int  oldMemTupSize = state->memtupsize;
!     int  newMemTupSize;
!     long oldMemTuplesAlloc = GetMemoryChunkSpace(state->memtuples);
! 
!     if (state->memtupcount > 0)
!     {
! 		elog(LOG, "unexpected attempt to shrink sort memory");
!         return;
!     }
! 
!     Assert(state->status == TSS_FINALMERGE || state->status == TSS_SORTEDONTAPE);
! 
!     /*
!      * Set new size, based upon earlier memory usage
!      *
!      * We don't care exactly how many tuples are stored, however we already
!      * know that the existing setting filled available memory. The merge
!      * order was set according to OPTIMAL_MERGE_BUFFER_SIZE, on the assumption
!      * that we might have to cope with a merge of maxTapes. However, we
!      * would prefer to merge using PREFERRED_MERGE_BUFFER_SIZE for each active
!      * tape. Any more memory than this is likely to be a waste, so we can 
!      * reduce memory in proporton to the ratio of activeTapes to maxTapes
!      * and thus allow it to be reused by subsequent sorts or hashes within 
!      * the same execution. For large memory settings this will be a major
!      * win, since we may have allocated enough memory for 1000s of tapes,
!      * yet might only need to merge back on a few tapes.
!      */
!    newMemTupSize = (int) ((state->memtupsize * state->activeTapes * 
!                     (PREFERRED_MERGE_BUFFER_SIZE / OPTIMAL_MERGE_BUFFER_SIZE))/  
!                      state->tapeRange);
! 
!     if (newMemTupSize >= state->memtupsize)
!         return;
! 
!     pfree(state->memtuples);
! 	FREEMEM(state, oldMemTuplesAlloc);
! 
!     /*
!      * We don't need a memtuples array at all if we have randomAccess
!      */
!     if (state->status == TSS_SORTEDONTAPE)
!     {
! #ifdef TRACE_SORT
! 	if (trace_sort)
! 		elog(LOG, "releasing sort resources prior to enabling randomAccess: %s",
! 		    pg_rusage_show(&state->ru_start));
! #endif
!         state->memtupsize = 0;
!         return;
!     }
! 
! 	state->memtupsize = newMemTupSize;
! 	state->memtuples = (SortTuple *)
! 		  palloc(state->memtupsize * sizeof(SortTuple));
! 	USEMEM(state, oldMemTuplesAlloc);
! 
! #ifdef TRACE_SORT
! 	if (trace_sort)
! 		elog(LOG, "shrinking resources to %d%% (from %d to %d slots): %s",
!             (100*newMemTupSize) / oldMemTupSize,
!             oldMemTupSize, state->memtupsize,
! 		    pg_rusage_show(&state->ru_start));
! #endif
! 
! 	if (LACKMEM(state))
! 		elog(ERROR, "unexpected out-of-memory situation during sort");
! 	return;
  }
  
  /*
***************
*** 834,840 ****
  			 */
  			if (state->memtupcount >= state->memtupsize - 1)
  			{
! 				(void) grow_memtuples(state);
  				Assert(state->memtupcount < state->memtupsize);
  			}
  			state->memtuples[state->memtupcount++] = *tuple;
--- 920,926 ----
  			 */
  			if (state->memtupcount >= state->memtupsize - 1)
  			{
! 				grow_memtuples(state);
  				Assert(state->memtupcount < state->memtupsize);
  			}
  			state->memtuples[state->memtupcount++] = *tuple;
***************
*** 1115,1120 ****
--- 1201,1208 ----
  				tuplesort_heap_siftup(state, false);
  				if ((tupIndex = state->mergenext[srcTape]) == 0)
  				{
+                     reassignresources(state, srcTape);
+  
  					/*
  					 * out of preloaded data on this tape, try to read more
  					 *
***************
*** 1125,1133 ****
--- 1213,1235 ----
  
  					/*
  					 * if still no data, we've reached end of run on this tape
+                      * so we can permanently reassign the resources used by
+                      * srcTape onto any remaining tapes for remainder of the
+                      * final merge
  					 */
  					if ((tupIndex = state->mergenext[srcTape]) == 0)
+                     {
+                         state->reassignableSlots += state->mergeavailslots[srcTape];
+                         state->reassignableMem += state->mergeavailmem[srcTape];
+                         state->numPrereads = 0;
+ #ifdef TRACE_SORT
+                     	if (trace_sort)
+                     		elog(LOG, "final merge: tape %d exhausted: %s",
+                                 srcTape,
+                     		    pg_rusage_show(&state->ru_start));
+ #endif
  						return true;
+                     }
  				}
  				/* pull next preread tuple from list, insert in heap */
  				newtup = &state->memtuples[tupIndex];
***************
*** 1150,1155 ****
--- 1252,1364 ----
  }
  
  /*
+  * Assign space uniformly across remaining active tapes
+  */
+ static void 
+ assignResourcesUniformly(Tuplesortstate *state, bool initialAssignment)
+ {
+     int slotsPerTape;
+     long spacePerTape;
+     int activeTapes = state->activeTapes;
+     int srcTape;
+ 
+ 	Assert(activeTapes > 0);
+ 
+     if (initialAssignment)
+     {
+     	slotsPerTape = (state->memtupsize - state->mergefirstfree) / activeTapes;
+     	Assert(slotsPerTape > 0);
+     	spacePerTape = state->availMem / activeTapes;
+ 
+     	for (srcTape = 0; srcTape < state->maxTapes; srcTape++)
+     	{
+     		if (state->mergeactive[srcTape])
+     		{
+     			state->mergeavailslots[srcTape] = slotsPerTape;
+     			state->mergeavailmem[srcTape] = spacePerTape;
+     		}
+     	}
+     }
+     else
+     {
+         slotsPerTape = state->reassignableSlots / activeTapes;
+         if (slotsPerTape == 0)
+             return;
+         spacePerTape = state->reassignableMem / activeTapes;
+ #ifdef TRACE_SORT
+     	if (trace_sort)
+     		elog(LOG, "reassigning resources; each tape gets: +%d slots, +%ld mem: %s",
+                 slotsPerTape,
+                 spacePerTape,
+     		    pg_rusage_show(&state->ru_start));
+ #endif
+ 
+     	for (srcTape = 0; srcTape < state->maxTapes; srcTape++)
+     	{
+     		if (state->mergeactive[srcTape])
+     		{
+     			state->mergeavailslots[srcTape] += slotsPerTape;
+     			state->mergeavailmem[srcTape] += spacePerTape;
+                 if (--activeTapes <= 0)
+                     break;
+     		}
+     	}
+     }
+ 
+     state->reassignableSlots = 0;
+     state->reassignableMem = 0;
+ }
+ 
+ /*
+  * If we have any reassignable resources from earlier tapes that have been 
+  * made empty by previous final run prereads, consider how to reassign them
+  */
+ static void
+ reassignresources(Tuplesortstate *state, int srcTape)
+ {
+     if (state->reassignableSlots <= state->activeTapes)
+         return;
+ 
+     /*
+      * We expect each tape to need to be preread about 2*number of tapes in 
+      * final merge. That gives us time to decide whether we should allocate the
+      * reassignable resources evenly, or whether we should give them all to a 
+      * single tape, which may be appropriate in some cases
+      */
+     if (state->numPrereads == 0 ||
+         state->lastPrereadTape == srcTape || 
+         state->activeTapes == 1)
+     {
+         state->numPrereads++;
+         state->lastPrereadTape = srcTape;
+ 
+         /*
+          * If confirm that the emerging pattern of prereads
+          * is consistently on a single tape, then hand
+          * over all the spare resources to that tape
+          */
+         if (state->numPrereads > 3 || state->activeTapes == 1)
+         {
+ #ifdef TRACE_SORT
+         	if (trace_sort)
+         		elog(LOG, "reassigning memory to tape %d, +%d slots, +%ld mem: %s",
+                     srcTape,
+                     state->reassignableSlots,
+                     state->reassignableMem,
+         		    pg_rusage_show(&state->ru_start));
+ #endif
+             state->mergeavailslots[srcTape] += state->reassignableSlots;
+             state->mergeavailmem[srcTape] += state->reassignableMem;
+             state->reassignableSlots = 0;
+             state->reassignableMem = 0;
+             state->numPrereads = 0;
+         }
+     }
+     else
+             assignResourcesUniformly(state, false);
+ }
+ 
+ /*
   * Fetch the next tuple in either forward or back direction.
   * Returns NULL if no more tuples.	If *should_free is set, the
   * caller must pfree the returned tuple when done with it.
***************
*** 1231,1237 ****
  	 * the MERGE_BUFFER_SIZE workspace.
  	 */
  	mOrder = (allowedMem - TAPE_BUFFER_OVERHEAD) /
! 		(MERGE_BUFFER_SIZE + TAPE_BUFFER_OVERHEAD);
  
  	/* Even in minimum memory, use at least a MINORDER merge */
  	mOrder = Max(mOrder, MINORDER);
--- 1440,1446 ----
  	 * the MERGE_BUFFER_SIZE workspace.
  	 */
  	mOrder = (allowedMem - TAPE_BUFFER_OVERHEAD) /
! 		(OPTIMAL_MERGE_BUFFER_SIZE + TAPE_BUFFER_OVERHEAD);
  
  	/* Even in minimum memory, use at least a MINORDER merge */
  	mOrder = Max(mOrder, MINORDER);
***************
*** 1436,1443 ****
  				/* Tell logtape.c we won't be writing anymore */
  				LogicalTapeSetForgetFreeSpace(state->tapeset);
  				/* Initialize for the final merge pass */
- 				beginmerge(state);
  				state->status = TSS_FINALMERGE;
  				return;
  			}
  		}
--- 1645,1652 ----
  				/* Tell logtape.c we won't be writing anymore */
  				LogicalTapeSetForgetFreeSpace(state->tapeset);
  				/* Initialize for the final merge pass */
  				state->status = TSS_FINALMERGE;
+ 				beginmerge(state);
  				return;
  			}
  		}
***************
*** 1506,1511 ****
--- 1715,1721 ----
  	state->result_tape = state->tp_tapenum[state->tapeRange];
  	LogicalTapeFreeze(state->tapeset, state->result_tape);
  	state->status = TSS_SORTEDONTAPE;
+     shrink_memtuples(state);
  }
  
  /*
***************
*** 1523,1528 ****
--- 1733,1739 ----
  	SortTuple  *tup;
  	long		priorAvail,
  				spaceFreed;
+     int         numtapes = state->activeTapes;
  
  	/*
  	 * Start the merge by loading one tuple from each active source tape into
***************
*** 1576,1582 ****
  
  #ifdef TRACE_SORT
  	if (trace_sort)
! 		elog(LOG, "finished %d-way merge step: %s", state->activeTapes,
  			 pg_rusage_show(&state->ru_start));
  #endif
  }
--- 1787,1793 ----
  
  #ifdef TRACE_SORT
  	if (trace_sort)
! 		elog(LOG, "finished %d-way merge step: %s", numtapes,
  			 pg_rusage_show(&state->ru_start));
  #endif
  }
***************
*** 1595,1602 ****
  	int			activeTapes;
  	int			tapenum;
  	int			srcTape;
- 	int			slotsPerTape;
- 	long		spacePerTape;
  
  	/* Heap should be empty here */
  	Assert(state->memtupcount == 0);
--- 1806,1811 ----
***************
*** 1628,1649 ****
  	state->mergefreelist = 0;	/* nothing in the freelist */
  	state->mergefirstfree = activeTapes;  /* 1st slot avail for preread */
  
! 	/*
! 	 * Initialize space allocation to let each active input tape have an equal
! 	 * share of preread space.
! 	 */
! 	Assert(activeTapes > 0);
! 	slotsPerTape = (state->memtupsize - state->mergefirstfree) / activeTapes;
! 	Assert(slotsPerTape > 0);
! 	spacePerTape = state->availMem / activeTapes;
! 	for (srcTape = 0; srcTape < state->maxTapes; srcTape++)
! 	{
! 		if (state->mergeactive[srcTape])
! 		{
! 			state->mergeavailslots[srcTape] = slotsPerTape;
! 			state->mergeavailmem[srcTape] = spacePerTape;
! 		}
! 	}
  
  	/*
  	 * Preread as many tuples as possible (and at least one) from each active
--- 1837,1845 ----
  	state->mergefreelist = 0;	/* nothing in the freelist */
  	state->mergefirstfree = activeTapes;  /* 1st slot avail for preread */
  
!     shrink_memtuples(state);
! 
!     assignResourcesUniformly(state, true);
  
  	/*
  	 * Preread as many tuples as possible (and at least one) from each active
***************
*** 1730,1736 ****
--- 1927,1935 ----
  		/* read next tuple, if any */
  		if ((tuplen = getlen(state, srcTape, true)) == 0)
  		{
+             /* remove this tape from the active set */
  			state->mergeactive[srcTape] = false;
+             --state->activeTapes;
  			break;
  		}
  		READTUP(state, &stup, srcTape, tuplen);
Index: src/backend/utils/init/globals.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/utils/init/globals.c,v
retrieving revision 1.97
diff -c -r1.97 globals.c
*** src/backend/utils/init/globals.c	5 Mar 2006 15:58:46 -0000	1.97
--- src/backend/utils/init/globals.c	20 Mar 2006 17:54:42 -0000
***************
*** 93,98 ****
--- 93,99 ----
  bool		enableFsync = true;
  bool		allowSystemTableMods = false;
  int			work_mem = 1024;
+ int         merge_buffers = 32;
  int			maintenance_work_mem = 16384;
  
  /* Primary determinants of sizes of shared-memory structures: */
Index: src/backend/utils/misc/guc.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/utils/misc/guc.c,v
retrieving revision 1.314
diff -c -r1.314 guc.c
*** src/backend/utils/misc/guc.c	7 Mar 2006 02:54:23 -0000	1.314
--- src/backend/utils/misc/guc.c	20 Mar 2006 17:54:48 -0000
***************
*** 1175,1180 ****
--- 1175,1191 ----
  	},
  
  	{
+ 		{"merge_buffers", PGC_USERSET, RESOURCES_MEM,
+ 			gettext_noop("Sets the maximum memory to be used for query workspaces."),
+ 			gettext_noop("This much memory may be used by each internal "
+ 						 "sort operation and hash table before switching to "
+ 						 "temporary disk files.")
+ 		},
+ 		&merge_buffers,
+ 		32, 4, 256, NULL, NULL
+ 	},
+ 
+ 	{
  		{"maintenance_work_mem", PGC_USERSET, RESOURCES_MEM,
  			gettext_noop("Sets the maximum memory to be used for maintenance operations."),
  			gettext_noop("This includes operations such as VACUUM and CREATE INDEX.")
Index: src/backend/utils/sort/tuplesort.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/utils/sort/tuplesort.c,v
retrieving revision 1.65
diff -c -r1.65 tuplesort.c
*** src/backend/utils/sort/tuplesort.c	10 Mar 2006 23:19:00 -0000	1.65
--- src/backend/utils/sort/tuplesort.c	20 Mar 2006 17:54:50 -0000
***************
*** 1231,1237 ****
  	 * the MERGE_BUFFER_SIZE workspace.
  	 */
  	mOrder = (allowedMem - TAPE_BUFFER_OVERHEAD) /
! 		(MERGE_BUFFER_SIZE + TAPE_BUFFER_OVERHEAD);
  
  	/* Even in minimum memory, use at least a MINORDER merge */
  	mOrder = Max(mOrder, MINORDER);
--- 1231,1237 ----
  	 * the MERGE_BUFFER_SIZE workspace.
  	 */
  	mOrder = (allowedMem - TAPE_BUFFER_OVERHEAD) /
! 		((BLCKSZ * merge_buffers) + TAPE_BUFFER_OVERHEAD);
  
  	/* Even in minimum memory, use at least a MINORDER merge */
  	mOrder = Max(mOrder, MINORDER);
Index: src/include/miscadmin.h
===================================================================
RCS file: /projects/cvsroot/pgsql/src/include/miscadmin.h,v
retrieving revision 1.186
diff -c -r1.186 miscadmin.h
*** src/include/miscadmin.h	5 Mar 2006 15:58:53 -0000	1.186
--- src/include/miscadmin.h	20 Mar 2006 17:54:51 -0000
***************
*** 201,206 ****
--- 201,207 ----
  extern bool enableFsync;
  extern bool allowSystemTableMods;
  extern DLLIMPORT int work_mem;
+ extern DLLIMPORT int merge_buffers;
  extern DLLIMPORT int maintenance_work_mem;
  
  extern int	VacuumCostPageHit;
---------------------------(end of broadcast)---------------------------
TIP 3: Have you checked our extensive FAQ?

               http://www.postgresql.org/docs/faq

Reply via email to