On Wed, 2020-03-04 at 11:57 +0800, Adam Lee wrote:
> Master(e537aed61d): 13342.844 ms 13195.982 ms 13271.023 ms
> With my patch(a pointer): 13020.029 ms 13008.158 ms 13063.658 ms
> With your patch(flexible array): 12870.117 ms 12814.725 ms 13119.255
> ms

I tracked the problem down.

When we change from a flexible array to a pointer and a separately-
allocated chunk, then it causes some unrelated code in
LogicalTapeWrite() to be optimized differently in my environment.

Specifically, the memcpy() in LogicalTapeWrite() gets turned into an
inline implementation using the "rep movsq" instruction. For my
particular case, actually calling memcpy (rather than using the inlined
version) is a lot faster.

In my test case, LogicalTapeWrite() was being called with size of 4 or
10, so perhaps those small values are just handled much more
efficiently in the real memcpy.

To get it to use the real memcpy(), I had to '#undef _FORTIFY_SOURCE'
at the top of the file, and pass -fno-builtin-memcpy. Then the
regression went away.

I don't care for the version I posted where it repalloc()s the entire
struct. That seems needlessly odd and could cause confusion; and it
also changes the API so that the caller needs to update its pointers.

I'm inclined to use a version similar to Adam's, where it has a pointer
and a separate palloc() chunk (attached). That causes a regression in
my setup, but it's not a terrible regression, and it's not really the
"fault" of the change. Trying to twist code around to satisfy a
particular compiler/libc seems like a bad idea. It also might depend on
the exact query, and may even be faster for some.

Any objections?

Regards,
        Jeff Davis

diff --git a/src/backend/utils/sort/logtape.c b/src/backend/utils/sort/logtape.c
index 4f78b55fbaf..4274afea32e 100644
--- a/src/backend/utils/sort/logtape.c
+++ b/src/backend/utils/sort/logtape.c
@@ -191,8 +191,8 @@ struct LogicalTapeSet
 	Size		freeBlocksLen;	/* current allocated length of freeBlocks[] */
 
 	/* The array of logical tapes. */
-	int			nTapes;			/* # of logical tapes in set */
-	LogicalTape tapes[FLEXIBLE_ARRAY_MEMBER];	/* has nTapes nentries */
+	int				nTapes;	/* # of logical tapes in set */
+	LogicalTape	   *tapes;	/* has nTapes nentries */
 };
 
 static void ltsWriteBlock(LogicalTapeSet *lts, long blocknum, void *buffer);
@@ -201,6 +201,7 @@ static long ltsGetFreeBlock(LogicalTapeSet *lts);
 static void ltsReleaseBlock(LogicalTapeSet *lts, long blocknum);
 static void ltsConcatWorkerTapes(LogicalTapeSet *lts, TapeShare *shared,
 								 SharedFileSet *fileset);
+static void ltsInitTape(LogicalTape *lt);
 static void ltsInitReadBuffer(LogicalTapeSet *lts, LogicalTape *lt);
 
 
@@ -536,6 +537,30 @@ ltsConcatWorkerTapes(LogicalTapeSet *lts, TapeShare *shared,
 	lts->nHoleBlocks = lts->nBlocksAllocated - nphysicalblocks;
 }
 
+/*
+ * Initialize per-tape struct.  Note we allocate the I/O buffer and the first
+ * block for a tape only when it is first actually written to.  This avoids
+ * wasting memory space when tuplesort.c overestimates the number of tapes
+ * needed.
+ */
+static void
+ltsInitTape(LogicalTape *lt)
+{
+	lt->writing           = true;
+	lt->frozen            = false;
+	lt->dirty             = false;
+	lt->firstBlockNumber  = -1L;
+	lt->curBlockNumber    = -1L;
+	lt->nextBlockNumber   = -1L;
+	lt->offsetBlockNumber = 0L;
+	lt->buffer            = NULL;
+	lt->buffer_size       = 0;
+	/* palloc() larger than MaxAllocSize would fail */
+	lt->max_size          = MaxAllocSize;
+	lt->pos               = 0;
+	lt->nbytes            = 0;
+}
+
 /*
  * Lazily allocate and initialize the read buffer. This avoids waste when many
  * tapes are open at once, but not all are active between rewinding and
@@ -579,15 +604,13 @@ LogicalTapeSetCreate(int ntapes, TapeShare *shared, SharedFileSet *fileset,
 					 int worker)
 {
 	LogicalTapeSet *lts;
-	LogicalTape *lt;
 	int			i;
 
 	/*
 	 * Create top-level struct including per-tape LogicalTape structs.
 	 */
 	Assert(ntapes > 0);
-	lts = (LogicalTapeSet *) palloc(offsetof(LogicalTapeSet, tapes) +
-									ntapes * sizeof(LogicalTape));
+	lts = (LogicalTapeSet *) palloc(sizeof(LogicalTapeSet));
 	lts->nBlocksAllocated = 0L;
 	lts->nBlocksWritten = 0L;
 	lts->nHoleBlocks = 0L;
@@ -596,30 +619,10 @@ LogicalTapeSetCreate(int ntapes, TapeShare *shared, SharedFileSet *fileset,
 	lts->freeBlocks = (long *) palloc(lts->freeBlocksLen * sizeof(long));
 	lts->nFreeBlocks = 0;
 	lts->nTapes = ntapes;
+	lts->tapes = (LogicalTape *) palloc(ntapes * sizeof(LogicalTape));
 
-	/*
-	 * Initialize per-tape structs.  Note we allocate the I/O buffer and the
-	 * first block for a tape only when it is first actually written to.  This
-	 * avoids wasting memory space when tuplesort.c overestimates the number
-	 * of tapes needed.
-	 */
 	for (i = 0; i < ntapes; i++)
-	{
-		lt = &lts->tapes[i];
-		lt->writing = true;
-		lt->frozen = false;
-		lt->dirty = false;
-		lt->firstBlockNumber = -1L;
-		lt->curBlockNumber = -1L;
-		lt->nextBlockNumber = -1L;
-		lt->offsetBlockNumber = 0L;
-		lt->buffer = NULL;
-		lt->buffer_size = 0;
-		/* palloc() larger than MaxAllocSize would fail */
-		lt->max_size = MaxAllocSize;
-		lt->pos = 0;
-		lt->nbytes = 0;
-	}
+		ltsInitTape(&lts->tapes[i]);
 
 	/*
 	 * Create temp BufFile storage as required.
@@ -1004,6 +1007,25 @@ LogicalTapeFreeze(LogicalTapeSet *lts, int tapenum, TapeShare *share)
 	}
 }
 
+/*
+ * Add additional tapes to this tape set. Not intended to be used when any
+ * tapes are frozen.
+ */
+void
+LogicalTapeSetExtend(LogicalTapeSet *lts, int nAdditional)
+{
+	int     i;
+	int		nTapesOrig = lts->nTapes;
+
+	lts->nTapes += nAdditional;
+
+	lts->tapes = (LogicalTape *) repalloc(
+		lts->tapes, lts->nTapes * sizeof(LogicalTape));
+
+	for (i = nTapesOrig; i < lts->nTapes; i++)
+		ltsInitTape(&lts->tapes[i]);
+}
+
 /*
  * Backspace the tape a given number of bytes.  (We also support a more
  * general seek interface, see below.)
diff --git a/src/include/utils/logtape.h b/src/include/utils/logtape.h
index 695d2c00ee4..39a99174afe 100644
--- a/src/include/utils/logtape.h
+++ b/src/include/utils/logtape.h
@@ -67,6 +67,7 @@ extern void LogicalTapeRewindForRead(LogicalTapeSet *lts, int tapenum,
 extern void LogicalTapeRewindForWrite(LogicalTapeSet *lts, int tapenum);
 extern void LogicalTapeFreeze(LogicalTapeSet *lts, int tapenum,
 							  TapeShare *share);
+extern void LogicalTapeSetExtend(LogicalTapeSet *lts, int nAdditional);
 extern size_t LogicalTapeBackspace(LogicalTapeSet *lts, int tapenum,
 								   size_t size);
 extern void LogicalTapeSeek(LogicalTapeSet *lts, int tapenum,

Reply via email to