On Thu, Jul 16, 2015 at 4:01 PM, Peter Geoghegan <p...@heroku.com> wrote: > Attached patch adds a portable Postgres wrapper on the GCC intrinsic. > It also adds a client within tuplesort.c -- a common function that > seems like a good generic choke point. I can get a speed-up of 6% - 9% > for all of these cases by prefetching a few SortTuples ahead, for the > "tuple proper". (In-memory sorts only.)
I added a silly bug during last minute clean-up. I attach a V2. -- Peter Geoghegan
From 5d3093e9c0d10f4140072da8d18d0dddf6b84b1e Mon Sep 17 00:00:00 2001 From: Peter Geoghegan <peter.geoghega...@gmail.com> Date: Sun, 12 Jul 2015 13:14:01 -0700 Subject: [PATCH] Prefetch from memtuples array in tuplesort Testing shows that prefetching the "tuple proper" of a slightly later SortTuple in the memtuples array during each of many sequential, in-logical-order SortTuple fetches speeds up various sorting intense operations considerably. For example, B-Tree index builds are accelerated as leaf pages are created from the memtuples array. (i.e. The operation following actually "performing" the sort, but before a tuplesort_end() call is made as a B-Tree spool is destroyed.) Similarly, ordered set aggregates (all cases except the datumsort case with a pass-by-value type), and regular heap tuplesorts benefit to about the same degree. The optimization is only used when sorts fit in memory, though. Also, prefetch a few places ahead within the analogous "fetching" point in tuplestore.c. This appears to offer similar benefits in certain cases. For example, queries involving large common table expressions significantly benefit. --- config/c-compiler.m4 | 17 +++++++++++++++++ configure | 30 ++++++++++++++++++++++++++++++ configure.in | 1 + src/backend/utils/sort/tuplesort.c | 18 ++++++++++++++++++ src/backend/utils/sort/tuplestore.c | 11 +++++++++++ src/include/c.h | 19 +++++++++++++++++++ src/include/pg_config.h.in | 3 +++ src/include/pg_config.h.win32 | 3 +++ 8 files changed, 102 insertions(+) diff --git a/config/c-compiler.m4 b/config/c-compiler.m4 index 050bfa5..5776201 100644 --- a/config/c-compiler.m4 +++ b/config/c-compiler.m4 @@ -287,6 +287,23 @@ fi])# PGAC_C_BUILTIN_UNREACHABLE +# PGAC_C_BUILTIN_PREFETCH +# ------------------------- +# Check if the C compiler understands __builtin_prefetch(), +# and define HAVE__BUILTIN_PREFETCH if so. +AC_DEFUN([PGAC_C_BUILTIN_PREFETCH], +[AC_CACHE_CHECK(for __builtin_prefetch, pgac_cv__builtin_prefetch, +[AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], +[int i = 0;__builtin_prefetch(&i, 0, 3);])], +[pgac_cv__builtin_prefetch=yes], +[pgac_cv__builtin_prefetch=no])]) +if test x"$pgac_cv__builtin_prefetch" = xyes ; then +AC_DEFINE(HAVE__BUILTIN_PREFETCH, 1, + [Define to 1 if your compiler understands __builtin_prefetch.]) +fi])# PGAC_C_BUILTIN_PREFETCH + + + # PGAC_C_VA_ARGS # -------------- # Check if the C compiler understands C99-style variadic macros, diff --git a/configure b/configure index 77048e8..f14e65d 100755 --- a/configure +++ b/configure @@ -11248,6 +11248,36 @@ if test x"$pgac_cv__builtin_unreachable" = xyes ; then $as_echo "#define HAVE__BUILTIN_UNREACHABLE 1" >>confdefs.h fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __builtin_prefetch" >&5 +$as_echo_n "checking for __builtin_prefetch... " >&6; } +if ${pgac_cv__builtin_prefetch+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main () +{ +int i = 0;__builtin_prefetch(&i, 0, 3); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + pgac_cv__builtin_prefetch=yes +else + pgac_cv__builtin_prefetch=no +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv__builtin_prefetch" >&5 +$as_echo "$pgac_cv__builtin_prefetch" >&6; } +if test x"$pgac_cv__builtin_prefetch" = xyes ; then + +$as_echo "#define HAVE__BUILTIN_PREFETCH 1" >>confdefs.h + +fi { $as_echo "$as_me:${as_lineno-$LINENO}: checking for __VA_ARGS__" >&5 $as_echo_n "checking for __VA_ARGS__... " >&6; } if ${pgac_cv__va_args+:} false; then : diff --git a/configure.in b/configure.in index 5724a4d..2132d69 100644 --- a/configure.in +++ b/configure.in @@ -1318,6 +1318,7 @@ PGAC_C_TYPES_COMPATIBLE PGAC_C_BUILTIN_BSWAP32 PGAC_C_BUILTIN_CONSTANT_P PGAC_C_BUILTIN_UNREACHABLE +PGAC_C_BUILTIN_PREFETCH PGAC_C_VA_ARGS PGAC_STRUCT_TIMEZONE PGAC_UNION_SEMUN diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c index 435041a..70b53d1 100644 --- a/src/backend/utils/sort/tuplesort.c +++ b/src/backend/utils/sort/tuplesort.c @@ -1663,6 +1663,24 @@ tuplesort_gettuple_common(Tuplesortstate *state, bool forward, if (state->current < state->memtupcount) { *stup = state->memtuples[state->current++]; + + /* + * Perform memory prefetch of "tuple proper" of the + * SortTuple that's three places ahead of current + * (which is returned to caller). Testing shows that + * this significantly boosts the performance for + * TSS_SORTEDINMEM "forward" callers by hiding memory + * latency behind their processing of returned tuples. + * + * Don't do this for pass-by-value datum sorts; even + * though hinting a NULL address does not affect + * correctness, it would have a noticeable overhead + * here. + */ + if (stup->tuple != NULL && + state->current + 2 < state->memtupcount) + pg_rfetch(state->memtuples[state->current + 2].tuple); + return true; } state->eof_reached = true; diff --git a/src/backend/utils/sort/tuplestore.c b/src/backend/utils/sort/tuplestore.c index 627b281..49afb24 100644 --- a/src/backend/utils/sort/tuplestore.c +++ b/src/backend/utils/sort/tuplestore.c @@ -895,6 +895,17 @@ tuplestore_gettuple(Tuplestorestate *state, bool forward, return NULL; if (readptr->current < state->memtupcount) { + /* + * Perform memory prefetch of tuple that's three places + * ahead of current (which is returned to caller). + * Testing shows that this significantly boosts the + * performance for TSS_INMEM "forward" callers by + * hiding memory latency behind their processing of + * returned tuples. + */ + if (readptr->current + 3 < state->memtupcount) + pg_rfetch(state->memtuples[readptr->current + 3]); + /* We have another tuple, so return it */ return state->memtuples[readptr->current++]; } diff --git a/src/include/c.h b/src/include/c.h index 92c5202..cab7faf 100644 --- a/src/include/c.h +++ b/src/include/c.h @@ -919,6 +919,25 @@ typedef NameData *Name; #define pg_unreachable() abort() #endif +/* + * Prefetch support -- Support memory prefetching hints on some platforms. + * + * pg_rfetch() is specialized for the case where an array is accessed + * sequentially, and we can prefetch a pointer within the next element (or an + * even later element) in order to hide memory latency. This case involves + * prefetching addresses with low temporal locality. Note that it's rather + * difficult to get any kind of speedup using pg_rfetch(); any use of the + * intrinsic should be carefully tested. Also note that it's okay to pass it + * an invalid or NULL address, although it's best avoided. + * + * Follow the example of GCC's __builtin_prefetch() itself, and still evaluate + * "addr" on platforms without support. + */ +#if defined(HAVE__BUILTIN_PREFETCH) +#define pg_rfetch(addr) __builtin_prefetch((addr), 0, 0) +#else +#define pg_rfetch(addr) ((void) (addr)) +#endif /* * Function inlining support -- Allow modules to define functions that may be diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 5688f75..6aad69b 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -669,6 +669,9 @@ /* Define to 1 if your compiler understands __builtin_constant_p. */ #undef HAVE__BUILTIN_CONSTANT_P +/* Define to 1 if your compiler understands __builtin_prefetch. */ +#undef HAVE__BUILTIN_PREFETCH + /* Define to 1 if your compiler understands __builtin_types_compatible_p. */ #undef HAVE__BUILTIN_TYPES_COMPATIBLE_P diff --git a/src/include/pg_config.h.win32 b/src/include/pg_config.h.win32 index 22bbb91..3e729c5 100644 --- a/src/include/pg_config.h.win32 +++ b/src/include/pg_config.h.win32 @@ -523,6 +523,9 @@ /* Define to 1 if your compiler understands __builtin_constant_p. */ /* #undef HAVE__BUILTIN_CONSTANT_P */ +/* Define to 1 if your compiler understands __builtin_prefetch. */ +#undef HAVE__BUILTIN_PREFETCH + /* Define to 1 if your compiler understands __builtin_types_compatible_p. */ /* #undef HAVE__BUILTIN_TYPES_COMPATIBLE_P */ -- 1.9.1
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers