Recent discussions on the threads "Double sorting split patch" and "CUDA sorting" raised the possibility that there could be significant performance optimisation "low-hanging fruit" picked by having the executor treat integers and floats as a special case during sorting, avoiding going to the trouble of calling a comparator using the built-in SQL function machinery, and taking advantage of inlining of the comparator, which has been shown to have a considerable performance advantage (at least compared to a general purpose c stdlib qsort(), that takes a function pointer as its comparator, much like tuplesort).
I've hacked together a sloppy POC implementation in a hurry (basically, some code is shifted around) , which is attached - I felt that it would be useful to determine if the community feels that this is a worth-while undertaking in advance of a business trip that I'm leaving on tomorrow lasting until Friday, during which I will be mostly unavailable. The patch breaks the Postgres sorting executor node (at least when it quicksorts) for any type other than int4. I apologise for how rough the patch is, but the code itself isn't important right now - the idea is. I anticipate that the value state->datumType or something similar will be set in a near future revision, so that tuplesort_performsort will know which type-specific optimisation it can use for the type, while falling back on the existing generic qsort_arg + qsort_arg_comparator, and sorting won't actually be broken. I've been doing some preliminary testing using the dell store 2 sample database. I increase work_mem to '50MB', to ensure that a quicksort will be performed for sorting (otherwise, I'm using the postgresql.conf that initdb gave me). The query is: explain analyze select * from orderlines order by prod_id; Once the cache has been warmed, explain analyze very consistently reports a runtime of 123ms for this query on master/HEAD, which varies +/- 1 ms, with a few outliers of maybe +/- 2ms. However, when I apply this patch, that goes down to 107ms +/- 1ms at -O0. I think that that's a pretty good start. Funnily enough, the difference/advantage vanishes at -O2 (I'm guessing that the higher optimisation level of GCC 4.5 hyper-corrects away the inlining, but I don't have time to check that right now). I imagine the version that I actually submit for patch review will have a macro-based infrastructure for inlining the sorting of various built-in types, initially integers and floats. It will most likely have some other optimisations - I haven't even used a profiler yet. This performance patch differs from most in that it's difficult in principle to imagine a performance regression occurring. Thoughts? -- Peter Geoghegan http://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Training and Services
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c index 3505236..c5ac708 100644 --- a/src/backend/utils/sort/tuplesort.c +++ b/src/backend/utils/sort/tuplesort.c @@ -1224,6 +1224,285 @@ puttuple_common(Tuplesortstate *state, SortTuple *tuple) } } +inline int +compare_int4( Datum first, Datum second) +{ + int32 a_is = DatumGetInt32(first); + int32 b_is = DatumGetInt32(second); + + if (a_is > b_is) + return 1; + else if (a_is == b_is) + return 0; + else + return -1; +} + + +/* + * Inline-able copy of FunctionCall2Coll() to save some cycles in sorting. + */ +static inline Datum +myFunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2) +{ + FunctionCallInfoData fcinfo; + Datum result; + + InitFunctionCallInfoData(fcinfo, flinfo, 2, collation, NULL, NULL); + + fcinfo.arg[0] = arg1; + fcinfo.arg[1] = arg2; + fcinfo.argnull[0] = false; + fcinfo.argnull[1] = false; + + result = FunctionCallInvoke(&fcinfo); + + /* Check for null result, since caller is clearly not expecting one */ + if (fcinfo.isnull) + elog(ERROR, "function %u returned NULL", fcinfo.flinfo->fn_oid); + + return result; +} +/* + * Apply a sort function (by now converted to fmgr lookup form) + * and return a 3-way comparison result. This takes care of handling + * reverse-sort and NULLs-ordering properly. We assume that DESC and + * NULLS_FIRST options are encoded in sk_flags the same way btree does it. + */ +static inline int32 +inlineApplySortFunction(FmgrInfo *sortFunction, int sk_flags, Oid collation, + Datum datum1, bool isNull1, + Datum datum2, bool isNull2) +{ + int32 compare; + + if (isNull1) + { + if (isNull2) + compare = 0; /* NULL "=" NULL */ + else if (sk_flags & SK_BT_NULLS_FIRST) + compare = -1; /* NULL "<" NOT_NULL */ + else + compare = 1; /* NULL ">" NOT_NULL */ + } + else if (isNull2) + { + if (sk_flags & SK_BT_NULLS_FIRST) + compare = 1; /* NOT_NULL ">" NULL */ + else + compare = -1; /* NOT_NULL "<" NULL */ + } + else + { + compare = compare_int4(datum1, datum2); + + if (sk_flags & SK_BT_DESC) + compare = -compare; + } + + return compare; +} + + + +inline int +comparetup_heap_int4(const SortTuple *a, const SortTuple *b, Tuplesortstate *state) +{ + ScanKey scanKey = state->scanKeys; + HeapTupleData ltup; + HeapTupleData rtup; + TupleDesc tupDesc; + int nkey; + int32 compare; + + /* Allow interrupting long sorts */ + CHECK_FOR_INTERRUPTS(); + /* Compare the leading sort key */ + compare = inlineApplySortFunction(&scanKey->sk_func, scanKey->sk_flags, + scanKey->sk_collation, + a->datum1, a->isnull1, + b->datum1, b->isnull1); + if (compare != 0) + return compare; + + + /* Compare additional sort keys */ + ltup.t_len = ((MinimalTuple) a->tuple)->t_len + MINIMAL_TUPLE_OFFSET; + ltup.t_data = (HeapTupleHeader) ((char *) a->tuple - MINIMAL_TUPLE_OFFSET); + rtup.t_len = ((MinimalTuple) b->tuple)->t_len + MINIMAL_TUPLE_OFFSET; + rtup.t_data = (HeapTupleHeader) ((char *) b->tuple - MINIMAL_TUPLE_OFFSET); + tupDesc = state->tupDesc; + scanKey++; + + for (nkey = 1; nkey < state->nKeys; nkey++, scanKey++) + { + AttrNumber attno = scanKey->sk_attno; + Datum datum1, + datum2; + bool isnull1, + isnull2; + + datum1 = heap_getattr(<up, attno, tupDesc, &isnull1); + datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2); + + compare = compare_int4(datum1, datum2); + return compare; + } + + return 0; +} + + +inline static char *med3(char *a, char *b, char *c, + void *arg); +inline static void swapfunc(char *, char *, size_t, int); + +/* + * Qsort routine based on J. L. Bentley and M. D. McIlroy, + * "Engineering a sort function", + * Software--Practice and Experience 23 (1993) 1249-1265. + * We have modified their original by adding a check for already-sorted input, + * which seems to be a win per discussions on pgsql-hackers around 2006-03-21. + */ +#define swapcode(TYPE, parmi, parmj, n) \ +do { \ + size_t i = (n) / sizeof (TYPE); \ + TYPE *pi = (TYPE *)(void *)(parmi); \ + TYPE *pj = (TYPE *)(void *)(parmj); \ + do { \ + TYPE t = *pi; \ + *pi++ = *pj; \ + *pj++ = t; \ + } while (--i > 0); \ +} while (0) + +#define SWAPINIT(a, es) swaptype = ((char *)(a) - (char *)0) % sizeof(long) || \ + (es) % sizeof(long) ? 2 : (es) == sizeof(long)? 0 : 1; + +inline static void +swapfunc(char *a, char *b, size_t n, int swaptype) +{ + if (swaptype <= 1) + swapcode(long, a, b, n); + else + swapcode(char, a, b, n); +} + +#define swap(a, b) \ + if (swaptype == 0) { \ + long t = *(long *)(void *)(a); \ + *(long *)(void *)(a) = *(long *)(void *)(b); \ + *(long *)(void *)(b) = t; \ + } else \ + swapfunc(a, b, es, swaptype) + +#define vecswap(a, b, n) if ((n) > 0) swapfunc((a), (b), (size_t)(n), swaptype) + +inline static char * +med3(char *a, char *b, char *c, void *arg) +{ + return comparetup_heap_int4(a, b, arg) < 0 ? + (comparetup_heap_int4(b, c, arg) < 0 ? b : (comparetup_heap_int4(a, c, arg) < 0 ? c : a)) + : (comparetup_heap_int4(b, c, arg) > 0 ? b : (comparetup_heap_int4(a, c, arg) < 0 ? a : c)); +} + + + +void +qsort_arg_int4(void *a, size_t n, size_t es, void *arg) +{ + char *pa, + *pb, + *pc, + *pd, + *pl, + *pm, + *pn; + int d, + r, + swaptype, + presorted; + +loop:SWAPINIT(a, es); + if (n < 7) + { + for (pm = (char *) a + es; pm < (char *) a + n * es; pm += es) + for (pl = pm; pl > (char *) a && comparetup_heap_int4(pl - es, pl, arg) > 0; + pl -= es) + swap(pl, pl - es); + return; + } + presorted = 1; + for (pm = (char *) a + es; pm < (char *) a + n * es; pm += es) + { + if (comparetup_heap_int4(pm - es, pm, arg) > 0) + { + presorted = 0; + break; + } + } + if (presorted) + return; + pm = (char *) a + (n / 2) * es; + if (n > 7) + { + pl = (char *) a; + pn = (char *) a + (n - 1) * es; + if (n > 40) + { + d = (n / 8) * es; + pl = med3(pl, pl + d, pl + 2 * d, arg); + pm = med3(pm - d, pm, pm + d, arg); + pn = med3(pn - 2 * d, pn - d, pn, arg); + } + pm = med3(pl, pm, pn, arg); + } + swap(a, pm); + pa = pb = (char *) a + es; + pc = pd = (char *) a + (n - 1) * es; + for (;;) + { + while (pb <= pc && (r = comparetup_heap_int4(pb, a, arg)) <= 0) + { + if (r == 0) + { + swap(pa, pb); + pa += es; + } + pb += es; + } + while (pb <= pc && (r = comparetup_heap_int4(pc, a, arg)) >= 0) + { + if (r == 0) + { + swap(pc, pd); + pd -= es; + } + pc -= es; + + } + if (pb > pc) + break; + swap(pb, pc); + pb += es; + pc -= es; + } + pn = (char *) a + n * es; + r = Min(pa - (char *) a, pb - pa); + vecswap(a, pb - r, r); + r = Min(pd - pc, pn - pd - es); + vecswap(pb, pn - r, r); + if ((r = pb - pa) > es) + qsort_arg_int4(a, r / es, es, arg); + if ((r = pd - pc) > es) + { + /* Iterate rather than recurse to save stack space */ + a = pn - r; + n = r / es; + goto loop; + } +} + /* * All tuples have been provided; finish the sort. */ @@ -1247,11 +1526,14 @@ tuplesort_performsort(Tuplesortstate *state) * amount of memory. Just qsort 'em and we're done. */ if (state->memtupcount > 1) - qsort_arg((void *) state->memtuples, + { + /* For this POC patch, pretend we only ever sort int4 */ + qsort_arg_int4((void *) state->memtuples, state->memtupcount, sizeof(SortTuple), - (qsort_arg_comparator) state->comparetup, (void *) state); + + } state->current = 0; state->eof_reached = false; state->markpos_offset = 0; @@ -2628,72 +2910,6 @@ SelectSortFunction(Oid sortOperator, } /* - * Inline-able copy of FunctionCall2Coll() to save some cycles in sorting. - */ -static inline Datum -myFunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2) -{ - FunctionCallInfoData fcinfo; - Datum result; - - InitFunctionCallInfoData(fcinfo, flinfo, 2, collation, NULL, NULL); - - fcinfo.arg[0] = arg1; - fcinfo.arg[1] = arg2; - fcinfo.argnull[0] = false; - fcinfo.argnull[1] = false; - - result = FunctionCallInvoke(&fcinfo); - - /* Check for null result, since caller is clearly not expecting one */ - if (fcinfo.isnull) - elog(ERROR, "function %u returned NULL", fcinfo.flinfo->fn_oid); - - return result; -} - -/* - * Apply a sort function (by now converted to fmgr lookup form) - * and return a 3-way comparison result. This takes care of handling - * reverse-sort and NULLs-ordering properly. We assume that DESC and - * NULLS_FIRST options are encoded in sk_flags the same way btree does it. - */ -static inline int32 -inlineApplySortFunction(FmgrInfo *sortFunction, int sk_flags, Oid collation, - Datum datum1, bool isNull1, - Datum datum2, bool isNull2) -{ - int32 compare; - - if (isNull1) - { - if (isNull2) - compare = 0; /* NULL "=" NULL */ - else if (sk_flags & SK_BT_NULLS_FIRST) - compare = -1; /* NULL "<" NOT_NULL */ - else - compare = 1; /* NULL ">" NOT_NULL */ - } - else if (isNull2) - { - if (sk_flags & SK_BT_NULLS_FIRST) - compare = 1; /* NOT_NULL ">" NULL */ - else - compare = -1; /* NOT_NULL "<" NULL */ - } - else - { - compare = DatumGetInt32(myFunctionCall2Coll(sortFunction, collation, - datum1, datum2)); - - if (sk_flags & SK_BT_DESC) - compare = -compare; - } - - return compare; -} - -/* * Non-inline ApplySortFunction() --- this is needed only to conform to * C99's brain-dead notions about how to implement inline functions... */
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers