Hello, I'm reviving a thread from 2016, because I wanted this thing again today.
Tom Lane <t...@sss.pgh.pa.us> wrote: > Thomas Munro <thomas.mu...@enterprisedb.com> writes: > > Here's a sketch patch that creates a function array_unique which takes > > the same arguments as qsort or qsort_arg and returns the new length. > > Hmm ... I'd be against using this in backend/regex/, because I still > have hopes of converting that to a standalone library someday (and > in any case it needs to stay compatible with Tcl's copy of the code). > But otherwise this seems like a reasonable proposal. > > As for the function name, maybe "qunique()" to go with "qsort()"? > I'm not thrilled with "array_unique" because that sounds like it > is meant for Postgres' array data types. OK, here it is renamed to qunique() and qunique_arg(). It's a bit odd because it has nothing to do with the quicksort algorithm, but make some sense because it's always used with qsort(). I suppose we could save a few more lines if there were a qsort_unique() function that does both, since the arguments are identical. I also moved it into a new header lib/qunique.h. Any better ideas for where it should live? I removed the hunk under regex. One thing I checked is that on my system it is inlined along with the comparator when that is visible, so no performance should be lost by throwing away the open coded versions. This makes me think that eg oid_cmp() should probably be defined in a header; clearly we're also carrying a few functions that should be consolidated into a new int32_cmp() function, somewhere, too. (It might also be interesting to use the pg_attribute_always_inline trick to instantiate some common qsort() specialisations for a bit of speed-up, but that's another topic.) Adding to CF. -- Thomas Munro https://enterprisedb.com
From b5fd67b9586cfdc3e66ccbdf019ec6d95f6206ec Mon Sep 17 00:00:00 2001 From: Thomas Munro <thomas.munro@gmail.com> Date: Fri, 30 Aug 2019 13:41:04 +1200 Subject: [PATCH] Consolidate code that makes a sorted array unique. Introduce qunique() and qunique_arg() which can be used after qsort() and qsort_arg() respectively to remove duplicate values. Author: Thomas Munro Reviewed-by: Tom Lane Discussion: https://postgr.es/m/CAEepm%3D2vmFTNpAmwbGGD2WaryM6T3hSDVKQPfUwjdD_5XY6vAA%40mail.gmail.com --- contrib/hstore/hstore_io.c | 5 +++ contrib/intarray/_int_tool.c | 19 +++----- contrib/pg_trgm/trgm_op.c | 25 ++--------- src/backend/access/nbtree/nbtutils.c | 19 ++------ src/backend/executor/nodeTidscan.c | 13 ++---- src/backend/utils/adt/acl.c | 15 ++----- src/backend/utils/adt/tsgistidx.c | 29 ++----------- src/backend/utils/adt/tsquery_op.c | 29 ++----------- src/backend/utils/adt/tsvector.c | 5 ++- src/backend/utils/adt/tsvector_op.c | 59 ++++--------------------- src/backend/utils/cache/syscache.c | 21 +++------ src/include/lib/qunique.h | 65 ++++++++++++++++++++++++++++ 12 files changed, 113 insertions(+), 191 deletions(-) create mode 100644 src/include/lib/qunique.h diff --git a/contrib/hstore/hstore_io.c b/contrib/hstore/hstore_io.c index 745497c76f..86d7c7c2d8 100644 --- a/contrib/hstore/hstore_io.c +++ b/contrib/hstore/hstore_io.c @@ -323,6 +323,11 @@ hstoreUniquePairs(Pairs *a, int32 l, int32 *buflen) } qsort((void *) a, l, sizeof(Pairs), comparePairs); + + /* + * We can't use qunique here because we have some clean-up code to run on + * removed elements. + */ ptr = a + 1; res = a; while (ptr - a < l) diff --git a/contrib/intarray/_int_tool.c b/contrib/intarray/_int_tool.c index fde8d15e2c..27d9ce7297 100644 --- a/contrib/intarray/_int_tool.c +++ b/contrib/intarray/_int_tool.c @@ -6,6 +6,7 @@ #include <limits.h> #include "catalog/pg_type.h" +#include "lib/qunique.h" #include "_int.h" @@ -310,23 +311,13 @@ internal_size(int *a, int len) ArrayType * _int_unique(ArrayType *r) { - int *tmp, - *dr, - *data; int num = ARRNELEMS(r); + bool duplicates_found; /* not used */ - if (num < 2) - return r; + num = qunique_arg(ARRPTR(r), num, sizeof(int), isort_cmp, + &duplicates_found); - data = tmp = dr = ARRPTR(r); - while (tmp - data < num) - { - if (*tmp != *dr) - *(++dr) = *tmp++; - else - tmp++; - } - return resize_intArrayType(r, dr + 1 - ARRPTR(r)); + return resize_intArrayType(r, num); } void diff --git a/contrib/pg_trgm/trgm_op.c b/contrib/pg_trgm/trgm_op.c index 0d4614e9c8..42fb625e7f 100644 --- a/contrib/pg_trgm/trgm_op.c +++ b/contrib/pg_trgm/trgm_op.c @@ -8,6 +8,7 @@ #include "trgm.h" #include "catalog/pg_type.h" +#include "lib/qunique.h" #include "tsearch/ts_locale.h" #include "utils/lsyscache.h" #include "utils/memutils.h" @@ -163,26 +164,6 @@ comp_trgm(const void *a, const void *b) return CMPTRGM(a, b); } -static int -unique_array(trgm *a, int len) -{ - trgm *curend, - *tmp; - - curend = tmp = a; - while (tmp - a < len) - if (CMPTRGM(tmp, curend)) - { - curend++; - CPTRGM(curend, tmp); - tmp++; - } - else - tmp++; - - return curend + 1 - a; -} - /* * Finds first word in string, returns pointer to the word, * endword points to the character after word @@ -395,7 +376,7 @@ generate_trgm(char *str, int slen) if (len > 1) { qsort((void *) GETARR(trg), len, sizeof(trgm), comp_trgm); - len = unique_array(GETARR(trg), len); + len = qunique(GETARR(trg), len, sizeof(trgm), comp_trgm); } SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len)); @@ -943,7 +924,7 @@ generate_wildcard_trgm(const char *str, int slen) if (len > 1) { qsort((void *) GETARR(trg), len, sizeof(trgm), comp_trgm); - len = unique_array(GETARR(trg), len); + len = qunique(GETARR(trg), len, sizeof(trgm), comp_trgm); } SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len)); diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 4c7b2d0966..662bab97fc 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -21,6 +21,7 @@ #include "access/reloptions.h" #include "access/relscan.h" #include "commands/progress.h" +#include "lib/qunique.h" #include "miscadmin.h" #include "utils/array.h" #include "utils/datum.h" @@ -435,8 +436,6 @@ _bt_sort_array_elements(IndexScanDesc scan, ScanKey skey, Oid elemtype; RegProcedure cmp_proc; BTSortArrayContext cxt; - int last_non_dup; - int i; if (nelems <= 1) return nelems; /* no work to do */ @@ -475,20 +474,8 @@ _bt_sort_array_elements(IndexScanDesc scan, ScanKey skey, _bt_compare_array_elements, (void *) &cxt); /* Now scan the sorted elements and remove duplicates */ - last_non_dup = 0; - for (i = 1; i < nelems; i++) - { - int32 compare; - - compare = DatumGetInt32(FunctionCall2Coll(&cxt.flinfo, - cxt.collation, - elems[last_non_dup], - elems[i])); - if (compare != 0) - elems[++last_non_dup] = elems[i]; - } - - return last_non_dup + 1; + return qunique_arg(elems, nelems, sizeof(Datum), + _bt_compare_array_elements, &cxt); } /* diff --git a/src/backend/executor/nodeTidscan.c b/src/backend/executor/nodeTidscan.c index 8cf22d5bf0..a59480f906 100644 --- a/src/backend/executor/nodeTidscan.c +++ b/src/backend/executor/nodeTidscan.c @@ -27,6 +27,7 @@ #include "catalog/pg_type.h" #include "executor/execdebug.h" #include "executor/nodeTidscan.h" +#include "lib/qunique.h" #include "miscadmin.h" #include "nodes/nodeFuncs.h" #include "storage/bufmgr.h" @@ -260,21 +261,13 @@ TidListEval(TidScanState *tidstate) */ if (numTids > 1) { - int lastTid; - int i; - /* CurrentOfExpr could never appear OR'd with something else */ Assert(!tidstate->tss_isCurrentOf); qsort((void *) tidList, numTids, sizeof(ItemPointerData), itemptr_comparator); - lastTid = 0; - for (i = 1; i < numTids; i++) - { - if (!ItemPointerEquals(&tidList[lastTid], &tidList[i])) - tidList[++lastTid] = tidList[i]; - } - numTids = lastTid + 1; + numTids = qunique(tidList, numTids, sizeof(ItemPointerData), + itemptr_comparator); } tidstate->tss_TidList = tidList; diff --git a/src/backend/utils/adt/acl.c b/src/backend/utils/adt/acl.c index d7e6100ccb..79bc750d77 100644 --- a/src/backend/utils/adt/acl.c +++ b/src/backend/utils/adt/acl.c @@ -28,6 +28,7 @@ #include "commands/tablespace.h" #include "foreign/foreign.h" #include "funcapi.h" +#include "lib/qunique.h" #include "miscadmin.h" #include "utils/acl.h" #include "utils/array.h" @@ -1475,8 +1476,7 @@ aclmembers(const Acl *acl, Oid **roleids) Oid *list; const AclItem *acldat; int i, - j, - k; + j; if (acl == NULL || ACL_NUM(acl) == 0) { @@ -1508,21 +1508,14 @@ aclmembers(const Acl *acl, Oid **roleids) /* Sort the array */ qsort(list, j, sizeof(Oid), oid_cmp); - /* Remove duplicates from the array */ - k = 0; - for (i = 1; i < j; i++) - { - if (list[k] != list[i]) - list[++k] = list[i]; - } - /* * We could repalloc the array down to minimum size, but it's hardly worth * it since it's only transient memory. */ *roleids = list; - return k + 1; + /* Remove duplicates from the array */ + return qunique(list, j, sizeof(Oid), oid_cmp); } diff --git a/src/backend/utils/adt/tsgistidx.c b/src/backend/utils/adt/tsgistidx.c index 4f256260fd..44defa9de1 100644 --- a/src/backend/utils/adt/tsgistidx.c +++ b/src/backend/utils/adt/tsgistidx.c @@ -16,6 +16,7 @@ #include "access/gist.h" #include "access/tuptoaster.h" +#include "lib/qunique.h" #include "port/pg_bitutils.h" #include "tsearch/ts_utils.h" #include "utils/builtins.h" @@ -122,31 +123,6 @@ compareint(const void *va, const void *vb) return (a > b) ? 1 : -1; } -/* - * Removes duplicates from an array of int32. 'l' is - * size of the input array. Returns the new size of the array. - */ -static int -uniqueint(int32 *a, int32 l) -{ - int32 *ptr, - *res; - - if (l <= 1) - return l; - - ptr = res = a; - - qsort((void *) a, l, sizeof(int32), compareint); - - while (ptr - a < l) - if (*ptr != *res) - *(++res) = *ptr++; - else - ptr++; - return res + 1 - a; -} - static void makesign(BITVECP sign, SignTSVector *a) { @@ -193,7 +169,8 @@ gtsvector_compress(PG_FUNCTION_ARGS) ptr++; } - len = uniqueint(GETARR(res), val->size); + qsort(GETARR(res), val->size, sizeof(int), compareint); + len = qunique(GETARR(res), val->size, sizeof(int), compareint); if (len != val->size) { /* diff --git a/src/backend/utils/adt/tsquery_op.c b/src/backend/utils/adt/tsquery_op.c index 1f63d9b6a9..e6816940ad 100644 --- a/src/backend/utils/adt/tsquery_op.c +++ b/src/backend/utils/adt/tsquery_op.c @@ -14,6 +14,7 @@ #include "postgres.h" +#include "lib/qunique.h" #include "tsearch/ts_utils.h" #include "utils/builtins.h" @@ -302,29 +303,6 @@ cmp_string(const void *a, const void *b) return strcmp(sa, sb); } -static int -remove_duplicates(char **strings, int n) -{ - if (n <= 1) - return n; - else - { - int i; - char *prev = strings[0]; - int new_n = 1; - - for (i = 1; i < n; i++) - { - if (strcmp(strings[i], prev) != 0) - { - strings[new_n++] = strings[i]; - prev = strings[i]; - } - } - return new_n; - } -} - Datum tsq_mcontains(PG_FUNCTION_ARGS) { @@ -342,9 +320,10 @@ tsq_mcontains(PG_FUNCTION_ARGS) /* Sort and remove duplicates from both arrays */ qsort(query_values, query_nvalues, sizeof(char *), cmp_string); - query_nvalues = remove_duplicates(query_values, query_nvalues); + query_nvalues = qunique(query_values, query_nvalues, sizeof(char *), + cmp_string); qsort(ex_values, ex_nvalues, sizeof(char *), cmp_string); - ex_nvalues = remove_duplicates(ex_values, ex_nvalues); + ex_nvalues = qunique(ex_values, ex_nvalues, sizeof(char *), cmp_string); if (ex_nvalues > query_nvalues) result = false; diff --git a/src/backend/utils/adt/tsvector.c b/src/backend/utils/adt/tsvector.c index ccfc4147fa..098eaed3e5 100644 --- a/src/backend/utils/adt/tsvector.c +++ b/src/backend/utils/adt/tsvector.c @@ -41,8 +41,9 @@ compareWordEntryPos(const void *a, const void *b) } /* - * Removes duplicate pos entries. If there's two entries with same pos - * but different weight, the higher weight is retained. + * Removes duplicate pos entries. If there's two entries with same pos but + * different weight, the higher weight is retained, so we can't use + * qunique here. * * Returns new length. */ diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c index 28d042273e..f483c4b228 100644 --- a/src/backend/utils/adt/tsvector_op.c +++ b/src/backend/utils/adt/tsvector_op.c @@ -21,6 +21,7 @@ #include "commands/trigger.h" #include "executor/spi.h" #include "funcapi.h" +#include "lib/qunique.h" #include "mb/pg_wchar.h" #include "miscadmin.h" #include "parser/parse_coerce.h" @@ -475,16 +476,9 @@ tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete, */ if (indices_count > 1) { - int kp; - qsort(indices_to_delete, indices_count, sizeof(int), compare_int); - kp = 0; - for (k = 1; k < indices_count; k++) - { - if (indices_to_delete[k] != indices_to_delete[kp]) - indices_to_delete[++kp] = indices_to_delete[k]; - } - indices_count = ++kp; + indices_count = qunique(indices_to_delete, indices_count, sizeof(int), + compare_int); } /* @@ -761,7 +755,6 @@ array_to_tsvector(PG_FUNCTION_ARGS) bool *nulls; int nitems, i, - j, tslen, datalen = 0; char *cur; @@ -781,13 +774,8 @@ array_to_tsvector(PG_FUNCTION_ARGS) if (nitems > 1) { qsort(dlexemes, nitems, sizeof(Datum), compare_text_lexemes); - j = 0; - for (i = 1; i < nitems; i++) - { - if (compare_text_lexemes(&dlexemes[j], &dlexemes[i]) < 0) - dlexemes[++j] = dlexemes[i]; - } - nitems = ++j; + nitems = qunique(dlexemes, nitems, sizeof(Datum), + compare_text_lexemes); } /* Calculate space needed for surviving lexemes. */ @@ -1270,39 +1258,6 @@ checkclass_str(CHKVAL *chkval, WordEntry *entry, QueryOperand *val, return result; } -/* - * Removes duplicate pos entries. We can't use uniquePos() from - * tsvector.c because array might be longer than MAXENTRYPOS - * - * Returns new length. - */ -static int -uniqueLongPos(WordEntryPos *pos, int npos) -{ - WordEntryPos *pos_iter, - *result; - - if (npos <= 1) - return npos; - - qsort((void *) pos, npos, sizeof(WordEntryPos), compareWordEntryPos); - - result = pos; - pos_iter = pos + 1; - while (pos_iter < pos + npos) - { - if (WEP_GETPOS(*pos_iter) != WEP_GETPOS(*result)) - { - result++; - *result = WEP_GETPOS(*pos_iter); - } - - pos_iter++; - } - - return result + 1 - pos; -} - /* * is there value 'val' in array or not ? */ @@ -1397,7 +1352,9 @@ checkcondition_str(void *checkval, QueryOperand *val, ExecPhraseData *data) { /* Sort and make unique array of found positions */ data->pos = allpos; - data->npos = uniqueLongPos(allpos, npos); + qsort(data->pos, npos, sizeof(WordEntryPos), compareWordEntryPos); + data->npos = qunique(data->pos, npos, sizeof(WordEntryPos), + compareWordEntryPos); data->allocated = true; } } diff --git a/src/backend/utils/cache/syscache.c b/src/backend/utils/cache/syscache.c index 16297a52a1..36aee74ab0 100644 --- a/src/backend/utils/cache/syscache.c +++ b/src/backend/utils/cache/syscache.c @@ -74,6 +74,7 @@ #include "catalog/pg_ts_template.h" #include "catalog/pg_type.h" #include "catalog/pg_user_mapping.h" +#include "lib/qunique.h" #include "utils/rel.h" #include "utils/catcache.h" #include "utils/syscache.h" @@ -1010,8 +1011,6 @@ void InitCatalogCache(void) { int cacheId; - int i, - j; StaticAssertStmt(SysCacheSize == (int) lengthof(cacheinfo), "SysCacheSize does not match syscache.c's array"); @@ -1048,21 +1047,15 @@ InitCatalogCache(void) /* Sort and de-dup OID arrays, so we can use binary search. */ pg_qsort(SysCacheRelationOid, SysCacheRelationOidSize, sizeof(Oid), oid_compare); - for (i = 1, j = 0; i < SysCacheRelationOidSize; i++) - { - if (SysCacheRelationOid[i] != SysCacheRelationOid[j]) - SysCacheRelationOid[++j] = SysCacheRelationOid[i]; - } - SysCacheRelationOidSize = j + 1; + SysCacheRelationOidSize = + qunique(SysCacheRelationOid, SysCacheRelationOidSize, sizeof(Oid), + oid_compare); pg_qsort(SysCacheSupportingRelOid, SysCacheSupportingRelOidSize, sizeof(Oid), oid_compare); - for (i = 1, j = 0; i < SysCacheSupportingRelOidSize; i++) - { - if (SysCacheSupportingRelOid[i] != SysCacheSupportingRelOid[j]) - SysCacheSupportingRelOid[++j] = SysCacheSupportingRelOid[i]; - } - SysCacheSupportingRelOidSize = j + 1; + SysCacheSupportingRelOidSize = + qunique(SysCacheSupportingRelOid, SysCacheSupportingRelOidSize, + sizeof(Oid), oid_compare); CacheInitialized = true; } diff --git a/src/include/lib/qunique.h b/src/include/lib/qunique.h new file mode 100644 index 0000000000..4d620f82ee --- /dev/null +++ b/src/include/lib/qunique.h @@ -0,0 +1,65 @@ +/*------------------------------------------------------------------------- + * + * qunique.h + * inline array unique functions + * Portions Copyright (c) 2019, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/include/lib/qunique.h + *------------------------------------------------------------------------- + */ + +#ifndef QUNIQUE_H +#define QUNIQUE_H + +/* + * Remove duplicates from a pre-sorted array, according to a user-supplied + * comparator. Usually the array should have been sorted with qsort() using + * the same arguments. Return the new size. + */ +static inline size_t +qunique(void *array, size_t elements, size_t width, + int (*compare) (const void *, const void *)) +{ + char *bytes = (char *) array; + size_t i, + j; + + if (elements <= 1) + return elements; + + for (i = 1, j = 0; i < elements; ++i) + { + if (compare(bytes + i * width, bytes + j * width) != 0) + memcpy(bytes + ++j * width, bytes + i * width, width); + } + + return j + 1; +} + +/* + * Like qunique(), but takes a comparator with an extra user data argument + * which is passed through, for compatibility with qsort_arg(). + */ +static inline size_t +qunique_arg(void *array, size_t elements, size_t width, + int (*compare) (const void *, const void *, void *), + void *arg) +{ + char *bytes = (char *) array; + size_t i, + j; + + if (elements <= 1) + return elements; + + for (i = 1, j = 0; i < elements; ++i) + { + if (compare(bytes + i * width, bytes + j * width, arg) != 0) + memcpy(bytes + ++j * width, bytes + i * width, width); + } + + return j + 1; +} + +#endif /* QUNIQUE_H */ -- 2.22.0