Hello,

I'm reviving a thread from 2016, because I wanted this thing again today.

Tom Lane <t...@sss.pgh.pa.us> wrote:
> Thomas Munro <thomas.mu...@enterprisedb.com> writes:
> > Here's a sketch patch that creates a function array_unique which takes
> > the same arguments as qsort or qsort_arg and returns the new length.
>
> Hmm ... I'd be against using this in backend/regex/, because I still
> have hopes of converting that to a standalone library someday (and
> in any case it needs to stay compatible with Tcl's copy of the code).
> But otherwise this seems like a reasonable proposal.
>
> As for the function name, maybe "qunique()" to go with "qsort()"?
> I'm not thrilled with "array_unique" because that sounds like it
> is meant for Postgres' array data types.

OK, here it is renamed to qunique() and qunique_arg().  It's a bit odd
because it has nothing to do with the quicksort algorithm, but make
some sense because it's always used with qsort().  I suppose we could
save a few more lines if there were a qsort_unique() function that
does both, since the arguments are identical.  I also moved it into a
new header lib/qunique.h.  Any better ideas for where it should live?
I removed the hunk under regex.

One thing I checked is that on my system it is inlined along with the
comparator when that is visible, so no performance should be lost by
throwing away the open coded versions.  This makes me think that eg
oid_cmp() should probably be defined in a header; clearly we're also
carrying a few functions that should be consolidated into a new
int32_cmp() function, somewhere, too.  (It might also be interesting
to use the pg_attribute_always_inline trick to instantiate some common
qsort() specialisations for a bit of speed-up, but that's another
topic.)

Adding to CF.

--
Thomas Munro
https://enterprisedb.com
From b5fd67b9586cfdc3e66ccbdf019ec6d95f6206ec Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Fri, 30 Aug 2019 13:41:04 +1200
Subject: [PATCH] Consolidate code that makes a sorted array unique.

Introduce qunique() and qunique_arg() which can be used after qsort()
and qsort_arg() respectively to remove duplicate values.

Author: Thomas Munro
Reviewed-by: Tom Lane
Discussion: https://postgr.es/m/CAEepm%3D2vmFTNpAmwbGGD2WaryM6T3hSDVKQPfUwjdD_5XY6vAA%40mail.gmail.com
---
 contrib/hstore/hstore_io.c           |  5 +++
 contrib/intarray/_int_tool.c         | 19 +++-----
 contrib/pg_trgm/trgm_op.c            | 25 ++---------
 src/backend/access/nbtree/nbtutils.c | 19 ++------
 src/backend/executor/nodeTidscan.c   | 13 ++----
 src/backend/utils/adt/acl.c          | 15 ++-----
 src/backend/utils/adt/tsgistidx.c    | 29 ++-----------
 src/backend/utils/adt/tsquery_op.c   | 29 ++-----------
 src/backend/utils/adt/tsvector.c     |  5 ++-
 src/backend/utils/adt/tsvector_op.c  | 59 ++++---------------------
 src/backend/utils/cache/syscache.c   | 21 +++------
 src/include/lib/qunique.h            | 65 ++++++++++++++++++++++++++++
 12 files changed, 113 insertions(+), 191 deletions(-)
 create mode 100644 src/include/lib/qunique.h

diff --git a/contrib/hstore/hstore_io.c b/contrib/hstore/hstore_io.c
index 745497c76f..86d7c7c2d8 100644
--- a/contrib/hstore/hstore_io.c
+++ b/contrib/hstore/hstore_io.c
@@ -323,6 +323,11 @@ hstoreUniquePairs(Pairs *a, int32 l, int32 *buflen)
 	}
 
 	qsort((void *) a, l, sizeof(Pairs), comparePairs);
+
+	/*
+	 * We can't use qunique here because we have some clean-up code to run on
+	 * removed elements.
+	 */
 	ptr = a + 1;
 	res = a;
 	while (ptr - a < l)
diff --git a/contrib/intarray/_int_tool.c b/contrib/intarray/_int_tool.c
index fde8d15e2c..27d9ce7297 100644
--- a/contrib/intarray/_int_tool.c
+++ b/contrib/intarray/_int_tool.c
@@ -6,6 +6,7 @@
 #include <limits.h>
 
 #include "catalog/pg_type.h"
+#include "lib/qunique.h"
 
 #include "_int.h"
 
@@ -310,23 +311,13 @@ internal_size(int *a, int len)
 ArrayType *
 _int_unique(ArrayType *r)
 {
-	int		   *tmp,
-			   *dr,
-			   *data;
 	int			num = ARRNELEMS(r);
+	bool		duplicates_found;	/* not used */
 
-	if (num < 2)
-		return r;
+	num = qunique_arg(ARRPTR(r), num, sizeof(int), isort_cmp,
+					  &duplicates_found);
 
-	data = tmp = dr = ARRPTR(r);
-	while (tmp - data < num)
-	{
-		if (*tmp != *dr)
-			*(++dr) = *tmp++;
-		else
-			tmp++;
-	}
-	return resize_intArrayType(r, dr + 1 - ARRPTR(r));
+	return resize_intArrayType(r, num);
 }
 
 void
diff --git a/contrib/pg_trgm/trgm_op.c b/contrib/pg_trgm/trgm_op.c
index 0d4614e9c8..42fb625e7f 100644
--- a/contrib/pg_trgm/trgm_op.c
+++ b/contrib/pg_trgm/trgm_op.c
@@ -8,6 +8,7 @@
 #include "trgm.h"
 
 #include "catalog/pg_type.h"
+#include "lib/qunique.h"
 #include "tsearch/ts_locale.h"
 #include "utils/lsyscache.h"
 #include "utils/memutils.h"
@@ -163,26 +164,6 @@ comp_trgm(const void *a, const void *b)
 	return CMPTRGM(a, b);
 }
 
-static int
-unique_array(trgm *a, int len)
-{
-	trgm	   *curend,
-			   *tmp;
-
-	curend = tmp = a;
-	while (tmp - a < len)
-		if (CMPTRGM(tmp, curend))
-		{
-			curend++;
-			CPTRGM(curend, tmp);
-			tmp++;
-		}
-		else
-			tmp++;
-
-	return curend + 1 - a;
-}
-
 /*
  * Finds first word in string, returns pointer to the word,
  * endword points to the character after word
@@ -395,7 +376,7 @@ generate_trgm(char *str, int slen)
 	if (len > 1)
 	{
 		qsort((void *) GETARR(trg), len, sizeof(trgm), comp_trgm);
-		len = unique_array(GETARR(trg), len);
+		len = qunique(GETARR(trg), len, sizeof(trgm), comp_trgm);
 	}
 
 	SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
@@ -943,7 +924,7 @@ generate_wildcard_trgm(const char *str, int slen)
 	if (len > 1)
 	{
 		qsort((void *) GETARR(trg), len, sizeof(trgm), comp_trgm);
-		len = unique_array(GETARR(trg), len);
+		len = qunique(GETARR(trg), len, sizeof(trgm), comp_trgm);
 	}
 
 	SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index 4c7b2d0966..662bab97fc 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -21,6 +21,7 @@
 #include "access/reloptions.h"
 #include "access/relscan.h"
 #include "commands/progress.h"
+#include "lib/qunique.h"
 #include "miscadmin.h"
 #include "utils/array.h"
 #include "utils/datum.h"
@@ -435,8 +436,6 @@ _bt_sort_array_elements(IndexScanDesc scan, ScanKey skey,
 	Oid			elemtype;
 	RegProcedure cmp_proc;
 	BTSortArrayContext cxt;
-	int			last_non_dup;
-	int			i;
 
 	if (nelems <= 1)
 		return nelems;			/* no work to do */
@@ -475,20 +474,8 @@ _bt_sort_array_elements(IndexScanDesc scan, ScanKey skey,
 			  _bt_compare_array_elements, (void *) &cxt);
 
 	/* Now scan the sorted elements and remove duplicates */
-	last_non_dup = 0;
-	for (i = 1; i < nelems; i++)
-	{
-		int32		compare;
-
-		compare = DatumGetInt32(FunctionCall2Coll(&cxt.flinfo,
-												  cxt.collation,
-												  elems[last_non_dup],
-												  elems[i]));
-		if (compare != 0)
-			elems[++last_non_dup] = elems[i];
-	}
-
-	return last_non_dup + 1;
+	return qunique_arg(elems, nelems, sizeof(Datum),
+					   _bt_compare_array_elements, &cxt);
 }
 
 /*
diff --git a/src/backend/executor/nodeTidscan.c b/src/backend/executor/nodeTidscan.c
index 8cf22d5bf0..a59480f906 100644
--- a/src/backend/executor/nodeTidscan.c
+++ b/src/backend/executor/nodeTidscan.c
@@ -27,6 +27,7 @@
 #include "catalog/pg_type.h"
 #include "executor/execdebug.h"
 #include "executor/nodeTidscan.h"
+#include "lib/qunique.h"
 #include "miscadmin.h"
 #include "nodes/nodeFuncs.h"
 #include "storage/bufmgr.h"
@@ -260,21 +261,13 @@ TidListEval(TidScanState *tidstate)
 	 */
 	if (numTids > 1)
 	{
-		int			lastTid;
-		int			i;
-
 		/* CurrentOfExpr could never appear OR'd with something else */
 		Assert(!tidstate->tss_isCurrentOf);
 
 		qsort((void *) tidList, numTids, sizeof(ItemPointerData),
 			  itemptr_comparator);
-		lastTid = 0;
-		for (i = 1; i < numTids; i++)
-		{
-			if (!ItemPointerEquals(&tidList[lastTid], &tidList[i]))
-				tidList[++lastTid] = tidList[i];
-		}
-		numTids = lastTid + 1;
+		numTids = qunique(tidList, numTids, sizeof(ItemPointerData),
+						  itemptr_comparator);
 	}
 
 	tidstate->tss_TidList = tidList;
diff --git a/src/backend/utils/adt/acl.c b/src/backend/utils/adt/acl.c
index d7e6100ccb..79bc750d77 100644
--- a/src/backend/utils/adt/acl.c
+++ b/src/backend/utils/adt/acl.c
@@ -28,6 +28,7 @@
 #include "commands/tablespace.h"
 #include "foreign/foreign.h"
 #include "funcapi.h"
+#include "lib/qunique.h"
 #include "miscadmin.h"
 #include "utils/acl.h"
 #include "utils/array.h"
@@ -1475,8 +1476,7 @@ aclmembers(const Acl *acl, Oid **roleids)
 	Oid		   *list;
 	const AclItem *acldat;
 	int			i,
-				j,
-				k;
+				j;
 
 	if (acl == NULL || ACL_NUM(acl) == 0)
 	{
@@ -1508,21 +1508,14 @@ aclmembers(const Acl *acl, Oid **roleids)
 	/* Sort the array */
 	qsort(list, j, sizeof(Oid), oid_cmp);
 
-	/* Remove duplicates from the array */
-	k = 0;
-	for (i = 1; i < j; i++)
-	{
-		if (list[k] != list[i])
-			list[++k] = list[i];
-	}
-
 	/*
 	 * We could repalloc the array down to minimum size, but it's hardly worth
 	 * it since it's only transient memory.
 	 */
 	*roleids = list;
 
-	return k + 1;
+	/* Remove duplicates from the array */
+	return qunique(list, j, sizeof(Oid), oid_cmp);
 }
 
 
diff --git a/src/backend/utils/adt/tsgistidx.c b/src/backend/utils/adt/tsgistidx.c
index 4f256260fd..44defa9de1 100644
--- a/src/backend/utils/adt/tsgistidx.c
+++ b/src/backend/utils/adt/tsgistidx.c
@@ -16,6 +16,7 @@
 
 #include "access/gist.h"
 #include "access/tuptoaster.h"
+#include "lib/qunique.h"
 #include "port/pg_bitutils.h"
 #include "tsearch/ts_utils.h"
 #include "utils/builtins.h"
@@ -122,31 +123,6 @@ compareint(const void *va, const void *vb)
 	return (a > b) ? 1 : -1;
 }
 
-/*
- * Removes duplicates from an array of int32. 'l' is
- * size of the input array. Returns the new size of the array.
- */
-static int
-uniqueint(int32 *a, int32 l)
-{
-	int32	   *ptr,
-			   *res;
-
-	if (l <= 1)
-		return l;
-
-	ptr = res = a;
-
-	qsort((void *) a, l, sizeof(int32), compareint);
-
-	while (ptr - a < l)
-		if (*ptr != *res)
-			*(++res) = *ptr++;
-		else
-			ptr++;
-	return res + 1 - a;
-}
-
 static void
 makesign(BITVECP sign, SignTSVector *a)
 {
@@ -193,7 +169,8 @@ gtsvector_compress(PG_FUNCTION_ARGS)
 			ptr++;
 		}
 
-		len = uniqueint(GETARR(res), val->size);
+		qsort(GETARR(res), val->size, sizeof(int), compareint);
+		len = qunique(GETARR(res), val->size, sizeof(int), compareint);
 		if (len != val->size)
 		{
 			/*
diff --git a/src/backend/utils/adt/tsquery_op.c b/src/backend/utils/adt/tsquery_op.c
index 1f63d9b6a9..e6816940ad 100644
--- a/src/backend/utils/adt/tsquery_op.c
+++ b/src/backend/utils/adt/tsquery_op.c
@@ -14,6 +14,7 @@
 
 #include "postgres.h"
 
+#include "lib/qunique.h"
 #include "tsearch/ts_utils.h"
 #include "utils/builtins.h"
 
@@ -302,29 +303,6 @@ cmp_string(const void *a, const void *b)
 	return strcmp(sa, sb);
 }
 
-static int
-remove_duplicates(char **strings, int n)
-{
-	if (n <= 1)
-		return n;
-	else
-	{
-		int			i;
-		char	   *prev = strings[0];
-		int			new_n = 1;
-
-		for (i = 1; i < n; i++)
-		{
-			if (strcmp(strings[i], prev) != 0)
-			{
-				strings[new_n++] = strings[i];
-				prev = strings[i];
-			}
-		}
-		return new_n;
-	}
-}
-
 Datum
 tsq_mcontains(PG_FUNCTION_ARGS)
 {
@@ -342,9 +320,10 @@ tsq_mcontains(PG_FUNCTION_ARGS)
 
 	/* Sort and remove duplicates from both arrays */
 	qsort(query_values, query_nvalues, sizeof(char *), cmp_string);
-	query_nvalues = remove_duplicates(query_values, query_nvalues);
+	query_nvalues = qunique(query_values, query_nvalues, sizeof(char *),
+							cmp_string);
 	qsort(ex_values, ex_nvalues, sizeof(char *), cmp_string);
-	ex_nvalues = remove_duplicates(ex_values, ex_nvalues);
+	ex_nvalues = qunique(ex_values, ex_nvalues, sizeof(char *), cmp_string);
 
 	if (ex_nvalues > query_nvalues)
 		result = false;
diff --git a/src/backend/utils/adt/tsvector.c b/src/backend/utils/adt/tsvector.c
index ccfc4147fa..098eaed3e5 100644
--- a/src/backend/utils/adt/tsvector.c
+++ b/src/backend/utils/adt/tsvector.c
@@ -41,8 +41,9 @@ compareWordEntryPos(const void *a, const void *b)
 }
 
 /*
- * Removes duplicate pos entries. If there's two entries with same pos
- * but different weight, the higher weight is retained.
+ * Removes duplicate pos entries. If there's two entries with same pos but
+ * different weight, the higher weight is retained, so we can't use
+ * qunique here.
  *
  * Returns new length.
  */
diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c
index 28d042273e..f483c4b228 100644
--- a/src/backend/utils/adt/tsvector_op.c
+++ b/src/backend/utils/adt/tsvector_op.c
@@ -21,6 +21,7 @@
 #include "commands/trigger.h"
 #include "executor/spi.h"
 #include "funcapi.h"
+#include "lib/qunique.h"
 #include "mb/pg_wchar.h"
 #include "miscadmin.h"
 #include "parser/parse_coerce.h"
@@ -475,16 +476,9 @@ tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete,
 	 */
 	if (indices_count > 1)
 	{
-		int			kp;
-
 		qsort(indices_to_delete, indices_count, sizeof(int), compare_int);
-		kp = 0;
-		for (k = 1; k < indices_count; k++)
-		{
-			if (indices_to_delete[k] != indices_to_delete[kp])
-				indices_to_delete[++kp] = indices_to_delete[k];
-		}
-		indices_count = ++kp;
+		indices_count = qunique(indices_to_delete, indices_count, sizeof(int),
+								compare_int);
 	}
 
 	/*
@@ -761,7 +755,6 @@ array_to_tsvector(PG_FUNCTION_ARGS)
 	bool	   *nulls;
 	int			nitems,
 				i,
-				j,
 				tslen,
 				datalen = 0;
 	char	   *cur;
@@ -781,13 +774,8 @@ array_to_tsvector(PG_FUNCTION_ARGS)
 	if (nitems > 1)
 	{
 		qsort(dlexemes, nitems, sizeof(Datum), compare_text_lexemes);
-		j = 0;
-		for (i = 1; i < nitems; i++)
-		{
-			if (compare_text_lexemes(&dlexemes[j], &dlexemes[i]) < 0)
-				dlexemes[++j] = dlexemes[i];
-		}
-		nitems = ++j;
+		nitems = qunique(dlexemes, nitems, sizeof(Datum),
+						 compare_text_lexemes);
 	}
 
 	/* Calculate space needed for surviving lexemes. */
@@ -1270,39 +1258,6 @@ checkclass_str(CHKVAL *chkval, WordEntry *entry, QueryOperand *val,
 	return result;
 }
 
-/*
- * Removes duplicate pos entries. We can't use uniquePos() from
- * tsvector.c because array might be longer than MAXENTRYPOS
- *
- * Returns new length.
- */
-static int
-uniqueLongPos(WordEntryPos *pos, int npos)
-{
-	WordEntryPos *pos_iter,
-			   *result;
-
-	if (npos <= 1)
-		return npos;
-
-	qsort((void *) pos, npos, sizeof(WordEntryPos), compareWordEntryPos);
-
-	result = pos;
-	pos_iter = pos + 1;
-	while (pos_iter < pos + npos)
-	{
-		if (WEP_GETPOS(*pos_iter) != WEP_GETPOS(*result))
-		{
-			result++;
-			*result = WEP_GETPOS(*pos_iter);
-		}
-
-		pos_iter++;
-	}
-
-	return result + 1 - pos;
-}
-
 /*
  * is there value 'val' in array or not ?
  */
@@ -1397,7 +1352,9 @@ checkcondition_str(void *checkval, QueryOperand *val, ExecPhraseData *data)
 		{
 			/* Sort and make unique array of found positions */
 			data->pos = allpos;
-			data->npos = uniqueLongPos(allpos, npos);
+			qsort(data->pos, npos, sizeof(WordEntryPos), compareWordEntryPos);
+			data->npos = qunique(data->pos, npos, sizeof(WordEntryPos),
+								 compareWordEntryPos);
 			data->allocated = true;
 		}
 	}
diff --git a/src/backend/utils/cache/syscache.c b/src/backend/utils/cache/syscache.c
index 16297a52a1..36aee74ab0 100644
--- a/src/backend/utils/cache/syscache.c
+++ b/src/backend/utils/cache/syscache.c
@@ -74,6 +74,7 @@
 #include "catalog/pg_ts_template.h"
 #include "catalog/pg_type.h"
 #include "catalog/pg_user_mapping.h"
+#include "lib/qunique.h"
 #include "utils/rel.h"
 #include "utils/catcache.h"
 #include "utils/syscache.h"
@@ -1010,8 +1011,6 @@ void
 InitCatalogCache(void)
 {
 	int			cacheId;
-	int			i,
-				j;
 
 	StaticAssertStmt(SysCacheSize == (int) lengthof(cacheinfo),
 					 "SysCacheSize does not match syscache.c's array");
@@ -1048,21 +1047,15 @@ InitCatalogCache(void)
 	/* Sort and de-dup OID arrays, so we can use binary search. */
 	pg_qsort(SysCacheRelationOid, SysCacheRelationOidSize,
 			 sizeof(Oid), oid_compare);
-	for (i = 1, j = 0; i < SysCacheRelationOidSize; i++)
-	{
-		if (SysCacheRelationOid[i] != SysCacheRelationOid[j])
-			SysCacheRelationOid[++j] = SysCacheRelationOid[i];
-	}
-	SysCacheRelationOidSize = j + 1;
+	SysCacheRelationOidSize =
+		qunique(SysCacheRelationOid, SysCacheRelationOidSize, sizeof(Oid),
+				oid_compare);
 
 	pg_qsort(SysCacheSupportingRelOid, SysCacheSupportingRelOidSize,
 			 sizeof(Oid), oid_compare);
-	for (i = 1, j = 0; i < SysCacheSupportingRelOidSize; i++)
-	{
-		if (SysCacheSupportingRelOid[i] != SysCacheSupportingRelOid[j])
-			SysCacheSupportingRelOid[++j] = SysCacheSupportingRelOid[i];
-	}
-	SysCacheSupportingRelOidSize = j + 1;
+	SysCacheSupportingRelOidSize =
+		qunique(SysCacheSupportingRelOid, SysCacheSupportingRelOidSize,
+				sizeof(Oid), oid_compare);
 
 	CacheInitialized = true;
 }
diff --git a/src/include/lib/qunique.h b/src/include/lib/qunique.h
new file mode 100644
index 0000000000..4d620f82ee
--- /dev/null
+++ b/src/include/lib/qunique.h
@@ -0,0 +1,65 @@
+/*-------------------------------------------------------------------------
+ *
+ * qunique.h
+ *		inline array unique functions
+ * Portions Copyright (c) 2019, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *		src/include/lib/qunique.h
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef QUNIQUE_H
+#define QUNIQUE_H
+
+/*
+ * Remove duplicates from a pre-sorted array, according to a user-supplied
+ * comparator.  Usually the array should have been sorted with qsort() using
+ * the same arguments.  Return the new size.
+ */
+static inline size_t
+qunique(void *array, size_t elements, size_t width,
+		int (*compare) (const void *, const void *))
+{
+	char	   *bytes = (char *) array;
+	size_t		i,
+				j;
+
+	if (elements <= 1)
+		return elements;
+
+	for (i = 1, j = 0; i < elements; ++i)
+	{
+		if (compare(bytes + i * width, bytes + j * width) != 0)
+			memcpy(bytes + ++j * width, bytes + i * width, width);
+	}
+
+	return j + 1;
+}
+
+/*
+ * Like qunique(), but takes a comparator with an extra user data argument
+ * which is passed through, for compatibility with qsort_arg().
+ */
+static inline size_t
+qunique_arg(void *array, size_t elements, size_t width,
+			int (*compare) (const void *, const void *, void *),
+			void *arg)
+{
+	char	   *bytes = (char *) array;
+	size_t		i,
+				j;
+
+	if (elements <= 1)
+		return elements;
+
+	for (i = 1, j = 0; i < elements; ++i)
+	{
+		if (compare(bytes + i * width, bytes + j * width, arg) != 0)
+			memcpy(bytes + ++j * width, bytes + i * width, width);
+	}
+
+	return j + 1;
+}
+
+#endif							/* QUNIQUE_H */
-- 
2.22.0

Reply via email to