Re: [PATCH] Introduce array_shuffle() and array_sample()

Martin Kalcher Thu, 21 Jul 2022 10:29:35 -0700

Am 21.07.22 um 10:41 schrieb Dean Rasheed:


It's important to mark these new functions as VOLATILE, not IMMUTABLE,
otherwise they won't work as expected in queries. See
https://www.postgresql.org/docs/current/xfunc-volatility.html

It would be better to use pg_prng_uint64_range() rather than rand() to
pick elements. Partly, that's because it uses a higher quality PRNG,
with a larger internal state, and it ensures that the results are
unbiased across the range. But more importantly, it interoperates with
setseed(), allowing predictable sequences of "random" numbers to be
generated -- something that's useful in writing repeatable regression
tests.

Assuming these new functions are made to interoperate with setseed(),
which I think they should be, then they also need to be marked as
PARALLEL RESTRICTED, rather than PARALLEL SAFE. See
https://www.postgresql.org/docs/current/parallel-safety.html, which
explains why setseed() and random() are parallel restricted.

Here is an updated patch that marks the functions VOLATILE PARALLELRESTRICTED and uses pg_prng_uint64_range() rather than rand().

From 26676802f05d00c31e0b2d5997f61734aa421fca Mon Sep 17 00:00:00 2001
From: Martin Kalcher <martin.kalc...@aboutsource.net>
Date: Sun, 17 Jul 2022 18:06:04 +0200
Subject: [PATCH] Introduce array_shuffle() and array_sample()

* array_shuffle() shuffles the elements of an array.
* array_sample() chooses n elements from an array by random.

Shuffling of arrays can already be accomplished with SQL
using unnest() and array_agg(order by random()). But that is
very slow compared to the new functions. In addition the new functions
are dimension aware.
---
 doc/src/sgml/func.sgml               |  35 +++++
 src/backend/utils/adt/arrayfuncs.c   | 189 ++++++++++++++++++++++++++-
 src/include/catalog/pg_proc.dat      |   6 +
 src/test/regress/expected/arrays.out |  60 +++++++++
 src/test/regress/sql/arrays.sql      |  17 +++
 5 files changed, 306 insertions(+), 1 deletion(-)

diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index b6783b7ad0..6b96897244 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -19395,6 +19395,41 @@ SELECT NULLIF(value, '(none)') ...
        </para></entry>
       </row>
 
+      <row>
+       <entry role="func_table_entry"><para role="func_signature">
+        <indexterm>
+         <primary>array_sample</primary>
+        </indexterm>
+        <function>array_sample</function> ( <parameter>array</parameter> <type>anyarray</type>, <parameter>n</parameter> <type>integer</type> )
+        <returnvalue>anyarray</returnvalue>
+       </para>
+       <para>
+        Returns <parameter>n</parameter> randomly chosen elements from <parameter>array</parameter>.
+        The order of the elements in resulting array is unspecified.
+       </para>
+       <para>
+        <literal>array_sample(ARRAY[[1,2],[3,4],[5,6]], 2)</literal>
+        <returnvalue>{{5,6},{3,4}}</returnvalue>
+       </para></entry>
+      </row>
+
+      <row>
+       <entry role="func_table_entry"><para role="func_signature">
+        <indexterm>
+         <primary>array_shuffle</primary>
+        </indexterm>
+        <function>array_shuffle</function> ( <type>anyarray</type> )
+        <returnvalue>anyarray</returnvalue>
+       </para>
+       <para>
+        Shuffles the first dimension of the array.
+       </para>
+       <para>
+        <literal>array_shuffle(ARRAY[[1,2],[3,4],[5,6]])</literal>
+        <returnvalue>{{5,6},{3,4},{1,2}}</returnvalue>
+       </para></entry>
+      </row>
+
       <row>
        <entry role="func_table_entry"><para role="func_signature">
         <indexterm id="function-array-to-string">
diff --git a/src/backend/utils/adt/arrayfuncs.c b/src/backend/utils/adt/arrayfuncs.c
index fb167f226a..64da980348 100644
--- a/src/backend/utils/adt/arrayfuncs.c
+++ b/src/backend/utils/adt/arrayfuncs.c
@@ -34,7 +34,7 @@
 #include "utils/memutils.h"
 #include "utils/selfuncs.h"
 #include "utils/typcache.h"
-
+#include "common/pg_prng.h"
 
 /*
  * GUC parameter
@@ -6872,3 +6872,190 @@ trim_array(PG_FUNCTION_ARGS)
 
 	PG_RETURN_DATUM(result);
 }
+
+/*
+ * Produce array with max n random items from given array in random order.
+ *
+ * array: array object to examine (must not be NULL)
+ * n: max number of items
+ * elmtyp, elmlen, elmbyval, elmalign: info for the datatype of the items
+ *
+ * NOTE: it would be cleaner to look up the elmlen/elmbval/elmalign info
+ * from the system catalogs, given the elmtype.  However, the caller is
+ * in a better position to cache this info across multiple uses, or even
+ * to hard-wire values if the element type is hard-wired.
+ */
+static ArrayType *
+array_shuffle_n(ArrayType *array, int n,
+				Oid elmtyp, int elmlen, bool elmbyval, char elmalign)
+{
+	ArrayType  *result;
+	int			ndim,
+			   *dims,
+			   *lbs,
+				rdims[MAXDIM],
+				nelm,
+				nitem,
+				i,
+				j,
+				k;
+	Datum		elm,
+			   *elms,
+			   *relms;
+	bool		nul,
+			   *nuls,
+			   *rnuls;
+
+	ndim = ARR_NDIM(array);
+	dims = ARR_DIMS(array);
+	lbs = ARR_LBOUND(array);
+
+	if (ndim < 1 || dims[0] < 1 || n < 1)
+		return construct_empty_array(elmtyp);
+
+	deconstruct_array(array, elmtyp, elmlen, elmbyval, elmalign,
+					  &relms, &rnuls, &nelm);
+
+	/* Calculate number of elements per item. */
+	nelm = (ndim > 1) ? ArrayGetNItems(ndim - 1, dims + 1) : 1;
+	elms = relms;
+	nuls = rnuls;
+	nitem = dims[0];
+	n = Min(n, nitem);
+
+	/*
+	 * Shuffle array using Fisher-Yates algorithm. Swap head with an randomly
+	 * chosen item and increment head.
+	 */
+	for (i = 0; i < n; i++)
+	{
+		k = (int) pg_prng_uint64_range(&pg_global_prng_state, 0, nitem - i - 1) * nelm;
+
+		/* Swap all elements in head with elements in item k. */
+		for (j = 0; j < nelm; j++)
+		{
+			elm = *elms;
+			nul = *nuls;
+			*elms = elms[k];
+			*nuls = nuls[k];
+			elms[k] = elm;
+			nuls[k] = nul;
+			elms++;
+			nuls++;
+		}
+	}
+
+	memcpy(rdims, dims, ndim * sizeof(int));
+	rdims[0] = n;
+
+	result = construct_md_array(relms, rnuls, ndim, rdims, lbs,
+								elmtyp, elmlen, elmbyval, elmalign);
+
+	pfree(relms);
+	pfree(rnuls);
+
+	return result;
+}
+
+/*
+ * Shuffle the elements of an array.
+ */
+Datum
+array_shuffle(PG_FUNCTION_ARGS)
+{
+	ArrayType  *array,
+			   *result;
+	int16		elmlen;
+	bool		elmbyval;
+	char		elmalign;
+	Oid			elmtyp;
+	TypeCacheEntry *typentry;
+	int			n;
+
+	array = PG_GETARG_ARRAYTYPE_P(0);
+
+	/* Return empty array immediately. */
+	if (ARR_NDIM(array) < 1)
+		PG_RETURN_ARRAYTYPE_P(array);
+
+	n = ARR_DIMS(array)[0];
+
+	/* There is no point in shuffling arrays with less then two items. */
+	if (n < 2)
+		PG_RETURN_ARRAYTYPE_P(array);
+
+	elmtyp = ARR_ELEMTYPE(array);
+	typentry = (TypeCacheEntry *) fcinfo->flinfo->fn_extra;
+	if (typentry == NULL || typentry->type_id != elmtyp)
+	{
+		typentry = lookup_type_cache(elmtyp, 0);
+		fcinfo->flinfo->fn_extra = (void *) typentry;
+	}
+	elmlen = typentry->typlen;
+	elmbyval = typentry->typbyval;
+	elmalign = typentry->typalign;
+
+	result = array_shuffle_n(array, n,
+							 elmtyp, elmlen, elmbyval, elmalign);
+
+	PG_FREE_IF_COPY(array, 0);
+
+	PG_RETURN_ARRAYTYPE_P(result);
+}
+
+/*
+ * Choose N random elements from an array.
+ */
+Datum
+array_sample(PG_FUNCTION_ARGS)
+{
+	ArrayType  *array,
+			   *result;
+	int16		elmlen;
+	bool		elmbyval;
+	char		elmalign;
+	Oid			elmtyp;
+	TypeCacheEntry *typentry;
+	int			n;
+
+	array = PG_GETARG_ARRAYTYPE_P(0);
+	n = PG_GETARG_INT32(1);
+
+	if (n < 0)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("second parameter must not be negative")));
+
+	elmtyp = ARR_ELEMTYPE(array);
+
+	/* Return an empty array if the requested sample size is zero. */
+	if (n == 0)
+	{
+		PG_FREE_IF_COPY(array, 0);
+		PG_RETURN_ARRAYTYPE_P(construct_empty_array(elmtyp));
+	}
+
+	/*
+	 * Return the original array if its size is less than or equal to the
+	 * requested sample size.
+	 */
+	if (ARR_NDIM(array) < 1 || ARR_DIMS(array)[0] <= n)
+		PG_RETURN_ARRAYTYPE_P(array);
+
+	typentry = (TypeCacheEntry *) fcinfo->flinfo->fn_extra;
+	if (typentry == NULL || typentry->type_id != elmtyp)
+	{
+		typentry = lookup_type_cache(elmtyp, 0);
+		fcinfo->flinfo->fn_extra = (void *) typentry;
+	}
+	elmlen = typentry->typlen;
+	elmbyval = typentry->typbyval;
+	elmalign = typentry->typalign;
+
+	result = array_shuffle_n(array, n,
+							 elmtyp, elmlen, elmbyval, elmalign);
+
+	PG_FREE_IF_COPY(array, 0);
+
+	PG_RETURN_ARRAYTYPE_P(result);
+}
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 2e41f4d9e8..9933d10f6d 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -1681,6 +1681,12 @@
   proname => 'arraycontjoinsel', provolatile => 's', prorettype => 'float8',
   proargtypes => 'internal oid internal int2 internal',
   prosrc => 'arraycontjoinsel' },
+{ oid => '8464', descr => 'shuffle array',
+  proname => 'array_shuffle', provolatile => 'v', proparallel => 'r', proisstrict => 't',
+  prorettype => 'anyarray', proargtypes => 'anyarray', prosrc => 'array_shuffle' },
+{ oid => '8465', descr => 'take samples from array',
+  proname => 'array_sample', provolatile => 'v', proparallel => 'r', proisstrict => 't',
+  prorettype => 'anyarray', proargtypes => 'anyarray int4', prosrc => 'array_sample' },
 
 { oid => '764', descr => 'large object import',
   proname => 'lo_import', provolatile => 'v', proparallel => 'u',
diff --git a/src/test/regress/expected/arrays.out b/src/test/regress/expected/arrays.out
index ce6f3a65f9..1eb7ef37cf 100644
--- a/src/test/regress/expected/arrays.out
+++ b/src/test/regress/expected/arrays.out
@@ -2445,3 +2445,63 @@ SELECT trim_array(ARRAY[1, 2, 3], -1); -- fail
 ERROR:  number of elements to trim must be between 0 and 3
 SELECT trim_array(ARRAY[1, 2, 3], 10); -- fail
 ERROR:  number of elements to trim must be between 0 and 3
+-- array_shuffle
+SELECT array_agg(a order by a)
+FROM
+(SELECT unnest(array_shuffle('{NULL,1,2,3,4,5}'::int[]))) v(a);
+    array_agg     
+------------------
+ {1,2,3,4,5,NULL}
+(1 row)
+
+SELECT array_dims(array_shuffle('{{1,2,3},{4,5,6}}'::int[][]));
+ array_dims 
+------------
+ [1:2][1:3]
+(1 row)
+
+SELECT array_dims(array_shuffle('[-1:1]={1,2,3}'::int[]));
+ array_dims 
+------------
+ [-1:1]
+(1 row)
+
+SELECT (array_shuffle('{{1,2},{3,4},{5,6}}'::int[][]))[1:][1] <@ '{1,3,5}'::int[];
+ ?column? 
+----------
+ t
+(1 row)
+
+-- array_sample
+SELECT cardinality(array_sample('{NULL,1,2,3,4,5}'::int[], 3));
+ cardinality 
+-------------
+           3
+(1 row)
+
+SELECT array_sample('{1,2,3,4,5}'::int[], 3) <@ '{1,2,3,4,5}'::int[];
+ ?column? 
+----------
+ t
+(1 row)
+
+SELECT array_sample('{1,2,3,4,5}'::int[], 10) <@ '{1,2,3,4,5}'::int[];
+ ?column? 
+----------
+ t
+(1 row)
+
+SELECT array_dims(array_sample('{{1,2},{3,4},{4,5},{6,7}}'::int[][], 3));
+ array_dims 
+------------
+ [1:3][1:2]
+(1 row)
+
+SELECT array_dims(array_sample('[-1:1]={1,2,3}'::int[], 2));
+ array_dims 
+------------
+ [-1:0]
+(1 row)
+
+SELECT array_sample('{1,2,3,4,5}'::int[], -1); -- fail
+ERROR:  second parameter must not be negative
diff --git a/src/test/regress/sql/arrays.sql b/src/test/regress/sql/arrays.sql
index f774faf856..5daee9aa72 100644
--- a/src/test/regress/sql/arrays.sql
+++ b/src/test/regress/sql/arrays.sql
@@ -754,3 +754,20 @@ FROM
 
 SELECT trim_array(ARRAY[1, 2, 3], -1); -- fail
 SELECT trim_array(ARRAY[1, 2, 3], 10); -- fail
+
+-- array_shuffle
+SELECT array_agg(a order by a)
+FROM
+(SELECT unnest(array_shuffle('{NULL,1,2,3,4,5}'::int[]))) v(a);
+
+SELECT array_dims(array_shuffle('{{1,2,3},{4,5,6}}'::int[][]));
+SELECT array_dims(array_shuffle('[-1:1]={1,2,3}'::int[]));
+SELECT (array_shuffle('{{1,2},{3,4},{5,6}}'::int[][]))[1:][1] <@ '{1,3,5}'::int[];
+
+-- array_sample
+SELECT cardinality(array_sample('{NULL,1,2,3,4,5}'::int[], 3));
+SELECT array_sample('{1,2,3,4,5}'::int[], 3) <@ '{1,2,3,4,5}'::int[];
+SELECT array_sample('{1,2,3,4,5}'::int[], 10) <@ '{1,2,3,4,5}'::int[];
+SELECT array_dims(array_sample('{{1,2},{3,4},{4,5},{6,7}}'::int[][], 3));
+SELECT array_dims(array_sample('[-1:1]={1,2,3}'::int[], 2));
+SELECT array_sample('{1,2,3,4,5}'::int[], -1); -- fail
-- 
2.37.1

Re: [PATCH] Introduce array_shuffle() and array_sample()

Reply via email to