Am 28.09.22 um 16:18 schrieb Tom Lane:
It is seeded at process start, yes. If you don't feel a need for user control over the sequence used by these functions, then using pg_global_prng_state is fine. (Basically the point to be made here is that we need to keep a tight rein on what can be affected by setseed().)regards, tom lane
New patch: array_shuffle() and array_sample() use pg_global_prng_state now. Martin
From b9433564f925521f5f6bcebd7cd74a3e12f4f354 Mon Sep 17 00:00:00 2001 From: Martin Kalcher <martin.kalc...@aboutsource.net> Date: Sun, 17 Jul 2022 18:06:04 +0200 Subject: [PATCH v5] Introduce array_shuffle() and array_sample() * array_shuffle() shuffles the elements of an array. * array_sample() chooses n elements from an array by random. --- doc/src/sgml/func.sgml | 34 +++++ src/backend/utils/adt/array_userfuncs.c | 176 ++++++++++++++++++++++++ src/include/catalog/pg_proc.dat | 6 + src/test/regress/expected/arrays.out | 54 ++++++++ src/test/regress/sql/arrays.sql | 14 ++ 5 files changed, 284 insertions(+) diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index e1fe4fec57..6b2a3d16f6 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -18468,6 +18468,40 @@ SELECT NULLIF(value, '(none)') ... </para></entry> </row> + <row> + <entry role="func_table_entry"><para role="func_signature"> + <indexterm> + <primary>array_sample</primary> + </indexterm> + <function>array_sample</function> ( <parameter>array</parameter> <type>anyarray</type>, <parameter>n</parameter> <type>integer</type> ) + <returnvalue>anyarray</returnvalue> + </para> + <para> + Returns <parameter>n</parameter> randomly chosen elements from <parameter>array</parameter> in selection order. + </para> + <para> + <literal>array_sample(ARRAY[[1,2],[3,4],[5,6]], 2)</literal> + <returnvalue>{{5,6},{1,2}}</returnvalue> + </para></entry> + </row> + + <row> + <entry role="func_table_entry"><para role="func_signature"> + <indexterm> + <primary>array_shuffle</primary> + </indexterm> + <function>array_shuffle</function> ( <type>anyarray</type> ) + <returnvalue>anyarray</returnvalue> + </para> + <para> + Shuffles the first dimension of the array. + </para> + <para> + <literal>array_shuffle(ARRAY[[1,2],[3,4],[5,6]])</literal> + <returnvalue>{{5,6},{1,2},{3,4}}</returnvalue> + </para></entry> + </row> + <row> <entry role="func_table_entry"><para role="func_signature"> <indexterm id="function-array-to-string"> diff --git a/src/backend/utils/adt/array_userfuncs.c b/src/backend/utils/adt/array_userfuncs.c index ca70590d7d..4bf28da5e4 100644 --- a/src/backend/utils/adt/array_userfuncs.c +++ b/src/backend/utils/adt/array_userfuncs.c @@ -14,6 +14,7 @@ #include "catalog/pg_type.h" #include "common/int.h" +#include "common/pg_prng.h" #include "utils/array.h" #include "utils/builtins.h" #include "utils/lsyscache.h" @@ -902,3 +903,178 @@ array_positions(PG_FUNCTION_ARGS) PG_RETURN_DATUM(makeArrayResult(astate, CurrentMemoryContext)); } + +/* + * Produce array with n randomly chosen items from given array in random order. + * + * array: array object to examine (must not be NULL) + * n: number of items (must not be greater than the size of the arrays first dimension) + * elmtyp, elmlen, elmbyval, elmalign: info for the datatype of the items + * + * NOTE: it would be cleaner to look up the elmlen/elmbval/elmalign info + * from the system catalogs, given the elmtyp. However, the caller is + * in a better position to cache this info across multiple uses, or even + * to hard-wire values if the element type is hard-wired. + */ +static ArrayType * +array_shuffle_n(ArrayType *array, int n, + Oid elmtyp, int elmlen, bool elmbyval, char elmalign) +{ + ArrayType *result; + int ndim, + *dims, + *lbs, + rdims[MAXDIM], + nelm, + nitem, + i, + j, + k; + Datum elm, + *elms, + *ielms, + *jelms; + bool nul, + *nuls, + *inuls, + *jnuls; + + ndim = ARR_NDIM(array); + dims = ARR_DIMS(array); + lbs = ARR_LBOUND(array); + + if (ndim < 1 || dims[0] < 1 || n < 1) + return construct_empty_array(elmtyp); + + deconstruct_array(array, elmtyp, elmlen, elmbyval, elmalign, + &elms, &nuls, &nelm); + + nitem = dims[0]; /* total number of items */ + nelm /= nitem; /* number of elements per item */ + + ielms = elms; + inuls = nuls; + + /* + * Shuffle array using Fisher-Yates algorithm. Iterate array and swap head + * (ielms) with an randomly chosen item (jelms) at each iteration. + */ + for (i = 0; i < n; i++) + { + j = (int) pg_prng_uint64_range(&pg_global_prng_state, 0, nitem - i - 1) * nelm; + jelms = ielms + j; + jnuls = inuls + j; + + /* Swap all elements in item (i) with elements in item (j). */ + for (k = 0; k < nelm; k++) + { + elm = *ielms; + nul = *inuls; + *ielms++ = *jelms; + *inuls++ = *jnuls; + *jelms++ = elm; + *jnuls++ = nul; + } + } + + memcpy(rdims, dims, ndim * sizeof(int)); + rdims[0] = n; + + result = construct_md_array(elms, nuls, ndim, rdims, lbs, + elmtyp, elmlen, elmbyval, elmalign); + + pfree(elms); + pfree(nuls); + + return result; +} + +/* + * Shuffle the elements of an array. + */ +Datum +array_shuffle(PG_FUNCTION_ARGS) +{ + ArrayType *array, + *result; + int16 elmlen; + bool elmbyval; + char elmalign; + Oid elmtyp; + TypeCacheEntry *typentry; + int n; + + array = PG_GETARG_ARRAYTYPE_P(0); + + /* Return empty array immediately. */ + if (ARR_NDIM(array) < 1) + PG_RETURN_ARRAYTYPE_P(array); + + n = ARR_DIMS(array)[0]; + + /* There is no point in shuffling arrays with less then two items. */ + if (n < 2) + PG_RETURN_ARRAYTYPE_P(array); + + elmtyp = ARR_ELEMTYPE(array); + typentry = (TypeCacheEntry *) fcinfo->flinfo->fn_extra; + if (typentry == NULL || typentry->type_id != elmtyp) + { + typentry = lookup_type_cache(elmtyp, 0); + fcinfo->flinfo->fn_extra = (void *) typentry; + } + elmlen = typentry->typlen; + elmbyval = typentry->typbyval; + elmalign = typentry->typalign; + + result = array_shuffle_n(array, n, + elmtyp, elmlen, elmbyval, elmalign); + + PG_FREE_IF_COPY(array, 0); + + PG_RETURN_ARRAYTYPE_P(result); +} + +/* + * Choose N random elements from an array. + */ +Datum +array_sample(PG_FUNCTION_ARGS) +{ + ArrayType *array, + *result; + int16 elmlen; + bool elmbyval; + char elmalign; + Oid elmtyp; + TypeCacheEntry *typentry; + int n, + nitem; + + array = PG_GETARG_ARRAYTYPE_P(0); + n = PG_GETARG_INT32(1); + nitem = (ARR_NDIM(array) < 1) ? 0 : ARR_DIMS(array)[0]; + + if (n < 0 || n > nitem) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("sample size must be between 0 and %d", nitem))); + + elmtyp = ARR_ELEMTYPE(array); + typentry = (TypeCacheEntry *) fcinfo->flinfo->fn_extra; + if (typentry == NULL || typentry->type_id != elmtyp) + { + typentry = lookup_type_cache(elmtyp, 0); + fcinfo->flinfo->fn_extra = (void *) typentry; + } + elmlen = typentry->typlen; + elmbyval = typentry->typbyval; + elmalign = typentry->typalign; + + result = array_shuffle_n(array, n, + elmtyp, elmlen, elmbyval, elmalign); + + PG_FREE_IF_COPY(array, 0); + + PG_RETURN_ARRAYTYPE_P(result); +} diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index a07e737a33..00c32c58e7 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -1681,6 +1681,12 @@ proname => 'arraycontjoinsel', provolatile => 's', prorettype => 'float8', proargtypes => 'internal oid internal int2 internal', prosrc => 'arraycontjoinsel' }, +{ oid => '8464', descr => 'shuffle array', + proname => 'array_shuffle', provolatile => 'v', proparallel => 'r', proisstrict => 't', + prorettype => 'anyarray', proargtypes => 'anyarray', prosrc => 'array_shuffle' }, +{ oid => '8465', descr => 'take samples from array', + proname => 'array_sample', provolatile => 'v', proparallel => 'r', proisstrict => 't', + prorettype => 'anyarray', proargtypes => 'anyarray int4', prosrc => 'array_sample' }, { oid => '764', descr => 'large object import', proname => 'lo_import', provolatile => 'v', proparallel => 'u', diff --git a/src/test/regress/expected/arrays.out b/src/test/regress/expected/arrays.out index 97920f38c2..9dd365f22d 100644 --- a/src/test/regress/expected/arrays.out +++ b/src/test/regress/expected/arrays.out @@ -2447,3 +2447,57 @@ SELECT trim_array(ARRAY[1, 2, 3], 10); -- fail ERROR: number of elements to trim must be between 0 and 3 SELECT trim_array(ARRAY[]::int[], 1); -- fail ERROR: number of elements to trim must be between 0 and 0 +-- array_shuffle +SELECT array_shuffle('{1,2,3,4,5,6}'::int[]) <@ '{1,2,3,4,5,6}'; + ?column? +---------- + t +(1 row) + +SELECT array_shuffle('{1,2,3,4,5,6}'::int[]) @> '{1,2,3,4,5,6}'; + ?column? +---------- + t +(1 row) + +SELECT array_dims(array_shuffle('[-1:2][2:3]={{1,2},{3,NULL},{5,6},{7,8}}'::int[])); + array_dims +------------- + [-1:2][2:3] +(1 row) + +SELECT array_dims(array_shuffle('{{{1,2},{3,NULL}},{{5,6},{7,8}},{{9,10},{11,12}}}'::int[])); + array_dims +----------------- + [1:3][1:2][1:2] +(1 row) + +-- array_sample +SELECT array_sample('{1,2,3,4,5,6}'::int[], 3) <@ '{1,2,3,4,5,6}'; + ?column? +---------- + t +(1 row) + +SELECT array_length(array_sample('{1,2,3,4,5,6}'::int[], 3), 1); + array_length +-------------- + 3 +(1 row) + +SELECT array_dims(array_sample('[-1:2][2:3]={{1,2},{3,NULL},{5,6},{7,8}}'::int[], 3)); + array_dims +------------- + [-1:1][2:3] +(1 row) + +SELECT array_dims(array_sample('{{{1,2},{3,NULL}},{{5,6},{7,8}},{{9,10},{11,12}}}'::int[], 2)); + array_dims +----------------- + [1:2][1:2][1:2] +(1 row) + +SELECT array_sample('{1,2,3,4,5,6}'::int[], -1); -- fail +ERROR: sample size must be between 0 and 6 +SELECT array_sample('{1,2,3,4,5,6}'::int[], 7); --fail +ERROR: sample size must be between 0 and 6 diff --git a/src/test/regress/sql/arrays.sql b/src/test/regress/sql/arrays.sql index 791af5c0ce..766798bc62 100644 --- a/src/test/regress/sql/arrays.sql +++ b/src/test/regress/sql/arrays.sql @@ -755,3 +755,17 @@ FROM SELECT trim_array(ARRAY[1, 2, 3], -1); -- fail SELECT trim_array(ARRAY[1, 2, 3], 10); -- fail SELECT trim_array(ARRAY[]::int[], 1); -- fail + +-- array_shuffle +SELECT array_shuffle('{1,2,3,4,5,6}'::int[]) <@ '{1,2,3,4,5,6}'; +SELECT array_shuffle('{1,2,3,4,5,6}'::int[]) @> '{1,2,3,4,5,6}'; +SELECT array_dims(array_shuffle('[-1:2][2:3]={{1,2},{3,NULL},{5,6},{7,8}}'::int[])); +SELECT array_dims(array_shuffle('{{{1,2},{3,NULL}},{{5,6},{7,8}},{{9,10},{11,12}}}'::int[])); + +-- array_sample +SELECT array_sample('{1,2,3,4,5,6}'::int[], 3) <@ '{1,2,3,4,5,6}'; +SELECT array_length(array_sample('{1,2,3,4,5,6}'::int[], 3), 1); +SELECT array_dims(array_sample('[-1:2][2:3]={{1,2},{3,NULL},{5,6},{7,8}}'::int[], 3)); +SELECT array_dims(array_sample('{{{1,2},{3,NULL}},{{5,6},{7,8}},{{9,10},{11,12}}}'::int[], 2)); +SELECT array_sample('{1,2,3,4,5,6}'::int[], -1); -- fail +SELECT array_sample('{1,2,3,4,5,6}'::int[], 7); --fail -- 2.37.3