intarray GiST index gets wrong answers for '{}' <@ anything

Tom Lane Tue, 06 Aug 2019 10:56:18 -0700

While looking at the pending patch for faster GIN index searches
on no-key queries, I was motivated to improve contrib/intarray's
regression test to exercise the GIN_SEARCH_MODE_ALL case, because
it didn't.  And then I thought well, let's try to bring the code
coverage of _int_gin.c up to something respectable, which led me
to the regression test additions shown in the attached.  And I
was astonished to observe that the GiST index cases mostly got
the wrong answer for the <@ query.  Sometimes they got the right
answer, but mostly not.  After some digging I saw that the problem
was that there are a number of empty arrays ('{}') in the data,
and those should surely all match the WHERE a <@ '{73,23,20}'
condition, but the GiST opclasses were not reliably finding them.


The reason appears to be that the condition for descending through a
non-leaf index key for the RTContainedBy case is incorrectly optimistic:
it supposes that we only need to descend into subtrees whose union key
overlaps the query array.  But this does not guarantee to find subtrees
that contain empty-array entries.  Worse, such entries could be anywhere
in the tree, and because of the way that the insertion penalty is
calculated, they probably are.  (We will compute a zero penalty to add
an empty array item to any subtree.)  The reason it sometimes works
seems to be that GiST randomizes its insertion decisions when there are
equal penalties (cf gistchoose()), and sometimes by luck it puts all
of the empty-array entries into subtrees that the existing rule will
search.

So as far as I can see, we have little choice but to lobotomize the
RTContainedBy case and force a whole-index search.  This applies to
both the gist__int_ops and gist__intbig_ops opclasses.  This is
pretty awful for any applications that are depending on such queries
to be fast, but it's hard to argue with "it gets the wrong answer,
and not even reproducibly so".

In the future we might think about removing <@ from these opclasses,
or making a non-backward-compatible change to segregate empty arrays
from everything else in the index.  But neither answer seems very
back-patchable, and I'm not really sure I want to put so much work
into a second-class-citizen contrib module anyway.

Comments?

                        regards, tom lane

diff --git a/contrib/intarray/_int_gist.c b/contrib/intarray/_int_gist.c
index 13dd7ac..e5a8011 100644
--- a/contrib/intarray/_int_gist.c
+++ b/contrib/intarray/_int_gist.c
@@ -96,8 +96,13 @@ g_int_consistent(PG_FUNCTION_ARGS)
 				retval = inner_int_contains(query,
 											(ArrayType *) DatumGetPointer(entry->key));
 			else
-				retval = inner_int_overlap((ArrayType *) DatumGetPointer(entry->key),
-										   query);
+			{
+				/*
+				 * Unfortunately, because empty arrays could be anywhere in
+				 * the index, we must search the whole tree.
+				 */
+				retval = true;
+			}
 			break;
 		default:
 			retval = false;
diff --git a/contrib/intarray/_intbig_gist.c b/contrib/intarray/_intbig_gist.c
index 8d3b3f5..2a20abe 100644
--- a/contrib/intarray/_intbig_gist.c
+++ b/contrib/intarray/_intbig_gist.c
@@ -567,7 +567,13 @@ g_intbig_consistent(PG_FUNCTION_ARGS)
 				}
 			}
 			else
-				retval = _intbig_overlap((GISTTYPE *) DatumGetPointer(entry->key), query);
+			{
+				/*
+				 * Unfortunately, because empty arrays could be anywhere in
+				 * the index, we must search the whole tree.
+				 */
+				retval = true;
+			}
 			break;
 		default:
 			retval = false;
diff --git a/contrib/intarray/expected/_int.out b/contrib/intarray/expected/_int.out
index 105c951..c92a565 100644
--- a/contrib/intarray/expected/_int.out
+++ b/contrib/intarray/expected/_int.out
@@ -431,6 +431,18 @@ SELECT count(*) from test__int WHERE a @> '{20,23}';
     12
 (1 row)
 
+SELECT count(*) from test__int WHERE a <@ '{73,23,20}';
+ count 
+-------
+    10
+(1 row)
+
+SELECT count(*) from test__int WHERE a = '{73,23,20}';
+ count 
+-------
+     1
+(1 row)
+
 SELECT count(*) from test__int WHERE a @@ '50&68';
  count 
 -------
@@ -449,6 +461,19 @@ SELECT count(*) from test__int WHERE a @@ '(20&23)|(50&68)';
     21
 (1 row)
 
+SELECT count(*) from test__int WHERE a @@ '20 | !21';
+ count 
+-------
+  6566
+(1 row)
+
+SELECT count(*) from test__int WHERE a @@ '!20 & !21';
+ count 
+-------
+  6343
+(1 row)
+
+SET enable_seqscan = off;  -- not all of these would use index by default
 CREATE INDEX text_idx on test__int using gist ( a gist__int_ops );
 SELECT count(*) from test__int WHERE a && '{23,50}';
  count 
@@ -480,6 +505,18 @@ SELECT count(*) from test__int WHERE a @> '{20,23}';
     12
 (1 row)
 
+SELECT count(*) from test__int WHERE a <@ '{73,23,20}';
+ count 
+-------
+    10
+(1 row)
+
+SELECT count(*) from test__int WHERE a = '{73,23,20}';
+ count 
+-------
+     1
+(1 row)
+
 SELECT count(*) from test__int WHERE a @@ '50&68';
  count 
 -------
@@ -498,6 +535,18 @@ SELECT count(*) from test__int WHERE a @@ '(20&23)|(50&68)';
     21
 (1 row)
 
+SELECT count(*) from test__int WHERE a @@ '20 | !21';
+ count 
+-------
+  6566
+(1 row)
+
+SELECT count(*) from test__int WHERE a @@ '!20 & !21';
+ count 
+-------
+  6343
+(1 row)
+
 DROP INDEX text_idx;
 CREATE INDEX text_idx on test__int using gist ( a gist__intbig_ops );
 SELECT count(*) from test__int WHERE a && '{23,50}';
@@ -530,6 +579,18 @@ SELECT count(*) from test__int WHERE a @> '{20,23}';
     12
 (1 row)
 
+SELECT count(*) from test__int WHERE a <@ '{73,23,20}';
+ count 
+-------
+    10
+(1 row)
+
+SELECT count(*) from test__int WHERE a = '{73,23,20}';
+ count 
+-------
+     1
+(1 row)
+
 SELECT count(*) from test__int WHERE a @@ '50&68';
  count 
 -------
@@ -548,6 +609,18 @@ SELECT count(*) from test__int WHERE a @@ '(20&23)|(50&68)';
     21
 (1 row)
 
+SELECT count(*) from test__int WHERE a @@ '20 | !21';
+ count 
+-------
+  6566
+(1 row)
+
+SELECT count(*) from test__int WHERE a @@ '!20 & !21';
+ count 
+-------
+  6343
+(1 row)
+
 DROP INDEX text_idx;
 CREATE INDEX text_idx on test__int using gin ( a gin__int_ops );
 SELECT count(*) from test__int WHERE a && '{23,50}';
@@ -580,6 +653,18 @@ SELECT count(*) from test__int WHERE a @> '{20,23}';
     12
 (1 row)
 
+SELECT count(*) from test__int WHERE a <@ '{73,23,20}';
+ count 
+-------
+    10
+(1 row)
+
+SELECT count(*) from test__int WHERE a = '{73,23,20}';
+ count 
+-------
+     1
+(1 row)
+
 SELECT count(*) from test__int WHERE a @@ '50&68';
  count 
 -------
@@ -598,3 +683,16 @@ SELECT count(*) from test__int WHERE a @@ '(20&23)|(50&68)';
     21
 (1 row)
 
+SELECT count(*) from test__int WHERE a @@ '20 | !21';
+ count 
+-------
+  6566
+(1 row)
+
+SELECT count(*) from test__int WHERE a @@ '!20 & !21';
+ count 
+-------
+  6343
+(1 row)
+
+RESET enable_seqscan;
diff --git a/contrib/intarray/sql/_int.sql b/contrib/intarray/sql/_int.sql
index 40225c6..6ca7e3c 100644
--- a/contrib/intarray/sql/_int.sql
+++ b/contrib/intarray/sql/_int.sql
@@ -85,9 +85,15 @@ SELECT count(*) from test__int WHERE a @@ '23|50';
 SELECT count(*) from test__int WHERE a @> '{23,50}';
 SELECT count(*) from test__int WHERE a @@ '23&50';
 SELECT count(*) from test__int WHERE a @> '{20,23}';
+SELECT count(*) from test__int WHERE a <@ '{73,23,20}';
+SELECT count(*) from test__int WHERE a = '{73,23,20}';
 SELECT count(*) from test__int WHERE a @@ '50&68';
 SELECT count(*) from test__int WHERE a @> '{20,23}' or a @> '{50,68}';
 SELECT count(*) from test__int WHERE a @@ '(20&23)|(50&68)';
+SELECT count(*) from test__int WHERE a @@ '20 | !21';
+SELECT count(*) from test__int WHERE a @@ '!20 & !21';
+
+SET enable_seqscan = off;  -- not all of these would use index by default
 
 CREATE INDEX text_idx on test__int using gist ( a gist__int_ops );
 
@@ -96,9 +102,13 @@ SELECT count(*) from test__int WHERE a @@ '23|50';
 SELECT count(*) from test__int WHERE a @> '{23,50}';
 SELECT count(*) from test__int WHERE a @@ '23&50';
 SELECT count(*) from test__int WHERE a @> '{20,23}';
+SELECT count(*) from test__int WHERE a <@ '{73,23,20}';
+SELECT count(*) from test__int WHERE a = '{73,23,20}';
 SELECT count(*) from test__int WHERE a @@ '50&68';
 SELECT count(*) from test__int WHERE a @> '{20,23}' or a @> '{50,68}';
 SELECT count(*) from test__int WHERE a @@ '(20&23)|(50&68)';
+SELECT count(*) from test__int WHERE a @@ '20 | !21';
+SELECT count(*) from test__int WHERE a @@ '!20 & !21';
 
 DROP INDEX text_idx;
 CREATE INDEX text_idx on test__int using gist ( a gist__intbig_ops );
@@ -108,9 +118,13 @@ SELECT count(*) from test__int WHERE a @@ '23|50';
 SELECT count(*) from test__int WHERE a @> '{23,50}';
 SELECT count(*) from test__int WHERE a @@ '23&50';
 SELECT count(*) from test__int WHERE a @> '{20,23}';
+SELECT count(*) from test__int WHERE a <@ '{73,23,20}';
+SELECT count(*) from test__int WHERE a = '{73,23,20}';
 SELECT count(*) from test__int WHERE a @@ '50&68';
 SELECT count(*) from test__int WHERE a @> '{20,23}' or a @> '{50,68}';
 SELECT count(*) from test__int WHERE a @@ '(20&23)|(50&68)';
+SELECT count(*) from test__int WHERE a @@ '20 | !21';
+SELECT count(*) from test__int WHERE a @@ '!20 & !21';
 
 DROP INDEX text_idx;
 CREATE INDEX text_idx on test__int using gin ( a gin__int_ops );
@@ -120,6 +134,12 @@ SELECT count(*) from test__int WHERE a @@ '23|50';
 SELECT count(*) from test__int WHERE a @> '{23,50}';
 SELECT count(*) from test__int WHERE a @@ '23&50';
 SELECT count(*) from test__int WHERE a @> '{20,23}';
+SELECT count(*) from test__int WHERE a <@ '{73,23,20}';
+SELECT count(*) from test__int WHERE a = '{73,23,20}';
 SELECT count(*) from test__int WHERE a @@ '50&68';
 SELECT count(*) from test__int WHERE a @> '{20,23}' or a @> '{50,68}';
 SELECT count(*) from test__int WHERE a @@ '(20&23)|(50&68)';
+SELECT count(*) from test__int WHERE a @@ '20 | !21';
+SELECT count(*) from test__int WHERE a @@ '!20 & !21';
+
+RESET enable_seqscan;

intarray GiST index gets wrong answers for '{}' <@ anything

Reply via email to