texteq, textne, byteaeq and byteane detoast their arguments, then check for
equality of length.  Unequal lengths imply the answer trivially; given equal
lengths, the functions proceed to compare the actual bytes.  We can skip
detoasting entirely when the lengths are unequal.  The attached patch implements
this.  As submitted, it applies atop of my recent strncmp->memcmp patch, but
they are logically independent.  To benchmark some optimal and pessimal cases, I
used the attached "bench-skip-texteq.sql".  It uses a few datum sizes and varies
whether the length check succeeds:

bench-skip-texteq.sql, 10 MiB nomatch: 58.4s previous, 0.00664s patched
bench-skip-texteq.sql,  144 B   match: 73.0s previous, 71.9s patched
bench-skip-texteq.sql,    3 B   match: 68.8s previous, 67.3s patched
bench-skip-texteq.sql,    3 B nomatch: 45.0s previous, 46.0s patched

The timing differences in the smaller-length test cases are probably not
statistically significant.

Thanks,
nm
*** a/src/backend/utils/adt/varlena.c
--- b/src/backend/utils/adt/varlena.c
***************
*** 1451,1472 **** text_cmp(text *arg1, text *arg2)
  Datum
  texteq(PG_FUNCTION_ARGS)
  {
!       text       *arg1 = PG_GETARG_TEXT_PP(0);
!       text       *arg2 = PG_GETARG_TEXT_PP(1);
        bool            result;
  
!       /*
!        * Since we only care about equality or not-equality, we can avoid all 
the
!        * expense of strcoll() here, and just do bitwise comparison.
!        */
!       if (VARSIZE_ANY_EXHDR(arg1) != VARSIZE_ANY_EXHDR(arg2))
                result = false;
        else
!               result = (memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2),
!                                                VARSIZE_ANY_EXHDR(arg1)) == 0);
! 
!       PG_FREE_IF_COPY(arg1, 0);
!       PG_FREE_IF_COPY(arg2, 1);
  
        PG_RETURN_BOOL(result);
  }
--- 1451,1475 ----
  Datum
  texteq(PG_FUNCTION_ARGS)
  {
!       Size            len1 = toast_raw_datum_size(PG_GETARG_DATUM(0)) - 
VARHDRSZ;
!       Size            len2 = toast_raw_datum_size(PG_GETARG_DATUM(1)) - 
VARHDRSZ;
        bool            result;
  
!       /* Fast path for different-length inputs: possibly skip detoast. */
!       if (len1 != len2)
                result = false;
        else
!       {
!               text       *arg1 = PG_GETARG_TEXT_PP(0);
!               text       *arg2 = PG_GETARG_TEXT_PP(1);
!               /*
!                * Since we only care about equality or not-equality, we can 
avoid all the
!                * expense of strcoll() here, and just do bitwise comparison.
!                */
!               result = (memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), len1) == 
0);
!               PG_FREE_IF_COPY(arg1, 0);
!               PG_FREE_IF_COPY(arg2, 1);
!       }
  
        PG_RETURN_BOOL(result);
  }
***************
*** 1474,1495 **** texteq(PG_FUNCTION_ARGS)
  Datum
  textne(PG_FUNCTION_ARGS)
  {
!       text       *arg1 = PG_GETARG_TEXT_PP(0);
!       text       *arg2 = PG_GETARG_TEXT_PP(1);
        bool            result;
  
!       /*
!        * Since we only care about equality or not-equality, we can avoid all 
the
!        * expense of strcoll() here, and just do bitwise comparison.
!        */
!       if (VARSIZE_ANY_EXHDR(arg1) != VARSIZE_ANY_EXHDR(arg2))
                result = true;
        else
!               result = (memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2),
!                                                VARSIZE_ANY_EXHDR(arg1)) != 0);
! 
!       PG_FREE_IF_COPY(arg1, 0);
!       PG_FREE_IF_COPY(arg2, 1);
  
        PG_RETURN_BOOL(result);
  }
--- 1477,1501 ----
  Datum
  textne(PG_FUNCTION_ARGS)
  {
!       Size            len1 = toast_raw_datum_size(PG_GETARG_DATUM(0)) - 
VARHDRSZ;
!       Size            len2 = toast_raw_datum_size(PG_GETARG_DATUM(1)) - 
VARHDRSZ;
        bool            result;
  
!       /* Fast path for different-length inputs: possibly skip detoast. */
!       if (len1 != len2)
                result = true;
        else
!       {
!               text       *arg1 = PG_GETARG_TEXT_PP(0);
!               text       *arg2 = PG_GETARG_TEXT_PP(1);
!               /*
!                * Since we only care about equality or not-equality, we can 
avoid all the
!                * expense of strcoll() here, and just do bitwise comparison.
!                */
!               result = (memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), len1) != 
0);
!               PG_FREE_IF_COPY(arg1, 0);
!               PG_FREE_IF_COPY(arg2, 1);
!       }
  
        PG_RETURN_BOOL(result);
  }
***************
*** 2358,2380 **** SplitIdentifierString(char *rawstring, char separator,
  Datum
  byteaeq(PG_FUNCTION_ARGS)
  {
!       bytea      *arg1 = PG_GETARG_BYTEA_PP(0);
!       bytea      *arg2 = PG_GETARG_BYTEA_PP(1);
!       int                     len1,
!                               len2;
        bool            result;
  
!       len1 = VARSIZE_ANY_EXHDR(arg1);
!       len2 = VARSIZE_ANY_EXHDR(arg2);
! 
!       /* fast path for different-length inputs */
        if (len1 != len2)
                result = false;
        else
                result = (memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), len1) == 
0);
! 
!       PG_FREE_IF_COPY(arg1, 0);
!       PG_FREE_IF_COPY(arg2, 1);
  
        PG_RETURN_BOOL(result);
  }
--- 2364,2384 ----
  Datum
  byteaeq(PG_FUNCTION_ARGS)
  {
!       Size            len1 = toast_raw_datum_size(PG_GETARG_DATUM(0)) - 
VARHDRSZ;
!       Size            len2 = toast_raw_datum_size(PG_GETARG_DATUM(1)) - 
VARHDRSZ;
        bool            result;
  
!       /* Fast path for different-length inputs: possibly skip detoast. */
        if (len1 != len2)
                result = false;
        else
+       {
+               bytea      *arg1 = PG_GETARG_BYTEA_PP(0);
+               bytea      *arg2 = PG_GETARG_BYTEA_PP(1);
                result = (memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), len1) == 
0);
!               PG_FREE_IF_COPY(arg1, 0);
!               PG_FREE_IF_COPY(arg2, 1);
!       }
  
        PG_RETURN_BOOL(result);
  }
***************
*** 2382,2404 **** byteaeq(PG_FUNCTION_ARGS)
  Datum
  byteane(PG_FUNCTION_ARGS)
  {
!       bytea      *arg1 = PG_GETARG_BYTEA_PP(0);
!       bytea      *arg2 = PG_GETARG_BYTEA_PP(1);
!       int                     len1,
!                               len2;
        bool            result;
  
!       len1 = VARSIZE_ANY_EXHDR(arg1);
!       len2 = VARSIZE_ANY_EXHDR(arg2);
! 
!       /* fast path for different-length inputs */
        if (len1 != len2)
                result = true;
        else
                result = (memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), len1) != 
0);
! 
!       PG_FREE_IF_COPY(arg1, 0);
!       PG_FREE_IF_COPY(arg2, 1);
  
        PG_RETURN_BOOL(result);
  }
--- 2386,2406 ----
  Datum
  byteane(PG_FUNCTION_ARGS)
  {
!       Size            len1 = toast_raw_datum_size(PG_GETARG_DATUM(0)) - 
VARHDRSZ;
!       Size            len2 = toast_raw_datum_size(PG_GETARG_DATUM(1)) - 
VARHDRSZ;
        bool            result;
  
!       /* Fast path for different-length inputs: possibly skip detoast. */
        if (len1 != len2)
                result = true;
        else
+       {
+               bytea      *arg1 = PG_GETARG_BYTEA_PP(0);
+               bytea      *arg2 = PG_GETARG_BYTEA_PP(1);
                result = (memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), len1) != 
0);
!               PG_FREE_IF_COPY(arg1, 0);
!               PG_FREE_IF_COPY(arg2, 1);
!       }
  
        PG_RETURN_BOOL(result);
  }
\timing on
-- Benchmark "texteq".

BEGIN;
-- Table having a single "text" column and 100 rows @ 10 MiB each.
CREATE TEMP TABLE t (c text);
INSERT INTO t
        SELECT repeat('foobarbazz', 1024 * 1024) || to_char(n, '000000')
        FROM generate_series(1,100) ser(n);
SELECT pg_size_pretty(pg_total_relation_size('t'));

-- Compare each row, failing at the length check.  With the patch, we skip all
-- detoasts.  This is a best-case for the patch.
CREATE FUNCTION pg_temp.try() RETURNS void LANGUAGE plpgsql AS $$
BEGIN
        FOR i IN 1..30 LOOP
                PERFORM count(*) FROM t WHERE c = repeat('foobarbazz', 1024 * 
1025);
        END LOOP;
END
$$;
SELECT pg_temp.try();
SELECT pg_temp.try();
SELECT pg_temp.try();
SELECT pg_temp.try();
SELECT pg_temp.try();
ROLLBACK;


BEGIN;
\set text144 
'''abcdefghijklmnopqrstuvwxyz0123456789abcdefghijklmnopqrstuvwxyz0123456789abcdefghijklmnopqrstuvwxyz0123456789abcdefghijklmnopqrstuvwxyz0123456789'''
-- Table having a single "text" column and 3M rows @ 144 B each.
CREATE TEMP TABLE t (c text);
INSERT INTO t SELECT :text144 FROM generate_series(1,3000000);
SELECT pg_size_pretty(pg_total_relation_size('t'));

-- Compare each row successfully.  No detoasting would ever be involved.  This
-- should reveal the simple-case overhead of the patch.
CREATE FUNCTION pg_temp.try(text) RETURNS void LANGUAGE plpgsql AS $$
BEGIN
        FOR i IN 1..30 LOOP
                PERFORM count(*) FROM t WHERE c = $1;
        END LOOP;
END
$$;
SELECT pg_temp.try(:text144);
SELECT pg_temp.try(:text144);
SELECT pg_temp.try(:text144);
SELECT pg_temp.try(:text144);
SELECT pg_temp.try(:text144);
ROLLBACK;


BEGIN;
-- Table having a single "text" column and 5M tiny (3 B) rows.
CREATE TEMP TABLE t (c text);
INSERT INTO t SELECT 'foo' FROM generate_series(1,5000000);
SELECT pg_size_pretty(pg_total_relation_size('t'));

-- Compare each row successfully.  Another perspective on the patch overhead.
CREATE FUNCTION pg_temp.try() RETURNS void LANGUAGE plpgsql AS $$
BEGIN
        FOR i IN 1..30 LOOP
                PERFORM count(*) FROM t WHERE c = 'foo';
        END LOOP;
END
$$;
SELECT pg_temp.try();
SELECT pg_temp.try();
SELECT pg_temp.try();
SELECT pg_temp.try();
SELECT pg_temp.try();

-- Compare each row, failing at the length check.  As patched, we avoid
-- detoasting the packed varlena, so we might win.
CREATE FUNCTION pg_temp.tryfail() RETURNS void LANGUAGE plpgsql AS $$
BEGIN
        FOR i IN 1..30 LOOP
                PERFORM count(*) FROM t WHERE c = 'notfoo';
        END LOOP;
END
$$;
SELECT pg_temp.tryfail();
SELECT pg_temp.tryfail();
SELECT pg_temp.tryfail();
SELECT pg_temp.tryfail();
SELECT pg_temp.tryfail();
ROLLBACK;
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to