texteq, textne, byteaeq and byteane detoast their arguments, then check for equality of length. Unequal lengths imply the answer trivially; given equal lengths, the functions proceed to compare the actual bytes. We can skip detoasting entirely when the lengths are unequal. The attached patch implements this. As submitted, it applies atop of my recent strncmp->memcmp patch, but they are logically independent. To benchmark some optimal and pessimal cases, I used the attached "bench-skip-texteq.sql". It uses a few datum sizes and varies whether the length check succeeds:
bench-skip-texteq.sql, 10 MiB nomatch: 58.4s previous, 0.00664s patched bench-skip-texteq.sql, 144 B match: 73.0s previous, 71.9s patched bench-skip-texteq.sql, 3 B match: 68.8s previous, 67.3s patched bench-skip-texteq.sql, 3 B nomatch: 45.0s previous, 46.0s patched The timing differences in the smaller-length test cases are probably not statistically significant. Thanks, nm
*** a/src/backend/utils/adt/varlena.c --- b/src/backend/utils/adt/varlena.c *************** *** 1451,1472 **** text_cmp(text *arg1, text *arg2) Datum texteq(PG_FUNCTION_ARGS) { ! text *arg1 = PG_GETARG_TEXT_PP(0); ! text *arg2 = PG_GETARG_TEXT_PP(1); bool result; ! /* ! * Since we only care about equality or not-equality, we can avoid all the ! * expense of strcoll() here, and just do bitwise comparison. ! */ ! if (VARSIZE_ANY_EXHDR(arg1) != VARSIZE_ANY_EXHDR(arg2)) result = false; else ! result = (memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), ! VARSIZE_ANY_EXHDR(arg1)) == 0); ! ! PG_FREE_IF_COPY(arg1, 0); ! PG_FREE_IF_COPY(arg2, 1); PG_RETURN_BOOL(result); } --- 1451,1475 ---- Datum texteq(PG_FUNCTION_ARGS) { ! Size len1 = toast_raw_datum_size(PG_GETARG_DATUM(0)) - VARHDRSZ; ! Size len2 = toast_raw_datum_size(PG_GETARG_DATUM(1)) - VARHDRSZ; bool result; ! /* Fast path for different-length inputs: possibly skip detoast. */ ! if (len1 != len2) result = false; else ! { ! text *arg1 = PG_GETARG_TEXT_PP(0); ! text *arg2 = PG_GETARG_TEXT_PP(1); ! /* ! * Since we only care about equality or not-equality, we can avoid all the ! * expense of strcoll() here, and just do bitwise comparison. ! */ ! result = (memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), len1) == 0); ! PG_FREE_IF_COPY(arg1, 0); ! PG_FREE_IF_COPY(arg2, 1); ! } PG_RETURN_BOOL(result); } *************** *** 1474,1495 **** texteq(PG_FUNCTION_ARGS) Datum textne(PG_FUNCTION_ARGS) { ! text *arg1 = PG_GETARG_TEXT_PP(0); ! text *arg2 = PG_GETARG_TEXT_PP(1); bool result; ! /* ! * Since we only care about equality or not-equality, we can avoid all the ! * expense of strcoll() here, and just do bitwise comparison. ! */ ! if (VARSIZE_ANY_EXHDR(arg1) != VARSIZE_ANY_EXHDR(arg2)) result = true; else ! result = (memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), ! VARSIZE_ANY_EXHDR(arg1)) != 0); ! ! PG_FREE_IF_COPY(arg1, 0); ! PG_FREE_IF_COPY(arg2, 1); PG_RETURN_BOOL(result); } --- 1477,1501 ---- Datum textne(PG_FUNCTION_ARGS) { ! Size len1 = toast_raw_datum_size(PG_GETARG_DATUM(0)) - VARHDRSZ; ! Size len2 = toast_raw_datum_size(PG_GETARG_DATUM(1)) - VARHDRSZ; bool result; ! /* Fast path for different-length inputs: possibly skip detoast. */ ! if (len1 != len2) result = true; else ! { ! text *arg1 = PG_GETARG_TEXT_PP(0); ! text *arg2 = PG_GETARG_TEXT_PP(1); ! /* ! * Since we only care about equality or not-equality, we can avoid all the ! * expense of strcoll() here, and just do bitwise comparison. ! */ ! result = (memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), len1) != 0); ! PG_FREE_IF_COPY(arg1, 0); ! PG_FREE_IF_COPY(arg2, 1); ! } PG_RETURN_BOOL(result); } *************** *** 2358,2380 **** SplitIdentifierString(char *rawstring, char separator, Datum byteaeq(PG_FUNCTION_ARGS) { ! bytea *arg1 = PG_GETARG_BYTEA_PP(0); ! bytea *arg2 = PG_GETARG_BYTEA_PP(1); ! int len1, ! len2; bool result; ! len1 = VARSIZE_ANY_EXHDR(arg1); ! len2 = VARSIZE_ANY_EXHDR(arg2); ! ! /* fast path for different-length inputs */ if (len1 != len2) result = false; else result = (memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), len1) == 0); ! ! PG_FREE_IF_COPY(arg1, 0); ! PG_FREE_IF_COPY(arg2, 1); PG_RETURN_BOOL(result); } --- 2364,2384 ---- Datum byteaeq(PG_FUNCTION_ARGS) { ! Size len1 = toast_raw_datum_size(PG_GETARG_DATUM(0)) - VARHDRSZ; ! Size len2 = toast_raw_datum_size(PG_GETARG_DATUM(1)) - VARHDRSZ; bool result; ! /* Fast path for different-length inputs: possibly skip detoast. */ if (len1 != len2) result = false; else + { + bytea *arg1 = PG_GETARG_BYTEA_PP(0); + bytea *arg2 = PG_GETARG_BYTEA_PP(1); result = (memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), len1) == 0); ! PG_FREE_IF_COPY(arg1, 0); ! PG_FREE_IF_COPY(arg2, 1); ! } PG_RETURN_BOOL(result); } *************** *** 2382,2404 **** byteaeq(PG_FUNCTION_ARGS) Datum byteane(PG_FUNCTION_ARGS) { ! bytea *arg1 = PG_GETARG_BYTEA_PP(0); ! bytea *arg2 = PG_GETARG_BYTEA_PP(1); ! int len1, ! len2; bool result; ! len1 = VARSIZE_ANY_EXHDR(arg1); ! len2 = VARSIZE_ANY_EXHDR(arg2); ! ! /* fast path for different-length inputs */ if (len1 != len2) result = true; else result = (memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), len1) != 0); ! ! PG_FREE_IF_COPY(arg1, 0); ! PG_FREE_IF_COPY(arg2, 1); PG_RETURN_BOOL(result); } --- 2386,2406 ---- Datum byteane(PG_FUNCTION_ARGS) { ! Size len1 = toast_raw_datum_size(PG_GETARG_DATUM(0)) - VARHDRSZ; ! Size len2 = toast_raw_datum_size(PG_GETARG_DATUM(1)) - VARHDRSZ; bool result; ! /* Fast path for different-length inputs: possibly skip detoast. */ if (len1 != len2) result = true; else + { + bytea *arg1 = PG_GETARG_BYTEA_PP(0); + bytea *arg2 = PG_GETARG_BYTEA_PP(1); result = (memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), len1) != 0); ! PG_FREE_IF_COPY(arg1, 0); ! PG_FREE_IF_COPY(arg2, 1); ! } PG_RETURN_BOOL(result); }
\timing on -- Benchmark "texteq". BEGIN; -- Table having a single "text" column and 100 rows @ 10 MiB each. CREATE TEMP TABLE t (c text); INSERT INTO t SELECT repeat('foobarbazz', 1024 * 1024) || to_char(n, '000000') FROM generate_series(1,100) ser(n); SELECT pg_size_pretty(pg_total_relation_size('t')); -- Compare each row, failing at the length check. With the patch, we skip all -- detoasts. This is a best-case for the patch. CREATE FUNCTION pg_temp.try() RETURNS void LANGUAGE plpgsql AS $$ BEGIN FOR i IN 1..30 LOOP PERFORM count(*) FROM t WHERE c = repeat('foobarbazz', 1024 * 1025); END LOOP; END $$; SELECT pg_temp.try(); SELECT pg_temp.try(); SELECT pg_temp.try(); SELECT pg_temp.try(); SELECT pg_temp.try(); ROLLBACK; BEGIN; \set text144 '''abcdefghijklmnopqrstuvwxyz0123456789abcdefghijklmnopqrstuvwxyz0123456789abcdefghijklmnopqrstuvwxyz0123456789abcdefghijklmnopqrstuvwxyz0123456789''' -- Table having a single "text" column and 3M rows @ 144 B each. CREATE TEMP TABLE t (c text); INSERT INTO t SELECT :text144 FROM generate_series(1,3000000); SELECT pg_size_pretty(pg_total_relation_size('t')); -- Compare each row successfully. No detoasting would ever be involved. This -- should reveal the simple-case overhead of the patch. CREATE FUNCTION pg_temp.try(text) RETURNS void LANGUAGE plpgsql AS $$ BEGIN FOR i IN 1..30 LOOP PERFORM count(*) FROM t WHERE c = $1; END LOOP; END $$; SELECT pg_temp.try(:text144); SELECT pg_temp.try(:text144); SELECT pg_temp.try(:text144); SELECT pg_temp.try(:text144); SELECT pg_temp.try(:text144); ROLLBACK; BEGIN; -- Table having a single "text" column and 5M tiny (3 B) rows. CREATE TEMP TABLE t (c text); INSERT INTO t SELECT 'foo' FROM generate_series(1,5000000); SELECT pg_size_pretty(pg_total_relation_size('t')); -- Compare each row successfully. Another perspective on the patch overhead. CREATE FUNCTION pg_temp.try() RETURNS void LANGUAGE plpgsql AS $$ BEGIN FOR i IN 1..30 LOOP PERFORM count(*) FROM t WHERE c = 'foo'; END LOOP; END $$; SELECT pg_temp.try(); SELECT pg_temp.try(); SELECT pg_temp.try(); SELECT pg_temp.try(); SELECT pg_temp.try(); -- Compare each row, failing at the length check. As patched, we avoid -- detoasting the packed varlena, so we might win. CREATE FUNCTION pg_temp.tryfail() RETURNS void LANGUAGE plpgsql AS $$ BEGIN FOR i IN 1..30 LOOP PERFORM count(*) FROM t WHERE c = 'notfoo'; END LOOP; END $$; SELECT pg_temp.tryfail(); SELECT pg_temp.tryfail(); SELECT pg_temp.tryfail(); SELECT pg_temp.tryfail(); SELECT pg_temp.tryfail(); ROLLBACK;
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers