On 02.07.2011 21:07, Tom Lane wrote:
I wrote:
I tweaked the patch a bit further (don't pessimize the boundary case
where there's exactly 4n+1 trigrams, avoid forcing trg1 into memory,
improve the comment) and attach that version below. This is a little
bit faster but I still wonder if it's worth the price of adding an
obscure dependency on the header size.
It occurred to me that we could very easily remove that objection by
making the code dynamically detect when it's reached a suitably aligned
trigram. The attached version of the patch does it that way. It seems
to be a percent or so slower than my previous version, but I think that
making the function robust against header changes is probably well worth
that price.
Ah, that opens the door to do something I considered earlier but
rejected because of alignment: instead of three 32-bit word fetches, we
could fetch one 64-bit word and 32-bit word. Might shave a few more
cycles...
Meanwhile, I experimented with optimizing the other part of the loop:
the HASH() macros for setting the right bits in the signature. With the
default compile-time settings, the signature array is 95 bits.
Currently, it's stored in a BITVEC, which is a byte array, but we could
store it in two 64-bit integers, which makes it possible to write SETBIT
differently. I experimented with a few approaches, first was essentially
this:
+ /* Set the nth bit in the signature, in s1 or s2 */
+ #define HASH_S(h) \
+ do {
\
+ unsigned int n = HASHVAL(h); \
+ if (((uint64) (n)) < 64) \
+ s1 |= (uint64) 1<<(n); \
+ else
\
+ s2 |= (uint64) 1<<((n) - 64); \
+ } while(0)
That was a bit faster on my x64 laptop, but slightly slower on my ia64
HP-UX box. My second try was to use lookup tables, patch attached. That
was yet faster on x64, and a small win on the ia64 box too. I'm not sure
it's worth the added code complexity, though.
Here's a summary of the timings I got with different versions:
ia64 HP-UX (anole):
unpatched: 11194.038 ms
fast_makesign-tom: 10064.980 ms
fast_makesign-2int: 10649.726 ms
fast_makesign-tbl: 9951.547 ms
x64 laptop:
unpatched: 4595,209 ms
fast_makesign-tom: 3346,548 ms
fast_makesign-2int: 3102,874 ms
fast_makesign-tbl: 2997,854 ms
I used the same "REINDEX INDEX i_words" test I used earlier, repeated
each run a couple of times, and took the lowest number.
fast_makesign-tom is the first patch you posted, I haven't tested your
latest one. fast_makesign-2int is with the HASH_S() macro above, and
has_makesign-tbl is with the attached patch.
PS. in case you wonder why the HP-UX box is so much slower than my
laptop; this box isn't really meant for performance testing. It's just
something I happen to have access to, I think it's a virtual machine of
some sort. The numbers are very repeatable, however.
--
Heikki Linnakangas
EnterpriseDB http://www.enterprisedb.com
*** a/contrib/pg_trgm/trgm_gist.c
--- b/contrib/pg_trgm/trgm_gist.c
***************
*** 82,88 **** gtrgm_out(PG_FUNCTION_ARGS)
}
static void
! makesign(BITVECP sign, TRGM *a)
{
int4 k,
len = ARRNELEM(a);
--- 82,88 ----
}
static void
! origmakesign(BITVECP sign, TRGM *a)
{
int4 k,
len = ARRNELEM(a);
***************
*** 98,103 **** makesign(BITVECP sign, TRGM *a)
--- 98,316 ----
}
}
+ /*
+ * Lookup tables for setting nth bit in a 128-bit bit array, where the
+ * bit array is stored in two 64-bit integers. That representation is used
+ * inside makesign(), elsewhere the signature is stored in a BITVEC, ie.
+ * a byte array, in little-endian order. To be compatible with the byte
+ * array representation, the 64-bit integers are in little-endian order
+ * regardless of the native endianness of the platform.
+ */
+ #ifdef WORDS_BIGENDIAN
+ #define SET_NTH_BIT_LOOKUP \
+ 1LL<<0x38, 1LL<<0x39, 1LL<<0x3A, 1LL<<0x3B, \
+ 1LL<<0x3C, 1LL<<0x3D, 1LL<<0x3E, 1LL<<0x3F, \
+ \
+ 1LL<<0x30, 1LL<<0x31, 1LL<<0x32, 1LL<<0x33, \
+ 1LL<<0x34, 1LL<<0x35, 1LL<<0x36, 1LL<<0x37, \
+ \
+ 1LL<<0x28, 1LL<<0x29, 1LL<<0x2A, 1LL<<0x2B, \
+ 1LL<<0x2C, 1LL<<0x2D, 1LL<<0x2E, 1LL<<0x2F, \
+ \
+ 1LL<<0x20, 1LL<<0x21, 1LL<<0x22, 1LL<<0x23, \
+ 1LL<<0x24, 1LL<<0x25, 1LL<<0x26, 1LL<<0x27, \
+ \
+ 1LL<<0x18, 1LL<<0x19, 1LL<<0x1A, 1LL<<0x1B, \
+ 1LL<<0x1C, 1LL<<0x1D, 1LL<<0x1E, 1LL<<0x1F, \
+ \
+ 1LL<<0x10, 1LL<<0x11, 1LL<<0x12, 1LL<<0x13, \
+ 1LL<<0x14, 1LL<<0x15, 1LL<<0x16, 1LL<<0x17, \
+ \
+ 1LL<<0x08, 1LL<<0x09, 1LL<<0x0A, 1LL<<0x0B, \
+ 1LL<<0x0C, 1LL<<0x0D, 1LL<<0x0E, 1LL<<0x0F, \
+ \
+ 1LL<<0x00, 1LL<<0x01, 1LL<<0x02, 1LL<<0x03, \
+ 1LL<<0x04, 1LL<<0x05, 1LL<<0x06, 1LL<<0x07
+ #else
+ #define SET_NTH_BIT_LOOKUP \
+ 1LL<<0x00, 1LL<<0x01, 1LL<<0x02, 1LL<<0x03, \
+ 1LL<<0x04, 1LL<<0x05, 1LL<<0x06, 1LL<<0x07, \
+ \
+ 1LL<<0x08, 1LL<<0x09, 1LL<<0x0A, 1LL<<0x0B, \
+ 1LL<<0x0C, 1LL<<0x0D, 1LL<<0x0E, 1LL<<0x0F, \
+ \
+ 1LL<<0x10, 1LL<<0x11, 1LL<<0x12, 1LL<<0x13, \
+ 1LL<<0x14, 1LL<<0x15, 1LL<<0x16, 1LL<<0x17, \
+ \
+ 1LL<<0x18, 1LL<<0x19, 1LL<<0x1A, 1LL<<0x1B, \
+ 1LL<<0x1C, 1LL<<0x1D, 1LL<<0x1E, 1LL<<0x1F, \
+ \
+ 1LL<<0x20, 1LL<<0x21, 1LL<<0x22, 1LL<<0x23, \
+ 1LL<<0x24, 1LL<<0x25, 1LL<<0x26, 1LL<<0x27, \
+ \
+ 1LL<<0x28, 1LL<<0x29, 1LL<<0x2A, 1LL<<0x2B, \
+ 1LL<<0x2C, 1LL<<0x2D, 1LL<<0x2E, 1LL<<0x2F, \
+ \
+ 1LL<<0x30, 1LL<<0x31, 1LL<<0x32, 1LL<<0x33, \
+ 1LL<<0x34, 1LL<<0x35, 1LL<<0x36, 1LL<<0x37, \
+ \
+ 1LL<<0x38, 1LL<<0x39, 1LL<<0x3A, 1LL<<0x3B, \
+ 1LL<<0x3C, 1LL<<0x3D, 1LL<<0x3E, 1LL<<0x3F
+ #endif
+
+ /* Lookup tables for setting the low and high 64-bits of the signature. */
+ static const uint64 lowsbox[128] =
+ {
+ /* 0-63 */
+ SET_NTH_BIT_LOOKUP,
+
+ /* 64-SIGLENBIT are zeros */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ };
+
+ static const uint64 highsbox[128] =
+ {
+ /* 0-63 are zeros. */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+ /* 64-SIGLENBIT */
+ SET_NTH_BIT_LOOKUP,
+ };
+
+ /* Set the nth bit in the signature, stored in two uint64s. */
+ #define SETBIT_S(high, low, hash) \
+ do { \
+ unsigned int n = (hash); \
+ high |= highsbox[n]; \
+ low |= lowsbox[n]; \
+ } while(0)
+
+ static void
+ makesign(BITVECP sign, TRGM *a)
+ {
+ int4 len = ARRNELEM(a);
+ trgm *ptr = GETARR(a);
+ char *p;
+ char *endptr;
+ uint32 w1,
+ w2,
+ w3;
+ uint32 trg0 = 0,
+ trg1,
+ trg2,
+ trg3,
+ trg4;
+ uint32 *p32;
+
+ /*
+ * s1 and s2 contain the signature we're calcuting. Together they
+ * form one bit-array of 128 bits.
+ */
+ uint64 highsig = 0;
+ uint64 lowsig = 0;
+
+ SETBIT_S(highsig, lowsig, SIGLENBIT); /* set last unused bit */
+
+ if (len <= 0)
+ goto end;
+
+ /*----------
+ * We have to extract each trigram into a uint32, and calculate the HASH.
+ * This would be a lot easier if the trigrams were aligned on 4-byte
+ * boundaries, but they're not. The simple way would be to copy each
+ * trigram byte-by-byte, but that is quite slow, and this function is a
+ * hotspot in penalty calculations.
+ *
+ * The first trigram in the array doesn't begin at a 4-byte boundary, as
+ * the flags byte comes first; but the next one does. So we fetch the
+ * first trigram as a special case, and after that each four trigrams fall
+ * onto 4-byte words like this:
+ *
+ * w1 w2 w3
+ * AAAB BBCC CDDD
+ *
+ * As long as there's at least four trigrams left to process, we fetch
+ * the next three words and extract the trigrams from them with bit
+ * operations, per the above diagram. The last few trigrams are handled
+ * one at a time with byte-by-byte fetching.
+ *
+ * Note that this code yields different results on big-endian and
+ * little-endian machines, because the bytes of each trigram are loaded
+ * into a uint32 in memory order and left-justified. That's probably
+ * undesirable, but changing this behavior would break existing indexes.
+ *----------
+ */
+ endptr = (char *) (ptr + len);
+ p32 = (uint32 *) (((char *) ptr) - 1);
+
+ /* Fetch and extract the initial word */
+ w1 = *(p32++);
+ #ifdef WORDS_BIGENDIAN
+ trg1 = w1 << 8;
+ #else
+ trg1 = w1 >> 8;
+ #endif
+ SETBIT_S(highsig, lowsig, HASHVAL(trg1));
+
+ while ((char *) p32 <= endptr - 3 * sizeof(uint32))
+ {
+ w1 = *(p32++);
+ w2 = *(p32++);
+ w3 = *(p32++);
+
+ #ifdef WORDS_BIGENDIAN
+ trg1 = w1 & 0xFFFFFF00;
+ trg2 = (w1 << 24) | ((w2 & 0xFFFF0000) >> 8);
+ trg3 = ((w2 & 0x0000FFFF) << 16) | ((w3 & 0xFF000000) >> 16);
+ trg4 = w3 << 8;
+ #else
+ trg1 = w1 & 0x00FFFFFF;
+ trg2 = (w1 >> 24) | ((w2 & 0x0000FFFF) << 8);
+ trg3 = ((w2 & 0xFFFF0000) >> 16) | ((w3 & 0x000000FF) << 16);
+ trg4 = w3 >> 8;
+ #endif
+
+ SETBIT_S(highsig, lowsig, HASHVAL(trg1));
+ SETBIT_S(highsig, lowsig, HASHVAL(trg2));
+ SETBIT_S(highsig, lowsig, HASHVAL(trg3));
+ SETBIT_S(highsig, lowsig, HASHVAL(trg4));
+ }
+
+ /* Handle the remaining 0-3 trigrams the slow way */
+ p = (char *) p32;
+ while (p < endptr)
+ {
+ CPTRGM(((char *) &trg0), p);
+ SETBIT_S(highsig, lowsig, HASHVAL(trg0));
+ p += 3;
+ }
+
+ end:
+ memcpy((char *) sign, &lowsig, sizeof(uint64));
+ memcpy(((char *) sign) + sizeof(uint64), &highsig, sizeof(uint64));
+
+ #ifdef VERIFY_SIG
+ {
+ BITVEC osign;
+ origmakesign(osign, a);
+
+ if (memcmp(sign, osign, sizeof(BITVEC)) != 0)
+ {
+ elog(LOG, "mismatch: %lx %lx", highsig, lowsig);
+ elog(LOG, "orig: %lx %lx", ((uint64 *)osign)[0], ((uint64 *)osign)[1]);
+ }
+ }
+ #endif
+ }
+
Datum
gtrgm_compress(PG_FUNCTION_ARGS)
{
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers