Re: Optimizing pg_trgm makesign() (was Re: [HACKERS] WIP: Fast GiST index build)

Heikki Linnakangas Sat, 02 Jul 2011 11:38:08 -0700

On 02.07.2011 21:07, Tom Lane wrote:

I wrote:

I tweaked the patch a bit further (don't pessimize the boundary case
where there's exactly 4n+1 trigrams, avoid forcing trg1 into memory,
improve the comment) and attach that version below.  This is a little
bit faster but I still wonder if it's worth the price of adding an
obscure dependency on the header size.


It occurred to me that we could very easily remove that objection by
making the code dynamically detect when it's reached a suitably aligned
trigram.  The attached version of the patch does it that way.  It seems
to be a percent or so slower than my previous version, but I think that
making the function robust against header changes is probably well worth
that price.

Ah, that opens the door to do something I considered earlier butrejected because of alignment: instead of three 32-bit word fetches, wecould fetch one 64-bit word and 32-bit word. Might shave a few morecycles...

Meanwhile, I experimented with optimizing the other part of the loop:the HASH() macros for setting the right bits in the signature. With thedefault compile-time settings, the signature array is 95 bits.Currently, it's stored in a BITVEC, which is a byte array, but we couldstore it in two 64-bit integers, which makes it possible to write SETBITdifferently. I experimented with a few approaches, first was essentiallythis:


+ /* Set the nth bit in the signature, in s1 or s2 */
+ #define HASH_S(h) \
+       do {                                                                    
\
+               unsigned int n = HASHVAL(h);            \
+               if (((uint64) (n)) < 64)                     \
+                       s1 |= (uint64) 1<<(n);                    \
+               else                                                            
\
+                       s2 |= (uint64) 1<<((n) - 64);     \
+       } while(0)

That was a bit faster on my x64 laptop, but slightly slower on my ia64HP-UX box. My second try was to use lookup tables, patch attached. Thatwas yet faster on x64, and a small win on the ia64 box too. I'm not sureit's worth the added code complexity, though.


Here's a summary of the timings I got with different versions:

ia64 HP-UX (anole):

unpatched: 11194.038 ms
fast_makesign-tom: 10064.980 ms
fast_makesign-2int: 10649.726 ms
fast_makesign-tbl: 9951.547 ms

x64 laptop:

unpatched: 4595,209 ms
fast_makesign-tom: 3346,548 ms
fast_makesign-2int: 3102,874 ms
fast_makesign-tbl: 2997,854 ms

I used the same "REINDEX INDEX i_words" test I used earlier, repeatedeach run a couple of times, and took the lowest number.fast_makesign-tom is the first patch you posted, I haven't tested yourlatest one. fast_makesign-2int is with the HASH_S() macro above, andhas_makesign-tbl is with the attached patch.

PS. in case you wonder why the HP-UX box is so much slower than mylaptop; this box isn't really meant for performance testing. It's justsomething I happen to have access to, I think it's a virtual machine ofsome sort. The numbers are very repeatable, however.


--
  Heikki Linnakangas
  EnterpriseDB   http://www.enterprisedb.com

*** a/contrib/pg_trgm/trgm_gist.c
--- b/contrib/pg_trgm/trgm_gist.c
***************
*** 82,88 **** gtrgm_out(PG_FUNCTION_ARGS)
  }
  
  static void
! makesign(BITVECP sign, TRGM *a)
  {
  	int4		k,
  				len = ARRNELEM(a);
--- 82,88 ----
  }
  
  static void
! origmakesign(BITVECP sign, TRGM *a)
  {
  	int4		k,
  				len = ARRNELEM(a);
***************
*** 98,103 **** makesign(BITVECP sign, TRGM *a)
--- 98,316 ----
  	}
  }
  
+ /*
+  * Lookup tables for setting nth bit in a 128-bit bit array, where the
+  * bit array is stored in two 64-bit integers. That representation is used
+  * inside makesign(), elsewhere the signature is stored in a BITVEC, ie.
+  * a byte array, in little-endian order. To be compatible with the byte
+  * array representation, the 64-bit integers are in little-endian order
+  * regardless of the native endianness of the platform.
+  */
+ #ifdef WORDS_BIGENDIAN
+ #define SET_NTH_BIT_LOOKUP						\
+ 	1LL<<0x38, 1LL<<0x39, 1LL<<0x3A, 1LL<<0x3B, \
+ 	1LL<<0x3C, 1LL<<0x3D, 1LL<<0x3E, 1LL<<0x3F, \
+ 												\
+ 	1LL<<0x30, 1LL<<0x31, 1LL<<0x32, 1LL<<0x33, \
+ 	1LL<<0x34, 1LL<<0x35, 1LL<<0x36, 1LL<<0x37, \
+ 												\
+ 	1LL<<0x28, 1LL<<0x29, 1LL<<0x2A, 1LL<<0x2B, \
+ 	1LL<<0x2C, 1LL<<0x2D, 1LL<<0x2E, 1LL<<0x2F, \
+ 												\
+ 	1LL<<0x20, 1LL<<0x21, 1LL<<0x22, 1LL<<0x23, \
+ 	1LL<<0x24, 1LL<<0x25, 1LL<<0x26, 1LL<<0x27, \
+ 												\
+ 	1LL<<0x18, 1LL<<0x19, 1LL<<0x1A, 1LL<<0x1B, \
+ 	1LL<<0x1C, 1LL<<0x1D, 1LL<<0x1E, 1LL<<0x1F, \
+ 												\
+ 	1LL<<0x10, 1LL<<0x11, 1LL<<0x12, 1LL<<0x13, \
+ 	1LL<<0x14, 1LL<<0x15, 1LL<<0x16, 1LL<<0x17, \
+ 												\
+ 	1LL<<0x08, 1LL<<0x09, 1LL<<0x0A, 1LL<<0x0B, \
+ 	1LL<<0x0C, 1LL<<0x0D, 1LL<<0x0E, 1LL<<0x0F, \
+ 												\
+ 	1LL<<0x00, 1LL<<0x01, 1LL<<0x02, 1LL<<0x03, \
+ 	1LL<<0x04, 1LL<<0x05, 1LL<<0x06, 1LL<<0x07
+ #else
+ #define SET_NTH_BIT_LOOKUP						\
+ 	1LL<<0x00, 1LL<<0x01, 1LL<<0x02, 1LL<<0x03, \
+ 	1LL<<0x04, 1LL<<0x05, 1LL<<0x06, 1LL<<0x07, \
+ 												\
+ 	1LL<<0x08, 1LL<<0x09, 1LL<<0x0A, 1LL<<0x0B, \
+ 	1LL<<0x0C, 1LL<<0x0D, 1LL<<0x0E, 1LL<<0x0F, \
+ 												\
+ 	1LL<<0x10, 1LL<<0x11, 1LL<<0x12, 1LL<<0x13, \
+ 	1LL<<0x14, 1LL<<0x15, 1LL<<0x16, 1LL<<0x17, \
+ 												\
+ 	1LL<<0x18, 1LL<<0x19, 1LL<<0x1A, 1LL<<0x1B, \
+ 	1LL<<0x1C, 1LL<<0x1D, 1LL<<0x1E, 1LL<<0x1F, \
+ 												\
+ 	1LL<<0x20, 1LL<<0x21, 1LL<<0x22, 1LL<<0x23, \
+ 	1LL<<0x24, 1LL<<0x25, 1LL<<0x26, 1LL<<0x27, \
+ 												\
+ 	1LL<<0x28, 1LL<<0x29, 1LL<<0x2A, 1LL<<0x2B, \
+ 	1LL<<0x2C, 1LL<<0x2D, 1LL<<0x2E, 1LL<<0x2F, \
+ 												\
+ 	1LL<<0x30, 1LL<<0x31, 1LL<<0x32, 1LL<<0x33, \
+ 	1LL<<0x34, 1LL<<0x35, 1LL<<0x36, 1LL<<0x37, \
+ 												\
+ 	1LL<<0x38, 1LL<<0x39, 1LL<<0x3A, 1LL<<0x3B, \
+ 	1LL<<0x3C, 1LL<<0x3D, 1LL<<0x3E, 1LL<<0x3F
+ #endif
+ 
+ /* Lookup tables for setting the low and high 64-bits of the signature. */
+ static const uint64 lowsbox[128] =
+ {
+ 	/* 0-63 */
+ 	SET_NTH_BIT_LOOKUP,
+ 
+ 	/* 64-SIGLENBIT are zeros */
+ 	0, 0, 0, 0,  0, 0, 0, 0,   0, 0, 0, 0,  0, 0, 0, 0,
+ 	0, 0, 0, 0,  0, 0, 0, 0,   0, 0, 0, 0,  0, 0, 0, 0,
+ 
+ 	0, 0, 0, 0,  0, 0, 0, 0,   0, 0, 0, 0,  0, 0, 0, 0,
+ 	0, 0, 0, 0,  0, 0, 0, 0,   0, 0, 0, 0,  0, 0, 0, 0
+ };
+ 
+ static const uint64 highsbox[128] =
+ {
+ 	/* 0-63 are zeros. */
+ 	0, 0, 0, 0,  0, 0, 0, 0,   0, 0, 0, 0,  0, 0, 0, 0,
+ 	0, 0, 0, 0,  0, 0, 0, 0,   0, 0, 0, 0,  0, 0, 0, 0,
+ 
+ 	0, 0, 0, 0,  0, 0, 0, 0,   0, 0, 0, 0,  0, 0, 0, 0,
+ 	0, 0, 0, 0,  0, 0, 0, 0,   0, 0, 0, 0,  0, 0, 0, 0,
+ 
+ 	/* 64-SIGLENBIT */
+ 	SET_NTH_BIT_LOOKUP,
+ };
+ 
+ /* Set the nth bit in the signature, stored in two uint64s. */
+ #define SETBIT_S(high, low, hash)		\
+ 	do {								\
+ 		unsigned int n = (hash);		\
+ 		high |= highsbox[n];			\
+ 		low |= lowsbox[n];				\
+ 	} while(0)
+ 
+ static void
+ makesign(BITVECP sign, TRGM *a)
+ {
+ 	int4		len = ARRNELEM(a);
+ 	trgm	   *ptr = GETARR(a);
+ 	char	   *p;
+ 	char	   *endptr;
+ 	uint32		w1,
+ 				w2,
+ 				w3;
+ 	uint32		trg0 = 0,
+ 				trg1,
+ 				trg2,
+ 				trg3,
+ 				trg4;
+ 	uint32	   *p32;
+ 
+ 	/*
+ 	 * s1 and s2 contain the signature we're calcuting. Together they
+ 	 * form one bit-array of 128 bits.
+ 	 */
+ 	uint64		highsig = 0;
+ 	uint64		lowsig = 0;
+ 
+ 	SETBIT_S(highsig, lowsig, SIGLENBIT);	/* set last unused bit */
+ 
+ 	if (len <= 0)
+ 		goto end;
+ 
+ 	/*----------
+ 	 * We have to extract each trigram into a uint32, and calculate the HASH.
+ 	 * This would be a lot easier if the trigrams were aligned on 4-byte
+ 	 * boundaries, but they're not.  The simple way would be to copy each
+ 	 * trigram byte-by-byte, but that is quite slow, and this function is a
+ 	 * hotspot in penalty calculations.
+ 	 *
+ 	 * The first trigram in the array doesn't begin at a 4-byte boundary, as
+ 	 * the flags byte comes first; but the next one does.  So we fetch the
+ 	 * first trigram as a special case, and after that each four trigrams fall
+ 	 * onto 4-byte words like this:
+ 	 *
+ 	 *  w1   w2   w3
+ 	 * AAAB BBCC CDDD
+ 	 *
+ 	 * As long as there's at least four trigrams left to process, we fetch
+ 	 * the next three words and extract the trigrams from them with bit
+ 	 * operations, per the above diagram.  The last few trigrams are handled
+ 	 * one at a time with byte-by-byte fetching.
+ 	 *
+ 	 * Note that this code yields different results on big-endian and
+ 	 * little-endian machines, because the bytes of each trigram are loaded
+ 	 * into a uint32 in memory order and left-justified.  That's probably
+ 	 * undesirable, but changing this behavior would break existing indexes.
+ 	 *----------
+ 	 */
+ 	endptr = (char *) (ptr + len);
+ 	p32 = (uint32 *) (((char *) ptr) - 1);
+ 
+ 	/* Fetch and extract the initial word */
+ 	w1 = *(p32++);
+ #ifdef WORDS_BIGENDIAN
+ 	trg1 = w1 << 8;
+ #else
+ 	trg1 = w1 >> 8;
+ #endif
+ 	SETBIT_S(highsig, lowsig, HASHVAL(trg1));
+ 
+ 	while ((char *) p32 <= endptr - 3 * sizeof(uint32))
+ 	{
+ 		w1 = *(p32++);
+ 		w2 = *(p32++);
+ 		w3 = *(p32++);
+ 
+ #ifdef WORDS_BIGENDIAN
+ 		trg1 = w1 & 0xFFFFFF00;
+ 		trg2 = (w1 << 24) | ((w2 & 0xFFFF0000) >> 8);
+ 		trg3 = ((w2 & 0x0000FFFF) << 16) | ((w3 & 0xFF000000) >> 16);
+ 		trg4 = w3 << 8;
+ #else
+ 		trg1 = w1 & 0x00FFFFFF;
+ 		trg2 = (w1 >> 24) | ((w2 & 0x0000FFFF) << 8);
+ 		trg3 = ((w2 & 0xFFFF0000) >> 16) | ((w3 & 0x000000FF) << 16);
+ 		trg4 = w3 >> 8;
+ #endif
+ 
+ 		SETBIT_S(highsig, lowsig, HASHVAL(trg1));
+ 		SETBIT_S(highsig, lowsig, HASHVAL(trg2));
+ 		SETBIT_S(highsig, lowsig, HASHVAL(trg3));
+ 		SETBIT_S(highsig, lowsig, HASHVAL(trg4));
+ 	}
+ 
+ 	/* Handle the remaining 0-3 trigrams the slow way */
+ 	p = (char *) p32;
+ 	while (p < endptr)
+ 	{
+ 		CPTRGM(((char *) &trg0), p);
+ 		SETBIT_S(highsig, lowsig, HASHVAL(trg0));
+ 		p += 3;
+ 	}
+ 
+ end:
+ 	memcpy((char  *) sign, &lowsig, sizeof(uint64));
+ 	memcpy(((char *) sign) + sizeof(uint64), &highsig, sizeof(uint64));
+ 
+ #ifdef VERIFY_SIG
+ 	{
+ 		BITVEC osign;
+ 		origmakesign(osign, a);
+ 
+ 		if (memcmp(sign, osign, sizeof(BITVEC)) != 0)
+ 		{
+ 			elog(LOG, "mismatch: %lx %lx", highsig, lowsig);
+ 			elog(LOG, "orig: %lx %lx", ((uint64 *)osign)[0], ((uint64 *)osign)[1]);
+ 		}
+ 	}
+ #endif
+ }
+ 
  Datum
  gtrgm_compress(PG_FUNCTION_ARGS)
  {

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Re: Optimizing pg_trgm makesign() (was Re: [HACKERS] WIP: Fast GiST index build)

Reply via email to