On 09/13/2014 11:28 PM, Peter Geoghegan wrote:
Anyway, attached rough test program implements what you outline. This
is for 30,000 32 byte strings (where just the final two bytes differ).
On my laptop, output looks like this (edited to only show median
duration in each case):
Got to be careful to not let the compiler optimize away microbenchmarks
like this. At least with my version of gcc, the strcoll calls get
optimized away, as do the memcmp calls, if you don't use the result for
anything. Clang was even more aggressive; it ran both comparisons in 0.0
seconds. Apparently it optimizes away the loops altogether.
Also, there should be a setlocale(LC_ALL, "") call somewhere. Otherwise
it runs in C locale, and we don't use strcoll() at all for C locale.
After fixing those, it runs much slower, so I had to reduce the number
of strings. Here's a fixed program. I'm now getting numbers like this:
(baseline) duration of comparisons without useless memcmp()s: 6.007368
seconds
duration of comparisons with useless memcmp()s: 6.079826 seconds
Both values vary in range 5.9 - 6.1 s, so it's fair to say that the
useless memcmp() is free with these parameters.
Is this the worst case scenario?
- Heikki
#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#include <locale.h>
/* STRING_SIZE does not include NUL byte */
#define STRING_SIZE 32
#define N_STRINGS 2000
#define LAST_N_DIFFER 2
#define INSTR_TIME_SUBTRACT(x,y) \
do { \
(x).tv_sec -= (y).tv_sec; \
(x).tv_usec -= (y).tv_usec; \
/* Normalize */ \
while ((x).tv_usec < 0) \
{ \
(x).tv_usec += 1000000; \
(x).tv_sec--; \
} \
} while (0)
#define INSTR_TIME_GET_DOUBLE(t) \
(((double) (t).tv_sec) + ((double) (t).tv_usec) / 1000000.0)
#define Min(x, y) ((x) < (y) ? (x) : (y))
/*
* Generate single random alphabetical ASCII char. Uses an unseeded rand()
* call, to ensure inter-run determinism.
*/
static inline char
generate_prandom_printable_char(void)
{
for (;;)
{
char crand = rand() & 0x7F;
if ((crand >= 'A' && crand <= 'Z') || (crand >= 'a' && crand <= 'z'))
return crand;
}
}
/*
* Generate a random "payload" string. The returned string is pre-calcuated
* once, to form the bulk of each distinct string.
*/
static inline char *
generate_prandom_payload()
{
char *payload = malloc(STRING_SIZE + 1);
int i;
/*
* The final LAST_N_DIFFER bytes will ultimately be clobbered with distinct
* bytes, but that occurs separately, per string.
*/
for (i = 0; i < STRING_SIZE; i++)
payload[i] = generate_prandom_printable_char();
/*
* Add NUL here, even though complete strings are not NULL terminated (or,
* rather, are copied into a buffer on the stack in order to add a NUL once
* per comparison)
*/
payload[STRING_SIZE] = '\0';
return payload;
}
int
main(int argc, const char *argv[])
{
char **strings = malloc(N_STRINGS * sizeof(char *));
char *payload = generate_prandom_payload();
int i, j;
int nmatches = 0;
setlocale(LC_ALL, "");
/* Initialize strings */
for (i = 0; i < N_STRINGS; i++)
{
int j;
strings[i] = malloc(STRING_SIZE);
memcpy(strings[i], payload, STRING_SIZE);
/* Last LAST_N_DIFFER characters randomly vary */
for (j = LAST_N_DIFFER; j > 0; j--)
{
char n_last = generate_prandom_printable_char();
strings[i][STRING_SIZE - j] = n_last;
}
/* Don't terminate -- no room in buffer */
}
printf("Strings generated - beginning tests\n");
for (;;)
{
struct timeval before, after;
/*
******************************************************************
* Baseline -- no wasted memcmp().
*
* Compare each string to each other string using only strcoll()
******************************************************************
*/
gettimeofday(&before, NULL);
for (i = 0; i < N_STRINGS; i++)
{
char *str = strings[i];
/* Quadratic string comparison */
for (j = 0; j < N_STRINGS; j++)
{
int cmp;
char buf1[STRING_SIZE + 1];
char buf2[STRING_SIZE + 1];
char *other;
other = strings[j];
memcpy(buf1, str, STRING_SIZE);
memcpy(buf2, other, STRING_SIZE);
buf1[STRING_SIZE] = '\0';
buf2[STRING_SIZE] = '\0';
cmp = strcoll(buf1, buf2);
if (cmp == 0)
nmatches++;
/*
* memcmp() tie-breaker (equivalent to the one that currently
* appears within varstr_cmp()) isn't considered. It's only
* relevant to a small number of locales, like hu_HU, where
* strcoll() might (rarely) indicate equality in respect of a
* pair of non-identical strings. If strcoll() did return 0,
* then for most locales it's certain that the first memcmp()
* would have worked out.
*/
}
}
/* Report no-wasted-memcmp case duration */
gettimeofday(&after, NULL);
INSTR_TIME_SUBTRACT(after, before);
printf("(baseline) duration of comparisons without useless memcmp()s: %f seconds\n\n",
INSTR_TIME_GET_DOUBLE(after));
/*
******************************************************************
* Compare each string to each other string using useless memcmp(),
* followed by tie-breaker strcoll()
******************************************************************
*/
gettimeofday(&before, NULL);
for (i = 0; i < N_STRINGS; i++)
{
char *str = strings[i];
/* Quadratic string comparison */
for (j = 0; j < N_STRINGS; j++)
{
int cmp;
char buf1[STRING_SIZE + 1];
char buf2[STRING_SIZE + 1];
char *other;
other = strings[j];
/*
* waste some cycles.
*
* Don't give the implementation the benefit of sometimes
* getting away with a memcmp(), even though ordinarily that's
* perfectly representative/legitimate.
*
* Need to NUL terminate each buffer.
*/
cmp = memcmp(str, other, STRING_SIZE);
if (cmp == 0)
nmatches++;
memcpy(buf1, str, STRING_SIZE);
memcpy(buf2, other, STRING_SIZE);
buf1[STRING_SIZE] = '\0';
buf2[STRING_SIZE] = '\0';
cmp = strcoll(buf1, buf2);
if (cmp == 0)
nmatches++;
}
}
/* Report wasted-memcmp case duration */
gettimeofday(&after, NULL);
INSTR_TIME_SUBTRACT(after, before);
printf("duration of comparisons with useless memcmp()s: %f seconds\n\n",
INSTR_TIME_GET_DOUBLE(after));
/*
* Print the number of cmp == 0 lines. This is completely useless
* information, but printing it ensures that the compiler doesn't
* optimize away the comparisons
*/
printf("%d equal comparisons\n", nmatches);
} /* Loop forever */
return 0;
}
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers