-----BEGIN PGP SIGNED MESSAGE----- Hash: SHA1 I've finished my implementation of a Two-Way plus Boyer-Moore hybrid string search algorithm for the memmmem module. This patch has passed everything I've thrown at it so far, and it has the nice properties of avoiding alloca/malloc (hence it is async-safe), as well as allowing sub-linear complexity for long needles (the added test in test-memmem.c fails with the KMP algorithm but passes with my Two-Way+BM hybrid). It guarantees fewer than 2N comparisons, and can determine misses as quickly as N/M. I chose a threshold of 128 bytes in the needle before attempting the BM bad-character shift table, since constructing a 256-entry table is expensive up front with little gain for short needles.
In my web searches, I've never seen this hybrid approach documented, so I may well have written the fastest string search algorithm to date. I'd appreciate any reviews before checking it in. - -- Don't work too hard, make some time for fun as well! Eric Blake [EMAIL PROTECTED] -----BEGIN PGP SIGNATURE----- Version: GnuPG v1.4.5 (Cygwin) Comment: Public key at home.comcast.net/~ericblake/eblake.gpg Comment: Using GnuPG with Mozilla - http://enigmail.mozdev.org iD8DBQFHgC2V84KuGfSFAYARAioOAJsF8ww7BZwEzah1ebKWG5cCqVMrJACgnlWm N1cTTDrPqrQM2HGnCEDLpXI= =95ja -----END PGP SIGNATURE-----
>From 16c4539a08e1427f831745c6809064daf3b03c64 Mon Sep 17 00:00:00 2001 From: Eric Blake <[EMAIL PROTECTED]> Date: Sat, 5 Jan 2008 14:09:11 -0700 Subject: [PATCH] Rewrite memmem to guarantee linear complexity without malloc. * lib/memmem.c (memmem): Use Two-Way rather than Knuth-Morris-Pratt, to allow O(1) space usage. (critical_factorization, two_way_short_needle) (two_way_long_needle): New functions. (knuth_morris_pratt): Delete. * modules/memmem (Depends-on): No longer need malloca or stdbool. Add stdint. * tests/test-memmem.c (main): Add test for sublinear performance. * doc/functions/memmem.texi (memmem): Document cygwin deficiency. Signed-off-by: Eric Blake <[EMAIL PROTECTED]> --- ChangeLog | 11 + doc/functions/memmem.texi | 2 +- lib/memmem.c | 523 +++++++++++++++++++++++++++++---------------- modules/memmem | 3 +- tests/test-memmem.c | 27 +++ 5 files changed, 374 insertions(+), 192 deletions(-) diff --git a/ChangeLog b/ChangeLog index bdc357a..4c6db8e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,16 @@ 2008-01-05 Eric Blake <[EMAIL PROTECTED]> + Rewrite memmem to guarantee linear complexity without malloc. + * lib/memmem.c (memmem): Use Two-Way rather than + Knuth-Morris-Pratt, to allow O(1) space usage. + (critical_factorization, two_way_short_needle) + (two_way_long_needle): New functions. + (knuth_morris_pratt): Delete. + * modules/memmem (Depends-on): No longer need malloca or stdbool. + Add stdint. + * tests/test-memmem.c (main): Add test for sublinear performance. + * doc/functions/memmem.texi (memmem): Document cygwin deficiency. + Avoid quadratic system memmem. * m4/memmem.m4 (gl_FUNC_MEMMEM): Check for quadratic memmem. Reported by Ralf Wildenhues. diff --git a/doc/functions/memmem.texi b/doc/functions/memmem.texi index 50d73fd..e51b640 100644 --- a/doc/functions/memmem.texi +++ b/doc/functions/memmem.texi @@ -16,7 +16,7 @@ Cygwin 1.5.x @item This function has quadratic instead of linear complexity on some platforms: -glibc <= 2.6.1 +glibc <= 2.6.1, cygwin @item This function is missing on some platforms: diff --git a/lib/memmem.c b/lib/memmem.c index 5aa704c..fb65ca7 100644 --- a/lib/memmem.c +++ b/lib/memmem.c @@ -1,4 +1,5 @@ -/* Copyright (C) 1991,92,93,94,96,97,98,2000,2004,2007,2008 Free Software Foundation, Inc. +/* Copyright (C) 1991,92,93,94,96,97,98,2000,2004,2007,2008 Free Software + Foundation, Inc. This file is part of the GNU C Library. This program is free software; you can redistribute it and/or modify @@ -19,136 +20,341 @@ # include <config.h> #endif -#include <stddef.h> +/* Specification of memmem. */ #include <string.h> -#include <stdbool.h> -#include "malloca.h" +#include <limits.h> +#include <stddef.h> +#include <stdint.h> #ifndef _LIBC # define __builtin_expect(expr, val) (expr) #endif -/* Knuth-Morris-Pratt algorithm. - See http://en.wikipedia.org/wiki/Knuth-Morris-Pratt_algorithm - Return a boolean indicating success. */ +/* We use the Two-Way string matching algorithm, which guarantees + linear complexity with constant space. Additionally, for long + needles, we also use a bad character shift table similar to the + Boyer-Moore algorithm to acheive sub-linear performance. -static bool -knuth_morris_pratt (const unsigned char *haystack, - const unsigned char *last_haystack, - const unsigned char *needle, size_t m, - const unsigned char **resultp) -{ - /* Allocate the table. */ - size_t *table = (size_t *) nmalloca (m, sizeof (size_t)); - if (table == NULL) - return false; - /* Fill the table. - For 0 < i < m: - 0 < table[i] <= i is defined such that - forall 0 < x < table[i]: needle[x..i-1] != needle[0..i-1-x], - and table[i] is as large as possible with this property. - This implies: - 1) For 0 < i < m: - If table[i] < i, - needle[table[i]..i-1] = needle[0..i-1-table[i]]. - 2) For 0 < i < m: - rhaystack[0..i-1] == needle[0..i-1] - and exists h, i <= h < m: rhaystack[h] != needle[h] - implies - forall 0 <= x < table[i]: rhaystack[x..x+m-1] != needle[0..m-1]. - table[0] remains uninitialized. */ - { - size_t i, j; + See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260 + and http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm +*/ + +/* Point at which computing a bad-byte shift table is likely to be + worthwhile. */ +#define LONG_NEEDLE_THRESHOLD (1 << (CHAR_BIT - 1)) - /* i = 1: Nothing to verify for x = 0. */ - table[1] = 1; - j = 0; +#define MAX(a, b) ((a < b) ? (b) : (a)) - for (i = 2; i < m; i++) - { - /* Here: j = i-1 - table[i-1]. - The inequality needle[x..i-1] != needle[0..i-1-x] is known to hold - for x < table[i-1], by induction. - Furthermore, if j>0: needle[i-1-j..i-2] = needle[0..j-1]. */ - unsigned char b = needle[i - 1]; +/* Peform a critical factorization of NEEDLE, of length NEEDLE_LEN. + Return the index of the first byte in the right half, and set + *PERIOD to the global period of the right half. - for (;;) - { - /* Invariants: The inequality needle[x..i-1] != needle[0..i-1-x] - is known to hold for x < i-1-j. - Furthermore, if j>0: needle[i-1-j..i-2] = needle[0..j-1]. */ - if (b == needle[j]) - { - /* Set table[i] := i-1-j. */ - table[i] = i - ++j; - break; - } - /* The inequality needle[x..i-1] != needle[0..i-1-x] also holds - for x = i-1-j, because - needle[i-1] != needle[j] = needle[i-1-x]. */ - if (j == 0) - { - /* The inequality holds for all possible x. */ - table[i] = i; - break; - } - /* The inequality needle[x..i-1] != needle[0..i-1-x] also holds - for i-1-j < x < i-1-j+table[j], because for these x: - needle[x..i-2] - = needle[x-(i-1-j)..j-1] - != needle[0..j-1-(x-(i-1-j))] (by definition of table[j]) - = needle[0..i-2-x], - hence needle[x..i-1] != needle[0..i-1-x]. - Furthermore - needle[i-1-j+table[j]..i-2] - = needle[table[j]..j-1] - = needle[0..j-1-table[j]] (by definition of table[j]). */ - j = j - table[j]; - } - /* Here: j = i - table[i]. */ - } - } + The global period of a string is the smallest index (possibly its + length) at which all remaining bytes in the string are repetitions + of the prefix (the last repetition may be a subset of the prefix). - /* Search, using the table to accelerate the processing. */ - { - size_t j; - const unsigned char *rhaystack; - const unsigned char *phaystack; + When NEEDLE is factored into two halves, a local period is the + length of the smallest word that shares a suffix with the left half + and shares a prefix with the right half. All factorizations of a + non-empty NEEDLE have a local period of at least 1 and no greater + than NEEDLE_LEN. - *resultp = NULL; - j = 0; - rhaystack = haystack; - phaystack = haystack; - /* Invariant: phaystack = rhaystack + j. */ - while (phaystack != last_haystack) - if (needle[j] == *phaystack) + A critical factorization has the property that the local period + equals the global period. All strings have at least one critical + factorization with the left half smaller than the global period. + + Given an ordered alphabet, a critical factorization can be computed + in linear time, with 2 * NEEDLE_LEN comparisons, by computing the + larger of two ordered maximal suffixes. The ordered maximal + suffixes are determined by lexicographic comparison of + periodicity. */ +static size_t +critical_factorization (const unsigned char *needle, size_t needle_len, + size_t *period) +{ + /* Index of last byte of left half, or SIZE_MAX. */ + size_t max_suffix, max_suffix_rev; + size_t j, k; /* Indices for iteration over NEEDLE. */ + unsigned char a, b; /* Current comparison bytes. */ + size_t p; /* Intermediate period. */ + + /* Perform lexicographic search. */ + max_suffix = SIZE_MAX; + j = 0; + k = p = 1; + while (j + k < needle_len) + { + a = needle[j + k]; + b = needle[max_suffix + k]; + if (a < b) + { + /* Suffix is smaller, period is entire prefix so far. */ + j += k; + k = 1; + p = j - max_suffix; + } + else if (a == b) { - j++; - phaystack++; - if (j == m) + /* Advance through repetition of the current period. */ + if (k != p) + ++k; + else { - /* The entire needle has been found. */ - *resultp = rhaystack; - break; + j += p; + k = 1; } } - else if (j > 0) + else /* b < a */ + { + /* Suffix is larger, start over from current location. */ + max_suffix = j++; + k = p = 1; + } + } + *period = p; + + /* Perform reverse lexicographic search. */ + max_suffix_rev = SIZE_MAX; + j = 0; + k = p = 1; + while (j + k < needle_len) + { + a = needle[j + k]; + b = needle[max_suffix_rev + k]; + if (b < a) + { + /* Suffix is smaller, period is entire prefix so far. */ + j += k; + k = 1; + p = j - max_suffix_rev; + } + else if (a == b) { - /* Found a match of needle[0..j-1], mismatch at needle[j]. */ - rhaystack += table[j]; - j -= table[j]; + /* Advance through repetition of the current period. */ + if (k != p) + ++k; + else + { + j += p; + k = 1; + } } - else + else /* a < b */ { - /* Found a mismatch at needle[0] already. */ - rhaystack++; - phaystack++; + /* Suffix is larger, start over from current location. */ + max_suffix_rev = j++; + k = p = 1; + } + } + + /* Choose the longer suffix. */ + if (max_suffix_rev + 1 < max_suffix + 1) + return max_suffix + 1; + *period = p; + return max_suffix_rev + 1; +} + +/* Return the first location of NEEDLE within HAYSTACK, or NULL. This + method requires 0 < NEEDLE_LEN <= HAYSTACK_LEN, and is optimized + for NEEDLE_LEN < LONG_NEEDLE_THRESHOLD. Performance is linear, + with 2 * NEEDLE_LEN comparisons in preparation, and at most 2 * + HAYSTACK_LEN - NEEDLE_LEN comparisons in searching. */ +static void * +two_way_short_needle (const unsigned char *haystack, size_t haystack_len, + const unsigned char *needle, size_t needle_len) +{ + size_t i, j; /* Indices into haystack. */ + size_t period; /* The period of the right half of needle. */ + size_t suffix; /* The index of the right half of needle. */ + + /* Factor the needle into two halves, such that the left half is + smaller than the global period, and the right half is + periodic. */ + suffix = critical_factorization (needle, needle_len, &period); + + /* Perform the search. Each iteration compares the right half + first. */ + if (memcmp (needle, needle + period, suffix) == 0) + { + /* Needle is periodic; a mismatch can only advance by the + period, so use memory to avoid rescanning known occurrences + of the period. */ + size_t memory = 0; + j = 0; + while (j <= haystack_len - needle_len) + { + /* Scan for matches in right half. */ + i = suffix < memory ? memory : suffix; + while (i < needle_len && needle[i] == haystack[i + j]) + ++i; + if (needle_len <= i) + { + /* Scan for matches in left half. */ + i = suffix - 1; + while (memory < i + 1 && needle[i] == haystack[i + j]) + --i; + if (i + 1 < memory + 1) + return (void *) (haystack + j); + /* No match, so remember how many repetitions of period + on the right half were scanned. */ + j += period; + memory = needle_len - period; + } + else + { + j += i - suffix + 1; + memory = 0; + } + } + } + else + { + /* The two halves of needle are distinct; no extra memory is + required, and any mismatch results in a maximal shift. */ + period = MAX (suffix, needle_len - suffix) + 1; + j = 0; + while (j <= haystack_len - needle_len) + { + /* Scan for matches in right half. */ + i = suffix; + while (i < needle_len && needle[i] == haystack[i + j]) + ++i; + if (needle_len <= i) + { + /* Scan for matches in left half. */ + i = suffix - 1; + while (i != SIZE_MAX && needle[i] == haystack[i + j]) + --i; + if (i == SIZE_MAX) + return (void *) (haystack + j); + j += period; + } + else + j += i - suffix + 1; } - } + } + return NULL; +} + +/* Return the first location of NEEDLE within HAYSTACK, or NULL. This + method requires 0 < NEEDLE_LEN <= HAYSTACK_LEN, and is optimized + for LONG_NEEDLE_THRESHOLD <= NEEDLE_LEN. Performance is linear, + with 3 * NEEDLE_LEN operations in preparation, and at most 2 * + HAYSTACK_LEN - NEEDLE_LEN comparisons in searching. The extra + initialization cost allows for potential sublinear performance + O(HAYSTACK_LEN / NEEDLE_LEN). */ +static void * +two_way_long_needle (const unsigned char *haystack, size_t haystack_len, + const unsigned char *needle, size_t needle_len) +{ + size_t i, j; /* Indices into haystack. */ + size_t period; /* The period of the right half of needle. */ + size_t suffix; /* The index of the right half of needle. */ + size_t shift_table[1 << CHAR_BIT]; + + /* Factor the needle into two halves, such that the left half is + smaller than the global period, and the right half is + periodic. */ + suffix = critical_factorization (needle, needle_len, &period); - freea (table); - return true; + /* Populate shift_table. For each possible byte value c, + shift_table[c] is the distance from the last occurrence of c to + the end of NEEDLE, or NEEDLE_LEN if c is absent from the NEEDLE. + shift_table[needle[needle_len - 1]] contains the only 0. */ + for (i = 0; i < 1 << CHAR_BIT; i++) + shift_table[i] = needle_len; + for (i = 0; i < needle_len; i++) + shift_table[needle[i]] = needle_len - i - 1; + + /* Perform the search. Each iteration compares the right half + first. */ + if (memcmp (needle, needle + period, suffix) == 0) + { + /* Needle is periodic; a mismatch can only advance by the + period, so use memory to avoid rescanning known occurrences + of the period. */ + size_t memory = 0; + j = 0; + while (j <= haystack_len - needle_len) + { + /* Check the last byte first; if it does not match, then + shift to the next possible match location. */ + size_t shift = shift_table[haystack[j + needle_len - 1]]; + if (0 < shift) + { + if (memory && shift < period) + { + /* Since needle is periodic, but the last period has + a byte out of place, there can be no match until + after the mismatch. */ + shift = needle_len - period; + memory = 0; + } + j += shift; + continue; + } + /* Scan for matches in right half. The last byte has + already been matched, by virtue of the shift table. */ + i = suffix < memory ? memory : suffix; + while (i < needle_len - 1 && needle[i] == haystack[i + j]) + ++i; + if (needle_len - 1 <= i) + { + /* Scan for matches in left half. */ + i = suffix - 1; + while (memory < i + 1 && needle[i] == haystack[i + j]) + --i; + if (i + 1 < memory + 1) + return (void *) (haystack + j); + /* No match, so remember how many repetitions of period + on the right half were scanned. */ + j += period; + memory = needle_len - period; + } + else + { + j += i - suffix + 1; + memory = 0; + } + } + } + else + { + /* The two halves of needle are distinct; no extra memory is + required, and any mismatch results in a maximal shift. */ + period = MAX (suffix, needle_len - suffix) + 1; + j = 0; + while (j <= haystack_len - needle_len) + { + /* Check the last byte first; if it does not match, then + shift to the next possible match location. */ + size_t shift = shift_table[haystack[j + needle_len - 1]]; + if (0 < shift) + { + j += shift; + continue; + } + /* Scan for matches in right half. The last byte has + already been matched, by virtue of the shift table. */ + i = suffix; + while (i < needle_len - 1 && needle[i] == haystack[i + j]) + ++i; + if (needle_len - 1 <= i) + { + /* Scan for matches in left half. */ + i = suffix - 1; + while (i != SIZE_MAX && needle[i] == haystack[i + j]) + --i; + if (i == SIZE_MAX) + return (void *) (haystack + j); + j += period; + } + else + j += i - suffix + 1; + } + } + return NULL; } /* Return the first occurrence of NEEDLE in HAYSTACK. Return HAYSTACK @@ -162,8 +368,6 @@ memmem (const void *haystack_start, size_t haystack_len, not an array of 'char' values. See ISO C 99 section 6.2.6.1. */ const unsigned char *haystack = (const unsigned char *) haystack_start; const unsigned char *needle = (const unsigned char *) needle_start; - const unsigned char *last_haystack = haystack + haystack_len; - const unsigned char *last_needle = needle + needle_len; if (needle_len == 0) /* The first occurrence of the empty string is deemed to occur at @@ -175,82 +379,23 @@ memmem (const void *haystack_start, size_t haystack_len, if (__builtin_expect (haystack_len < needle_len, 0)) return NULL; - /* Use optimizations in memchr when possible. */ - if (__builtin_expect (needle_len == 1, 0)) - return memchr (haystack, *needle, haystack_len); - - /* Minimizing the worst-case complexity: - Let n = haystack_len, m = needle_len. - The naïve algorithm is O(n*m) worst-case. - The Knuth-Morris-Pratt algorithm is O(n) worst-case but it needs a - memory allocation. - To achieve linear complexity and yet amortize the cost of the - memory allocation, we activate the Knuth-Morris-Pratt algorithm - only once the naïve algorithm has already run for some time; more - precisely, when - - the outer loop count is >= 10, - - the average number of comparisons per outer loop is >= 5, - - the total number of comparisons is >= m. - But we try it only once. If the memory allocation attempt failed, - we don't retry it. */ - { - bool try_kmp = true; - size_t outer_loop_count = 0; - size_t comparison_count = 0; - - /* Speed up the following searches of needle by caching its first - byte. */ - unsigned char b = *needle++; - - for (;; haystack++) - { - if (haystack == last_haystack) - /* No match. */ - return NULL; - - /* See whether it's advisable to use an asymptotically faster - algorithm. */ - if (try_kmp - && outer_loop_count >= 10 - && comparison_count >= 5 * outer_loop_count) - { - /* See if needle + comparison_count now reaches the end of - needle. */ - if (comparison_count >= needle_len) - { - /* Try the Knuth-Morris-Pratt algorithm. */ - const unsigned char *result; - if (knuth_morris_pratt (haystack, last_haystack, - needle - 1, needle_len, &result)) - return (void *) result; - try_kmp = false; - } - } - - outer_loop_count++; - comparison_count++; - if (*haystack == b) - /* The first byte matches. */ - { - const unsigned char *rhaystack = haystack + 1; - const unsigned char *rneedle = needle; - - for (;; rhaystack++, rneedle++) - { - if (rneedle == last_needle) - /* Found a match. */ - return (void *) haystack; - if (rhaystack == last_haystack) - /* No match. */ - return NULL; - comparison_count++; - if (*rhaystack != *rneedle) - /* Nothing in this round. */ - break; - } - } - } - } - - return NULL; + /* Use optimizations in memchr when possible, to reduce the search + size of haystack using a linear algorithm with a smaller + coefficient. However, avoid memchr for long needles, since we + can often acheive sublinear performance. */ + if (needle_len < LONG_NEEDLE_THRESHOLD) + { + haystack = memchr (haystack, *needle, haystack_len); + if (!haystack || __builtin_expect (needle_len == 1, 0)) + return (void *) haystack; + haystack_len -= haystack - (const unsigned char *) haystack_start; + if (haystack_len < needle_len) + return NULL; + return two_way_short_needle (haystack, haystack_len, needle, needle_len); + } + else + return two_way_long_needle (haystack, haystack_len, needle, needle_len); } + +#undef LONG_NEEDLE_THRESHOLD +#undef MAX diff --git a/modules/memmem b/modules/memmem index 738cd6a..2c02be9 100644 --- a/modules/memmem +++ b/modules/memmem @@ -8,8 +8,7 @@ m4/memmem.m4 Depends-on: extensions string -stdbool -malloca +stdint memchr memcmp diff --git a/tests/test-memmem.c b/tests/test-memmem.c index 976f293..e900e1c 100644 --- a/tests/test-memmem.c +++ b/tests/test-memmem.c @@ -152,5 +152,32 @@ main (int argc, char *argv[]) free (haystack); } + /* Check that a some long needles not present in a haystack can be + handled with sublinear speed. */ + { + size_t repeat = 10000; + size_t m = 1000000; + size_t n = 1000; + char *haystack = (char *) malloc (m); + char *needle = (char *) malloc (n); + if (haystack != NULL && needle != NULL) + { + const char *result; + + memset (haystack, 'A', m); + memset (needle, 'B', n); + + for (; repeat > 0; repeat--) + { + result = memmem (haystack, m, needle, n); + ASSERT (result == NULL); + } + } + if (haystack != NULL) + free (haystack); + if (needle != NULL) + free (needle); + } + return 0; } -- 1.5.3.5