Now here's the fix that brings the worst-case complexity down from O(n*m) to O(n). One could use either the Knuth-Morris-Pratt or the Boyer-Moore algorithm. I'm using the former one, because Boyer-Moore works "backwards", which is harder to generalize to the mbsstr case.
2007-02-11 Bruno Haible <[EMAIL PROTECTED]> Ensure O(n) worst-case complexity of c_strstr. * lib/c-strstr.c: Include stdbool.h, string.h. (knuth_morris_pratt): New function. (c_strstr): Add some bookkeeping. Invoke knuth_morris_pratt when the bookkeeping indicates that it's worth it. * modules/c-strstr (Depends-on): Add stdbool, strnlen. *** lib/c-strstr.c 11 Feb 2007 13:58:43 -0000 1.4 --- lib/c-strstr.c 11 Feb 2007 14:01:35 -0000 *************** *** 21,27 **** /* Specification. */ #include "c-strstr.h" ! #include <stddef.h> /* Find the first occurrence of NEEDLE in HAYSTACK. */ char * --- 21,117 ---- /* Specification. */ #include "c-strstr.h" ! #include <stdbool.h> ! #include <stdlib.h> ! #include <string.h> ! ! /* Knuth-Morris-Pratt algorithm. ! See http://en.wikipedia.org/wiki/Knuth-Morris-Pratt_algorithm ! Return a boolean indicating success. */ ! static bool ! knuth_morris_pratt (const char *haystack, const char *needle, ! const char **resultp) ! { ! size_t m = strlen (needle); ! ! /* Allocate the table. */ ! size_t *table = (size_t *) malloc (m * sizeof (size_t)); ! if (table == NULL) ! return false; ! /* Fill the table. ! For 0 < i < m: ! 0 < table[i] <= i is defined such that ! rhaystack[0..i-1] == needle[0..i-1] and rhaystack[i] != needle[i] ! implies ! forall 0 <= x < table[i]: rhaystack[x..x+m-1] != needle[0..m-1], ! and table[i] is as large as possible with this property. ! table[0] remains uninitialized. */ ! { ! size_t i, j; ! ! table[1] = 1; ! j = 0; ! for (i = 2; i < m; i++) ! { ! unsigned char b = (unsigned char) needle[i - 1]; ! ! for (;;) ! { ! if (b == (unsigned char) needle[j]) ! { ! table[i] = i - ++j; ! break; ! } ! if (j == 0) ! { ! table[i] = i; ! break; ! } ! j = j - table[j]; ! } ! } ! } ! ! /* Search, using the table to accelerate the processing. */ ! { ! size_t j; ! const char *rhaystack; ! const char *phaystack; ! ! *resultp = NULL; ! j = 0; ! rhaystack = haystack; ! phaystack = haystack; ! /* Invariant: phaystack = rhaystack + j. */ ! while (*phaystack != '\0') ! if ((unsigned char) needle[j] == (unsigned char) *phaystack) ! { ! j++; ! phaystack++; ! if (j == m) ! { ! /* The entire needle has been found. */ ! *resultp = rhaystack; ! break; ! } ! } ! else if (j > 0) ! { ! /* Found a match of needle[0..j-1], mismatch at needle[j]. */ ! rhaystack += table[j]; ! j -= table[j]; ! } ! else ! { ! /* Found a mismatch at needle[0] already. */ ! rhaystack++; ! phaystack++; ! } ! } ! ! free (table); ! return true; ! } /* Find the first occurrence of NEEDLE in HAYSTACK. */ char * *************** *** 34,39 **** --- 124,149 ---- needle may be found in haystack. */ if (*needle != '\0') { + /* Minimizing the worst-case complexity: + Let n = strlen(haystack), m = strlen(needle). + The naïve algorithm is O(n*m) worst-case. + The Knuth-Morris-Pratt algorithm is O(n) worst-case but it needs a + memory allocation. + To achieve linear complexity and yet amortize the cost of the memory + allocation, we activate the Knuth-Morris-Pratt algorithm only once + the naïve algorithm has already run for some time; more precisely, + when + - the outer loop count is >= 10, + - the average number of comparisons per outer loop is >= 5, + - the total number of comparisons is >= m. + But we try it only once. If the memory allocation attempt failed, + we don't retry it. */ + bool try_kmp = true; + size_t outer_loop_count = 0; + size_t comparison_count = 0; + size_t last_ccount = 0; /* last comparison count */ + const char *needle_last_ccount = needle; /* = needle + last_ccount */ + /* Speed up the following searches of needle by caching its first character. */ unsigned char b = (unsigned char) *needle; *************** *** 44,49 **** --- 154,190 ---- if (*haystack == '\0') /* No match. */ return NULL; + + /* See whether it's advisable to use an asymptotically faster + algorithm. */ + if (try_kmp + && outer_loop_count >= 10 + && comparison_count >= 5 * outer_loop_count) + { + /* See if needle + comparison_count now reaches the end of + needle. */ + if (needle_last_ccount != NULL) + { + needle_last_ccount += + strnlen (needle_last_ccount, comparison_count - last_ccount); + if (*needle_last_ccount == '\0') + needle_last_ccount = NULL; + last_ccount = comparison_count; + } + if (needle_last_ccount == NULL) + { + /* Try the Knuth-Morris-Pratt algorithm. */ + const char *result; + bool success = + knuth_morris_pratt (haystack, needle - 1, &result); + if (success) + return (char *) result; + try_kmp = false; + } + } + + outer_loop_count++; + comparison_count++; if ((unsigned char) *haystack == b) /* The first character matches. */ { *************** *** 58,63 **** --- 199,205 ---- if (*rhaystack == '\0') /* No match. */ return NULL; + comparison_count++; if ((unsigned char) *rhaystack != (unsigned char) *rneedle) /* Nothing in this round. */ break; *** modules/c-strstr 28 Aug 2006 13:04:33 -0000 1.1 --- modules/c-strstr 11 Feb 2007 14:01:35 -0000 *************** *** 6,11 **** --- 6,13 ---- lib/c-strstr.c Depends-on: + stdbool + strnlen configure.ac: