Hi, Searching multiple fixed words, grep uses Commentz-Walter algorithm, but it is very slow for a worst case e.g. as following. It is O(m*n).
- input: yes `printf %040d` | head -10000000 - word1: x0000000000000000000 - word2: x This change uses Aho-Corasick algorithm instead of Commentz-Walter algorithm to search multiple fixed words. It uses a function to build tries which has been already defined for Commentz-Walter algorithm in kwset.c and which has been already high quality. I see 7x speed-up even for a typical case on Fedora 21 with a 3.2GHz i5 by this change. First without this change (best-of-5 trials): find /usr/share/doc/ -type f | LC_ALL=C time -p xargs.sh src/grep -Ff /usr/share/dict/linux.words >/dev/null real 11.37 user 11.03 sys 0.24 Next with this change (best-of-5 trials): find /usr/share/doc/ -type f | LC_ALL=C time -p xargs.sh src/grep -Ff /usr/share/dict/linux.words >/dev/null real 1.49 user 1.31 sys 0.15 I also wrote two additional patches. Second, If search multiple fixed words, grep immediately returns without longest match if not needed. Without this change, grep tries longest match for multiple words even if not needed. Third, use memchr2 for two patterns of a character. Thanks, Norihiro
From 755f39d4c2e2822d4c241e357d258e5b25ba0d04 Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka <nori...@kcn.ne.jp> Date: Sat, 22 Nov 2014 14:47:45 +0900 Subject: [PATCH 1/3] grep: use Aho-Corasick algorithm to search multiple fixed words Searching multiple fixed words, grep uses Commentz-Walter algorithm, but it is very slow for a worst case e.g. as following. It is O(m*n). - input: yes `printf %040d` | head -10000000 - word1: x0000000000000000000 - word2: x This change uses Aho-Corasick algorithm instead of Commentz-Walter algorithm to search multiple fixed words. It uses a function to build tries which has been already defined for Commentz-Walter algorithm in kwset.c and which has been already high quality. I see 7x speed-up even for a typical case on Fedora 21 with a 3.2GHz i5 by this change. First without this change (best-of-5 trials): find /usr/share/doc/ -type f | LC_ALL=C time -p xargs.sh src/grep -Ff /usr/share/dict/linux.words >/dev/null real 11.37 user 11.03 sys 0.24 Next with this change (best-of-5 trials): find /usr/share/doc/ -type f | LC_ALL=C time -p xargs.sh src/grep -Ff /usr/share/dict/linux.words >/dev/null real 1.49 user 1.31 sys 0.15 * src/kwset.c (struct kwset): Add a new member 'mode'. (kwsalloc): Use it. All callers are changed. (kwsincr): Using Aho-Corasick algorithm, build tries in normal order. (acexec_trans, acexec): Add a new function. (kwsexec): Use it. * src/kwset.h (kwsalloc): Update a prototype. * NEWS (Improvements): Mention it. --- NEWS | 5 + src/kwset.c | 326 +++++++++++++++++++++++++++++++++++++++++++++--------- src/kwset.h | 3 +- src/searchutils.c | 4 +- 4 files changed, 282 insertions(+), 56 deletions(-) diff --git a/NEWS b/NEWS index 6138c4e..653213a 100644 --- a/NEWS +++ b/NEWS @@ -2,6 +2,11 @@ GNU grep NEWS -*- outline -*- * Noteworthy changes in release ?.? (????-??-??) [?] +** Improvements + + grep -F is now typically 7 times faster for a lot of words. The + improvement replace Commentz-Walter algorithm which has been used for + a long time to Aho-Corasick algorithm. * Noteworthy changes in release 2.21 (2014-11-23) [stable] diff --git a/src/kwset.c b/src/kwset.c index 6d21893..9585f08 100644 --- a/src/kwset.c +++ b/src/kwset.c @@ -35,7 +35,6 @@ #include "kwset.h" -#include <stdbool.h> #include <stdint.h> #include <sys/types.h> #include "system.h" @@ -67,7 +66,7 @@ struct tree char balance; /* Difference in depths of subtrees. */ }; -/* Node of a trie representing a set of reversed keywords. */ +/* Node of a trie representing a set of keywords. */ struct trie { size_t accepting; /* Word index of accepted word, or zero. */ @@ -111,6 +110,13 @@ struct kwset more such matches; e.g., Greek has three sigma characters that all match when case-folding. */ int gc1help; + + /* If true, prefer Aho-Corasick algorithm to Beate Commentz-Walter + algorithm in multiple words. */ + bool reverse; + + /* kwsexec() implementation. */ + size_t (*kwsexec) (kwset_t, char const *, size_t, struct kwsmatch *); }; /* Use TRANS to transliterate C. A null TRANS does no transliteration. */ @@ -120,10 +126,14 @@ tr (char const *trans, char c) return trans ? trans[U(c)] : c; } +static size_t acexec (kwset_t, char const *, size_t, struct kwsmatch *); +static size_t cwexec (kwset_t, char const *, size_t, struct kwsmatch *); +static size_t bmexec (kwset_t, char const *, size_t, struct kwsmatch *); + /* Allocate and initialize a keyword set object, returning an opaque pointer to it. */ kwset_t -kwsalloc (char const *trans) +kwsalloc (char const *trans, bool const reverse) { struct kwset *kwset = xmalloc (sizeof *kwset); @@ -141,6 +151,11 @@ kwsalloc (char const *trans) kwset->maxd = -1; kwset->target = NULL; kwset->trans = trans; + kwset->reverse = reverse; + if (reverse) + kwset->kwsexec = cwexec; + else + kwset->kwsexec = acexec; return kwset; } @@ -156,13 +171,14 @@ kwsincr (kwset_t kwset, char const *text, size_t len) struct trie *trie = kwset->trie; char const *trans = kwset->trans; - text += len; + if (kwset->reverse) + text += len; - /* Descend the trie (built of reversed keywords) character-by-character, + /* Descend the trie (built of keywords) character-by-character, installing new nodes when necessary. */ while (len--) { - unsigned char uc = *--text; + unsigned char uc = kwset->reverse ? *--text : *text++; unsigned char label = trans ? trans[uc] : uc; /* Descend the tree of outgoing links for this trie node, @@ -311,15 +327,15 @@ enqueue (struct tree *tree, struct trie **last) well as a last resort failure node. */ static void treefails (struct tree const *tree, struct trie const *fail, - struct trie *recourse) + struct trie *recourse, bool reverse) { struct tree *link; if (!tree) return; - treefails(tree->llink, fail, recourse); - treefails(tree->rlink, fail, recourse); + treefails(tree->llink, fail, recourse, reverse); + treefails(tree->rlink, fail, recourse, reverse); /* Find, in the chain of fails going back to the root, the first node that has a descendant on the current label. */ @@ -334,6 +350,8 @@ treefails (struct tree const *tree, struct trie const *fail, if (link) { tree->trie->fail = link->trie; + if (!reverse && link->trie->accepting && !tree->trie->accepting) + tree->trie->accepting = -1; return; } fail = fail->fail; @@ -396,6 +414,34 @@ kwsprep (kwset_t kwset) int i; unsigned char deltabuf[NCHAR]; unsigned char *delta = trans ? deltabuf : kwset->delta; + struct trie *curr, *last; + + if (kwset->words == 1) + { + if (!kwset->reverse) + { + kwset_t new_kwset; + + /* Enqueue the immediate descendants in the level order queue. */ + for (curr = last = kwset->trie; curr; curr = curr->next) + enqueue (curr->links, &last); + + /* Looking for just one string. Extract it from the trie. */ + kwset->target = obstack_alloc (&kwset->obstack, kwset->mind); + for (i = 0, curr = kwset->trie; i < kwset->mind; ++i) + { + kwset->target[i] = curr->links->label; + curr = curr->next; + } + + new_kwset = kwsalloc (kwset->trans, true); + kwsincr (new_kwset, kwset->target, kwset->mind); + obstack_free (&kwset->obstack, NULL); + *kwset = *new_kwset; + free (new_kwset); + } + kwset->kwsexec = bmexec; + } /* Initial values for the delta table; will be changed later. The delta entry for a given character is the smallest depth of any @@ -404,49 +450,54 @@ kwsprep (kwset_t kwset) /* Traverse the nodes of the trie in level order, simultaneously computing the delta table, failure function, and shift function. */ - struct trie *curr, *last; for (curr = last = kwset->trie; curr; curr = curr->next) { /* Enqueue the immediate descendants in the level order queue. */ enqueue (curr->links, &last); - curr->shift = kwset->mind; - curr->maxshift = kwset->mind; - /* Update the delta table for the descendants of this node. */ treedelta (curr->links, curr->depth, delta); /* Compute the failure function for the descendants of this node. */ - treefails (curr->links, curr->fail, kwset->trie); + treefails (curr->links, curr->fail, kwset->trie, kwset->reverse); - /* Update the shifts at each node in the current node's chain - of fails back to the root. */ - struct trie *fail; - for (fail = curr->fail; fail; fail = fail->fail) + if (kwset->reverse) { - /* If the current node has some outgoing edge that the fail - doesn't, then the shift at the fail should be no larger - than the difference of their depths. */ - if (!hasevery (fail->links, curr->links)) - if (curr->depth - fail->depth < fail->shift) - fail->shift = curr->depth - fail->depth; - - /* If the current node is accepting then the shift at the - fail and its descendants should be no larger than the - difference of their depths. */ - if (curr->accepting && fail->maxshift > curr->depth - fail->depth) - fail->maxshift = curr->depth - fail->depth; + curr->shift = kwset->mind; + curr->maxshift = kwset->mind; + + /* Update the shifts at each node in the current node's chain + of fails back to the root. */ + struct trie *fail; + for (fail = curr->fail; fail; fail = fail->fail) + { + /* If the current node has some outgoing edge that the fail + doesn't, then the shift at the fail should be no larger + than the difference of their depths. */ + if (!hasevery (fail->links, curr->links)) + if (curr->depth - fail->depth < fail->shift) + fail->shift = curr->depth - fail->depth; + + /* If the current node is accepting then the shift at the + fail and its descendants should be no larger than the + difference of their depths. */ + if (curr->accepting && fail->maxshift > curr->depth - fail->depth) + fail->maxshift = curr->depth - fail->depth; + } } } - /* Traverse the trie in level order again, fixing up all nodes whose - shift exceeds their inherited maxshift. */ - for (curr = kwset->trie->next; curr; curr = curr->next) + if (kwset->reverse) { - if (curr->maxshift > curr->parent->maxshift) - curr->maxshift = curr->parent->maxshift; - if (curr->shift > curr->maxshift) - curr->shift = curr->maxshift; + /* Traverse the trie in level order again, fixing up all nodes whose + shift exceeds their inherited maxshift. */ + for (curr = kwset->trie->next; curr; curr = curr->next) + { + if (curr->maxshift > curr->parent->maxshift) + curr->maxshift = curr->parent->maxshift; + if (curr->shift > curr->maxshift) + curr->shift = curr->maxshift; + } } /* Create a vector, indexed by character code, of the outgoing links @@ -470,6 +521,7 @@ kwsprep (kwset_t kwset) kwset->target[i] = curr->links->label; curr = curr->next; } + /* Looking for the delta2 shift that we might make after a backwards match has failed. Extract it from the trie. */ if (kwset->mind > 1) @@ -669,16 +721,26 @@ bmexec_trans (kwset_t kwset, char const *text, size_t size) /* Fast Boyer-Moore search. */ static size_t -bmexec (kwset_t kwset, char const *text, size_t size) +bmexec (kwset_t kwset, char const *text, size_t size, + struct kwsmatch *kwsmatch) { /* Help the compiler inline bmexec_trans in two ways, depending on whether kwset->trans is null. */ - return (kwset->trans + size_t ret = (kwset->trans ? bmexec_trans (kwset, text, size) : bmexec_trans (kwset, text, size)); + + if (ret != (size_t) -1) + { + kwsmatch->index = 0; + kwsmatch->offset[0] = ret; + kwsmatch->size[0] = kwset->mind; + } + + return ret; } -/* Hairy multiple string search. */ +/* Hairy multiple string search with Commentz-Walter algorithm. */ static size_t _GL_ARG_NONNULL ((4)) cwexec (kwset_t kwset, char const *text, size_t len, struct kwsmatch *kwsmatch) { @@ -834,6 +896,176 @@ cwexec (kwset_t kwset, char const *text, size_t len, struct kwsmatch *kwsmatch) return mch - text; } +/* Hairy multiple string search with Aho-Corasick algorithm. + (inlinable version) */ +static inline size_t _GL_ARG_NONNULL ((4)) +acexec_trans (kwset_t kwset, char const *text, size_t len, + struct kwsmatch *kwsmatch) +{ + struct trie * const *next; + struct trie const *trie, *accept; + char const *tp, *left, *lim; + unsigned char c; + struct tree const *tree; + char const *trans; + +#ifdef lint + accept = NULL; +#endif + + /* Initialize register copies and look for easy ways out. */ + if (len < kwset->mind) + return -1; + + next = kwset->next; + trans = kwset->trans; + lim = text + len; + tp = text; + + if (kwset->trie->accepting) + { + trie = kwset->trie; + goto match; + } + + trie = next[U(tr (trans, *tp++))]; + + while (true) + { + if (tp < lim - 8) + { + while (!trie) + { + trie = next[U(tr (trans, *tp++))]; + if (trie) + break; + trie = next[U(tr (trans, *tp++))]; + if (trie) + break; + trie = next[U(tr (trans, *tp++))]; + if (trie) + break; + trie = next[U(tr (trans, *tp++))]; + if (trie) + break; + trie = next[U(tr (trans, *tp++))]; + if (trie) + break; + trie = next[U(tr (trans, *tp++))]; + if (trie) + break; + trie = next[U(tr (trans, *tp++))]; + if (trie) + break; + trie = next[U(tr (trans, *tp++))]; + if (trie) + break; + if (tp >= lim - 8) + break; + trie = next[U(tr (trans, *tp++))]; + } + } + + while (!trie) + { + if (tp >= lim) + return -1; + trie = next[U(tr (trans, *tp++))]; + } + + if (trie->accepting) + goto match; + if (tp >= lim) + return -1; + c = tr (trans, *tp++); + tree = trie->links; + + while (true) + { + if (c == tree->label) + { + trie = tree->trie; + if (trie->accepting) + goto match; + if (tp >= lim) + return -1; + c = tr (trans, *tp++); + tree = trie->links; + continue; + } + else if (c < tree->label) + tree = tree->llink; + else + tree = tree->rlink; + if (tree) + continue; + trie = trie->fail; + if (!trie) + break; + if (trie->accepting) + { + --tp; + goto match; + } + tree = trie->links; + } + + trie = next[c]; + } + + match: + accept = trie; + while (accept->accepting == (size_t) -1) + accept = accept->fail; + left = tp - accept->depth; + + /* Try left-most longest match. */ + while (tp < lim) + { + struct trie const *accept1; + char const *left1; + c = tr (trans, *tp++); + tree = trie->links; + while (tree && c != tree->label) + if (c < tree->label) + tree = tree->llink; + else + tree = tree->rlink; + if (!tree) + break; + trie = tree->trie; + if (!trie->accepting) + continue; + accept1 = trie; + while (accept1->accepting == (size_t) -1) + accept1 = accept1->fail; + left1 = tp - accept1->depth; + if (left1 <= left) + { + left = left1; + accept = accept1; + } + } + + kwsmatch->index = accept->accepting / 2; + kwsmatch->offset[0] = left - text; + kwsmatch->size[0] = accept->depth; + + return left - text; +} + +/* Hairy multiple string search with Aho-Corasick algorithm. */ +static size_t +acexec (kwset_t kwset, char const *text, size_t size, + struct kwsmatch *kwsmatch) +{ + /* Help the compiler inline bmexec_trans in two ways, depending on + whether kwset->trans is null. */ + return (kwset->trans + ? acexec_trans (kwset, text, size, kwsmatch) + : acexec_trans (kwset, text, size, kwsmatch)); +} + /* Search TEXT for a match of any member of KWSET. Return the offset (into TEXT) of the first byte of the matching substring, or (size_t) -1 if no match is found. Upon a match, store details in @@ -843,19 +1075,7 @@ size_t kwsexec (kwset_t kwset, char const *text, size_t size, struct kwsmatch *kwsmatch) { - if (kwset->words == 1) - { - size_t ret = bmexec (kwset, text, size); - if (ret != (size_t) -1) - { - kwsmatch->index = 0; - kwsmatch->offset[0] = ret; - kwsmatch->size[0] = kwset->mind; - } - return ret; - } - else - return cwexec (kwset, text, size, kwsmatch); + return kwset->kwsexec (kwset, text, size, kwsmatch); } /* Free the components of the given keyword set. */ diff --git a/src/kwset.h b/src/kwset.h index 12afb8e..2476db3 100644 --- a/src/kwset.h +++ b/src/kwset.h @@ -22,6 +22,7 @@ or (US mail) as Mike Haertel c/o Free Software Foundation. */ #include <stddef.h> +#include <stdbool.h> struct kwsmatch { @@ -38,7 +39,7 @@ typedef struct kwset *kwset_t; /* Return an opaque pointer to a newly allocated keyword set. A nonnull arg specifies a table of character translations to be applied to all pattern and search text. */ -extern kwset_t kwsalloc (char const *); +extern kwset_t kwsalloc (char const *, bool const); /* Incrementally extend the keyword set to include the given string. Remember an index number for each keyword included in the set. */ diff --git a/src/searchutils.c b/src/searchutils.c index 9edc785..314ab8e 100644 --- a/src/searchutils.c +++ b/src/searchutils.c @@ -39,10 +39,10 @@ kwsinit (kwset_t *kwset) for (i = 0; i < NCHAR; ++i) trans[i] = toupper (i); - *kwset = kwsalloc (trans); + *kwset = kwsalloc (trans, false); } else - *kwset = kwsalloc (NULL); + *kwset = kwsalloc (NULL, false); if (!*kwset) xalloc_die (); -- 2.2.0
From e9496f8d82ea9ba260d337a2d8fae32161633637 Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka <nori...@kcn.ne.jp> Date: Sat, 22 Nov 2014 15:16:55 +0900 Subject: [PATCH 2/3] grep immediately return without longest match to search multiple fixed words if not needed Searching multiple fixed words, grep immediately returns without longest match if not needed. Without this change, grep tries longest match for multiple words even if not needed. * src/kwset.c (kwsexec, acexec, cwexec, bmexec): Add an argument which is bool whether needed longest match or not. All callers are changed. * src/kwset.h (kwsexec): Update prototype. --- src/dfasearch.c | 2 +- src/kwsearch.c | 18 +++++-- src/kwset.c | 165 ++++++++++++++++++++++++++++++-------------------------- src/kwset.h | 4 +- 4 files changed, 105 insertions(+), 84 deletions(-) diff --git a/src/dfasearch.c b/src/dfasearch.c index 77b4e3e..1eb24ec 100644 --- a/src/dfasearch.c +++ b/src/dfasearch.c @@ -239,7 +239,7 @@ EGexecute (char const *buf, size_t size, size_t *match_size, /* Find a possible match using the KWset matcher. */ size_t offset = kwsexec (kwset, beg - begline, - buflim - beg + begline, &kwsm); + buflim - beg + begline, &kwsm, true); if (offset == (size_t) -1) goto failure; match = beg + offset; diff --git a/src/kwsearch.c b/src/kwsearch.c index 1335a26..c44ec08 100644 --- a/src/kwsearch.c +++ b/src/kwsearch.c @@ -111,6 +111,8 @@ Fexecute (char const *buf, size_t size, size_t *match_size, struct kwsmatch kwsmatch; size_t ret_val; mb_len_map_t *map = NULL; + bool mb_check; + bool longest; if (MB_CUR_MAX > 1) { @@ -123,15 +125,23 @@ Fexecute (char const *buf, size_t size, size_t *match_size, } } + if (match_lines) + mb_check = longest = false; + else + { + mb_check = MB_CUR_MAX > 1 && !using_utf8 (); + longest = mb_check || start_ptr || match_words; + } + for (mb_start = beg = start_ptr ? start_ptr : buf; beg <= buf + size; beg++) { size_t offset = kwsexec (kwset, beg - match_lines, - buf + size - beg + match_lines, &kwsmatch); + buf + size - beg + match_lines, &kwsmatch, + longest); if (offset == (size_t) -1) goto failure; len = kwsmatch.size[0] - 2 * match_lines; - if (!match_lines && MB_CUR_MAX > 1 && !using_utf8 () - && mb_goback (&mb_start, beg + offset, buf + size) != 0) + if (mb_check && mb_goback (&mb_start, beg + offset, buf + size) != 0) { /* We have matched a single byte that is not at the beginning of a multibyte character. mb_goback has advanced MB_START past that @@ -166,7 +176,7 @@ Fexecute (char const *buf, size_t size, size_t *match_size, { if (!len) break; - offset = kwsexec (kwset, beg, --len, &kwsmatch); + offset = kwsexec (kwset, beg, --len, &kwsmatch, true); if (offset == (size_t) -1) break; try = beg + offset; diff --git a/src/kwset.c b/src/kwset.c index 9585f08..a02bade 100644 --- a/src/kwset.c +++ b/src/kwset.c @@ -116,7 +116,8 @@ struct kwset bool reverse; /* kwsexec() implementation. */ - size_t (*kwsexec) (kwset_t, char const *, size_t, struct kwsmatch *); + size_t (*kwsexec) (kwset_t, char const *, size_t, struct kwsmatch *, + bool); }; /* Use TRANS to transliterate C. A null TRANS does no transliteration. */ @@ -126,9 +127,12 @@ tr (char const *trans, char c) return trans ? trans[U(c)] : c; } -static size_t acexec (kwset_t, char const *, size_t, struct kwsmatch *); -static size_t cwexec (kwset_t, char const *, size_t, struct kwsmatch *); -static size_t bmexec (kwset_t, char const *, size_t, struct kwsmatch *); +static size_t acexec (kwset_t, char const *, size_t, struct kwsmatch *, + bool); +static size_t cwexec (kwset_t, char const *, size_t, struct kwsmatch *, + bool); +static size_t bmexec (kwset_t, char const *, size_t, struct kwsmatch *, + bool); /* Allocate and initialize a keyword set object, returning an opaque pointer to it. */ @@ -722,7 +726,7 @@ bmexec_trans (kwset_t kwset, char const *text, size_t size) /* Fast Boyer-Moore search. */ static size_t bmexec (kwset_t kwset, char const *text, size_t size, - struct kwsmatch *kwsmatch) + struct kwsmatch *kwsmatch, bool const longest) { /* Help the compiler inline bmexec_trans in two ways, depending on whether kwset->trans is null. */ @@ -742,7 +746,8 @@ bmexec (kwset_t kwset, char const *text, size_t size, /* Hairy multiple string search with Commentz-Walter algorithm. */ static size_t _GL_ARG_NONNULL ((4)) -cwexec (kwset_t kwset, char const *text, size_t len, struct kwsmatch *kwsmatch) +cwexec (kwset_t kwset, char const *text, size_t len, + struct kwsmatch *kwsmatch, bool longest) { struct trie * const *next; struct trie const *trie; @@ -837,56 +842,59 @@ cwexec (kwset_t kwset, char const *text, size_t len, struct kwsmatch *kwsmatch) /* Given a known match, find the longest possible match anchored at or before its starting point. This is nearly a verbatim copy of the preceding main search loops. */ - if (lim - mch > kwset->maxd) - lim = mch + kwset->maxd; - lmch = 0; - d = 1; - while (lim - end >= d) + if (longest) { - if ((d = delta[c = (end += d)[-1]]) != 0) - continue; - beg = end - 1; - if (!(trie = next[c])) - { - d = 1; - continue; - } - if (trie->accepting && beg <= mch) - { - lmch = beg; - accept = trie; - } - d = trie->shift; - while (beg > text) + if (lim - mch > kwset->maxd) + lim = mch + kwset->maxd; + lmch = 0; + d = 1; + while (lim - end >= d) { - unsigned char uc = *--beg; - c = trans ? trans[uc] : uc; - tree = trie->links; - while (tree && c != tree->label) - if (c < tree->label) - tree = tree->llink; - else - tree = tree->rlink; - if (tree) + if ((d = delta[c = (end += d)[-1]]) != 0) + continue; + beg = end - 1; + if (!(trie = next[c])) { - trie = tree->trie; - if (trie->accepting && beg <= mch) + d = 1; + continue; + } + if (trie->accepting && beg <= mch) + { + lmch = beg; + accept = trie; + } + d = trie->shift; + while (beg > text) + { + unsigned char uc = *--beg; + c = trans ? trans[uc] : uc; + tree = trie->links; + while (tree && c != tree->label) + if (c < tree->label) + tree = tree->llink; + else + tree = tree->rlink; + if (tree) { - lmch = beg; - accept = trie; + trie = tree->trie; + if (trie->accepting && beg <= mch) + { + lmch = beg; + accept = trie; + } } + else + break; + d = trie->shift; } - else - break; - d = trie->shift; - } - if (lmch) - { - mch = lmch; - goto match; + if (lmch) + { + mch = lmch; + goto match; + } + if (!d) + d = 1; } - if (!d) - d = 1; } kwsmatch->index = accept->accepting / 2; @@ -900,7 +908,7 @@ cwexec (kwset_t kwset, char const *text, size_t len, struct kwsmatch *kwsmatch) (inlinable version) */ static inline size_t _GL_ARG_NONNULL ((4)) acexec_trans (kwset_t kwset, char const *text, size_t len, - struct kwsmatch *kwsmatch) + struct kwsmatch *kwsmatch, bool const longest) { struct trie * const *next; struct trie const *trie, *accept; @@ -1020,30 +1028,33 @@ acexec_trans (kwset_t kwset, char const *text, size_t len, left = tp - accept->depth; /* Try left-most longest match. */ - while (tp < lim) + if (longest) { - struct trie const *accept1; - char const *left1; - c = tr (trans, *tp++); - tree = trie->links; - while (tree && c != tree->label) - if (c < tree->label) - tree = tree->llink; - else - tree = tree->rlink; - if (!tree) - break; - trie = tree->trie; - if (!trie->accepting) - continue; - accept1 = trie; - while (accept1->accepting == (size_t) -1) - accept1 = accept1->fail; - left1 = tp - accept1->depth; - if (left1 <= left) + while (tp < lim) { - left = left1; - accept = accept1; + struct trie const *accept1; + char const *left1; + c = tr (trans, *tp++); + tree = trie->links; + while (tree && c != tree->label) + if (c < tree->label) + tree = tree->llink; + else + tree = tree->rlink; + if (!tree) + break; + trie = tree->trie; + if (!trie->accepting) + continue; + accept1 = trie; + while (accept1->accepting == (size_t) -1) + accept1 = accept1->fail; + left1 = tp - accept1->depth; + if (left1 <= left) + { + left = left1; + accept = accept1; + } } } @@ -1057,13 +1068,13 @@ acexec_trans (kwset_t kwset, char const *text, size_t len, /* Hairy multiple string search with Aho-Corasick algorithm. */ static size_t acexec (kwset_t kwset, char const *text, size_t size, - struct kwsmatch *kwsmatch) + struct kwsmatch *kwsmatch, bool const longest) { /* Help the compiler inline bmexec_trans in two ways, depending on whether kwset->trans is null. */ return (kwset->trans - ? acexec_trans (kwset, text, size, kwsmatch) - : acexec_trans (kwset, text, size, kwsmatch)); + ? acexec_trans (kwset, text, size, kwsmatch, longest) + : acexec_trans (kwset, text, size, kwsmatch, longest)); } /* Search TEXT for a match of any member of KWSET. @@ -1073,9 +1084,9 @@ acexec (kwset_t kwset, char const *text, size_t size, value), and length. */ size_t kwsexec (kwset_t kwset, char const *text, size_t size, - struct kwsmatch *kwsmatch) + struct kwsmatch *kwsmatch, bool const longest) { - return kwset->kwsexec (kwset, text, size, kwsmatch); + return kwset->kwsexec (kwset, text, size, kwsmatch, longest); } /* Free the components of the given keyword set. */ diff --git a/src/kwset.h b/src/kwset.h index 2476db3..e36c860 100644 --- a/src/kwset.h +++ b/src/kwset.h @@ -54,8 +54,8 @@ extern void kwsprep (kwset_t); the matching substring in the integer it points to. Similarly, if foundindex is non-NULL, store the index of the particular keyword found therein. */ -extern size_t kwsexec (kwset_t, char const *, size_t, struct kwsmatch *) - _GL_ARG_NONNULL ((4)); +extern size_t kwsexec (kwset_t, char const *, size_t, struct kwsmatch *, + bool const) _GL_ARG_NONNULL ((4)); /* Deallocate the given keyword set and all its associated storage. */ extern void kwsfree (kwset_t); -- 2.2.0
From 8f3e71ba78d932678947e63f96888d0281c8476b Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka <nori...@kcn.ne.jp> Date: Sat, 22 Nov 2014 15:21:35 +0900 Subject: [PATCH 3/3] use memchr2 for two patterns of a character * src/kwset.c (memchr2_kwset): Add a new function. grep uses memchr2 to search Just two letters. (cwexec, acexec_trans): Use it. --- src/kwset.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/src/kwset.c b/src/kwset.c index a02bade..2351a90 100644 --- a/src/kwset.c +++ b/src/kwset.c @@ -640,6 +640,31 @@ memchr_kwset (char const *s, size_t n, kwset_t kwset) return n == 0 ? NULL : memchr2 (s, kwset->gc1, kwset->gc1help, n); } +/* Return the address of the first byte in the buffer S (of size N) + that matches the last byte specified by KWSET, a singleton. */ +static size_t +memchr2_kwset (char const *s, size_t n, kwset_t kwset, + struct kwsmatch *kwsmatch) +{ + struct tree const *link = kwset->trie->links; + struct tree const *clink = link->llink ? link->llink : link->rlink; + + char const *mch = memchr2 (s, link->label, clink->label, n); + if (mch) + { + size_t off = mch - s; + if (*mch == link->label) + kwsmatch->index = link->trie->accepting / 2; + else + kwsmatch->index = clink->trie->accepting / 2; + kwsmatch->offset[0] = off; + kwsmatch->size[0] = 1; + return off; + } + else + return -1; +} + /* Fast Boyer-Moore search (inlinable version). */ static inline size_t _GL_ATTRIBUTE_PURE bmexec_trans (kwset_t kwset, char const *text, size_t size) @@ -767,6 +792,8 @@ cwexec (kwset_t kwset, char const *text, size_t len, /* Initialize register copies and look for easy ways out. */ if (len < kwset->mind) return -1; + if (!kwset->trans && kwset->maxd == 1 && kwset->words == 2) + return memchr2_kwset (text, len, kwset, kwsmatch); next = kwset->next; delta = kwset->delta; trans = kwset->trans; @@ -924,6 +951,8 @@ acexec_trans (kwset_t kwset, char const *text, size_t len, /* Initialize register copies and look for easy ways out. */ if (len < kwset->mind) return -1; + if (!kwset->trans && kwset->maxd == 1 && kwset->words == 2) + return memchr2_kwset (text, len, kwset, kwsmatch); next = kwset->next; trans = kwset->trans; -- 2.2.0