Hello I here attach compiled fnmatch I wrote as gnulib module. It always should give same answer as fnmatch. Still I assume UTF8 or singlebyte encoding otherwise I fallback. I could generalize this to wide characters but if I dont know any character is single wchar I cannot do much (UTF16 wchar in windows). If I focus on UTF8 I could add additional optimalizations.
Here is output of my benchmark: *abc* en_US.UTF-8 user: 2.71 user: 0.23 C user: 0.86 user: 0.23 This should be behaviour for most patterns. I assumed that matches are rare so made failing as fast as possible. *[abc]bc* en_US.UTF-8 user: 4.03 user: 2.80 UTF8 is slow because I call mblen. Adding inlined UTF8 char size would greatly speed things up. C user: 2.27 user: 0.76 This belongs to worst pattern type we can get because * optimalization is ineffective. I could speed it up but IMO leading *[ is too rare to care.(1) *ab[a-z]* en_US.UTF-8 user: 2.85 user: 3.32 C user: 0.86 user: 1.25 here we must fallback to fnmatch. Note additional cost same for all cases casefold *abc* en_US.UTF-8 user: 4.29 user: 4.85 fallback didnt added u8fold yet. C user: 1.07 user: 0.68 I could add following optimalization: speed up (1) then transform pattern like *ab[de] -> *[aA][bB][dDeE] in singlebyte encoding. Not sure with this one --- MODULES.html.sh | 2 + lib/fnmatchcomp.c | 458 ++++++++++++++++++++++++++++++++++++++++++++++++ lib/fnmatchcomp.h | 39 ++++ lib/fnmatchcomp_loop.h | 160 +++++++++++++++++ lib/fnmatchcomp_star.h | 67 +++++++ modules/fnmatchcomp | 33 ++++ 6 files changed, 759 insertions(+), 0 deletions(-) create mode 100644 lib/fnmatchcomp.c create mode 100644 lib/fnmatchcomp.h create mode 100644 lib/fnmatchcomp_loop.h create mode 100644 lib/fnmatchcomp_star.h create mode 100644 modules/fnmatchcomp diff --git a/MODULES.html.sh b/MODULES.html.sh index 06afa2d..03d308e 100755 --- a/MODULES.html.sh +++ b/MODULES.html.sh @@ -61,6 +61,7 @@ fenv float fmtmsg fnmatch +fnmatchcomp ftw glob grp @@ -418,6 +419,7 @@ fmodf fmodl fmtmsg fnmatch +fnmatchcomp fopen fork fpathconf diff --git a/lib/fnmatchcomp.c b/lib/fnmatchcomp.c new file mode 100644 index 0000000..e22074a --- /dev/null +++ b/lib/fnmatchcomp.c @@ -0,0 +1,458 @@ +/* Copyright (C) 2009 + Ondrej Bilka <nel...@seznam.cz> + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software Foundation, + Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ + +#include <stdint.h> + +#define _GNU_SOURCE +#include <fnmatch.h> +#include "fnmatchcomp.h" + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <nl_types.h> +#include <langinfo.h> +#include <locale.h> +#include <ctype.h> + +#define UINT4 uint32_t +#define UINT2 uint16_t +#define CHARSPERINT 4 +#define STREQ(a, b) (strcmp(a, b) == 0) +#define MAX_PATTERN_SIZE (2*sizeof (bracket_pattern)) +typedef struct +{ + char tp; + char chr; +} chr_pattern; +typedef struct +{ + char tp; + char chars[CHARSPERINT]; + short sizerest; + short charno; +} star_pattern; +typedef struct +{ + char tp; +} end_pattern; +typedef struct +{ + char tp; +} endstar_pattern; +typedef struct +{ + char tp; + char flags; + char *chars; + int sizerest; + uint32_t hash[8]; +} bracket_pattern; +typedef struct +{ + char tp; +} begindot_pattern; +typedef struct +{ + char tp; +} slashend_pattern; +typedef struct +{ + char tp; + char *ptr; + char *prefix; + int flags; +} fallback_pattern; +typedef struct +{ + char tp; + void *states; +} fold_pattern; +typedef star_pattern starslash_pattern; +enum PATTERNS +{ + PATTERN_chr, + PATTERN_star, + PATTERN_starslash, + PATTERN_end, + PATTERN_endstar, + PATTERN_bracket, + PATTERN_begindot, + PATTERN_slashend, + PATTERN_fallback, + PATTERN_fold +}; +union _patterns +{ + chr_pattern chr; + star_pattern star; + starslash_pattern starslash; + end_pattern end; + endstar_pattern endstar; + bracket_pattern bracket; + begindot_pattern begindot; + fallback_pattern fallback; + fold_pattern fold; +}; +typedef union _patterns patterns; +void freepatterns (patterns * p); + +struct states +{ + patterns *p; + struct states *next; +}; +struct _fnmatch_state +{ + struct states *states; + int flags; + int *refcount; + patterns *start; +}; + +int +ascii_compatible_encoding (char *c) +{ + return (MB_CUR_MAX == 1 || STREQ (c, "UTF-8")); +} + +void +strfold (const char *str, char *buf) +{ + while (*str) + *buf++ = tolower (*str++); + *buf = 0; +} + +fnmatch_state * +initfnmatchstate () +{ + fnmatch_state *st = (fnmatch_state *) malloc (sizeof (fnmatch_state)); + st->states = malloc (sizeof (struct states)); + st->states->next = NULL; + st->refcount = malloc (sizeof (int)); + *st->refcount = 1; + return st; +} + +fnmatch_state * +fnmatch_fallback (const char *str, int flags) +{ + fnmatch_state *st = initfnmatchstate (); + st->start = st->states->p = malloc (sizeof (fallback_pattern)); + st->states->p->fallback.tp = PATTERN_fallback; + st->states->p->fallback.ptr = strdup (str); + st->states->p->fallback.flags = flags; + st->states->p->fallback.prefix = strdup (""); + return st; +} + +fnmatch_state * +fold_fallback (fnmatch_state * s) +{ + fnmatch_state *st = initfnmatchstate (); + st->start = st->states->p = malloc (sizeof (fold_pattern)); + st->states->p->fold.tp = PATTERN_fold; + st->states->p->fold.states = (void *) s; + return st; +} + +int +parse_bracket (const char *str, patterns * pat, int flags) +{ + char *chr; + const char *s = str; + pat->bracket.tp = PATTERN_bracket; + if (*s == '!' || (getenv ("POSIXLY_CORRECT") != NULL && *s == '^')) + { + pat->bracket.flags = 1; + s++; + } + int mlen; + + if (!pat->bracket.chars) + { + pat->bracket.chars = malloc (2 * strlen (s) + 1); + } + chr = pat->bracket.chars; + do + { + if (*s == 0 || *s == '-' || *s == '\\' || + (*s == '[' + && (*(s + 1) == ':' || *(s + 1) == '=' || *(s + 1) == '.'))) + return -1; + mlen = mblen (s, MB_CUR_MAX); + char lastbyte = s[mlen - 1]; + pat->bracket.hash[lastbyte / 32] |= 1 << (lastbyte % 32); + memcpy (chr, s, mlen); + s += mlen; + chr += mlen + 1; + } + while (*s && *s != ']'); + *chr = 0; + return s - str; +} + +#define NEXTPATTERN(type) pat->chr.tp = PATTERN_##type; pat = (patterns *)(((void *) pat)+sizeof (type##_pattern)); +#define FALLBACK freepatterns(ret); free(ret); +fnmatch_state * +fnmatch_compile (const char *str, int flags) +{ + const char *s; + patterns *pat, *ret; + int i, pass; + + char *codeset = nl_langinfo (CODESET); + if (!ascii_compatible_encoding (codeset)) + { + return fnmatch_fallback (str, flags); + } + if (flags & FNM_CASEFOLD) + { + if (MB_CUR_MAX != 1) + return fnmatch_fallback (str, flags); + char *buf = malloc (2 * strlen (str)); + strfold (str, buf); + fnmatch_state *st = + fold_fallback (fnmatch_compile (buf, flags & (!FNM_CASEFOLD))); + free (buf); + if (((fnmatch_state *) st->start->fold.states)->start->chr.tp + == PATTERN_fallback) + { + fnmatch_free (st); + return fnmatch_fallback (str, flags); + } + return st; + } + + ret = (patterns *) calloc (1, strlen (str) * MAX_PATTERN_SIZE); + + int size = 0, patsize; // compute minimal string size + + + for (pass = 0; pass < 2; pass++) + { // two passes in first we compute values we use + patsize = size; // in second to optimalize. + size = 0; + pat = ret; + if (flags & FNM_PERIOD && *str != '.') + { + NEXTPATTERN (begindot)} + for (s = str; *s; s++) + { + if (flags & FNM_EXTMATCH && *(s + 1) == '(' && + (*s == '*' || *s == '+' || *s == '@' || *s == '!')) + { + FALLBACK; + return fnmatch_fallback (str, flags); + } + switch (*s) + { + case '*': + while (*(s + 1) == '?') + { + parse_bracket ("!", pat, flags); + size++; + pat->bracket.sizerest = patsize - size; + NEXTPATTERN (bracket); + s++; + } + if (*(s + 1)) + { + if (pass) + { + patterns *tmppat = pat; + pat->star.sizerest = patsize - size; + NEXTPATTERN (star); + for (i = 0; pat->chr.tp == PATTERN_chr && pat->chr.chr + && i < CHARSPERINT; i++) + { + tmppat->star.chars[i] = pat->chr.chr; + NEXTPATTERN (chr); + } + if (i == 3) + i = 2; + tmppat->star.charno = i; + pat = tmppat; + } + if (flags & FNM_PATHNAME) + { + NEXTPATTERN (starslash); + } + else + { + NEXTPATTERN (star); + } + } + else + { + NEXTPATTERN (endstar); + } + break; + case '?': + parse_bracket ("!", pat, flags); // by specification leading ] is ordinary character + size++; + pat->bracket.sizerest = patsize - size; + NEXTPATTERN (bracket); + break; + case '[':; + int siz = parse_bracket (s + 1, pat, flags); + if (siz < 0) + { + FALLBACK; + return fnmatch_fallback (str, flags); + } + size++; + s += siz + 1; + pat->bracket.sizerest = patsize - size; + NEXTPATTERN (bracket); + break; + default: + if (*s == '\\' && (!(flags & FNM_NOESCAPE))) + s++; + pat->chr.chr = *s; + size++; + NEXTPATTERN (chr); + if (*s == '/' && (flags & FNM_PERIOD) && (flags & FNM_PATHNAME) + && *(s + 1) != '.') + { + NEXTPATTERN (begindot)} + + break; + } + } + } + if (flags & FNM_LEADING_DIR) + { + NEXTPATTERN (slashend) NEXTPATTERN (endstar)} + else + { + NEXTPATTERN (end)} + + fnmatch_state *st = initfnmatchstate (); + st->states->p = ret; + st->start = ret; + return st; +} + +#undef NEXTPATTERN +#define NEXTPATTERN(type) p = (patterns *)(((void *) p)+sizeof (type##_pattern)); + +inline int +fnmatch2_internal (patterns * p, const char *str, int len) +{ +#include "fnmatchcomp_loop.h" +} + +int +fnmatch_exec (fnmatch_state * p, const char *str) +{ + struct states *s; + int len = strlen (str); + for (s = p->states; s; s = s->next) + { + if (!fnmatch2_internal (s->p, str, len)) + return 0; + } + return FNM_NOMATCH; +} + +#define FNMATCH_PREFIX +int +fnmatch2_prefix_internal (patterns * p, const char *str, int len, + fnmatch_state * ret) +{ +#include "fnmatchcomp_loop.h" +} + +fnmatch_state * +fnmatch_prefix (fnmatch_state * p, const char *str) +{ + struct states *s; + int len = strlen (str); + fnmatch_state *ret = (fnmatch_state *) malloc (sizeof (fnmatch_state)); + memcpy ((void *) ret, (void *) p, sizeof (fnmatch_state)); + ret->states = NULL; + (*ret->refcount)++; + for (s = p->states; s; s = s->next) + { + fnmatch2_prefix_internal (s->p, str, len, ret); + } + return ret; +} + +void +freepatterns (patterns * p) +{ +#define SKIPPATTERN(type) case PATTERN_##type: NEXTPATTERN(type); break; + while (p->chr.tp != PATTERN_end && p->chr.tp != PATTERN_endstar) + { + switch (p->chr.tp) + { + case PATTERN_chr: + if (!p->chr.chr) + return; + NEXTPATTERN (chr); + break; + SKIPPATTERN (star); + SKIPPATTERN (starslash); + SKIPPATTERN (end); + SKIPPATTERN (endstar); + case PATTERN_bracket: + free (p->bracket.chars); + NEXTPATTERN (bracket); + break; + SKIPPATTERN (begindot); + SKIPPATTERN (slashend); + case PATTERN_fallback: + free (p->fallback.ptr); + return; + case PATTERN_fold: + return; + } + } + +} + +void +fnmatch_free (fnmatch_state * s) +{ + struct states *st, *st2; + for (st = s->states; st;) + { + if (st->p->chr.tp == PATTERN_fallback) + { + free (st->p->fallback.prefix); + } + if (st->p->chr.tp == PATTERN_fold) + { + fnmatch_free ((fnmatch_state *) st->p->fold.states); + } + + st2 = st->next; + free (st); + st = st2; + } + (*s->refcount)--; + if (!*s->refcount) + { + free (s->refcount); + freepatterns (s->start); + free (s->start); + } + free (s); +} diff --git a/lib/fnmatchcomp.h b/lib/fnmatchcomp.h new file mode 100644 index 0000000..2392182 --- /dev/null +++ b/lib/fnmatchcomp.h @@ -0,0 +1,39 @@ +/* Copyright (C) 2009 + Ondrej Bilka <nel...@seznam.cz> + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software Foundation, + Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ + + +struct _fnmatch_state; +typedef struct _fnmatch_state fnmatch_state; + + /* compile pattern */ +fnmatch_state *fnmatch_compile (const char *pattern, int flags); + + /* add prefix to matched strings. + suppose you want match foo againist /home/foo, /home/bar/foo, + /home/bar/bar + fnmatch_state* s = fnmatch_compile("foo", 0); + fnmatch_state* home = fnmatch_prefix(s, "/home"); + fnmatch_state* bar = fnmatch_prefix(home, "/bar"); + fnmatch_exec(home, "/foo"); + fnmatch_exec(bar, "/foo"); + fnmatch_exec(bar, "/bar"); + */ +fnmatch_state *fnmatch_prefix (fnmatch_state * pattern, const char *prefix); + +int fnmatch_exec (fnmatch_state * pattern, const char *string); + +void fnmatch_free (fnmatch_state * pattern); diff --git a/lib/fnmatchcomp_loop.h b/lib/fnmatchcomp_loop.h new file mode 100644 index 0000000..1eb3af6 --- /dev/null +++ b/lib/fnmatchcomp_loop.h @@ -0,0 +1,160 @@ +/* Copyright (C) 2009 + Ondrej Bilka <nel...@seznam.cz> + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software Foundation, + Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ + + +const char *s = str; +int i; +int sizerest, charno; +char *chars; +patterns *pat; +while (1) + { +#ifndef FNMATCH_PREFIX +#undef ADDSTATE +#define ADDSTATE +#else + struct states *state; +#undef ADDSTATE +#define ADDSTATE state = calloc(1, sizeof (struct states)); \ + state->next = ret->states; \ + ret->states = state; \ + state->p = p; + if (!*s) + { + ADDSTATE; + return 0; + } +#endif + switch (p->chr.tp) + { + case PATTERN_chr: + if (*s != p->chr.chr) + return FNM_NOMATCH; + NEXTPATTERN (chr); + s++; + break; + case PATTERN_starslash: +#undef SLASHTEST +#define SLASHTEST if (str[i] == '/') return FNM_NOMATCH; +#include "fnmatchcomp_star.h" + break; + case PATTERN_star: +#undef SLASHTEST +#define SLASHTEST +#include "fnmatchcomp_star.h" + break; + + case PATTERN_end: + return (*s) ? FNM_NOMATCH : 0; + break; + case PATTERN_slashend: + if (*s == '/' || *s == 0) + { + NEXTPATTERN (slashend); + } + break; + case PATTERN_endstar: + ADDSTATE; + return 0; + break; + case PATTERN_bracket: + chars = p->bracket.chars; + int mat; + int mlen, mlen2; + unsigned char lastbyte; + if (MB_CUR_MAX == 1) + { + mlen = 1; + lastbyte = s[mlen - 1]; + mat = (p->bracket.hash[lastbyte / 32] & + (1 << (lastbyte % 32))) ? 1 : 0; + } + else + { + mlen = mblen (s, MB_CUR_MAX); + mat = 1; + if (mlen < 0) + return FNM_NOMATCH; + lastbyte = s[mlen - 1]; + if (!(p->bracket.hash[lastbyte / 32] & (1 << (lastbyte % 32)))) + { + mat = 0; + goto match; + } + while (*chars) + { + mlen2 = strlen (chars); + if (!strncmp (chars, s, mlen) && mlen == mlen2) + goto match; + chars += mlen2 + 1; + } + mat = 0; + } + match: + if (mat ^ (p->bracket.flags & 1)) + { + if (p->bracket.flags & 2 && *s == '/') + return FNM_NOMATCH; + s += mlen; + if (len - (str - s) < p->bracket.sizerest) + return FNM_NOMATCH; + } + else + { + return FNM_NOMATCH; + } + NEXTPATTERN (bracket); + break; + case PATTERN_begindot: + if (*s == '.') + return FNM_NOMATCH; + NEXTPATTERN (begindot); + break; + case PATTERN_fallback: + ; + char *stri = alloca (strlen (p->fallback.prefix) + len + 1); + strcpy (stri, p->fallback.prefix); + strcat (stri, s); +#ifndef FNMATCH_PREFIX + int ret = fnmatch (p->fallback.ptr, stri, p->fallback.flags); + return ret; +#else + patterns *pat2 = malloc (sizeof (fallback_pattern)); + memcpy (pat2, p, sizeof (fallback_pattern)); + pat2->fallback.prefix = stri; + p = (patterns *) pat2; + ADDSTATE; + return FNM_NOMATCH; +#endif + break; + case PATTERN_fold:; + char *buf = alloca (len + 1); + strfold (str, buf); +#ifndef FNMATCH_PREFIX + return fnmatch_exec ((fnmatch_state *) p->fold.states, buf); +#else + pat2 = malloc (sizeof (fallback_pattern)); + pat2->fold.tp = PATTERN_fold; + pat2->fold.states = + (void *) fold_fallback (fnmatch_prefix (p->fold.states, buf)); + p = (patterns *) pat2; + ADDSTATE; + return FNM_NOMATCH; +#endif + break; + } + } diff --git a/lib/fnmatchcomp_star.h b/lib/fnmatchcomp_star.h new file mode 100644 index 0000000..27b29ae --- /dev/null +++ b/lib/fnmatchcomp_star.h @@ -0,0 +1,67 @@ +/* Copyright (C) 2009 + Ondrej Bilka <nel...@seznam.cz> + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software Foundation, + Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ + + +#ifndef FNMATCH_PREFIX +sizerest = p->star.sizerest; +chars = p->star.chars; +charno = p->star.charno; +NEXTPATTERN (star); +for (i = 0; i < charno; i++) + { + NEXTPATTERN (chr); + } + +switch (charno) + { + case 0: + for (i = s - str; i < len - sizerest; i++) + { + if (!fnmatch2_internal (p, str + i, len - i)) + return 0; + SLASHTEST; + } + break; +#define CASEN(type, no) \ + case no:\ + for (i = s-str; i <= len-sizerest; i++) {\ + if ((*((type*)(str+i)) == *(type *) chars)\ + && !fnmatch2_internal(p, str+i+no, len-i-no)) return 0; \ + SLASHTEST \ + } \ + break; + CASEN (char, 1); + CASEN (UINT2, 2); + case 3: + CASEN (UINT4, 4); + } + +return FNM_NOMATCH; +#else +pat = p; +NEXTPATTERN (star); +for (i = s - str; i < len; i++) + { + fnmatch2_prefix_internal (p, str + i, len - i, ret); + SLASHTEST; + } + +p = pat; +ADDSTATE; +return FNM_NOMATCH; +#endif +break; diff --git a/modules/fnmatchcomp b/modules/fnmatchcomp new file mode 100644 index 0000000..a28192c --- /dev/null +++ b/modules/fnmatchcomp @@ -0,0 +1,33 @@ +Description: + Allows you to compile fnmatch pattern. You can add to this pattern prefix + which will be added as prefix of any string you will match with this pattern +Files: +lib/fnmatchcomp.c +lib/fnmatchcomp.h +lib/fnmatchcomp_loop.h +lib/fnmatchcomp_star.h + +Depends-on: +fnmatch +extensions +alloca +stdbool +wchar +wctype +memchr +memcmp +mbsrtowcs +mbsinit + +configure.ac: + +Makefile.am: + +Include: +<fnmatchcomp.h> + +License: +LGPLv2+ + +Maintainer: +Ondrej Bilka -- 1.5.6.5