Attached are some proposed patches which should improve the performance
of grep -P when applied to binary files, among other things. I have
some other ideas for boosting performance further but thought I'd
publish these first. Please give them a try if you have the time. I
doubt whether this will "solve" the performance problem entirely with -P
and encoding errors but at least it should be heading in the right
direction.
From ad34b7d8556e9fc274690666ac6ded2b6576feb3 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Sun, 14 Sep 2014 11:42:08 -0700
Subject: [PROPOSED PATCH 1/6] grep: remove/refactor unnecessary code about
line splitting
* src/grep.c (do_execute): Remove. Caller now uses 'execute'.
* src/pcresearch.c (Pexecute): Improve comment about this.
---
src/grep.c | 45 +--------------------------------------------
src/pcresearch.c | 7 +++++--
2 files changed, 6 insertions(+), 46 deletions(-)
diff --git a/src/grep.c b/src/grep.c
index 1f801e9..719dff1 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -1048,49 +1048,6 @@ prtext (char const *beg, char const *lim)
outleft -= n;
}
-/* Invoke the matcher, EXECUTE, on buffer BUF of SIZE bytes. If there
- is no match, return (size_t) -1. Otherwise, set *MATCH_SIZE to the
- length of the match and return the offset of the start of the match. */
-static size_t
-do_execute (char const *buf, size_t size, size_t *match_size)
-{
- size_t result;
- const char *line_next;
-
- /* With the current implementation, using --ignore-case with a multi-byte
- character set is very inefficient when applied to a large buffer
- containing many matches. We can avoid much of the wasted effort
- by matching line-by-line.
-
- FIXME: this is just an ugly workaround, and it doesn't really
- belong here. Also, PCRE is always using this same per-line
- matching algorithm. Either we fix -i, or we should refactor
- this code---for example, we could add another function pointer
- to struct matcher to split the buffer passed to execute. It would
- perform the memchr if line-by-line matching is necessary, or just
- return buf + size otherwise. */
- if (! (execute == Fexecute || execute == Pexecute)
- || MB_CUR_MAX == 1 || !match_icase)
- return execute (buf, size, match_size, NULL);
-
- for (line_next = buf; line_next < buf + size; )
- {
- const char *line_buf = line_next;
- const char *line_end = memchr (line_buf, eolbyte,
- (buf + size) - line_buf);
- if (line_end == NULL)
- line_next = line_end = buf + size;
- else
- line_next = line_end + 1;
-
- result = execute (line_buf, line_next - line_buf, match_size, NULL);
- if (result != (size_t) -1)
- return (line_buf - buf) + result;
- }
-
- return (size_t) -1;
-}
-
/* Scan the specified portion of the buffer, matching lines (or
between matching lines if OUT_INVERT is true). Return a count of
lines printed. */
@@ -1104,7 +1061,7 @@ grepbuf (char const *beg, char const *lim)
for (p = beg; p < lim; p = endp)
{
size_t match_size;
- size_t match_offset = do_execute (p, lim - p, &match_size);
+ size_t match_offset = execute (p, lim - p, &match_size, NULL);
if (match_offset == (size_t) -1)
{
if (!out_invert)
diff --git a/src/pcresearch.c b/src/pcresearch.c
index 3475d4a..0c5220d 100644
--- a/src/pcresearch.c
+++ b/src/pcresearch.c
@@ -149,8 +149,11 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
int e = PCRE_ERROR_NOMATCH;
char const *line_end;
- /* PCRE can't limit the matching to single lines, therefore we have to
- match each line in the buffer separately. */
+ /* pcre_exec mishandles matches that cross line boundaries.
+ PCRE_MULTILINE isn't a win, partly because it's incompatible with
+ -z, and partly because it checks the entire input buffer and is
+ therefore slow on a large buffer containing many matches.
+ Avoid these problems by matching line-by-line. */
for (; p < buf + size; p = line_start = line_end + 1)
{
line_end = memchr (p, eolbyte, buf + size - p);
--
1.9.3
From b7b7711dd072c335a45dbf09115b1597fed2ae76 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Sun, 14 Sep 2014 11:44:12 -0700
Subject: [PROPOSED PATCH 2/6] grep: speed up -P on files containing many
multibyte errors
* src/pcresearch.c (empty_match): New var.
(Pcompile): Set it.
(Pexecute): Use it.
---
src/pcresearch.c | 26 ++++++++++++++++++--------
1 file changed, 18 insertions(+), 8 deletions(-)
diff --git a/src/pcresearch.c b/src/pcresearch.c
index 0c5220d..95877e3 100644
--- a/src/pcresearch.c
+++ b/src/pcresearch.c
@@ -33,6 +33,10 @@ static pcre *cre;
/* Additional information about the pattern. */
static pcre_extra *extra;
+/* Table, indexed by ! (flag & PCRE_NOTBOL), of whether the empty
+ string matches when that flag is used. */
+static int empty_match[2];
+
# ifdef PCRE_STUDY_JIT_COMPILE
static pcre_jit_stack *jit_stack;
# else
@@ -124,6 +128,10 @@ Pcompile (char const *pattern, size_t size)
_("failed to allocate memory for the PCRE JIT stack"));
pcre_assign_jit_stack (extra, NULL, jit_stack);
}
+
+ empty_match[false] = pcre_exec (cre, extra, "", 0, 0, PCRE_NOTBOL, NULL, 0);
+ empty_match[true] = pcre_exec (cre, extra, "", 0, 0, 0, NULL, 0);
+
# endif
free (re);
#endif /* HAVE_LIBPCRE */
@@ -144,7 +152,7 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
int sub[nsub];
char const *p = start_ptr ? start_ptr : buf;
- int options = p == buf || p[-1] == eolbyte ? 0 : PCRE_NOTBOL;
+ bool bol = p[-1] == eolbyte;
char const *line_start = buf;
int e = PCRE_ERROR_NOMATCH;
char const *line_end;
@@ -164,23 +172,26 @@ Pexecute (char const *buf, size_t size, size_t
*match_size,
/* Treat encoding-error bytes as data that cannot match. */
for (;;)
{
+ int options = bol ? 0 : PCRE_NOTBOL;
int valid_bytes;
e = pcre_exec (cre, extra, p, line_end - p, 0, options, sub, nsub);
if (e != PCRE_ERROR_BADUTF8)
break;
valid_bytes = sub[0];
- e = pcre_exec (cre, extra, p, valid_bytes, 0,
- options | PCRE_NO_UTF8_CHECK | PCRE_NOTEOL,
- sub, nsub);
+ e = (valid_bytes == 0
+ ? empty_match[bol]
+ : pcre_exec (cre, extra, p, valid_bytes, 0,
+ options | PCRE_NO_UTF8_CHECK | PCRE_NOTEOL,
+ sub, nsub));
if (e != PCRE_ERROR_NOMATCH)
break;
p += valid_bytes + 1;
- options = PCRE_NOTBOL;
+ bol = false;
}
if (e != PCRE_ERROR_NOMATCH)
break;
- options = 0;
+ bol = true;
}
if (e <= 0)
@@ -188,7 +199,7 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
switch (e)
{
case PCRE_ERROR_NOMATCH:
- return -1;
+ break;
case PCRE_ERROR_NOMEMORY:
error (EXIT_TROUBLE, 0, _("memory exhausted"));
@@ -205,7 +216,6 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
error (EXIT_TROUBLE, 0, _("internal PCRE error: %d"), e);
}
- /* NOTREACHED */
return -1;
}
else
--
1.9.3
From f4a95dff902840826eed69fcc7205db5b3e86573 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Sat, 13 Sep 2014 17:58:53 -0700
Subject: [PROPOSED PATCH 3/6] grep: use bool for boolean in grep.c
* src/grep.c (show_version, suppress_errors, only_matching)
(align_tabs, match_icase, match_words, match_lines, errseen)
(write_error_seen, is_device_mode, usable_st_size)
(file_is_binary, skipped_file, reset, fillbuf, out_quiet)
(out_line, out_byte, count_matches, no_filenames, line_buffered)
(done_on_match, exit_on_match, print_line_head, prline, grep)
(grepdirent, grepfile, grepdesc, grep_command_line_arg)
(get_nondigit_option, main): Use bool for boolean.
(print_line_head, prline): Use char for byte.
* src/grep.h: Include <stdbool.h>, and adjust decls to match
changes in grep.c.
---
src/grep.c | 232 +++++++++++++++++++++++++++++++------------------------------
src/grep.h | 8 ++-
2 files changed, 124 insertions(+), 116 deletions(-)
diff --git a/src/grep.c b/src/grep.c
index 719dff1..1e0cc6d 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -65,20 +65,20 @@ static struct stat out_stat;
/* if non-zero, display usage information and exit */
static int show_help;
-/* If non-zero, print the version on standard output and exit. */
-static int show_version;
+/* Print the version on standard output and exit. */
+static bool show_version;
-/* If nonzero, suppress diagnostics for nonexistent or unreadable files. */
-static int suppress_errors;
+/* Suppress diagnostics for nonexistent or unreadable files. */
+static bool suppress_errors;
/* If nonzero, use color markers. */
static int color_option;
-/* If nonzero, show only the part of a line matching the expression. */
-static int only_matching;
+/* Show only the part of a line matching the expression. */
+static bool only_matching;
/* If nonzero, make sure first content char in a line is on a tab stop. */
-static int align_tabs;
+static bool align_tabs;
/* The group separator used when context is requested. */
static const char *group_separator = SEP_STR_GROUP;
@@ -347,9 +347,9 @@ static struct option const long_options[] =
};
/* Define flags declared in grep.h. */
-int match_icase;
-int match_words;
-int match_lines;
+bool match_icase;
+bool match_words;
+bool match_lines;
unsigned char eolbyte;
static char const *matcher;
@@ -358,8 +358,8 @@ static char const *matcher;
/* The input file name, or (if standard input) "-" or a --label argument. */
static char const *filename;
static size_t filename_prefix_len;
-static int errseen;
-static int write_error_seen;
+static bool errseen;
+static bool write_error_seen;
enum directories_type
{
@@ -392,22 +392,22 @@ static enum
SKIP_DEVICES
} devices = READ_COMMAND_LINE_DEVICES;
-static int grepfile (int, char const *, int, int);
-static int grepdesc (int, int);
+static bool grepfile (int, char const *, bool, bool);
+static bool grepdesc (int, bool);
static void dos_binary (void);
static void dos_unix_byte_offsets (void);
static size_t undossify_input (char *, size_t);
-static int
+static bool
is_device_mode (mode_t m)
{
return S_ISCHR (m) || S_ISBLK (m) || S_ISSOCK (m) || S_ISFIFO (m);
}
-/* Return nonzero if ST->st_size is defined. Assume the file is not a
+/* Return if ST->st_size is defined. Assume the file is not a
symbolic link. */
-static int
+static bool
usable_st_size (struct stat const *st)
{
return S_ISREG (st->st_mode) || S_TYPEISSHM (st) || S_TYPEISTMO (st);
@@ -425,7 +425,7 @@ suppressible_error (char const *mesg, int errnum)
{
if (! suppress_errors)
error (0, errnum, "%s", mesg);
- errseen = 1;
+ errseen = true;
}
/* If there has already been a write error, don't bother closing
@@ -437,10 +437,10 @@ clean_up_stdout (void)
close_stdout ();
}
-/* Return 1 if a file is known to be binary for the purpose of 'grep'.
+/* Return true if a file is known to be binary for the purpose of 'grep'.
BUF, of size BUFSIZE, is the initial buffer read from the file with
descriptor FD and status ST. */
-static int
+static bool
file_is_binary (char const *buf, size_t bufsize, int fd, struct stat const *st)
{
#ifndef SEEK_HOLE
@@ -455,7 +455,7 @@ file_is_binary (char const *buf, size_t bufsize, int fd,
struct stat const *st)
/* If the initial buffer contains a null byte, guess that the file
is binary. */
if (memchr (buf, '\0', bufsize))
- return 1;
+ return true;
/* If the file has holes, it must contain a null byte somewhere. */
if (SEEK_HOLE != SEEK_END && usable_st_size (st))
@@ -465,7 +465,7 @@ file_is_binary (char const *buf, size_t bufsize, int fd,
struct stat const *st)
{
cur = lseek (fd, 0, SEEK_CUR);
if (cur < 0)
- return 0;
+ return false;
}
/* Look for a hole after the current location. */
@@ -475,12 +475,12 @@ file_is_binary (char const *buf, size_t bufsize, int fd,
struct stat const *st)
if (lseek (fd, cur, SEEK_SET) < 0)
suppressible_error (filename, errno);
if (hole_start < st->st_size)
- return 1;
+ return true;
}
}
/* Guess that the file does not contain binary data. */
- return 0;
+ return false;
}
/* Convert STR to a nonnegative integer, storing the result in *OUT.
@@ -503,11 +503,11 @@ context_length_arg (char const *str, intmax_t *out)
}
}
-/* Return nonzero if the file with NAME should be skipped.
- If COMMAND_LINE is nonzero, it is a command-line argument.
- If IS_DIR is nonzero, it is a directory. */
-static int
-skipped_file (char const *name, int command_line, int is_dir)
+/* Return true if the file with NAME should be skipped.
+ If COMMAND_LINE, it is a command-line argument.
+ If IS_DIR, it is a directory. */
+static bool
+skipped_file (char const *name, bool command_line, bool is_dir)
{
return (is_dir
? (directories == SKIP_DIRECTORIES
@@ -541,9 +541,9 @@ static off_t after_last_match; /* Pointer after last
matching line that
? (val) \
: (val) + ((alignment) - (size_t) (val) % (alignment)))
-/* Reset the buffer for a new file, returning zero if we should skip it.
+/* Reset the buffer for a new file, returning false if we should skip it.
Initialize on the first time through. */
-static int
+static bool
reset (int fd, struct stat const *st)
{
if (! pagesize)
@@ -569,22 +569,22 @@ reset (int fd, struct stat const *st)
if (bufoffset < 0)
{
suppressible_error (_("lseek failed"), errno);
- return 0;
+ return false;
}
}
}
- return 1;
+ return true;
}
/* Read new stuff into the buffer, saving the specified
amount of old stuff. When we're done, 'bufbeg' points
to the beginning of the buffer contents, and 'buflim'
- points just after the end. Return zero if there's an error. */
-static int
+ points just after the end. Return false if there's an error. */
+static bool
fillbuf (size_t save, struct stat const *st)
{
size_t fillsize;
- int cc = 1;
+ bool cc = true;
char *readbuf;
size_t readsize;
@@ -646,7 +646,10 @@ fillbuf (size_t save, struct stat const *st)
fillsize = safe_read (bufdesc, readbuf, readsize);
if (fillsize == SAFE_READ_ERROR)
- fillsize = cc = 0;
+ {
+ fillsize = 0;
+ cc = false;
+ }
bufoffset += fillsize;
fillsize = undossify_input (readbuf, fillsize);
buflim = readbuf + fillsize;
@@ -662,20 +665,19 @@ static enum
} binary_files; /* How to handle binary files. */
static int filename_mask; /* If zero, output nulls after filenames. */
-static int out_quiet; /* Suppress all normal output. */
+static bool out_quiet; /* Suppress all normal output. */
static bool out_invert; /* Print nonmatching stuff. */
static int out_file; /* Print filenames. */
-static int out_line; /* Print line numbers. */
-static int out_byte; /* Print byte offsets. */
+static bool out_line; /* Print line numbers. */
+static bool out_byte; /* Print byte offsets. */
static intmax_t out_before; /* Lines of leading context. */
static intmax_t out_after; /* Lines of trailing context. */
-static int count_matches; /* Count matching lines. */
+static bool count_matches; /* Count matching lines. */
static int list_files; /* List matching files. */
-static int no_filenames; /* Suppress file names. */
+static bool no_filenames; /* Suppress file names. */
static intmax_t max_count; /* Stop after outputting this many
lines from an input file. */
-static int line_buffered; /* If nonzero, use line buffering, i.e.
- fflush everyline out. */
+static bool line_buffered; /* Use line buffering. */
static char *label = NULL; /* Fake filename for stdin */
@@ -689,8 +691,8 @@ static uintmax_t totalnl; /* Total newline count before
lastnl. */
static intmax_t outleft; /* Maximum number of lines to be output. */
static intmax_t pending; /* Pending lines of output.
Always kept 0 if out_quiet is true. */
-static int done_on_match; /* Stop scanning file on first match. */
-static int exit_on_match; /* Exit on first match. */
+static bool done_on_match; /* Stop scanning file on first match. */
+static bool exit_on_match; /* Exit on first match. */
#include "dosbuf.c"
@@ -768,15 +770,15 @@ print_offset (uintmax_t pos, int min_width, const char
*color)
/* Print a whole line head (filename, line, byte). */
static void
-print_line_head (char const *beg, char const *lim, int sep)
+print_line_head (char const *beg, char const *lim, char sep)
{
- int pending_sep = 0;
+ bool pending_sep = false;
if (out_file)
{
print_filename ();
if (filename_mask)
- pending_sep = 1;
+ pending_sep = true;
else
fputc (0, stdout);
}
@@ -792,7 +794,7 @@ print_line_head (char const *beg, char const *lim, int sep)
if (pending_sep)
print_sep (sep);
print_offset (totalnl, 4, line_num_color);
- pending_sep = 1;
+ pending_sep = true;
}
if (out_byte)
@@ -802,7 +804,7 @@ print_line_head (char const *beg, char const *lim, int sep)
if (pending_sep)
print_sep (sep);
print_offset (pos, 6, byte_num_color);
- pending_sep = 1;
+ pending_sep = true;
}
if (pending_sep)
@@ -903,9 +905,9 @@ print_line_tail (const char *beg, const char *lim, const
char *line_color)
}
static void
-prline (char const *beg, char const *lim, int sep)
+prline (char const *beg, char const *lim, char sep)
{
- int matching;
+ bool matching;
const char *line_color;
const char *match_color;
@@ -945,7 +947,7 @@ prline (char const *beg, char const *lim, int sep)
if (ferror (stdout))
{
- write_error_seen = 1;
+ write_error_seen = true;
error (EXIT_TROUBLE, 0, _("write error"));
}
@@ -1098,12 +1100,14 @@ static intmax_t
grep (int fd, struct stat const *st)
{
intmax_t nlines, i;
- int not_text;
+ bool not_text;
size_t residue, save;
char oldc;
char *beg;
char *lim;
char eol = eolbyte;
+ bool done_on_match_0 = done_on_match;
+ bool out_quiet_0 = out_quiet;
if (! reset (fd, st))
return 0;
@@ -1130,8 +1134,8 @@ grep (int fd, struct stat const *st)
&& file_is_binary (bufbeg, buflim - bufbeg, fd, st));
if (not_text && binary_files == WITHOUT_MATCH_BINARY_FILES)
return 0;
- done_on_match += not_text;
- out_quiet += not_text;
+ done_on_match |= not_text;
+ out_quiet |= not_text;
for (;;)
{
@@ -1208,17 +1212,18 @@ grep (int fd, struct stat const *st)
}
finish_grep:
- done_on_match -= not_text;
- out_quiet -= not_text;
+ done_on_match = done_on_match_0;
+ out_quiet = out_quiet_0;
if ((not_text & ~out_quiet) && nlines != 0)
printf (_("Binary file %s matches\n"), filename);
return nlines;
}
-static int
-grepdirent (FTS *fts, FTSENT *ent, int command_line)
+static bool
+grepdirent (FTS *fts, FTSENT *ent, bool command_line)
{
- int follow, dirdesc;
+ bool follow;
+ int dirdesc;
struct stat *st = ent->fts_statp;
command_line &= ent->fts_level == FTS_ROOTLEVEL;
@@ -1226,7 +1231,7 @@ grepdirent (FTS *fts, FTSENT *ent, int command_line)
{
if (directories == RECURSE_DIRECTORIES && command_line)
out_file &= ~ (2 * !no_filenames);
- return 1;
+ return true;
}
if (skipped_file (ent->fts_name, command_line,
@@ -1234,7 +1239,7 @@ grepdirent (FTS *fts, FTSENT *ent, int command_line)
|| ent->fts_info == FTS_DNR)))
{
fts_set (fts, ent, FTS_SKIP);
- return 1;
+ return true;
}
filename = ent->fts_path + filename_prefix_len;
@@ -1247,7 +1252,7 @@ grepdirent (FTS *fts, FTSENT *ent, int command_line)
if (directories == RECURSE_DIRECTORIES)
{
out_file |= 2 * !no_filenames;
- return 1;
+ return true;
}
fts_set (fts, ent, FTS_SKIP);
break;
@@ -1256,13 +1261,13 @@ grepdirent (FTS *fts, FTSENT *ent, int command_line)
if (!suppress_errors)
error (0, 0, _("warning: %s: %s"), filename,
_("recursive directory loop"));
- return 1;
+ return true;
case FTS_DNR:
case FTS_ERR:
case FTS_NS:
suppressible_error (filename, ent->fts_errno);
- return 1;
+ return true;
case FTS_DEFAULT:
case FTS_NSOK:
@@ -1279,12 +1284,12 @@ grepdirent (FTS *fts, FTSENT *ent, int command_line)
if (fstatat (fts->fts_cwd_fd, ent->fts_accpath, &st1, flag) != 0)
{
suppressible_error (filename, errno);
- return 1;
+ return true;
}
st = &st1;
}
if (is_device_mode (st->st_mode))
- return 1;
+ return true;
}
break;
@@ -1294,7 +1299,7 @@ grepdirent (FTS *fts, FTSENT *ent, int command_line)
case FTS_SL:
case FTS_W:
- return 1;
+ return true;
default:
abort ();
@@ -1306,24 +1311,24 @@ grepdirent (FTS *fts, FTSENT *ent, int command_line)
return grepfile (dirdesc, ent->fts_accpath, follow, command_line);
}
-static int
-grepfile (int dirdesc, char const *name, int follow, int command_line)
+static bool
+grepfile (int dirdesc, char const *name, bool follow, bool command_line)
{
int desc = openat_safer (dirdesc, name, O_RDONLY | (follow ? 0 :
O_NOFOLLOW));
if (desc < 0)
{
if (follow || (errno != ELOOP && errno != EMLINK))
suppressible_error (filename, errno);
- return 1;
+ return true;
}
return grepdesc (desc, command_line);
}
-static int
-grepdesc (int desc, int command_line)
+static bool
+grepdesc (int desc, bool command_line)
{
intmax_t count;
- int status = 1;
+ bool status = true;
struct stat st;
/* Get the file status, possibly for the second time. This catches
@@ -1339,7 +1344,7 @@ grepdesc (int desc, int command_line)
}
if (desc != STDIN_FILENO && command_line
- && skipped_file (filename, 1, S_ISDIR (st.st_mode)))
+ && skipped_file (filename, true, S_ISDIR (st.st_mode) != 0))
goto closeout;
if (desc != STDIN_FILENO
@@ -1404,7 +1409,7 @@ grepdesc (int desc, int command_line)
{
if (! suppress_errors)
error (0, 0, _("input file %s is also the output"), quote (filename));
- errseen = 1;
+ errseen = true;
goto closeout;
}
@@ -1456,18 +1461,18 @@ grepdesc (int desc, int command_line)
return status;
}
-static int
+static bool
grep_command_line_arg (char const *arg)
{
if (STREQ (arg, "-"))
{
filename = label ? label : _("(standard input)");
- return grepdesc (STDIN_FILENO, 1);
+ return grepdesc (STDIN_FILENO, true);
}
else
{
filename = arg;
- return grepfile (AT_FDCWD, arg, 1, 1);
+ return grepfile (AT_FDCWD, arg, true, true);
}
}
@@ -1721,14 +1726,15 @@ static int
get_nondigit_option (int argc, char *const *argv, intmax_t *default_context)
{
static int prev_digit_optind = -1;
- int this_digit_optind, was_digit;
+ int this_digit_optind;
+ bool was_digit;
char buf[INT_BUFSIZE_BOUND (intmax_t) + 4];
char *p = buf;
int opt;
- was_digit = 0;
+ was_digit = false;
this_digit_optind = optind;
- while (1)
+ while (true)
{
opt = getopt_long (argc, (char **) argv, short_options,
long_options, NULL);
@@ -1758,7 +1764,7 @@ get_nondigit_option (int argc, char *const *argv,
intmax_t *default_context)
}
*p++ = opt;
- was_digit = 1;
+ was_digit = true;
prev_digit_optind = this_digit_optind;
this_digit_optind = optind;
}
@@ -1890,9 +1896,9 @@ main (int argc, char **argv)
{
char *keys;
size_t keycc, oldcc, keyalloc;
- int with_filenames;
+ bool with_filenames, ok;
size_t cc;
- int opt, status, prepended;
+ int opt, prepended;
int prev_optind, last_recursive;
int fread_errno;
intmax_t default_context;
@@ -1904,7 +1910,7 @@ main (int argc, char **argv)
keys = NULL;
keycc = 0;
- with_filenames = 0;
+ with_filenames = false;
eolbyte = '\n';
filename_mask = ~0;
@@ -1915,7 +1921,7 @@ main (int argc, char **argv)
/* Default before/after context: changed by -C/-NUM options */
default_context = -1;
/* Changed by -o option */
- only_matching = 0;
+ only_matching = false;
/* Internationalization. */
#if defined HAVE_SETLOCALE
@@ -1987,8 +1993,8 @@ main (int argc, char **argv)
break;
case 'H':
- with_filenames = 1;
- no_filenames = 0;
+ with_filenames = true;
+ no_filenames = false;
break;
case 'I':
@@ -1996,7 +2002,7 @@ main (int argc, char **argv)
break;
case 'T':
- align_tabs = 1;
+ align_tabs = true;
break;
case 'U':
@@ -2008,7 +2014,7 @@ main (int argc, char **argv)
break;
case 'V':
- show_version = 1;
+ show_version = true;
break;
case 'a':
@@ -2016,11 +2022,11 @@ main (int argc, char **argv)
break;
case 'b':
- out_byte = 1;
+ out_byte = true;
break;
case 'c':
- count_matches = 1;
+ count_matches = true;
break;
case 'd':
@@ -2063,13 +2069,13 @@ main (int argc, char **argv)
break;
case 'h':
- with_filenames = 0;
- no_filenames = 1;
+ with_filenames = false;
+ no_filenames = true;
break;
case 'i':
case 'y': /* For old-timers . . . */
- match_icase = 1;
+ match_icase = true;
break;
case 'L':
@@ -2095,15 +2101,15 @@ main (int argc, char **argv)
break;
case 'n':
- out_line = 1;
+ out_line = true;
break;
case 'o':
- only_matching = 1;
+ only_matching = true;
break;
case 'q':
- exit_on_match = 1;
+ exit_on_match = true;
exit_failure = 0;
break;
@@ -2116,7 +2122,7 @@ main (int argc, char **argv)
break;
case 's':
- suppress_errors = 1;
+ suppress_errors = true;
break;
case 'v':
@@ -2124,11 +2130,11 @@ main (int argc, char **argv)
break;
case 'w':
- match_words = 1;
+ match_words = true;
break;
case 'x':
- match_lines = 1;
+ match_lines = true;
break;
case 'Z':
@@ -2199,7 +2205,7 @@ main (int argc, char **argv)
break;
case LINE_BUFFERED_OPTION:
- line_buffered = 1;
+ line_buffered = true;
break;
case LABEL_OPTION:
@@ -2226,8 +2232,8 @@ main (int argc, char **argv)
list_files = 0;
if (exit_on_match | list_files)
{
- count_matches = 0;
- done_on_match = 1;
+ count_matches = false;
+ done_on_match = true;
}
out_quiet = count_matches | done_on_match;
@@ -2267,7 +2273,7 @@ main (int argc, char **argv)
{
/* No keys were specified (e.g. -f /dev/null). Match nothing. */
out_invert ^= true;
- match_lines = match_words = 0;
+ match_lines = match_words = false;
}
else
/* Strip trailing newline. */
@@ -2323,21 +2329,21 @@ main (int argc, char **argv)
if (optind < argc)
{
- status = 1;
+ ok = true;
do
- status &= grep_command_line_arg (argv[optind]);
+ ok &= grep_command_line_arg (argv[optind]);
while (++optind < argc);
}
else if (directories == RECURSE_DIRECTORIES && prepended < last_recursive)
{
/* Grep through ".", omitting leading "./" from diagnostics. */
filename_prefix_len = 2;
- status = grep_command_line_arg (".");
+ ok = grep_command_line_arg (".");
}
else
- status = grep_command_line_arg ("-");
+ ok = grep_command_line_arg ("-");
/* We register via atexit() to test stdout. */
- exit (errseen ? EXIT_TROUBLE : status);
+ exit (errseen ? EXIT_TROUBLE : ok);
}
/* vim:set shiftwidth=2: */
diff --git a/src/grep.h b/src/grep.h
index 4935872..5496eb2 100644
--- a/src/grep.h
+++ b/src/grep.h
@@ -20,11 +20,13 @@
#ifndef GREP_GREP_H
#define GREP_GREP_H 1
+#include <stdbool.h>
+
/* The following flags are exported from grep for the matchers
to look at. */
-extern int match_icase; /* -i */
-extern int match_words; /* -w */
-extern int match_lines; /* -x */
+extern bool match_icase; /* -i */
+extern bool match_words; /* -w */
+extern bool match_lines; /* -x */
extern unsigned char eolbyte; /* -z */
#endif
--
1.9.3
From f784a73a01b823109d660aa8d256535623e98971 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Sun, 14 Sep 2014 13:49:18 -0700
Subject: [PROPOSED PATCH 4/6] grep: treat a file as binary if its prefix
contains encoding errors
* NEWS:
* doc/grep.texi (File and Directory Selection):
Document this.
* src/grep.c (buffer_encoding, buffer_textbin): New functions.
(file_textbin): Rename from file_is_binary. Now returns 3-way value.
All callers changed.
(file_textbin, grep): Check the input more carefully for text vs
binary data.
(contains_encoding_error): Remove; use replaced by buffer_encoding.
* tests/backref-multibyte-slow:
* tests/high-bit-range:
* tests/invalid-multibyte-infloop:
Use -a, since the input is now considered to be binary.
* tests/invalid-multibyte-infloop: Add a check for new behavior.
---
NEWS | 4 ++
doc/grep.texi | 3 +-
src/grep.c | 126 +++++++++++++++++++++++++++-------------
tests/backref-multibyte-slow | 2 +-
tests/high-bit-range | 2 +-
tests/invalid-multibyte-infloop | 14 ++++-
6 files changed, 106 insertions(+), 45 deletions(-)
diff --git a/NEWS b/NEWS
index 36bb48f..9377d7d 100644
--- a/NEWS
+++ b/NEWS
@@ -6,6 +6,10 @@ GNU grep NEWS -*- outline
-*-
Performance has improved for very long strings in patterns.
+ If a file contains data improperly encoded for the current locale,
+ and this is discovered before any of the file's contents are output,
+ grep now treats the file as binary.
+
grep -P no longer reports an error and exits when given invalid UTF-8 data.
Instead, it considers the data to be non-matching.
diff --git a/doc/grep.texi b/doc/grep.texi
index c8e4acd..14bd69e 100644
--- a/doc/grep.texi
+++ b/doc/grep.texi
@@ -592,7 +592,8 @@ this is equivalent to the @samp{--binary-files=text} option.
@item --binary-files=@var{type}
@opindex --binary-files
@cindex binary files
-If a file's allocation metadata or its first few bytes
+If a file's allocation metadata,
+or if its data read before a line is selected for output,
indicate that the file contains binary data,
assume that the file is of type @var{type}.
By default, @var{type} is @samp{binary},
diff --git a/src/grep.c b/src/grep.c
index 1e0cc6d..ccba1b6 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -437,50 +437,74 @@ clean_up_stdout (void)
close_stdout ();
}
-/* Return true if a file is known to be binary for the purpose of 'grep'.
+/* Return 1 if BUF (of size SIZE) contains text, -1 if it contains
+ binary data, and 0 if the answer depends on what comes immediately
+ after BUF. */
+static int
+buffer_textbin (char const *buf, size_t size)
+{
+ mbstate_t mbs = { 0 };
+ size_t charlen;
+ char badbyte = eolbyte ? '\0' : '\200';
+ char const *p;
+
+ for (p = buf; p < buf + size; p += charlen)
+ {
+ if (*p == badbyte)
+ return -1;
+ charlen = mbrlen (p, buf + size - p, &mbs);
+ if ((size_t) -2 <= charlen)
+ return charlen == (size_t) -2 ? 0 : -1;
+ charlen += !charlen;
+ }
+
+ return 1;
+}
+
+/* Return 1 if a file is known to be text for the purpose of 'grep'.
+ Return -1 if it is known to be binary, 0 if unknown.
BUF, of size BUFSIZE, is the initial buffer read from the file with
descriptor FD and status ST. */
-static bool
-file_is_binary (char const *buf, size_t bufsize, int fd, struct stat const *st)
+static int
+file_textbin (char const *buf, size_t bufsize, int fd, struct stat const *st)
{
#ifndef SEEK_HOLE
enum { SEEK_HOLE = SEEK_END };
#endif
- /* If -z, test only whether the initial buffer contains '\200';
- knowing about holes won't help. */
- if (! eolbyte)
- return memchr (buf, '\200', bufsize) != 0;
+ int textbin = buffer_textbin (buf, bufsize);
+ if (textbin < 0)
+ return textbin;
- /* If the initial buffer contains a null byte, guess that the file
- is binary. */
- if (memchr (buf, '\0', bufsize))
- return true;
-
- /* If the file has holes, it must contain a null byte somewhere. */
- if (SEEK_HOLE != SEEK_END && usable_st_size (st))
+ if (usable_st_size (st))
{
- off_t cur = bufsize;
- if (O_BINARY || fd == STDIN_FILENO)
- {
- cur = lseek (fd, 0, SEEK_CUR);
- if (cur < 0)
- return false;
- }
+ if (st->st_size <= bufsize)
+ return 2 * textbin - 1;
- /* Look for a hole after the current location. */
- off_t hole_start = lseek (fd, cur, SEEK_HOLE);
- if (0 <= hole_start)
+ /* If the file has holes, it must contain a null byte somewhere. */
+ if (SEEK_HOLE != SEEK_END && eolbyte)
{
- if (lseek (fd, cur, SEEK_SET) < 0)
- suppressible_error (filename, errno);
- if (hole_start < st->st_size)
- return true;
+ off_t cur = bufsize;
+ if (O_BINARY || fd == STDIN_FILENO)
+ {
+ cur = lseek (fd, 0, SEEK_CUR);
+ if (cur < 0)
+ return 0;
+ }
+
+ /* Look for a hole after the current location. */
+ off_t hole_start = lseek (fd, cur, SEEK_HOLE);
+ if (0 <= hole_start)
+ {
+ if (lseek (fd, cur, SEEK_SET) < 0)
+ suppressible_error (filename, errno);
+ if (hole_start < st->st_size)
+ return -1;
+ }
}
}
- /* Guess that the file does not contain binary data. */
- return false;
+ return 0;
}
/* Convert STR to a nonnegative integer, storing the result in *OUT.
@@ -1100,7 +1124,7 @@ static intmax_t
grep (int fd, struct stat const *st)
{
intmax_t nlines, i;
- bool not_text;
+ int textbin;
size_t residue, save;
char oldc;
char *beg;
@@ -1129,13 +1153,18 @@ grep (int fd, struct stat const *st)
return 0;
}
- not_text = (((binary_files == BINARY_BINARY_FILES && !out_quiet)
- || binary_files == WITHOUT_MATCH_BINARY_FILES)
- && file_is_binary (bufbeg, buflim - bufbeg, fd, st));
- if (not_text && binary_files == WITHOUT_MATCH_BINARY_FILES)
- return 0;
- done_on_match |= not_text;
- out_quiet |= not_text;
+ if (binary_files == TEXT_BINARY_FILES)
+ textbin = 1;
+ else
+ {
+ textbin = file_textbin (bufbeg, buflim - bufbeg, fd, st);
+ if (textbin < 0)
+ {
+ if (binary_files == WITHOUT_MATCH_BINARY_FILES)
+ return 0;
+ done_on_match = out_quiet = true;
+ }
+ }
for (;;)
{
@@ -1187,8 +1216,13 @@ grep (int fd, struct stat const *st)
}
/* Detect whether leading context is adjacent to previous output. */
- if (beg != lastout)
- lastout = 0;
+ if (lastout)
+ {
+ if (!textbin)
+ textbin = 1;
+ if (beg != lastout)
+ lastout = 0;
+ }
/* Handle some details and read more data to scan. */
save = residue + lim - beg;
@@ -1201,6 +1235,16 @@ grep (int fd, struct stat const *st)
suppressible_error (filename, errno);
goto finish_grep;
}
+
+ /* If the file's textbin has not been determined yet, assume
+ it's binary if the next input buffer suggests so. */
+ if (! textbin && buffer_textbin (bufbeg, buflim - bufbeg) < 0)
+ {
+ textbin = -1;
+ if (binary_files == WITHOUT_MATCH_BINARY_FILES)
+ return 0;
+ done_on_match = out_quiet = true;
+ }
}
if (residue)
{
@@ -1214,7 +1258,7 @@ grep (int fd, struct stat const *st)
finish_grep:
done_on_match = done_on_match_0;
out_quiet = out_quiet_0;
- if ((not_text & ~out_quiet) && nlines != 0)
+ if (textbin < 0 && !out_quiet && nlines != 0)
printf (_("Binary file %s matches\n"), filename);
return nlines;
}
diff --git a/tests/backref-multibyte-slow b/tests/backref-multibyte-slow
index ffebb6b..d447a4a 100755
--- a/tests/backref-multibyte-slow
+++ b/tests/backref-multibyte-slow
@@ -21,7 +21,7 @@ max_seconds=$(LC_ALL=C perl -le 'use Time::HiRes qw(time); my
$s = time();
for LOC in en_US.UTF-8; do
out=out-$LOC
- LC_ALL=$LOC timeout ${max_seconds}s grep -E '^([a-z]).\1$' in > $out 2>&1
+ LC_ALL=$LOC timeout ${max_seconds}s grep -aE '^([a-z]).\1$' in > $out 2>&1
test $? = 0 || fail=1
compare $out in || fail=1
done
diff --git a/tests/high-bit-range b/tests/high-bit-range
index 74b6e65..76c3310 100755
--- a/tests/high-bit-range
+++ b/tests/high-bit-range
@@ -21,7 +21,7 @@
fail=0
printf '\201\n' > in || framework_failure_
-grep "$(printf '[\201]')" in > out || fail=1
+grep -a "$(printf '[\201]')" in > out || fail=1
compare out in || fail=1
diff --git a/tests/invalid-multibyte-infloop b/tests/invalid-multibyte-infloop
index b28bc53..d7c6165 100755
--- a/tests/invalid-multibyte-infloop
+++ b/tests/invalid-multibyte-infloop
@@ -14,7 +14,7 @@ encode AA > input
fail=0
# Before 2.15, this would infloop.
-LC_ALL=en_US.UTF-8 timeout 3 grep -F $(encode A) input > out
+LC_ALL=en_US.UTF-8 timeout 3 grep -aF $(encode A) input > out
status=$?
if test $status -eq 0; then
compare input out
@@ -24,4 +24,16 @@ else
test $status -eq 2
fi || fail=1
+echo 'Binary file input matches' >binary-file-matches
+
+LC_ALL=en_US.UTF-8 timeout 3 grep -F $(encode A) input > out
+status=$?
+if test $status -eq 0; then
+ compare binary-file-matches out
+elif test $status -eq 1; then
+ compare_dev_null_ /dev/null out
+else
+ test $status -eq 2
+fi || fail=1
+
Exit $fail
--
1.9.3
From cc87d585025a2ff310b6c55097480d6e953557bd Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Sun, 14 Sep 2014 14:52:56 -0700
Subject: [PROPOSED PATCH 5/6] grep: improve performance for older glibc
glibc has a bug where mbrlen and mbrtowc mishandle length-0 inputs.
Working around it in gnulib slows grep down, so disable the tests for it
and make sure grep works even if the bug is present.
* bootstrap.conf (avoided_gnulib_modules): Add mbrtowc-tests.
* configure.ac (gl_cv_func_mbrtowc_empty_input): Assume yes.
* src/searchutils.c (mb_next_wc): Don't invoke mbrtowc on empty input.
---
bootstrap.conf | 1 +
configure.ac | 5 +++++
src/searchutils.c | 3 ++-
3 files changed, 8 insertions(+), 1 deletion(-)
diff --git a/bootstrap.conf b/bootstrap.conf
index d8171f5..50c0aab 100644
--- a/bootstrap.conf
+++ b/bootstrap.conf
@@ -17,6 +17,7 @@
avoided_gnulib_modules='
--avoid=lock-tests
+ --avoid=mbrtowc-tests
'
# gnulib modules used by this package.
diff --git a/configure.ac b/configure.ac
index 3315855..4d069b8 100644
--- a/configure.ac
+++ b/configure.ac
@@ -83,6 +83,11 @@ AC_PROG_CC
gl_EARLY
AC_PROG_RANLIB
+# grep never invokes mbrtowc or mbrlen on empty input,
+# so don't worry about this common bug,
+# as working around it would merely slow grep down.
+gl_cv_func_mbrtowc_empty_input='assume yes'
+
dnl Checks for typedefs, structures, and compiler characteristics.
AC_TYPE_SIZE_T
AC_C_CONST
diff --git a/src/searchutils.c b/src/searchutils.c
index 5eb9a12..18dd584 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -285,5 +285,6 @@ mb_next_wc (char const *cur, char const *end)
{
wchar_t wc;
mbstate_t mbs = { 0 };
- return mbrtowc (&wc, cur, end - cur, &mbs) < (size_t) -2 ? wc : WEOF;
+ return (end - cur != 0 && mbrtowc (&wc, cur, end - cur, &mbs) < (size_t) -2
+ ? wc : WEOF);
}
--
1.9.3
From 04c6552a279c7a8d565aae9a7ffea0b751689052 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Sun, 14 Sep 2014 17:37:58 -0700
Subject: [PROPOSED PATCH 6/6] grep: use mbclen cache more effectively
* src/grep.c (buffer_textbin, contains_encoding_error):
Use mb_clen for speed.
(buffer_textbin): Bypass mb_clen in unibyte locales.
(main): Always initialize the cache, since it's sometimes used in
unibyte locales now. Initialize it before contains_encoding_error
might be called.
* src/search.h (SEARCH_INLINE): New macro.
(mbclen_cache): Now extern decl.
(mb_clen): New inline function.
* src/searchutils.c (SEARCH_INLINE, SYSTEM_INLINE): Define.
(mbclen_cache): Now extern.
(build_mbclen_cache): Put 1 into the cache when mbrlen returns 0.
(mb_goback): Use mb_len for speed, and rely on it returning nonzero.
* src/system.h (SYSTEM_INLINE): New macro.
(to_uchar): Use it.
---
src/grep.c | 38 +++++++++++++++++++++-----------------
src/search.h | 19 +++++++++++++++++++
src/searchutils.c | 26 ++++++++++++++------------
src/system.h | 9 ++++++++-
4 files changed, 62 insertions(+), 30 deletions(-)
diff --git a/src/grep.c b/src/grep.c
index ccba1b6..72a811e 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -443,22 +443,27 @@ clean_up_stdout (void)
static int
buffer_textbin (char const *buf, size_t size)
{
- mbstate_t mbs = { 0 };
- size_t charlen;
char badbyte = eolbyte ? '\0' : '\200';
- char const *p;
- for (p = buf; p < buf + size; p += charlen)
+ if (MB_CUR_MAX <= 1)
+ return memchr (buf, badbyte, size) ? -1 : 1;
+ else
{
- if (*p == badbyte)
- return -1;
- charlen = mbrlen (p, buf + size - p, &mbs);
- if ((size_t) -2 <= charlen)
- return charlen == (size_t) -2 ? 0 : -1;
- charlen += !charlen;
- }
+ mbstate_t mbs = { 0 };
+ size_t clen;
+ char const *p;
- return 1;
+ for (p = buf; p < buf + size; p += clen)
+ {
+ if (*p == badbyte)
+ return -1;
+ clen = mb_clen (p, buf + size - p, &mbs);
+ if ((size_t) -2 <= clen)
+ return clen == (size_t) -2 ? 0 : -1;
+ }
+
+ return 1;
+ }
}
/* Return 1 if a file is known to be text for the purpose of 'grep'.
@@ -1887,9 +1892,9 @@ contains_encoding_error (char const *pat, size_t patlen)
mbstate_t mbs = { 0 };
size_t i, charlen;
- for (i = 0; i < patlen; i += charlen + (charlen == 0))
+ for (i = 0; i < patlen; i += charlen)
{
- charlen = mbrlen (pat + i, patlen - i, &mbs);
+ charlen = mb_clen (pat + i, patlen - i, &mbs);
if ((size_t) -2 <= charlen)
return true;
}
@@ -2332,6 +2337,8 @@ main (int argc, char **argv)
else
usage (EXIT_TROUBLE);
+ build_mbclen_cache ();
+
/* If fgrep in a multibyte locale, then use grep if either
(1) case is ignored (where grep is typically faster), or
(2) the pattern has an encoding error (where fgrep might not work). */
@@ -2349,9 +2356,6 @@ main (int argc, char **argv)
execute = EGexecute;
}
- if (MB_CUR_MAX > 1)
- build_mbclen_cache ();
-
compile (keys, keycc);
free (keys);
diff --git a/src/search.h b/src/search.h
index 14877bc..3f10a47 100644
--- a/src/search.h
+++ b/src/search.h
@@ -34,6 +34,11 @@
#include "kwset.h"
#include "xalloc.h"
+_GL_INLINE_HEADER_BEGIN
+#ifndef SEARCH_INLINE
+# define SEARCH_INLINE _GL_INLINE
+#endif
+
/* This must be a signed type. Each value is the difference in the size
of a character (in bytes) induced by converting to lower case.
The vast majority of values are 0, but a few are 1 or -1, so
@@ -45,6 +50,7 @@ extern void kwsinit (kwset_t *);
extern char *mbtoupper (char const *, size_t *, mb_len_map_t **);
extern void build_mbclen_cache (void);
+extern size_t mbclen_cache[];
extern ptrdiff_t mb_goback (char const **, char const *, char const *);
extern wint_t mb_prev_wc (char const *, char const *, char const *);
extern wint_t mb_next_wc (char const *, char const *);
@@ -61,4 +67,17 @@ extern size_t Fexecute (char const *, size_t, size_t *, char
const *);
extern void Pcompile (char const *, size_t);
extern size_t Pexecute (char const *, size_t, size_t *, char const *);
+/* Return the number of bytes in the character at the start of S, which
+ is of size N. N must be positive. MBS is the conversion state.
+ This acts like mbrlen, except it returns 1 when mbrlen would return 0,
+ and it is typically faster because of the cache. */
+SEARCH_INLINE size_t
+mb_clen (char const *s, size_t n, mbstate_t *mbs)
+{
+ size_t len = mbclen_cache[to_uchar (*s)];
+ return len == (size_t) -2 ? mbrlen (s, n, mbs) : len;
+}
+
+_GL_INLINE_HEADER_END
+
#endif /* GREP_SEARCH_H */
diff --git a/src/searchutils.c b/src/searchutils.c
index 18dd584..9edc785 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -17,12 +17,16 @@
02110-1301, USA. */
#include <config.h>
-#include <assert.h>
+
+#define SEARCH_INLINE _GL_EXTERN_INLINE
+#define SYSTEM_INLINE _GL_EXTERN_INLINE
#include "search.h"
+#include <assert.h>
+
#define NCHAR (UCHAR_MAX + 1)
-static size_t mbclen_cache[NCHAR];
+size_t mbclen_cache[NCHAR];
void
kwsinit (kwset_t *kwset)
@@ -218,7 +222,8 @@ build_mbclen_cache (void)
char c = i;
unsigned char uc = i;
mbstate_t mbs = { 0 };
- mbclen_cache[uc] = mbrlen (&c, 1, &mbs);
+ size_t len = mbrlen (&c, 1, &mbs);
+ mbclen_cache[uc] = len ? len : 1;
}
}
@@ -244,20 +249,17 @@ mb_goback (char const **mb_start, char const *cur, char
const *end)
while (p < cur)
{
- size_t mbclen = mbclen_cache[to_uchar (*p)];
-
- if (mbclen == (size_t) -2)
- mbclen = mbrlen (p, end - p, &cur_state);
+ size_t clen = mb_clen (p, end - p, &cur_state);
- if (! (0 < mbclen && mbclen < (size_t) -2))
+ if ((size_t) -2 <= clen)
{
- /* An invalid sequence, or a truncated multibyte character, or
- a null wide character. Treat it as a single byte character. */
- mbclen = 1;
+ /* An invalid sequence, or a truncated multibyte character.
+ Treat it as a single byte character. */
+ clen = 1;
memset (&cur_state, 0, sizeof cur_state);
}
p0 = p;
- p += mbclen;
+ p += clen;
}
*mb_start = p;
diff --git a/src/system.h b/src/system.h
index 7da1d8d..bac2623 100644
--- a/src/system.h
+++ b/src/system.h
@@ -49,15 +49,22 @@ enum { EXIT_TROUBLE = 2 };
#include "unlocked-io.h"
+_GL_INLINE_HEADER_BEGIN
+#ifndef SYSTEM_INLINE
+# define SYSTEM_INLINE _GL_INLINE
+#endif
+
#define STREQ(a, b) (strcmp (a, b) == 0)
/* Convert a possibly-signed character to an unsigned character. This is
a bit safer than casting to unsigned char, since it catches some type
errors that the cast doesn't. */
-static inline unsigned char
+SYSTEM_INLINE unsigned char
to_uchar (char ch)
{
return ch;
}
+_GL_INLINE_HEADER_END
+
#endif
--
1.9.3