The root cause of this bug is the use of mbtowc in 64-egf-speedup.patch
and 67-w.patch. These patches try to use mbtowc to look at the
character before and after the match to check if the match is a whole
word. But when a binary file is being grepped, mbtowc gets passed
random junk rather than a valid UTF-8 character. As a consequence, its
internal state gets messed up, and you get nonsense for the following
matches. The fix is to use mbrtowc so you can reset its state. A patch
is attached.
65-dfa-optional.patch is a red herring. I guess that patch just exposes
the bug because it causes grep to use a different code path. But you
get the same bug with grep -F, which is not touched by that patch.
--
Peter De Wachter
--- a/build-tree/grep-2.5.3/src/search.c
+++ b/build-tree/grep-2.5.3/src/search.c
@@ -502,7 +502,7 @@
}
else
s = last_char;
- mr = mbtowc (&pwc, s, match - s);
+ mr = mbrtowc (&pwc, s, match - s, &mbs);
if (mr <= 0)
{
memset (&mbs, '\0', sizeof (mbstate_t));
@@ -531,8 +531,8 @@
wchar_t nwc;
int mr;
- mr = mbtowc (&nwc, buf + start + len,
- end - buf - start - len - 1);
+ mr = mbrtowc (&nwc, buf + start + len,
+ end - buf - start - len - 1, &mbs);
if (mr <= 0)
{
memset (&mbs, '\0', sizeof (mbstate_t));
@@ -941,7 +941,7 @@
}
else
s = last_char;
- mr = mbtowc (&pwc, s, beg - s);
+ mr = mbrtowc (&pwc, s, beg - s, &mbs);
if (mr <= 0)
memset (&mbs, '\0', sizeof (mbstate_t));
else if ((iswalnum (pwc) || pwc == L'_')
@@ -959,7 +959,7 @@
wchar_t nwc;
int mr;
- mr = mbtowc (&nwc, beg + len, buf + size - beg - len);
+ mr = mbrtowc (&nwc, beg + len, buf + size - beg - len, &mbs);
if (mr <= 0)
{
memset (&mbs, '\0', sizeof (mbstate_t));