Hi NRK, Thanks it looks good, the patches are now pushing into the dmenu repo.
I pushed a very small style patch for the recursion check also. While reviewing I noticed: in the future we might want to change the check for ellipsis_width and invalid_width. I think there might be issues with the caching if we switch the fontset/fonts (but that would also currently be an issue). After some time of crowd-testing I will sync the drw.{c,h} changes to libsl and dwm. On Fri, Jul 05, 2024 at 02:40:45AM +0000, NRK wrote: > This is a follow up to a discussion from the dev list: > https://lists.suckless.org/dev/2407/35646.html > > I've separated the patch into two halves. > > 1. Replaces the current utf8decoder with a shorter and more direct one. > It reports back errors and how much to advance on error (needed for the > next patch) > 2. Detects invalid utf8 bytes and renders them as U+FFFD instead of > passing invalid bytes to xft which cuts it off. > > If keeping the current utf8 decoder is important, I can look into how to > change it so that it can return error information back to the caller. > But IMHO the current decoder is unnecessarily overcomplicated for > libsl's need. > > The interaction between invalid utf8 sequence and ellipsis is a bit > finicky and could use some extra eyes. But so far it has worked well in > my testing. > > - NRK > From 7208684792d36efe262d2272f84e7d6e729fd3e7 Mon Sep 17 00:00:00 2001 > From: NRK <n...@disroot.org> > Date: Thu, 4 Jul 2024 21:25:37 +0000 > Subject: [PATCH 1/2] overhaul utf8decode() > > this changes the utf8decode function to: > > * report when an error occurs > * report how many bytes to advance on error > > these will be useful in the next commit to render invalid utf8 > sequences. > > the new implementation is also shorter and more direct. > --- > drw.c | 76 ++++++++++++++++++++++++----------------------------------- > 1 file changed, 31 insertions(+), 45 deletions(-) > > diff --git a/drw.c b/drw.c > index 78a2b27..eb71da7 100644 > --- a/drw.c > +++ b/drw.c > @@ -9,54 +9,40 @@ > #include "util.h" > > #define UTF_INVALID 0xFFFD > -#define UTF_SIZ 4 > > -static const unsigned char utfbyte[UTF_SIZ + 1] = {0x80, 0, 0xC0, 0xE0, > 0xF0}; > -static const unsigned char utfmask[UTF_SIZ + 1] = {0xC0, 0x80, 0xE0, 0xF0, > 0xF8}; > -static const long utfmin[UTF_SIZ + 1] = { 0, 0, 0x80, 0x800, > 0x10000}; > -static const long utfmax[UTF_SIZ + 1] = {0x10FFFF, 0x7F, 0x7FF, 0xFFFF, > 0x10FFFF}; > - > -static long > -utf8decodebyte(const char c, size_t *i) > -{ > - for (*i = 0; *i < (UTF_SIZ + 1); ++(*i)) > - if (((unsigned char)c & utfmask[*i]) == utfbyte[*i]) > - return (unsigned char)c & ~utfmask[*i]; > - return 0; > -} > - > -static size_t > -utf8validate(long *u, size_t i) > +static int > +utf8decode(const char *s_in, long *u, int *err) > { > - if (!BETWEEN(*u, utfmin[i], utfmax[i]) || BETWEEN(*u, 0xD800, 0xDFFF)) > - *u = UTF_INVALID; > - for (i = 1; *u > utfmax[i]; ++i) > - ; > - return i; > -} > - > -static size_t > -utf8decode(const char *c, long *u, size_t clen) > -{ > - size_t i, j, len, type; > - long udecoded; > - > + static const unsigned char lens[] = { > + /* 0XXXX */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, > + /* 10XXX */ 0, 0, 0, 0, 0, 0, 0, 0, /* invalid */ > + /* 110XX */ 2, 2, 2, 2, > + /* 1110X */ 3, 3, > + /* 11110 */ 4, > + /* 11111 */ 0, /* invalid */ > + }; > + static const unsigned char leading_mask[] = { 0x7F, 0x1F, 0x0F, 0x07 }; > + static const unsigned int overlong[] = { 0x0, 0x80, 0x0800, 0x10000 }; > + > + const unsigned char *s = (const unsigned char *)s_in; > + int len = lens[*s >> 3]; > *u = UTF_INVALID; > - if (!clen) > - return 0; > - udecoded = utf8decodebyte(c[0], &len); > - if (!BETWEEN(len, 1, UTF_SIZ)) > + *err = 1; > + if (len == 0) > return 1; > - for (i = 1, j = 1; i < clen && j < len; ++i, ++j) { > - udecoded = (udecoded << 6) | utf8decodebyte(c[i], &type); > - if (type) > - return j; > + > + long cp = s[0] & leading_mask[len - 1]; > + for (int i = 1; i < len; ++i) { > + if (s[i] == '\0' || (s[i] & 0xC0) != 0x80) > + return i; > + cp = (cp << 6) | (s[i] & 0x3F); > } > - if (j < len) > - return 0; > - *u = udecoded; > - utf8validate(u, len); > + /* out of range, surrogate, overlong encoding */ > + if (cp > 0x10FFFF || (cp >> 11) == 0x1B || cp < overlong[len - 1]) > + return len; > > + *err = 0; > + *u = cp; > return len; > } > > @@ -242,7 +228,7 @@ drw_text(Drw *drw, int x, int y, unsigned int w, unsigned > int h, unsigned int lp > unsigned int tmpw, ew, ellipsis_w = 0, ellipsis_len, hash, h0, h1; > XftDraw *d = NULL; > Fnt *usedfont, *curfont, *nextfont; > - int utf8strlen, utf8charlen, render = x || y || w || h; > + int utf8strlen, utf8charlen, utf8err, render = x || y || w || h; > long utf8codepoint = 0; > const char *utf8str; > FcCharSet *fccharset; > @@ -272,11 +258,11 @@ drw_text(Drw *drw, int x, int y, unsigned int w, > unsigned int h, unsigned int lp > if (!ellipsis_width && render) > ellipsis_width = drw_fontset_getwidth(drw, "..."); > while (1) { > - ew = ellipsis_len = utf8strlen = 0; > + ew = ellipsis_len = utf8err = utf8charlen = utf8strlen = 0; > utf8str = text; > nextfont = NULL; > while (*text) { > - utf8charlen = utf8decode(text, &utf8codepoint, UTF_SIZ); > + utf8charlen = utf8decode(text, &utf8codepoint, > &utf8err); > for (curfont = drw->fonts; curfont; curfont = > curfont->next) { > charexists = charexists || > XftCharExists(drw->dpy, curfont->xfont, utf8codepoint); > if (charexists) { > -- > 2.42.0 > > From 396a21976f08b3c66e521855084685ce93356756 Mon Sep 17 00:00:00 2001 > From: NRK <n...@disroot.org> > Date: Thu, 4 Jul 2024 21:27:47 +0000 > Subject: [PATCH 2/2] render invalid utf8 sequences as U+FFFD > > previously drw_text would do the width calculations as if > invalid utf8 sequences were replaced with U+FFFD but would pass > the invalid utf8 sequence to xft to render where xft would just > cut it off at the first invalid byte. > > this change makes invalid utf8 render as U+FFFD and avoids > sending invalid sequences to xft. the following can be used to > check the behavior before and after the patch: > > $ printf "0\xef1234567\ntest" | dmenu > > Ref: https://lists.suckless.org/dev/2407/35646.html > --- > drw.c | 19 +++++++++++++++---- > 1 file changed, 15 insertions(+), 4 deletions(-) > > diff --git a/drw.c b/drw.c > index eb71da7..f151ae5 100644 > --- a/drw.c > +++ b/drw.c > @@ -237,7 +237,8 @@ drw_text(Drw *drw, int x, int y, unsigned int w, unsigned > int h, unsigned int lp > XftResult result; > int charexists = 0, overflow = 0; > /* keep track of a couple codepoints for which we have no match. */ > - static unsigned int nomatches[128], ellipsis_width; > + static unsigned int nomatches[128], ellipsis_width, invalid_width; > + static const char invalid[] = "�"; > > if (!drw || (render && (!drw->scheme || !w)) || !text || !drw->fonts) > return 0; > @@ -257,6 +258,10 @@ drw_text(Drw *drw, int x, int y, unsigned int w, > unsigned int h, unsigned int lp > usedfont = drw->fonts; > if (!ellipsis_width && render) > ellipsis_width = drw_fontset_getwidth(drw, "..."); > + if (!invalid_width) { > + invalid_width = -1; /* stop infinite recursion */ > + invalid_width = drw_fontset_getwidth(drw, invalid); > + } > while (1) { > ew = ellipsis_len = utf8err = utf8charlen = utf8strlen = 0; > utf8str = text; > @@ -284,9 +289,9 @@ drw_text(Drw *drw, int x, int y, unsigned int w, unsigned > int h, unsigned int lp > else > utf8strlen = > ellipsis_len; > } else if (curfont == usedfont) { > - utf8strlen += utf8charlen; > text += utf8charlen; > - ew += tmpw; > + utf8strlen += utf8err ? 0 : > utf8charlen; > + ew += utf8err ? 0 : tmpw; > } else { > nextfont = curfont; > } > @@ -294,7 +299,7 @@ drw_text(Drw *drw, int x, int y, unsigned int w, unsigned > int h, unsigned int lp > } > } > > - if (overflow || !charexists || nextfont) > + if (overflow || !charexists || nextfont || utf8err) > break; > else > charexists = 0; > @@ -309,6 +314,12 @@ drw_text(Drw *drw, int x, int y, unsigned int w, > unsigned int h, unsigned int lp > x += ew; > w -= ew; > } > + if (utf8err && (!render || invalid_width < w)) { > + if (render) > + drw_text(drw, x, y, w, h, 0, invalid, invert); > + x += invalid_width; > + w -= invalid_width; > + } > if (render && overflow) > drw_text(drw, ellipsis_x, y, ellipsis_w, h, 0, "...", > invert); > > -- > 2.42.0 > -- Kind regards, Hiltjo