Re: [hackers] [PATCH][dmenu/libsl] render invalid utf8 bytes as U+FFFD

Hiltjo Posthuma Sun, 14 Jul 2024 02:48:10 -0700

Hi NRK,

Thanks it looks good, the patches are now pushing into the dmenu repo.


I pushed a very small style patch for the recursion check also.

While reviewing I noticed: in the future we might want to change the check for
ellipsis_width and invalid_width.  I think there might be issues with the
caching if we switch the fontset/fonts (but that would also currently be an
issue).

After some time of crowd-testing I will sync the drw.{c,h} changes to libsl and
dwm.

On Fri, Jul 05, 2024 at 02:40:45AM +0000, NRK wrote:
> This is a follow up to a discussion from the dev list:
> https://lists.suckless.org/dev/2407/35646.html
> 
> I've separated the patch into two halves.
> 
> 1. Replaces the current utf8decoder with a shorter and more direct one.
>    It reports back errors and how much to advance on error (needed for the
>    next patch)
> 2. Detects invalid utf8 bytes and renders them as U+FFFD instead of
>    passing invalid bytes to xft which cuts it off.
> 
> If keeping the current utf8 decoder is important, I can look into how to
> change it so that it can return error information back to the caller.
> But IMHO the current decoder is unnecessarily overcomplicated for
> libsl's need.
> 
> The interaction between invalid utf8 sequence and ellipsis is a bit
> finicky and could use some extra eyes. But so far it has worked well in
> my testing.
> 
> - NRK

> From 7208684792d36efe262d2272f84e7d6e729fd3e7 Mon Sep 17 00:00:00 2001
> From: NRK <[email protected]>
> Date: Thu, 4 Jul 2024 21:25:37 +0000
> Subject: [PATCH 1/2] overhaul utf8decode()
> 
> this changes the utf8decode function to:
> 
> * report when an error occurs
> * report how many bytes to advance on error
> 
> these will be useful in the next commit to render invalid utf8
> sequences.
> 
> the new implementation is also shorter and more direct.
> ---
>  drw.c | 76 ++++++++++++++++++++++++-----------------------------------
>  1 file changed, 31 insertions(+), 45 deletions(-)
> 
> diff --git a/drw.c b/drw.c
> index 78a2b27..eb71da7 100644
> --- a/drw.c
> +++ b/drw.c
> @@ -9,54 +9,40 @@
>  #include "util.h"
>  
>  #define UTF_INVALID 0xFFFD
> -#define UTF_SIZ     4
>  
> -static const unsigned char utfbyte[UTF_SIZ + 1] = {0x80,    0, 0xC0, 0xE0, 
> 0xF0};
> -static const unsigned char utfmask[UTF_SIZ + 1] = {0xC0, 0x80, 0xE0, 0xF0, 
> 0xF8};
> -static const long utfmin[UTF_SIZ + 1] = {       0,    0,  0x80,  0x800,  
> 0x10000};
> -static const long utfmax[UTF_SIZ + 1] = {0x10FFFF, 0x7F, 0x7FF, 0xFFFF, 
> 0x10FFFF};
> -
> -static long
> -utf8decodebyte(const char c, size_t *i)
> -{
> -     for (*i = 0; *i < (UTF_SIZ + 1); ++(*i))
> -             if (((unsigned char)c & utfmask[*i]) == utfbyte[*i])
> -                     return (unsigned char)c & ~utfmask[*i];
> -     return 0;
> -}
> -
> -static size_t
> -utf8validate(long *u, size_t i)
> +static int
> +utf8decode(const char *s_in, long *u, int *err)
>  {
> -     if (!BETWEEN(*u, utfmin[i], utfmax[i]) || BETWEEN(*u, 0xD800, 0xDFFF))
> -             *u = UTF_INVALID;
> -     for (i = 1; *u > utfmax[i]; ++i)
> -             ;
> -     return i;
> -}
> -
> -static size_t
> -utf8decode(const char *c, long *u, size_t clen)
> -{
> -     size_t i, j, len, type;
> -     long udecoded;
> -
> +     static const unsigned char lens[] = {
> +             /* 0XXXX */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
> +             /* 10XXX */ 0, 0, 0, 0, 0, 0, 0, 0,  /* invalid */
> +             /* 110XX */ 2, 2, 2, 2,
> +             /* 1110X */ 3, 3,
> +             /* 11110 */ 4,
> +             /* 11111 */ 0,  /* invalid */
> +     };
> +     static const unsigned char leading_mask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
> +     static const unsigned int overlong[] = { 0x0, 0x80, 0x0800, 0x10000 };
> +
> +     const unsigned char *s = (const unsigned char *)s_in;
> +     int len = lens[*s >> 3];
>       *u = UTF_INVALID;
> -     if (!clen)
> -             return 0;
> -     udecoded = utf8decodebyte(c[0], &len);
> -     if (!BETWEEN(len, 1, UTF_SIZ))
> +     *err = 1;
> +     if (len == 0)
>               return 1;
> -     for (i = 1, j = 1; i < clen && j < len; ++i, ++j) {
> -             udecoded = (udecoded << 6) | utf8decodebyte(c[i], &type);
> -             if (type)
> -                     return j;
> +
> +     long cp = s[0] & leading_mask[len - 1];
> +     for (int i = 1; i < len; ++i) {
> +             if (s[i] == '\0' || (s[i] & 0xC0) != 0x80)
> +                     return i;
> +             cp = (cp << 6) | (s[i] & 0x3F);
>       }
> -     if (j < len)
> -             return 0;
> -     *u = udecoded;
> -     utf8validate(u, len);
> +     /* out of range, surrogate, overlong encoding */
> +     if (cp > 0x10FFFF || (cp >> 11) == 0x1B || cp < overlong[len - 1])
> +             return len;
>  
> +     *err = 0;
> +     *u = cp;
>       return len;
>  }
>  
> @@ -242,7 +228,7 @@ drw_text(Drw *drw, int x, int y, unsigned int w, unsigned 
> int h, unsigned int lp
>       unsigned int tmpw, ew, ellipsis_w = 0, ellipsis_len, hash, h0, h1;
>       XftDraw *d = NULL;
>       Fnt *usedfont, *curfont, *nextfont;
> -     int utf8strlen, utf8charlen, render = x || y || w || h;
> +     int utf8strlen, utf8charlen, utf8err, render = x || y || w || h;
>       long utf8codepoint = 0;
>       const char *utf8str;
>       FcCharSet *fccharset;
> @@ -272,11 +258,11 @@ drw_text(Drw *drw, int x, int y, unsigned int w, 
> unsigned int h, unsigned int lp
>       if (!ellipsis_width && render)
>               ellipsis_width = drw_fontset_getwidth(drw, "...");
>       while (1) {
> -             ew = ellipsis_len = utf8strlen = 0;
> +             ew = ellipsis_len = utf8err = utf8charlen = utf8strlen = 0;
>               utf8str = text;
>               nextfont = NULL;
>               while (*text) {
> -                     utf8charlen = utf8decode(text, &utf8codepoint, UTF_SIZ);
> +                     utf8charlen = utf8decode(text, &utf8codepoint, 
> &utf8err);
>                       for (curfont = drw->fonts; curfont; curfont = 
> curfont->next) {
>                               charexists = charexists || 
> XftCharExists(drw->dpy, curfont->xfont, utf8codepoint);
>                               if (charexists) {
> -- 
> 2.42.0
> 

> From 396a21976f08b3c66e521855084685ce93356756 Mon Sep 17 00:00:00 2001
> From: NRK <[email protected]>
> Date: Thu, 4 Jul 2024 21:27:47 +0000
> Subject: [PATCH 2/2] render invalid utf8 sequences as U+FFFD
> 
> previously drw_text would do the width calculations as if
> invalid utf8 sequences were replaced with U+FFFD but would pass
> the invalid utf8 sequence to xft to render where xft would just
> cut it off at the first invalid byte.
> 
> this change makes invalid utf8 render as U+FFFD and avoids
> sending invalid sequences to xft. the following can be used to
> check the behavior before and after the patch:
> 
>       $ printf "0\xef1234567\ntest" | dmenu
> 
> Ref: https://lists.suckless.org/dev/2407/35646.html
> ---
>  drw.c | 19 +++++++++++++++----
>  1 file changed, 15 insertions(+), 4 deletions(-)
> 
> diff --git a/drw.c b/drw.c
> index eb71da7..f151ae5 100644
> --- a/drw.c
> +++ b/drw.c
> @@ -237,7 +237,8 @@ drw_text(Drw *drw, int x, int y, unsigned int w, unsigned 
> int h, unsigned int lp
>       XftResult result;
>       int charexists = 0, overflow = 0;
>       /* keep track of a couple codepoints for which we have no match. */
> -     static unsigned int nomatches[128], ellipsis_width;
> +     static unsigned int nomatches[128], ellipsis_width, invalid_width;
> +     static const char invalid[] = "�";
>  
>       if (!drw || (render && (!drw->scheme || !w)) || !text || !drw->fonts)
>               return 0;
> @@ -257,6 +258,10 @@ drw_text(Drw *drw, int x, int y, unsigned int w, 
> unsigned int h, unsigned int lp
>       usedfont = drw->fonts;
>       if (!ellipsis_width && render)
>               ellipsis_width = drw_fontset_getwidth(drw, "...");
> +     if (!invalid_width) {
> +             invalid_width = -1; /* stop infinite recursion */
> +             invalid_width = drw_fontset_getwidth(drw, invalid);
> +     }
>       while (1) {
>               ew = ellipsis_len = utf8err = utf8charlen = utf8strlen = 0;
>               utf8str = text;
> @@ -284,9 +289,9 @@ drw_text(Drw *drw, int x, int y, unsigned int w, unsigned 
> int h, unsigned int lp
>                                               else
>                                                       utf8strlen = 
> ellipsis_len;
>                                       } else if (curfont == usedfont) {
> -                                             utf8strlen += utf8charlen;
>                                               text += utf8charlen;
> -                                             ew += tmpw;
> +                                             utf8strlen += utf8err ? 0 : 
> utf8charlen;
> +                                             ew += utf8err ? 0 : tmpw;
>                                       } else {
>                                               nextfont = curfont;
>                                       }
> @@ -294,7 +299,7 @@ drw_text(Drw *drw, int x, int y, unsigned int w, unsigned 
> int h, unsigned int lp
>                               }
>                       }
>  
> -                     if (overflow || !charexists || nextfont)
> +                     if (overflow || !charexists || nextfont || utf8err)
>                               break;
>                       else
>                               charexists = 0;
> @@ -309,6 +314,12 @@ drw_text(Drw *drw, int x, int y, unsigned int w, 
> unsigned int h, unsigned int lp
>                       x += ew;
>                       w -= ew;
>               }
> +             if (utf8err && (!render || invalid_width < w)) {
> +                     if (render)
> +                             drw_text(drw, x, y, w, h, 0, invalid, invert);
> +                     x += invalid_width;
> +                     w -= invalid_width;
> +             }
>               if (render && overflow)
>                       drw_text(drw, ellipsis_x, y, ellipsis_w, h, 0, "...", 
> invert);
>  
> -- 
> 2.42.0
> 


-- 
Kind regards,
Hiltjo

Re: [hackers] [PATCH][dmenu/libsl] render invalid utf8 bytes as U+FFFD

Reply via email to