I've extended old code using lex to accept utf by massaging the input
stream, before lex sees it, to parse utf and encode non-ascii Runes
into '\33' (escape) followed by 4 hex digits.  A simple lex rule then
decodes for the benefit of yacc.

This encodes:

/*
 * lex can't cope with character sets wider than 8 bits, so convert
 * s to runes and encode non-ascii runes as <esc><hex><hex><hex><hex>.
 * result is malloced.
 */
char *
utf2lex(char *s)
{
        int nb, bytes;
        Rune r;
        char *news, *p, *ds;

        /* pass 1: count bytes needed by the converted string; watch for UTF */
        for (p = s, nb = 0; *p != '\0'; p += bytes, nb++) {
                bytes = chartorune(&r, p);
                if (bytes > 1)
                        nb += 4;
        }
        news = malloc(nb+1);
        if (news != 0) {
                /* pass 2: convert s into new string */
                news[nb] = '\0';
                for (p = s, ds = news; *p != '\0'; p += bytes) {
                        bytes = chartorune(&r, p);
                        if (bytes == 1)
                                *ds++ = r;
                        else
                                ds += sprint(ds, "\33%.4ux", (int)r);
                }
        }
        return news;
}

and this lex code decodes:

%{
char *lex2rune(Rune *rp, char *s);
char *estrdup(char *);

static Rune inrune;
%}
E       \33
%%
{E}....                 {
                        yylval.charp = estrdup(lex2rune(&inrune, yytext+1));
                        return inrune;
                        }
%%
char *
lex2rune(Rune *rp, char *s)
{
        static char utf[UTFmax+1];

        *rp = strtoul(s, 0, 16);
        utf[runetochar(utf, rp)] = '\0';
        return utf;
}


Reply via email to