[PATCH] sed: don't escape delimiters in bracket expression

Yao Zi Sun, 01 Sep 2024 09:57:37 -0700

As specified in POSIX standard[1], delimiters in bracket expression
always have their original meaning, thus 's/[\/]//' matches either '\'
or '/'.

This commit skips occurrence of "\DELIM" in a bracket expression when
parsing escape sequence in a regular expression, following the
specification and behavior of other implementation (GNU coreutils,
NetBSD). Corresponding test is added as well.

[1]: "Regular Expressions in sed" 
https://pubs.opengroup.org/onlinepubs/9799919799/utilities/sed.html
Closes: http://lists.busybox.net/pipermail/busybox/2024-July/090844.html
Fixes: e998c7c03 ("sed: fix handling of escaped delimiters in s/// search 
pattern, closes 14541")
Signed-off-by: Yao Zi <zi...@disroot.org>
---
 editors/sed.c       | 41 +++++++++++++++++++++++++++--------------
 testsuite/sed.tests |  2 ++
 2 files changed, 29 insertions(+), 14 deletions(-)

diff --git a/editors/sed.c b/editors/sed.c
index 6179c5e80..0f58a7807 100644
--- a/editors/sed.c
+++ b/editors/sed.c
@@ -254,18 +254,22 @@ static void cleanup_outname(void)
        if (G.outname) unlink(G.outname);
 }
 
-/* strcpy, replacing "\from" with 'to'. If to is NUL, replacing "\any" with 
'any' */
-static unsigned parse_escapes(char *dest, const char *string, int len, char 
from, char to)
+/* strcpy, replacing "\from" with 'to'.
+ * If to is NUL, replacing "\any" with 'any'.
+ * If re is 1, '\from' in bracket expression is not treated as escape sequence.
+ */
+static unsigned parse_escapes(char *dest, const char *string, int len,
+                             char from, char to, int re)
 {
        char *d = dest;
-       int i = 0;
+       int i = 0, bracket = 0;
 
        if (len == -1)
                len = strlen(string);
 
        while (i < len) {
                if (string[i] == '\\') {
-                       if (!to || string[i+1] == from) {
+                       if (!bracket && (!to || string[i+1] == from)) {
                                if ((*d = to ? to : string[i+1]) == '\0')
                                        return d - dest;
                                i += 2;
@@ -275,7 +279,12 @@ static unsigned parse_escapes(char *dest, const char 
*string, int len, char from
                        i++; /* skip backslash in string[] */
                        *d++ = '\\';
                        /* fall through: copy next char verbatim */
+               } else if (re && string[i] == '[') {
+                       bracket++;
+               } else if (re && string[i] == ']') {
+                       bracket--;
                }
+
                if ((*d = string[i++]) == '\0')
                        return d - dest;
                d++;
@@ -284,7 +293,8 @@ static unsigned parse_escapes(char *dest, const char 
*string, int len, char from
        return d - dest;
 }
 
-static char *copy_parsing_escapes(const char *string, int len, char delim)
+static char *copy_parsing_escapes(const char *string, int len, char delim,
+                                 int re)
 {
        const char *s;
        char *dest = xmalloc(len + 1);
@@ -292,14 +302,15 @@ static char *copy_parsing_escapes(const char *string, int 
len, char delim)
        /* sed recognizes \n */
        /* GNU sed also recognizes \t and \r */
        for (s = "\nn\tt\rr"; *s; s += 2) {
-               len = parse_escapes(dest, string, len, s[1], s[0]);
+               len = parse_escapes(dest, string, len, s[1], s[0],
+                                   re && delim == s[1]);
                string = dest;
        }
        if (delim) {
                /* we additionally unescape any instances of escaped delimiter.
                 * For example, in 's+9\++X+' the pattern is "9+", not "9\+".
                 */
-               len = parse_escapes(dest, string, len, delim, delim);
+               len = parse_escapes(dest, string, len, delim, delim, re);
        }
        return dest;
 }
@@ -360,14 +371,15 @@ static int parse_regex_delim(const char *cmdstr, char 
**match, char **replace)
 
        /* save the match string */
        idx = index_of_next_unescaped_regexp_delim(delimiter, cmdstr_ptr);
-       *match = copy_parsing_escapes(cmdstr_ptr, idx, delimiter);
+       *match = copy_parsing_escapes(cmdstr_ptr, idx, delimiter, 1);
+       dbg("sed match: %s", *match);
        /* save the replacement string */
        cmdstr_ptr += idx + 1;
        idx = index_of_next_unescaped_regexp_delim(- (int)delimiter, 
cmdstr_ptr);
 //GNU sed 4.8:
 // echo 789 | sed 's&8&\&&'       - 7&9  ("\&" remained "\&")
 // echo 789 | sed 's1\(8\)1\1\11' - 7119 ("\1\1" become "11")
-       *replace = copy_parsing_escapes(cmdstr_ptr, idx, delimiter != '&' ? 
delimiter : 0);
+       *replace = copy_parsing_escapes(cmdstr_ptr, idx, delimiter != '&' ? 
delimiter : 0, 0);
 
        return ((cmdstr_ptr - cmdstr) + idx);
 }
@@ -395,7 +407,7 @@ static int get_address(const char *my_str, int *linenum, 
regex_t ** regex)
                        delimiter = *++pos;
                next = index_of_next_unescaped_regexp_delim(delimiter, ++pos);
                if (next != 0) {
-                       temp = copy_parsing_escapes(pos, next, 0);
+                       temp = copy_parsing_escapes(pos, next, 0, 0);
                        G.previous_regex_ptr = *regex = 
xzalloc(sizeof(regex_t));
                        xregcomp(*regex, temp, G.regex_type);
                        free(temp);
@@ -590,10 +602,11 @@ static const char *parse_cmd_args(sed_cmd_t *sed_cmd, 
const char *cmdstr)
                        cmdstr++;
                }
                len = strlen(cmdstr);
-               sed_cmd->string = copy_parsing_escapes(cmdstr, len, 0);
+               sed_cmd->string = copy_parsing_escapes(cmdstr, len, 0, 0);
                cmdstr += len;
                /* "\anychar" -> "anychar" */
-               parse_escapes(sed_cmd->string, sed_cmd->string, -1, '\0', '\0');
+               parse_escapes(sed_cmd->string, sed_cmd->string, -1, '\0', '\0',
+                             0);
        }
        /* handle file cmds: (r)ead */
        else if (idx <= IDX_w) { /* r,w */
@@ -625,8 +638,8 @@ static const char *parse_cmd_args(sed_cmd_t *sed_cmd, const 
char *cmdstr)
 
                cmdstr += parse_regex_delim(cmdstr, &match, &replace)+1;
                /* \n already parsed, but \delimiter needs unescaping. */
-               parse_escapes(match,   match,   -1, i, i);
-               parse_escapes(replace, replace, -1, i, i);
+               parse_escapes(match,   match,   -1, i, i, 1);
+               parse_escapes(replace, replace, -1, i, i, 0);
 
                sed_cmd->string = xzalloc((strlen(match) + 1) * 2);
                for (i = 0; match[i] && replace[i]; i++) {
diff --git a/testsuite/sed.tests b/testsuite/sed.tests
index 626542e33..1992b9de6 100755
--- a/testsuite/sed.tests
+++ b/testsuite/sed.tests
@@ -428,6 +428,8 @@ testing "sed understands duplicate file name" \
        "" \
        "a\nb\nc\n"
 
+testing "sed don't escape delimiter in bracket expressions" \
+       "sed 's/[\/]//'" '/' "" '\/'
 
 # testing "description" "commands" "result" "infile" "stdin"
 
-- 
2.46.0

_______________________________________________
busybox mailing list
busybox@busybox.net
http://lists.busybox.net/mailman/listinfo/busybox

[PATCH] sed: don't escape delimiters in bracket expression

Reply via email to