As specified in POSIX standard[1], delimiters in bracket expression always have their original meaning, thus 's/[\/]//' matches either '\' or '/'.
This commit skips occurrence of "\DELIM" in a bracket expression when parsing escape sequence in a regular expression, following the specification and behavior of other implementation (GNU coreutils, NetBSD). Corresponding test is added as well. [1]: "Regular Expressions in sed" https://pubs.opengroup.org/onlinepubs/9799919799/utilities/sed.html Closes: http://lists.busybox.net/pipermail/busybox/2024-July/090844.html Fixes: e998c7c03 ("sed: fix handling of escaped delimiters in s/// search pattern, closes 14541") Signed-off-by: Yao Zi <zi...@disroot.org> --- editors/sed.c | 41 +++++++++++++++++++++++++++-------------- testsuite/sed.tests | 2 ++ 2 files changed, 29 insertions(+), 14 deletions(-) diff --git a/editors/sed.c b/editors/sed.c index 6179c5e80..0f58a7807 100644 --- a/editors/sed.c +++ b/editors/sed.c @@ -254,18 +254,22 @@ static void cleanup_outname(void) if (G.outname) unlink(G.outname); } -/* strcpy, replacing "\from" with 'to'. If to is NUL, replacing "\any" with 'any' */ -static unsigned parse_escapes(char *dest, const char *string, int len, char from, char to) +/* strcpy, replacing "\from" with 'to'. + * If to is NUL, replacing "\any" with 'any'. + * If re is 1, '\from' in bracket expression is not treated as escape sequence. + */ +static unsigned parse_escapes(char *dest, const char *string, int len, + char from, char to, int re) { char *d = dest; - int i = 0; + int i = 0, bracket = 0; if (len == -1) len = strlen(string); while (i < len) { if (string[i] == '\\') { - if (!to || string[i+1] == from) { + if (!bracket && (!to || string[i+1] == from)) { if ((*d = to ? to : string[i+1]) == '\0') return d - dest; i += 2; @@ -275,7 +279,12 @@ static unsigned parse_escapes(char *dest, const char *string, int len, char from i++; /* skip backslash in string[] */ *d++ = '\\'; /* fall through: copy next char verbatim */ + } else if (re && string[i] == '[') { + bracket++; + } else if (re && string[i] == ']') { + bracket--; } + if ((*d = string[i++]) == '\0') return d - dest; d++; @@ -284,7 +293,8 @@ static unsigned parse_escapes(char *dest, const char *string, int len, char from return d - dest; } -static char *copy_parsing_escapes(const char *string, int len, char delim) +static char *copy_parsing_escapes(const char *string, int len, char delim, + int re) { const char *s; char *dest = xmalloc(len + 1); @@ -292,14 +302,15 @@ static char *copy_parsing_escapes(const char *string, int len, char delim) /* sed recognizes \n */ /* GNU sed also recognizes \t and \r */ for (s = "\nn\tt\rr"; *s; s += 2) { - len = parse_escapes(dest, string, len, s[1], s[0]); + len = parse_escapes(dest, string, len, s[1], s[0], + re && delim == s[1]); string = dest; } if (delim) { /* we additionally unescape any instances of escaped delimiter. * For example, in 's+9\++X+' the pattern is "9+", not "9\+". */ - len = parse_escapes(dest, string, len, delim, delim); + len = parse_escapes(dest, string, len, delim, delim, re); } return dest; } @@ -360,14 +371,15 @@ static int parse_regex_delim(const char *cmdstr, char **match, char **replace) /* save the match string */ idx = index_of_next_unescaped_regexp_delim(delimiter, cmdstr_ptr); - *match = copy_parsing_escapes(cmdstr_ptr, idx, delimiter); + *match = copy_parsing_escapes(cmdstr_ptr, idx, delimiter, 1); + dbg("sed match: %s", *match); /* save the replacement string */ cmdstr_ptr += idx + 1; idx = index_of_next_unescaped_regexp_delim(- (int)delimiter, cmdstr_ptr); //GNU sed 4.8: // echo 789 | sed 's&8&\&&' - 7&9 ("\&" remained "\&") // echo 789 | sed 's1\(8\)1\1\11' - 7119 ("\1\1" become "11") - *replace = copy_parsing_escapes(cmdstr_ptr, idx, delimiter != '&' ? delimiter : 0); + *replace = copy_parsing_escapes(cmdstr_ptr, idx, delimiter != '&' ? delimiter : 0, 0); return ((cmdstr_ptr - cmdstr) + idx); } @@ -395,7 +407,7 @@ static int get_address(const char *my_str, int *linenum, regex_t ** regex) delimiter = *++pos; next = index_of_next_unescaped_regexp_delim(delimiter, ++pos); if (next != 0) { - temp = copy_parsing_escapes(pos, next, 0); + temp = copy_parsing_escapes(pos, next, 0, 0); G.previous_regex_ptr = *regex = xzalloc(sizeof(regex_t)); xregcomp(*regex, temp, G.regex_type); free(temp); @@ -590,10 +602,11 @@ static const char *parse_cmd_args(sed_cmd_t *sed_cmd, const char *cmdstr) cmdstr++; } len = strlen(cmdstr); - sed_cmd->string = copy_parsing_escapes(cmdstr, len, 0); + sed_cmd->string = copy_parsing_escapes(cmdstr, len, 0, 0); cmdstr += len; /* "\anychar" -> "anychar" */ - parse_escapes(sed_cmd->string, sed_cmd->string, -1, '\0', '\0'); + parse_escapes(sed_cmd->string, sed_cmd->string, -1, '\0', '\0', + 0); } /* handle file cmds: (r)ead */ else if (idx <= IDX_w) { /* r,w */ @@ -625,8 +638,8 @@ static const char *parse_cmd_args(sed_cmd_t *sed_cmd, const char *cmdstr) cmdstr += parse_regex_delim(cmdstr, &match, &replace)+1; /* \n already parsed, but \delimiter needs unescaping. */ - parse_escapes(match, match, -1, i, i); - parse_escapes(replace, replace, -1, i, i); + parse_escapes(match, match, -1, i, i, 1); + parse_escapes(replace, replace, -1, i, i, 0); sed_cmd->string = xzalloc((strlen(match) + 1) * 2); for (i = 0; match[i] && replace[i]; i++) { diff --git a/testsuite/sed.tests b/testsuite/sed.tests index 626542e33..1992b9de6 100755 --- a/testsuite/sed.tests +++ b/testsuite/sed.tests @@ -428,6 +428,8 @@ testing "sed understands duplicate file name" \ "" \ "a\nb\nc\n" +testing "sed don't escape delimiter in bracket expressions" \ + "sed 's/[\/]//'" '/' "" '\/' # testing "description" "commands" "result" "infile" "stdin" -- 2.46.0 _______________________________________________ busybox mailing list busybox@busybox.net http://lists.busybox.net/mailman/listinfo/busybox