Changeset: 431bfeaaa76b for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/431bfeaaa76b Branch: default Log Message:
Merges regexp branch diffs (101 lines): diff --git a/monetdb5/modules/mal/pcre.c b/monetdb5/modules/mal/pcre.c --- a/monetdb5/modules/mal/pcre.c +++ b/monetdb5/modules/mal/pcre.c @@ -733,6 +733,7 @@ single_replace(pcre *pcre_code, pcre_ext int offset = 0; int len_result = 0; int addlen; + int empty_match_correction = 0; char *tmp; do { @@ -740,7 +741,12 @@ single_replace(pcre *pcre_code, pcre_ext exec_options, ovector, ovecsize); if (j <= 0) break; - addlen = ovector[0] - offset + (nbackrefs == 0 ? len_replacement : 0); + + empty_match_correction = ovector[0] == ovector[1] ? 1 : 0; + + // calculate the length of the string that will be appended to result + addlen = ovector[0] - offset + + (nbackrefs == 0 ? len_replacement : 0) + empty_match_correction; if (len_result + addlen >= *max_result) { tmp = GDKrealloc(result, len_result + addlen + 1); if (tmp == NULL) { @@ -750,11 +756,13 @@ single_replace(pcre *pcre_code, pcre_ext result = tmp; *max_result = len_result + addlen + 1; } + // append to the result the parts of the original string that are left unchanged if (ovector[0] > offset) { strncpy(result + len_result, origin_str + offset, ovector[0] - offset); len_result += ovector[0] - offset; } + // append to the result the replacement of the matched string if (nbackrefs == 0) { strncpy(result + len_result, replacement, len_replacement); len_result += len_replacement; @@ -807,8 +815,18 @@ single_replace(pcre *pcre_code, pcre_ext len_result += addlen; } } - offset = ovector[1]; - } while (offset < len_origin_str && global); + // In case of an empty match just advance the offset by 1 + offset = ovector[1] + empty_match_correction; + // and copy the character that we just advanced over + if (empty_match_correction) { + strncpy(result + len_result, origin_str + ovector[1], 1); + ++len_result; + } + // before we loop around check with the offset - 1 if we had an empty match + // since we manually advanced the offset by one. otherwise we gonna skip a + // replacement at the end of the string + } while ((offset - empty_match_correction) < len_origin_str && global); + if (offset < len_origin_str) { addlen = len_origin_str - offset; if (len_result + addlen >= *max_result) { diff --git a/sql/test/Tests/regexp.test b/sql/test/Tests/regexp.test --- a/sql/test/Tests/regexp.test +++ b/sql/test/Tests/regexp.test @@ -54,9 +54,36 @@ select regexp_replace('foo', 'f o o', 'X ---- foo -# regex option - not extended +# regex option - extended query T rowsort select regexp_replace('foo', 'f o o', 'XYZ', 'x') ---- XYZ +# regex option - not emtpy match +query T rowsort +select regexp_replace('foobar', 'k?', 'XY') +---- +foobar + +# regex option - emtpy match +query T rowsort +select regexp_replace('foobar', 'k?', '-', 'e') +---- +-f-o-o-b-a-r- + +# regex option - empty match w alternative v1 +query T rowsort +select regexp_replace('abc', 'b|k?', '-', 'e') +---- +-a--c- + +# regex option - empty match w alternative v2 +# even though you would expect -a--c- the pcre lib does not return +# the longest match for this particular pattern in offset 1 ('b') but +# an empty string match ¯\_(ツ)_/¯ +query T rowsort +select regexp_replace('abc', 'k?|b', '-', 'e') +---- +-a-b-c- + _______________________________________________ checkin-list mailing list -- checkin-list@monetdb.org To unsubscribe send an email to checkin-list-le...@monetdb.org