On Mon, 10 Jan 2005, Matt Diephouse wrote:
> # New Ticket Created by Matt Diephouse
> # Please include the string: [perl #33747]
> # in the subject line of all future correspondence about this issue.
> # <URL: http://rt.perl.org:80/rt3/Ticket/Display.html?id=33747 >
>
> I'm not sure what this should print, but I'm pretty sure the right
> answer isn't "8".
>
> .sub main @MAIN
> $S0 = ""
> $S1 = substr $S0, -1, 1
> print $S1
> print "\n"
> end
> .end
>
> If you s/-1, 1/-1/, it doesn't print the 8.
Well, you're trying to take a non-zero length substring from out of a
zero-length string, so I would contend that Parrot ought to throw an
exception. The reason that it doesn't is that this code in
string_substr() in string.c does not act as intended for a zero-length
string:
...
true_offset = (UINTVAL)offset;
/* Allow regexes to return $' easily for "aaa" =~ /aaa/ */
if (offset == (INTVAL)string_length(interpreter, src) || length < 1) {
return string_make_empty(interpreter, enum_stringrep_one, 0);
}
if (offset < 0) {
true_offset = (UINTVAL)(src->strlen + offset);
}
if (true_offset > src->strlen - 1) { /* 0 based... */
internal_exception(SUBSTR_OUT_OF_STRING,
"Cannot take substr outside string");
}
...
As best I can tell, the intent of the code is this:
1) If you ask for a substring (of arbitrary length) starting at one
character position past the end of the string, then return an
empty string.
2) If you ask for a zero-length substring, then return an empty string,
regardless of the supplied offset
3) If the supplied offset is before the start or after the end of the
string, then throw an exception.
4) Otherwise, continue.
The problem here is with step three, and in particular the comparison
between true_offset and src->strlen - 1. This does not work as intended
for a zero-length string, because src->strlen is an _unsigned_ int, and
so if src->strlen = 0, then src->strlen - 1 is very very large. The
patch below fixes this bug while otherwise keeping the behaviour the
same, but I'm posting it to the list rather than simply applying it
because it is not clear to me that the behaviour described above is
precisely what we want. Specifically, should this code:
set S0, ""
substr S2, S0, 0, 1
throw an exception or not? At the moment, it does not (even with my
patch), because the offset == the string length. On the other hand,
this code:
set S0, ""
substr S2, S0, -1, 1
will throw an exception. Is this behaviour correct?
Incidentally, the reason that:
set S0, ""
substr S2, S0, -1
works is that the code passes in the length of S0 as the length
parameter, and since this is zero, string_substr returns an empty
string, without any reference to the offset parameter.
Simon
PS I've also included a patch to t/op/string.t that adds tests which
feed zero length strings to the various forms of the substr op. The
tests assume that the behaviour described above is correct.
----------------------------------------------------------------------
Index: src/string.c
===================================================================
RCS file: /cvs/public/parrot/src/string.c,v
retrieving revision 1.229
diff -u -r1.229 string.c
--- src/string.c 3 Nov 2004 21:44:39 -0000 1.229
+++ src/string.c 10 Jan 2005 19:05:09 -0000
@@ -1421,7 +1421,7 @@
true_offset = (UINTVAL)(src->strlen + offset);
}
- if (true_offset > src->strlen - 1) { /* 0 based... */
+ if (src->strlen == 0 || true_offset > src->strlen - 1) { /* 0 based... */
internal_exception(SUBSTR_OUT_OF_STRING,
"Cannot take substr outside string");
}
Index: t/op/string.t
===================================================================
RCS file: /cvs/public/parrot/t/op/string.t,v
retrieving revision 1.79
diff -u -r1.79 string.t
--- t/op/string.t 2 Jan 2005 11:34:55 -0000 1.79
+++ t/op/string.t 10 Jan 2005 19:06:18 -0000
@@ -16,7 +16,7 @@
=cut
-use Parrot::Test tests => 135;
+use Parrot::Test tests => 144;
use Test::More;
output_is( <<'CODE', <<OUTPUT, "set_s_s|sc" );
@@ -512,6 +512,127 @@
end
CODE
+output_like( <<'CODE', <<'OUTPUT', "substr, +ve offset, zero-length string" );
+ set S0, ""
+ substr S1, S0, 10, 3
+ print S1
+ end
+CODE
+/Cannot take substr outside string/
+OUTPUT
+
+output_is( <<'CODE', <<'OUTPUT', "substr, offset 0, zero-length string" );
+ set S0, ""
+ substr S1, S0, 0, 1
+ print S1
+ print "_\n"
+ end
+CODE
+_
+OUTPUT
+
+output_like( <<'CODE', <<'OUTPUT', "substr, offset -1, zero-length string" );
+ set S0, ""
+ substr S1, S0, -1, 1
+ print S1
+ end
+CODE
+/Cannot take substr outside string/
+OUTPUT
+
+output_like( <<'CODE', <<'OUTPUT', "substr, -ve offset, zero-length string" );
+ set S0, ""
+ substr S1, S0, -10, 5
+ print S1
+ end
+CODE
+/Cannot take substr outside string/
+OUTPUT
+
+output_is( <<'CODE', <<'OUTPUT', "zero-length substr, zero-length string" );
+ set S0, ""
+ substr S1, S0, 10, 0
+ print S1
+ print "_\n"
+ end
+CODE
+_
+OUTPUT
+
+output_is( <<'CODE', <<'OUTPUT', "zero-length substr, zero-length string" );
+ set S0, ""
+ substr S1, S0, -10, 0
+ print S1
+ print "_\n"
+ end
+CODE
+_
+OUTPUT
+
+output_is( <<'CODE', <<'OUTPUT', "3-arg substr, zero-length string" );
+ set S0, ""
+ substr S1, S0, 2
+ print S1
+ print "_\n"
+ end
+CODE
+_
+OUTPUT
+
+output_is( <<'CODE', <<'OUTPUT', "5 arg substr, zero-length string" );
+ set S0, ""
+ set S1, "xyz"
+ substr S2, S0, 0, 3, S1
+ print S0
+ print "\n"
+ print S1
+ print "\n"
+ print S2
+ print "\n"
+
+ set S3, ""
+ set S4, "abcde"
+ substr S5, S3, 0, 0, S4
+ print S3
+ print "\n"
+ print S4
+ print "\n"
+ print S5
+ print "\n"
+ end
+CODE
+xyz
+xyz
+
+abcde
+abcde
+
+OUTPUT
+
+output_is( <<'CODE', <<'OUTPUT', "4 arg substr replace, zero-length string" );
+ set S0, ""
+ set S1, "xyz"
+ substr S0, 0, 3, S1
+ print S0
+ print "\n"
+ print S1
+ print "\n"
+
+ set S2, ""
+ set S3, "abcde"
+ substr S2, 0, 0, S3
+ print S2
+ print "\n"
+ print S3
+ print "\n"
+ end
+CODE
+xyz
+xyz
+abcde
+abcde
+OUTPUT
+
output_is( <<'CODE', '<><', "concat_s_s|sc, null onto null" );
print "<>"
concat S0, S0