Hello, Internals!
I was fixing #72152 when it became apparent that the base64_decode
function is very buggy.
- Null byte ends processing.
- "V" produces empty result, while "V=" fails. Not very logical.
- Too short padding is allowed, e.g. "VV=" works like "VV==".
- Extra padding is allowed (like "V=====").
- Invalid padding is allowed ("=VVV=", "VV=V=", "VVV==") except on the
second place of a 24-bit run ("V=VV=" fails).
- In strict mode, space between padding fails:
"V V==" and "VV ==" and "VV== " are allowed,
"VV= =" fails.
- In strict mode, after a padding, one character is skipped, so "VVV=V"
decodes to "UU" (should be "UUU"), and "VVVV=*" decodes to "UUU" instead
of failing.
For each of the above, what would be the preferred behaviour in default
mode and strict mode?
Affected existing tests:
- ext/openssl/tests/bug61124.phpt uses "kzo w2RMExUTYQXW2Xzxmg==" as an
invalid base64 string, based on the invalid padding.
- ext/standard/tests/file/stream_rfc2397_006.phpt tests
"#Zm9vYmFyIGZvb2Jhcg==" and excepts this to be valid, while "#" is
clearly not valid base64. This also raises a question whether fragments
should be skipped in data uri handling.
Suggestions?
I've created a bug-for-bug compatible rewrite of base64_decode [1], with
all the bugs neatly and specifically implemented and missing features
commented out, so it's now very simple to fix them one by one.
I've also attached a test script that tests "all" possible combinations
of data, padding, NUL and other invalid characters, and my first patch
indeed provides identical results to the old implementation.
Currently interesting lines in the test results:
'base64' 'default' 'strict'
'V' '' ''
'V=' (false) (false)
'VV=' 'U' 'U'
'VV==' 'U' 'U'
'V=====' (false) (false)
'=VVV=' 'UU' (false)
'VV=V=' 'UU' (false)
'VVV==' 'UU' 'UU'
'V=VV=' (false) (false)
'V V==' 'U' 'U'
'VV ==' 'U' 'U'
'VV== ' 'U' 'U'
'VV= =' 'U' (false)
'VVV=V' 'UUU' 'UU'
'VVVV=*' 'UUU' 'UUU'
'VVVVVV=V' 'UUUUU' 'UUUU'
'VVVVVV=*' 'UUUU' 'UUUU'
'VVVV===*' 'UUU' 'UUU'
'VVV====V' 'UUU' 'UU'
'VVV====*' 'UU' 'UU'
'VV=====V' 'UU' 'U'
'VV=====*' 'U' 'U'
'=======*' '' ''
--
Lauri Kenttä
<?php
function str_format($s) {
return is_string($s) ? "'$s'" : ($s === false ? "(false)" : "(".gettype($s)." ???)");
}
function line($a, $b, $c) {
#if ($c !== false && ($b !== $c || strpos($a, "*") !== false || strpos($a, "\0") !== false))
printf("%-8s \t %-8s \t %-8s\n", str_format($a), str_format($b), str_format($c));
}
line("base64", "default", "strict");
foreach (["V", "V=", "VV=", "VV==", "V=====", "=VVV=", "VV=V=", "VVV==", "V=VV=", "V V==", "VV ==", "VV== ", "VV= =", "VVV=V", "VVVV=*"] as $v) {
line($v, base64_decode($v), base64_decode($v, true));
}
$t = "V=*\0";
for ($i = 0; $i < (1<<16); ++$i) {
$v = ltrim($t[($i>>14)&3].$t[($i>>12)&3].$t[($i>>10)&3].$t[($i>>8)&3].$t[($i>>6)&3].$t[($i>>4)&3].$t[($i>>2)&3].$t[($i>>0)&3], "\0");
line($v, base64_decode($v), base64_decode($v, true));
}
commit a38510eac7b852bdb8d580184802f0d5090b35fd
Author: Lauri Kenttä <lauri.ken...@gmail.com>
Date: Sun May 22 13:11:47 2016 +0300
base64_decode: reimplement cleanly (bug-for-bug)
diff --git a/ext/standard/base64.c b/ext/standard/base64.c
index 81f826c..bc63329 100644
--- a/ext/standard/base64.c
+++ b/ext/standard/base64.c
@@ -135,75 +135,99 @@ PHPAPI zend_string *php_base64_decode(const unsigned char *str, size_t length) /
PHPAPI zend_string *php_base64_decode_ex(const unsigned char *str, size_t length, zend_bool strict) /* {{{ */
{
- const unsigned char *current = str;
- int ch, i = 0, j = 0, k;
- /* this sucks for threaded environments */
+ int val_b64, padding = 0;
+ size_t i, n_in = 0, n_out = 0;
zend_string *result;
- result = zend_string_alloc(length, 0);
+ result = zend_string_alloc((length + 3) / 4 * 3, 0);
/* run through the whole string, converting as we go */
- while ((ch = *current++) != '\0' && length-- > 0) {
- if (ch == base64_pad) {
- if (*current != '=' && ((i % 4) == 1 || (strict && length > 0))) {
- if ((i % 4) != 1) {
- while (isspace(*(++current))) {
- continue;
- }
- if (*current == '\0') {
- continue;
- }
+ for (i = 0; i < length; ++i) {
+ /* stop on null byte */
+ /* FIXME: this is wrong behaviour, remove this! */
+ if (str[i] == 0) {
+ break;
+ }
+ /* count padding characters */
+ if (str[i] == base64_pad) {
+ /* fail if the padding character is second in a group (like A===) */
+ /* FIXME: why we still allow invalid padding in other places in the middle of the string? */
+ if (n_in % 4 == 1) {
+ goto fail;
+ }
+ /* in strict mode, when the padding ends, skip one (any) character, skip whitespaces,
+ * and return FALSE if the next character is not NUL, otherwise return the current decoded string */
+ /* FIXME: this is wrong behaviour and may read past-the-end, remove this! */
+ if (strict && i != length - 1 && str[i+1] != base64_pad) {
+ i += 2;
+ while (isspace(str[i])) {
+ i += 1;
}
- zend_string_free(result);
- return NULL;
+ if (str[i] == 0) {
+ break;
+ }
+ goto fail;
+ }
+ /* strict: fail if there is a space between padding characters */
+ /* FIXME: this is wrong behaviour, remove this! */
+ if (strict && padding && str[i-1] != base64_pad) {
+ goto fail;
}
+ /* strict: maximum padding is two characters */
+ /* FIXME: enable this!
+ if (strict && padding == 2) {
+ goto fail;
+ }
+ */
+ ++padding;
continue;
}
-
- ch = base64_reverse_table[ch];
- if ((!strict && ch < 0) || ch == -1) { /* a space or some other separator character, we simply skip over */
+ val_b64 = base64_reverse_table[str[i]];
+ /* spaces and unknown characters */
+ if (val_b64 < 0) {
+ /* strict: fail on unknown characters */
+ if (strict && val_b64 == -2) {
+ goto fail;
+ }
continue;
- } else if (ch == -2) {
- zend_string_free(result);
- return NULL;
}
+ /* strict: fail if data follows padding */
+ if (strict && padding) {
+ goto fail;
+ }
+ /* forget invalid padding */
+ padding = 0;
- switch(i % 4) {
+ switch (n_in++ % 4) {
case 0:
- ZSTR_VAL(result)[j] = ch << 2;
+ ZSTR_VAL(result)[n_out] = val_b64 << 2;
break;
case 1:
- ZSTR_VAL(result)[j++] |= ch >> 4;
- ZSTR_VAL(result)[j] = (ch & 0x0f) << 4;
+ ZSTR_VAL(result)[n_out++] |= val_b64 >> 4;
+ ZSTR_VAL(result)[n_out] = (val_b64 & 0x0f) << 4;
break;
case 2:
- ZSTR_VAL(result)[j++] |= ch >>2;
- ZSTR_VAL(result)[j] = (ch & 0x03) << 6;
+ ZSTR_VAL(result)[n_out++] |= val_b64 >> 2;
+ ZSTR_VAL(result)[n_out] = (val_b64 & 0x03) << 6;
break;
case 3:
- ZSTR_VAL(result)[j++] |= ch;
+ ZSTR_VAL(result)[n_out++] |= val_b64;
break;
}
- i++;
}
-
- k = j;
- /* mop things up if we ended on a boundary */
- if (ch == base64_pad) {
- switch(i % 4) {
- case 1:
- zend_string_free(result);
- return NULL;
- case 2:
- k++;
- case 3:
- ZSTR_VAL(result)[k] = 0;
- }
+ /* FIXME: fail if the last 24-bit sequence had only 6 bits set (like A===)
+ if (n_in % 4 == 1) {
+ goto fail;
}
- ZSTR_LEN(result) = j;
+ */
+
+ ZSTR_LEN(result) = n_out;
ZSTR_VAL(result)[ZSTR_LEN(result)] = '\0';
return result;
+fail:
+ zend_string_free(result);
+ return NULL;
}
/* }}} */
--
PHP Internals - PHP Runtime Development Mailing List
To unsubscribe, visit: http://www.php.net/unsub.php