Serhiy Storchaka <storch...@gmail.com> added the comment:

I thought it was one error, and not two.

The updated patch adds tests and fixes minor mistake. 2.7 is not
affected by main security issue, but it contains one of mentioned bugs
(read 1 byte outside of the input array). A patch for 2.7 fixes this bug
and also includes tests.

----------
Added file: http://bugs.python.org/file25366/utf16_error_handling-3.2_4.patch
Added file: http://bugs.python.org/file25367/utf16_error_handling-2.7.patch

_______________________________________
Python tracker <rep...@bugs.python.org>
<http://bugs.python.org/issue14579>
_______________________________________
diff -r b07488490001 Lib/test/test_codecs.py
--- a/Lib/test/test_codecs.py   Fri Apr 20 14:36:47 2012 +0200
+++ b/Lib/test/test_codecs.py   Wed Apr 25 20:08:37 2012 +0300
@@ -540,8 +540,19 @@
         )
 
     def test_errors(self):
-        self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
-                          b"\xff", "strict", True)
+        tests = [
+            (b'\xff', '\ufffd'),
+            (b'A\x00Z', 'A\ufffd'),
+            (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
+            (b'\x00\xd8', '\ufffd'),
+            (b'\x00\xd8A', '\ufffd'),
+            (b'\x00\xd8A\x00', '\ufffdA'),
+            (b'\x00\xdcA\x00', '\ufffdA'),
+        ]
+        for raw, expected in tests:
+            self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
+                              raw, 'strict', True)
+            self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
 
     def test_nonbmp(self):
         self.assertEqual("\U00010203".encode(self.encoding),
@@ -568,8 +579,19 @@
         )
 
     def test_errors(self):
-        self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
-                          b"\xff", "strict", True)
+        tests = [
+            (b'\xff', '\ufffd'),
+            (b'\x00A\xff', 'A\ufffd'),
+            (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
+            (b'\xd8\x00', '\ufffd'),
+            (b'\xd8\x00\xdc', '\ufffd'),
+            (b'\xd8\x00\x00A', '\ufffdA'),
+            (b'\xdc\x00\x00A', '\ufffdA'),
+        ]
+        for raw, expected in tests:
+            self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
+                              raw, 'strict', True)
+            self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
 
     def test_nonbmp(self):
         self.assertEqual("\U00010203".encode(self.encoding),
diff -r b07488490001 Objects/unicodeobject.c
--- a/Objects/unicodeobject.c   Fri Apr 20 14:36:47 2012 +0200
+++ b/Objects/unicodeobject.c   Wed Apr 25 20:08:37 2012 +0300
@@ -3425,7 +3425,7 @@
     /* Unpack UTF-16 encoded data */
     p = unicode->str;
     q = (unsigned char *)s;
-    e = q + size - 1;
+    e = q + size;
 
     if (byteorder)
         bo = *byteorder;
@@ -3476,8 +3476,20 @@
 #endif
 
     aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
-    while (q < e) {
+    while (1) {
         Py_UNICODE ch;
+        if (e - q < 2) {
+            /* remaining byte at the end? (size should be even) */
+            if (q == e || consumed)
+                break;
+            errmsg = "truncated data";
+            startinpos = ((const char *)q) - starts;
+            endinpos = ((const char *)e) - starts;
+            outpos = p - PyUnicode_AS_UNICODE(unicode);
+            goto utf16Error;
+            /* The remaining input chars are ignored if the callback
+               chooses to skip the input */
+        }
         /* First check for possible aligned read of a C 'long'. Unaligned
            reads are more expensive, better to defer to another iteration. */
         if (!((size_t) q & LONG_PTR_MASK)) {
@@ -3546,8 +3558,8 @@
             }
             p = _p;
             q = _q;
-            if (q >= e)
-                break;
+            if (e - q < 2)
+                continue;
         }
         ch = (q[ihi] << 8) | q[ilo];
 
@@ -3559,10 +3571,10 @@
         }
 
         /* UTF-16 code pair: */
-        if (q > e) {
+        if (e - q < 2) {
             errmsg = "unexpected end of data";
             startinpos = (((const char *)q) - 2) - starts;
-            endinpos = ((const char *)e) + 1 - starts;
+            endinpos = ((const char *)e) - starts;
             goto utf16Error;
         }
         if (0xD800 <= ch && ch <= 0xDBFF) {
@@ -3606,31 +3618,9 @@
                 &outpos,
                 &p))
             goto onError;
-    }
-    /* remaining byte at the end? (size should be even) */
-    if (e == q) {
-        if (!consumed) {
-            errmsg = "truncated data";
-            startinpos = ((const char *)q) - starts;
-            endinpos = ((const char *)e) + 1 - starts;
-            outpos = p - PyUnicode_AS_UNICODE(unicode);
-            if (unicode_decode_call_errorhandler(
-                    errors,
-                    &errorHandler,
-                    "utf16", errmsg,
-                    &starts,
-                    (const char **)&e,
-                    &startinpos,
-                    &endinpos,
-                    &exc,
-                    (const char **)&q,
-                    &unicode,
-                    &outpos,
-                    &p))
-                goto onError;
-            /* The remaining input chars are ignored if the callback
-               chooses to skip the input */
-        }
+        /* Update data because unicode_decode_call_errorhandler might have
+           changed the input object. */
+        aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
     }
 
     if (byteorder)
diff -r ab9d6c4907e7 Lib/test/test_codecs.py
--- a/Lib/test/test_codecs.py   Thu Apr 19 23:55:01 2012 +0200
+++ b/Lib/test/test_codecs.py   Wed Apr 25 20:08:19 2012 +0300
@@ -495,7 +495,20 @@
         )
 
     def test_errors(self):
-        self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode, "\xff", 
"strict", True)
+        tests = [
+            (b'\xff', u'\ufffd'),
+            (b'A\x00Z', u'A\ufffd'),
+            (b'A\x00B\x00C\x00D\x00Z', u'ABCD\ufffd'),
+            (b'\x00\xd8', u'\ufffd'),
+            (b'\x00\xd8A', u'\ufffd'),
+            (b'\x00\xd8A\x00', u'\ufffdA'),
+            (b'\x00\xdcA\x00', u'\ufffdA'),
+        ]
+        for raw, expected in tests:
+            print('*****', raw, expected)
+            self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
+                              raw, 'strict', True)
+            self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
 
 class UTF16BETest(ReadTest):
     encoding = "utf-16-be"
@@ -516,7 +529,20 @@
         )
 
     def test_errors(self):
-        self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode, "\xff", 
"strict", True)
+        tests = [
+            (b'\xff', u'\ufffd'),
+            (b'\x00A\xff', u'A\ufffd'),
+            (b'\x00A\x00B\x00C\x00DZ', u'ABCD\ufffd'),
+            (b'\xd8\x00', u'\ufffd'),
+            (b'\xd8\x00\xdc', u'\ufffd'),
+            (b'\xd8\x00\x00A', u'\ufffdA'),
+            (b'\xdc\x00\x00A', u'\ufffdA'),
+        ]
+        for raw, expected in tests:
+            print('*****', raw, expected)
+            self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
+                              raw, 'strict', True)
+            self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
 
 class UTF8Test(ReadTest):
     encoding = "utf-8"
diff -r ab9d6c4907e7 Objects/unicodeobject.c
--- a/Objects/unicodeobject.c   Thu Apr 19 23:55:01 2012 +0200
+++ b/Objects/unicodeobject.c   Wed Apr 25 20:08:19 2012 +0300
@@ -2564,7 +2564,7 @@
         }
 
         /* UTF-16 code pair: */
-        if (q >= e) {
+        if (e - q < 2) {
             errmsg = "unexpected end of data";
             startinpos = (((const char *)q)-2)-starts;
             endinpos = ((const char *)e)-starts;
_______________________________________________
Python-bugs-list mailing list
Unsubscribe: 
http://mail.python.org/mailman/options/python-bugs-list/archive%40mail-archive.com

Reply via email to