Author: Carl Friedrich Bolz-Tereick <[email protected]>
Branch: py3.6
Changeset: r97468:bd340e819dcd
Date: 2019-09-12 18:48 +0200
http://bitbucket.org/pypy/pypy/changeset/bd340e819dcd/

Log:    now I get to what I actually wanted to achieve: a fast path in
        utf8_encode_utf_8 for the common case where no surrogates are
        present

diff --git a/pypy/interpreter/unicodehelper.py 
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -212,27 +212,38 @@
         self.old = old
 
 def utf8_encode_utf_8(s, errors, errorhandler, allow_surrogates=False):
-    assert isinstance(s, str)
     size = len(s)
     if size == 0:
         return ''
+
+    # two fast paths
+    if allow_surrogates:
+        # already valid utf-8 with surrogates, surrogates are allowed, so just
+        # return
+        return s
+    if not rutf8.has_surrogates(s):
+        # already valid utf-8 and doesn't contain surrogates, so we don't need
+        # to do anything
+        return s
+    # annoying slow path
+    return _utf8_encode_utf_8_deal_with_surrogates(s, errors, errorhandler)
+
+def _utf8_encode_utf_8_deal_with_surrogates(s, errors, errorhandler):
     pos = 0
     upos = 0
+    size = len(s)
     result = StringBuilder(size)
     while pos < size:
         try:
-            lgt = rutf8.check_utf8(s, allow_surrogates=allow_surrogates, 
start=pos)
-            if pos == 0:
-                # fast path
-                return s
-            for ch in s[pos:]:
-                result.append(ch)
+            rutf8.check_utf8(s, allow_surrogates=False, start=pos)
+            # otherwise the fast path above would have triggered
+            assert pos != 0
+            result.append_slice(s, pos, len(s))
             break
         except rutf8.CheckError as e:
             end = e.pos
             assert end >= 0
-            for ch in s[pos:end]:
-                result.append(ch)
+            result.append_slice(s, pos, end)
             upos += rutf8.codepoints_in_utf8(s, start=pos, end=end)
             pos = end
             # Try to get collect surrogates in one pass
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

Reply via email to