Author: Carl Friedrich Bolz-Tereick <[email protected]>
Branch: py3.6
Changeset: r97468:bd340e819dcd
Date: 2019-09-12 18:48 +0200
http://bitbucket.org/pypy/pypy/changeset/bd340e819dcd/
Log: now I get to what I actually wanted to achieve: a fast path in
utf8_encode_utf_8 for the common case where no surrogates are
present
diff --git a/pypy/interpreter/unicodehelper.py
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -212,27 +212,38 @@
self.old = old
def utf8_encode_utf_8(s, errors, errorhandler, allow_surrogates=False):
- assert isinstance(s, str)
size = len(s)
if size == 0:
return ''
+
+ # two fast paths
+ if allow_surrogates:
+ # already valid utf-8 with surrogates, surrogates are allowed, so just
+ # return
+ return s
+ if not rutf8.has_surrogates(s):
+ # already valid utf-8 and doesn't contain surrogates, so we don't need
+ # to do anything
+ return s
+ # annoying slow path
+ return _utf8_encode_utf_8_deal_with_surrogates(s, errors, errorhandler)
+
+def _utf8_encode_utf_8_deal_with_surrogates(s, errors, errorhandler):
pos = 0
upos = 0
+ size = len(s)
result = StringBuilder(size)
while pos < size:
try:
- lgt = rutf8.check_utf8(s, allow_surrogates=allow_surrogates,
start=pos)
- if pos == 0:
- # fast path
- return s
- for ch in s[pos:]:
- result.append(ch)
+ rutf8.check_utf8(s, allow_surrogates=False, start=pos)
+ # otherwise the fast path above would have triggered
+ assert pos != 0
+ result.append_slice(s, pos, len(s))
break
except rutf8.CheckError as e:
end = e.pos
assert end >= 0
- for ch in s[pos:end]:
- result.append(ch)
+ result.append_slice(s, pos, end)
upos += rutf8.codepoints_in_utf8(s, start=pos, end=end)
pos = end
# Try to get collect surrogates in one pass
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit