On 04/03/2013 07:05 AM, Neil Hodgson wrote:
Dave Angel:
That would seem to imply that the speed regression on your data is NOT
caused by the differing size encodings. Perhaps it is the difference in
MSC compiler version, or other changes made between 3.2 and 3.3
Its not caused by there actually being different size encodings but
that the code is checking encoding size 2-4 times for each character.
Back in 3.2 the comparison loop looked like:
while (len1 > 0 && len2 > 0) {
Py_UNICODE c1, c2;
c1 = *s1++;
c2 = *s2++;
if (c1 != c2)
return (c1 < c2) ? -1 : 1;
len1--; len2--;
}
For 3.3 this has changed to
for (i = 0; i < len1 && i < len2; ++i) {
Py_UCS4 c1, c2;
c1 = PyUnicode_READ(kind1, data1, i);
c2 = PyUnicode_READ(kind2, data2, i);
if (c1 != c2)
return (c1 < c2) ? -1 : 1;
}
with PyUnicode_READ being
#define PyUnicode_READ(kind, data, index) \
((Py_UCS4) \
((kind) == PyUnicode_1BYTE_KIND ? \
((const Py_UCS1 *)(data))[(index)] : \
((kind) == PyUnicode_2BYTE_KIND ? \
((const Py_UCS2 *)(data))[(index)] : \
((const Py_UCS4 *)(data))[(index)] \
) \
))
There are either 1 or 2 kind checks in each call to PyUnicode_READ
and 2 calls to PyUnicode_READ inside the loop. A compiler may decide to
move the kind checks out of the loop and specialize the loop but MSVC
2010 appears to not do so.
I don't know how good MSC's template logic is, but it seems this would
be a good case for an explicit template, typed on the 'kind's values.
Or are all C++ features disabled when compiling Python? Failing that,
just code up 9 cases, and do a switch on the kinds.
I'm also puzzled. I thought that the sort algorithm used a hash of all
the items to be sorted, and only reverted to a raw comparison of the
original values when the hash collided. Is that not the case? Or is
the code you post here only used when the hash collides?
The assembler (32-bit build) for each
PyUnicode_READ looks like
mov ecx, DWORD PTR _kind1$[ebp]
cmp ecx, 1
jne SHORT $LN17@unicode_co@2
lea ecx, DWORD PTR [ebx+eax]
movzx edx, BYTE PTR [ecx+edx]
jmp SHORT $LN16@unicode_co@2
$LN17@unicode_co@2:
cmp ecx, 2
jne SHORT $LN15@unicode_co@2
movzx edx, WORD PTR [ebx+edi]
jmp SHORT $LN16@unicode_co@2
$LN15@unicode_co@2:
mov edx, DWORD PTR [ebx+esi]
$LN16@unicode_co@2:
It appears that the compiler is keeping the three pointers in three
separate registers (eax, esi and edi) even though those are 3 aliases
for the same pointer. This is preventing it from putting other values
in those registers.
It'd probably do better if the C code manipulated the pointers, rather
than using an index i each time. But if it did, perhaps gcc would
generate worse code.
If I were coding the assembler by hand (Intel only), I'd be able to
avoid the multiple cmp operations, simply by comparing first to 2, then
doing a jne and a ja. I dunno whether the compiler would notice if I
coded the equivalent in C. (make both comparisons to 2, one for less,
and one for more)
The kind1/kind2 variables aren't even going into registers and at
least one test+branch and a jump are executed for every character. Two
tests for 2 and 4 byte kinds. len1 and len2 don't get to go into
registers either.
Here's the full assembler output for unicode_compare:
; COMDAT _unicode_compare
_TEXT SEGMENT
_kind2$ = -20 ; size = 4
_kind1$ = -16 ; size = 4
_len2$ = -12 ; size = 4
_len1$ = -8 ; size = 4
_data2$ = -4 ; size = 4
_unicode_compare PROC ; COMDAT
; _str1$ = ecx
; _str2$ = eax
; 10417: {
push ebp
mov ebp, esp
sub esp, 20 ; 00000014H
push ebx
push esi
mov esi, eax
; 10418: int kind1, kind2;
; 10419: void *data1, *data2;
; 10420: Py_ssize_t len1, len2, i;
; 10421:
; 10422: kind1 = PyUnicode_KIND(str1);
mov eax, DWORD PTR [ecx+16]
mov edx, eax
shr edx, 2
and edx, 7
push edi
mov DWORD PTR _kind1$[ebp], edx
; 10423: kind2 = PyUnicode_KIND(str2);
mov edx, DWORD PTR [esi+16]
mov edi, edx
shr edi, 2
and edi, 7
mov DWORD PTR _kind2$[ebp], edi
; 10424: data1 = PyUnicode_DATA(str1);
test al, 32 ; 00000020H
je SHORT $LN9@unicode_co@2
test al, 64 ; 00000040H
je SHORT $LN7@unicode_co@2
lea ebx, DWORD PTR [ecx+24]
jmp SHORT $LN10@unicode_co@2
$LN7@unicode_co@2:
lea ebx, DWORD PTR [ecx+36]
jmp SHORT $LN10@unicode_co@2
$LN9@unicode_co@2:
mov ebx, DWORD PTR [ecx+36]
$LN10@unicode_co@2:
; 10425: data2 = PyUnicode_DATA(str2);
test dl, 32 ; 00000020H
je SHORT $LN13@unicode_co@2
test dl, 64 ; 00000040H
je SHORT $LN11@unicode_co@2
lea edx, DWORD PTR [esi+24]
jmp SHORT $LN30@unicode_co@2
$LN11@unicode_co@2:
lea eax, DWORD PTR [esi+36]
mov DWORD PTR _data2$[ebp], eax
mov edx, eax
jmp SHORT $LN14@unicode_co@2
$LN13@unicode_co@2:
mov edx, DWORD PTR [esi+36]
$LN30@unicode_co@2:
mov DWORD PTR _data2$[ebp], edx
$LN14@unicode_co@2:
; 10426: len1 = PyUnicode_GET_LENGTH(str1);
mov edi, DWORD PTR [ecx+8]
; 10427: len2 = PyUnicode_GET_LENGTH(str2);
mov ecx, DWORD PTR [esi+8]
; 10428:
; 10429: for (i = 0; i < len1 && i < len2; ++i) {
xor eax, eax
mov DWORD PTR _len1$[ebp], edi
mov DWORD PTR _len2$[ebp], ecx
test edi, edi
jle SHORT $LN2@unicode_co@2
; 10426: len1 = PyUnicode_GET_LENGTH(str1);
mov esi, edx
mov edi, edx
; 10428:
; 10429: for (i = 0; i < len1 && i < len2; ++i) {
sub ebx, edx
jmp SHORT $LN4@unicode_co@2
$LL28@unicode_co@2:
mov edx, DWORD PTR _data2$[ebp]
$LN4@unicode_co@2:
cmp eax, ecx
jge SHORT $LN29@unicode_co@2
; 10430: Py_UCS4 c1, c2;
; 10431: c1 = PyUnicode_READ(kind1, data1, i);
mov ecx, DWORD PTR _kind1$[ebp]
cmp ecx, 1
jne SHORT $LN17@unicode_co@2
lea ecx, DWORD PTR [ebx+eax]
movzx edx, BYTE PTR [ecx+edx]
jmp SHORT $LN16@unicode_co@2
$LN17@unicode_co@2:
cmp ecx, 2
jne SHORT $LN15@unicode_co@2
movzx edx, WORD PTR [ebx+edi]
jmp SHORT $LN16@unicode_co@2
$LN15@unicode_co@2:
mov edx, DWORD PTR [ebx+esi]
$LN16@unicode_co@2:
; 10432: c2 = PyUnicode_READ(kind2, data2, i);
mov ecx, DWORD PTR _kind2$[ebp]
cmp ecx, 1
jne SHORT $LN21@unicode_co@2
mov ecx, DWORD PTR _data2$[ebp]
movzx ecx, BYTE PTR [eax+ecx]
jmp SHORT $LN20@unicode_co@2
$LN21@unicode_co@2:
cmp ecx, 2
jne SHORT $LN19@unicode_co@2
movzx ecx, WORD PTR [edi]
jmp SHORT $LN20@unicode_co@2
$LN19@unicode_co@2:
mov ecx, DWORD PTR [esi]
$LN20@unicode_co@2:
; 10433:
; 10434: if (c1 != c2)
cmp edx, ecx
jne SHORT $LN31@unicode_co@2
mov ecx, DWORD PTR _len2$[ebp]
inc eax
add edi, 2
add esi, 4
cmp eax, DWORD PTR _len1$[ebp]
jl SHORT $LL28@unicode_co@2
$LN29@unicode_co@2:
mov edi, DWORD PTR _len1$[ebp]
$LN2@unicode_co@2:
; 10436: }
; 10437:
; 10438: return (len1 < len2) ? -1 : (len1 != len2);
cmp edi, ecx
jge SHORT $LN23@unicode_co@2
pop edi
pop esi
or eax, -1
pop ebx
; 10439: }
mov esp, ebp
pop ebp
ret 0
$LN31@unicode_co@2:
; 10435: return (c1 < c2) ? -1 : 1;
sbb eax, eax
pop edi
and eax, -2 ; fffffffeH
pop esi
inc eax
pop ebx
; 10439: }
mov esp, ebp
pop ebp
ret 0
$LN23@unicode_co@2:
; 10436: }
; 10437:
; 10438: return (len1 < len2) ? -1 : (len1 != len2);
xor eax, eax
cmp edi, ecx
pop edi
pop esi
setne al
pop ebx
; 10439: }
mov esp, ebp
pop ebp
ret 0
_unicode_compare ENDP
Neil
--
DaveA
--
http://mail.python.org/mailman/listinfo/python-list