Richard Sandiford <richard.sandif...@arm.com> writes: > Richard Biener <richard.guent...@gmail.com> writes: >> On Mon, Jul 29, 2019 at 11:11 AM Richard Sandiford >> <richard.sandif...@arm.com> wrote: >>> >>> inchash::hash::add_wide_int operated directly on the raw encoding >>> of the wide_int, including any redundant upper bits. The problem >>> with that is that the upper bits are only defined for some wide-int >>> storage types (including wide_int itself). wi::to_wide(tree) instead >>> returns a value that is extended according to the signedness of the >>> type (so that wi::to_widest can use the same encoding) while rtxes >>> have the awkward special case of BI, which can be zero-extended >>> rather than sign-extended. >>> >>> In the PR, we computed a hash for a "normal" sign-extended wide_int >>> while the existing entries hashed wi::to_wide(tree). This gives >>> different results for unsigned types that have the top bit set. >>> >>> The patch fixes that by hashing the canonical sign-extended form even >>> if the raw encoding happens to be different. >>> >>> Tested on aarch64-linux-gnu (with and without SVE), armeb-eabi >>> and x86_64-linux-gnu. OK to install? >> >> But only the most significant elt needs this treatment, no? So >> we can save some compile-time in the hasing by not doing >> it for all elts. > > Yeah, we only need it for the most significant elt. But the vast > majority of constants need 64 bits or fewer of encoding anyway > (even if the precision is greater than 64 bits), so that's usually the > only one we need to process. I'm not convinced handling multiple elts > specially would pay off.
To go into a bit more detail, the code for: void f1 (inchash::hash &h, const wide_int &x) { h.add_wide_int (x); } doesn't change with the patch, because is_sign_extended is true for wide_int. The code on x86_64 from a bootstrap compiler is: ------------------------------------------------------------------------ .cfi_startproc // Hash the length. movl 24(%rsi), %edx movl (%rdi), %ecx movl $-1640531527, %r8d subl %edx, %r8d subl %ecx, %edx movl %r8d, %eax movl %ecx, %r8d subl %ecx, %eax shrl $13, %r8d xorl %eax, %r8d movl %edx, %eax movl %r8d, %edx subl %r8d, %eax subl %r8d, %ecx sall $8, %edx xorl %eax, %edx movl %ecx, %eax movl %edx, %ecx subl %edx, %eax subl %edx, %r8d shrl $13, %ecx xorl %eax, %ecx movl %r8d, %eax movl %ecx, %r8d subl %ecx, %eax subl %ecx, %edx shrl $12, %r8d xorl %eax, %r8d movl %edx, %eax movl %r8d, %edx subl %r8d, %eax subl %r8d, %ecx sall $16, %edx xorl %eax, %edx movl %ecx, %eax movl %edx, %ecx subl %edx, %eax subl %edx, %r8d shrl $5, %ecx xorl %eax, %ecx movl %ecx, %eax subl %ecx, %r8d subl %ecx, %edx shrl $3, %eax xorl %eax, %r8d movl %edx, %eax movl %r8d, %edx subl %r8d, %eax sall $10, %edx xorl %eax, %edx movl %ecx, %eax subl %r8d, %eax xorl %r8d, %r8d subl %edx, %eax shrl $15, %edx xorl %edx, %eax movl %eax, (%rdi) // Hash the elements. movl 24(%rsi), %edx testl %edx, %edx je .L444 .p2align 4,,10 .p2align 3 .L446: movl %r8d, %edx movq (%rsi,%rdx,8), %r9 movq %r9, %rdx subl %eax, %r9d sarq $32, %rdx movl %r9d, %ecx movl %eax, %r9d subl %edx, %ecx shrl $13, %r9d subl %eax, %edx xorl %ecx, %r9d movl %edx, %ecx movl %r9d, %edx subl %r9d, %ecx subl %r9d, %eax sall $8, %edx xorl %ecx, %edx movl %edx, %ecx subl %edx, %eax subl %edx, %r9d shrl $13, %ecx xorl %ecx, %eax movl %r9d, %ecx movl %eax, %r9d subl %eax, %ecx subl %eax, %edx shrl $12, %r9d xorl %ecx, %r9d movl %edx, %ecx movl %r9d, %edx subl %r9d, %ecx subl %r9d, %eax sall $16, %edx xorl %ecx, %edx movl %edx, %ecx subl %edx, %eax subl %edx, %r9d shrl $5, %ecx xorl %eax, %ecx movl %ecx, %eax subl %ecx, %r9d subl %ecx, %edx shrl $3, %eax xorl %eax, %r9d movl %edx, %eax movl %r9d, %edx subl %r9d, %eax sall $10, %edx xorl %eax, %edx movl %ecx, %eax addl $1, %r8d subl %r9d, %eax subl %edx, %eax shrl $15, %edx xorl %edx, %eax movl %eax, (%rdi) cmpl %r8d, 24(%rsi) ja .L446 .L444: ret ------------------------------------------------------------------------ The code for: void f2 (inchash::hash &h, const wide_int_ref &x) { h.add_wide_int (x); } does change, and becomes: ------------------------------------------------------------------------ .cfi_startproc // Hash the length. movl 8(%rsi), %ecx movl (%rdi), %edx movl $-1640531527, %r8d subl %ecx, %r8d subl %edx, %ecx movl %r8d, %eax movl %edx, %r8d subl %edx, %eax shrl $13, %r8d xorl %eax, %r8d movl %ecx, %eax movl %r8d, %ecx subl %r8d, %eax subl %r8d, %edx sall $8, %ecx xorl %eax, %ecx movl %edx, %eax movl %ecx, %edx subl %ecx, %eax subl %ecx, %r8d shrl $13, %edx xorl %eax, %edx movl %r8d, %eax movl %edx, %r8d subl %edx, %eax subl %edx, %ecx shrl $12, %r8d xorl %eax, %r8d movl %ecx, %eax movl %r8d, %ecx subl %r8d, %eax subl %r8d, %edx sall $16, %ecx xorl %eax, %ecx movl %edx, %eax movl %ecx, %edx subl %ecx, %eax subl %ecx, %r8d shrl $5, %edx xorl %eax, %edx movl %edx, %eax subl %edx, %r8d subl %edx, %ecx shrl $3, %eax xorl %eax, %r8d movl %r8d, %eax subl %r8d, %ecx sall $10, %eax xorl %eax, %ecx subl %r8d, %edx subl %ecx, %edx shrl $15, %ecx movl %ecx, %eax xorl %edx, %eax movl %eax, (%rdi) // Hash the elements. movl 8(%rsi), %edx testl %edx, %edx je .L457 pushq %rbx .cfi_def_cfa_offset 16 .cfi_offset 3, -16 movq (%rsi), %r11 xorl %r9d, %r9d movl $64, %r10d .p2align 4,,10 .p2align 3 .L453: movl %r9d, %edx movl 12(%rsi), %ecx movq (%r11,%rdx,8), %r8 movl %r9d, %edx sall $6, %edx movl %ecx, %ebx subl %edx, %ebx cmpl $63, %ebx ja .L452 movl %r10d, %ebx subl %ecx, %ebx movl %ebx, %ecx addl %edx, %ecx salq %cl, %r8 sarq %cl, %r8 .L452: movq %r8, %rcx movl %eax, %edx subl %eax, %r8d sarq $32, %rcx shrl $13, %edx subl %ecx, %r8d subl %eax, %ecx xorl %edx, %r8d movl %r8d, %edx subl %r8d, %ecx subl %r8d, %eax sall $8, %edx xorl %edx, %ecx movl %ecx, %edx subl %ecx, %eax subl %ecx, %r8d shrl $13, %edx xorl %edx, %eax movl %eax, %edx subl %eax, %r8d subl %eax, %ecx shrl $12, %edx xorl %edx, %r8d movl %r8d, %edx subl %r8d, %ecx subl %r8d, %eax sall $16, %edx xorl %edx, %ecx movl %ecx, %edx subl %ecx, %eax subl %ecx, %r8d shrl $5, %edx xorl %eax, %edx movl %edx, %eax subl %edx, %r8d subl %edx, %ecx shrl $3, %eax xorl %eax, %r8d movl %r8d, %eax subl %r8d, %ecx sall $10, %eax xorl %eax, %ecx subl %r8d, %edx addl $1, %r9d subl %ecx, %edx shrl $15, %ecx movl %ecx, %eax xorl %edx, %eax movl %eax, (%rdi) cmpl %r9d, 8(%rsi) ja .L453 popq %rbx .cfi_def_cfa_offset 8 ret ------------------------------------------------------------------------ If I instead change the function to something like: /* Add wide_int-based value V. */ template<typename T> void add_wide_int (const generic_wide_int<T> &x) { add_int (x.get_len ()); for (unsigned i = 1; i < x.get_len (); i++) add_hwi (x.elt (i - 1)); add_hwi (x.sext_elt (x.get_len () - 1)); } then the function becomes much bigger for both f1 and f2. The f1 version is: ------------------------------------------------------------------------ .cfi_startproc // Hash the length. movl 24(%rsi), %ecx movl (%rdi), %edx movl $-1640531527, %r8d subl %ecx, %r8d subl %edx, %ecx movl %r8d, %eax movl %edx, %r8d subl %edx, %eax shrl $13, %r8d xorl %eax, %r8d movl %ecx, %eax movl %r8d, %ecx subl %r8d, %eax subl %r8d, %edx sall $8, %ecx xorl %eax, %ecx movl %edx, %eax movl %ecx, %edx subl %ecx, %eax subl %ecx, %r8d shrl $13, %edx xorl %eax, %edx movl %r8d, %eax movl %edx, %r8d subl %edx, %eax subl %edx, %ecx shrl $12, %r8d xorl %eax, %r8d movl %ecx, %eax movl %r8d, %ecx subl %r8d, %eax subl %r8d, %edx sall $16, %ecx xorl %eax, %ecx movl %edx, %eax movl %ecx, %edx subl %ecx, %eax subl %ecx, %r8d shrl $5, %edx xorl %eax, %edx movl %edx, %eax subl %edx, %r8d subl %edx, %ecx shrl $3, %eax xorl %eax, %r8d movl %r8d, %eax subl %r8d, %ecx sall $10, %eax xorl %eax, %ecx subl %r8d, %edx subl %ecx, %edx shrl $15, %ecx movl %ecx, %eax xorl %edx, %eax movl %eax, (%rdi) // Hash the elements using elt(). movl 24(%rsi), %edx cmpl $1, %edx jbe .L445 xorl %r9d, %r9d jmp .L448 .p2align 4,,10 .p2align 3 .L454: subl $1, %edx movq (%rsi,%rdx,8), %r8 sarq $63, %r8 .L447: movq %r8, %rcx subl %eax, %r8d sarq $32, %rcx movl %r8d, %edx movl %eax, %r8d subl %ecx, %edx shrl $13, %r8d subl %eax, %ecx xorl %edx, %r8d movl %ecx, %edx movl %r8d, %ecx subl %r8d, %edx subl %r8d, %eax sall $8, %ecx xorl %edx, %ecx subl %ecx, %eax subl %ecx, %r8d movl %eax, %edx movl %ecx, %eax shrl $13, %eax xorl %edx, %eax movl %r8d, %edx movl %eax, %r8d subl %eax, %edx subl %eax, %ecx shrl $12, %r8d xorl %edx, %r8d subl %r8d, %ecx subl %r8d, %eax movl %ecx, %edx movl %r8d, %ecx sall $16, %ecx xorl %edx, %ecx movl %ecx, %edx subl %ecx, %eax subl %ecx, %r8d shrl $5, %edx xorl %eax, %edx movl %edx, %eax subl %edx, %r8d subl %edx, %ecx shrl $3, %eax xorl %eax, %r8d movl %r8d, %eax subl %r8d, %ecx sall $10, %eax xorl %eax, %ecx subl %r8d, %edx addq $1, %r9 subl %ecx, %edx shrl $15, %ecx movl %ecx, %eax leal 1(%r9), %ecx xorl %edx, %eax movl %eax, (%rdi) movl 24(%rsi), %edx cmpl %ecx, %edx jbe .L445 .L448: cmpl %r9d, %edx jbe .L454 movq (%rsi,%r9,8), %r8 jmp .L447 .p2align 4,,10 .p2align 3 .L445: // Hash the final element using sext_elt(). addl $-1, %edx jc .L450 movabsq $34359738360, %rdx movq (%rsi,%rdx), %rcx sarq $63, %rcx .L452: movq %rcx, %rdx subl %eax, %ecx sarq $32, %rdx movl %ecx, %esi movl %eax, %ecx subl %edx, %esi shrl $13, %ecx subl %eax, %edx xorl %esi, %ecx movl %edx, %esi movl %ecx, %edx subl %ecx, %esi subl %ecx, %eax sall $8, %edx xorl %esi, %edx movl %edx, %esi subl %edx, %eax subl %edx, %ecx shrl $13, %esi xorl %esi, %eax movl %ecx, %esi movl %eax, %ecx subl %eax, %esi subl %eax, %edx shrl $12, %ecx xorl %esi, %ecx movl %edx, %esi movl %ecx, %edx subl %ecx, %esi subl %ecx, %eax sall $16, %edx xorl %esi, %edx subl %edx, %eax subl %edx, %ecx movl %eax, %esi movl %edx, %eax shrl $5, %eax xorl %esi, %eax movl %eax, %esi subl %eax, %ecx subl %eax, %edx shrl $3, %esi xorl %esi, %ecx movl %ecx, %esi subl %ecx, %edx sall $10, %esi xorl %esi, %edx subl %ecx, %eax subl %edx, %eax shrl $15, %edx xorl %edx, %eax movl %eax, (%rdi) ret .L450: movl %edx, %edx movq (%rsi,%rdx,8), %rcx jmp .L452 .cfi_endproc ------------------------------------------------------------------------ That's a lot of extra code. :-) And by splitting the final element out, we lose the optimisation that avoids the sign_mask path in elt(), so that we end up with an if-then-else diamond that wasn't there before: // Hash the final element using sext_elt(). addl $-1, %edx jc .L450 // sign_mask movabsq $34359738360, %rdx movq (%rsi,%rdx), %rcx sarq $63, %rcx .L452: ... .L450: // normal read movl %edx, %edx movq (%rsi,%rdx,8), %rcx jmp .L452 The branch should of course be predictable. And the branch around the elt() handling should be predictable if it really is the case that most wide ints don't need it. But it's still a lot of code that isn't marked cold. (And marking it cold would make the optimisation pointless, since the optimisation is targetting the case in which the code is executed.) Thanks, Richard