On Mon, Jul 29, 2019 at 3:06 PM Richard Sandiford <richard.sandif...@arm.com> wrote: > > Richard Sandiford <richard.sandif...@arm.com> writes: > > Richard Biener <richard.guent...@gmail.com> writes: > >> On Mon, Jul 29, 2019 at 11:11 AM Richard Sandiford > >> <richard.sandif...@arm.com> wrote: > >>> > >>> inchash::hash::add_wide_int operated directly on the raw encoding > >>> of the wide_int, including any redundant upper bits. The problem > >>> with that is that the upper bits are only defined for some wide-int > >>> storage types (including wide_int itself). wi::to_wide(tree) instead > >>> returns a value that is extended according to the signedness of the > >>> type (so that wi::to_widest can use the same encoding) while rtxes > >>> have the awkward special case of BI, which can be zero-extended > >>> rather than sign-extended. > >>> > >>> In the PR, we computed a hash for a "normal" sign-extended wide_int > >>> while the existing entries hashed wi::to_wide(tree). This gives > >>> different results for unsigned types that have the top bit set. > >>> > >>> The patch fixes that by hashing the canonical sign-extended form even > >>> if the raw encoding happens to be different. > >>> > >>> Tested on aarch64-linux-gnu (with and without SVE), armeb-eabi > >>> and x86_64-linux-gnu. OK to install? > >> > >> But only the most significant elt needs this treatment, no? So > >> we can save some compile-time in the hasing by not doing > >> it for all elts. > > > > Yeah, we only need it for the most significant elt. But the vast > > majority of constants need 64 bits or fewer of encoding anyway > > (even if the precision is greater than 64 bits), so that's usually the > > only one we need to process. I'm not convinced handling multiple elts > > specially would pay off. > > To go into a bit more detail, the code for: > > void > f1 (inchash::hash &h, const wide_int &x) > { > h.add_wide_int (x); > } > > doesn't change with the patch, because is_sign_extended is true for > wide_int. The code on x86_64 from a bootstrap compiler is: > > ------------------------------------------------------------------------ > .cfi_startproc > // Hash the length. > movl 24(%rsi), %edx > movl (%rdi), %ecx > movl $-1640531527, %r8d > subl %edx, %r8d > subl %ecx, %edx > movl %r8d, %eax > movl %ecx, %r8d > subl %ecx, %eax > shrl $13, %r8d > xorl %eax, %r8d > movl %edx, %eax > movl %r8d, %edx > subl %r8d, %eax > subl %r8d, %ecx > sall $8, %edx > xorl %eax, %edx > movl %ecx, %eax > movl %edx, %ecx > subl %edx, %eax > subl %edx, %r8d > shrl $13, %ecx > xorl %eax, %ecx > movl %r8d, %eax > movl %ecx, %r8d > subl %ecx, %eax > subl %ecx, %edx > shrl $12, %r8d > xorl %eax, %r8d > movl %edx, %eax > movl %r8d, %edx > subl %r8d, %eax > subl %r8d, %ecx > sall $16, %edx > xorl %eax, %edx > movl %ecx, %eax > movl %edx, %ecx > subl %edx, %eax > subl %edx, %r8d > shrl $5, %ecx > xorl %eax, %ecx > movl %ecx, %eax > subl %ecx, %r8d > subl %ecx, %edx > shrl $3, %eax > xorl %eax, %r8d > movl %edx, %eax > movl %r8d, %edx > subl %r8d, %eax > sall $10, %edx > xorl %eax, %edx > movl %ecx, %eax > subl %r8d, %eax > xorl %r8d, %r8d > subl %edx, %eax > shrl $15, %edx > xorl %edx, %eax > movl %eax, (%rdi) > // Hash the elements. > movl 24(%rsi), %edx > testl %edx, %edx > je .L444 > .p2align 4,,10 > .p2align 3 > .L446: > movl %r8d, %edx > movq (%rsi,%rdx,8), %r9 > movq %r9, %rdx > subl %eax, %r9d > sarq $32, %rdx > movl %r9d, %ecx > movl %eax, %r9d > subl %edx, %ecx > shrl $13, %r9d > subl %eax, %edx > xorl %ecx, %r9d > movl %edx, %ecx > movl %r9d, %edx > subl %r9d, %ecx > subl %r9d, %eax > sall $8, %edx > xorl %ecx, %edx > movl %edx, %ecx > subl %edx, %eax > subl %edx, %r9d > shrl $13, %ecx > xorl %ecx, %eax > movl %r9d, %ecx > movl %eax, %r9d > subl %eax, %ecx > subl %eax, %edx > shrl $12, %r9d > xorl %ecx, %r9d > movl %edx, %ecx > movl %r9d, %edx > subl %r9d, %ecx > subl %r9d, %eax > sall $16, %edx > xorl %ecx, %edx > movl %edx, %ecx > subl %edx, %eax > subl %edx, %r9d > shrl $5, %ecx > xorl %eax, %ecx > movl %ecx, %eax > subl %ecx, %r9d > subl %ecx, %edx > shrl $3, %eax > xorl %eax, %r9d > movl %edx, %eax > movl %r9d, %edx > subl %r9d, %eax > sall $10, %edx > xorl %eax, %edx > movl %ecx, %eax > addl $1, %r8d > subl %r9d, %eax > subl %edx, %eax > shrl $15, %edx > xorl %edx, %eax > movl %eax, (%rdi) > cmpl %r8d, 24(%rsi) > ja .L446 > .L444: > ret > ------------------------------------------------------------------------ > > The code for: > > void > f2 (inchash::hash &h, const wide_int_ref &x) > { > h.add_wide_int (x); > } > > does change, and becomes: > > ------------------------------------------------------------------------ > .cfi_startproc > // Hash the length. > movl 8(%rsi), %ecx > movl (%rdi), %edx > movl $-1640531527, %r8d > subl %ecx, %r8d > subl %edx, %ecx > movl %r8d, %eax > movl %edx, %r8d > subl %edx, %eax > shrl $13, %r8d > xorl %eax, %r8d > movl %ecx, %eax > movl %r8d, %ecx > subl %r8d, %eax > subl %r8d, %edx > sall $8, %ecx > xorl %eax, %ecx > movl %edx, %eax > movl %ecx, %edx > subl %ecx, %eax > subl %ecx, %r8d > shrl $13, %edx > xorl %eax, %edx > movl %r8d, %eax > movl %edx, %r8d > subl %edx, %eax > subl %edx, %ecx > shrl $12, %r8d > xorl %eax, %r8d > movl %ecx, %eax > movl %r8d, %ecx > subl %r8d, %eax > subl %r8d, %edx > sall $16, %ecx > xorl %eax, %ecx > movl %edx, %eax > movl %ecx, %edx > subl %ecx, %eax > subl %ecx, %r8d > shrl $5, %edx > xorl %eax, %edx > movl %edx, %eax > subl %edx, %r8d > subl %edx, %ecx > shrl $3, %eax > xorl %eax, %r8d > movl %r8d, %eax > subl %r8d, %ecx > sall $10, %eax > xorl %eax, %ecx > subl %r8d, %edx > subl %ecx, %edx > shrl $15, %ecx > movl %ecx, %eax > xorl %edx, %eax > movl %eax, (%rdi) > // Hash the elements. > movl 8(%rsi), %edx > testl %edx, %edx > je .L457 > pushq %rbx > .cfi_def_cfa_offset 16 > .cfi_offset 3, -16 > movq (%rsi), %r11 > xorl %r9d, %r9d > movl $64, %r10d > .p2align 4,,10 > .p2align 3 > .L453: > movl %r9d, %edx > movl 12(%rsi), %ecx > movq (%r11,%rdx,8), %r8 > movl %r9d, %edx > sall $6, %edx > movl %ecx, %ebx > subl %edx, %ebx > cmpl $63, %ebx > ja .L452 > movl %r10d, %ebx > subl %ecx, %ebx > movl %ebx, %ecx > addl %edx, %ecx > salq %cl, %r8 > sarq %cl, %r8 > .L452: > movq %r8, %rcx > movl %eax, %edx > subl %eax, %r8d > sarq $32, %rcx > shrl $13, %edx > subl %ecx, %r8d > subl %eax, %ecx > xorl %edx, %r8d > movl %r8d, %edx > subl %r8d, %ecx > subl %r8d, %eax > sall $8, %edx > xorl %edx, %ecx > movl %ecx, %edx > subl %ecx, %eax > subl %ecx, %r8d > shrl $13, %edx > xorl %edx, %eax > movl %eax, %edx > subl %eax, %r8d > subl %eax, %ecx > shrl $12, %edx > xorl %edx, %r8d > movl %r8d, %edx > subl %r8d, %ecx > subl %r8d, %eax > sall $16, %edx > xorl %edx, %ecx > movl %ecx, %edx > subl %ecx, %eax > subl %ecx, %r8d > shrl $5, %edx > xorl %eax, %edx > movl %edx, %eax > subl %edx, %r8d > subl %edx, %ecx > shrl $3, %eax > xorl %eax, %r8d > movl %r8d, %eax > subl %r8d, %ecx > sall $10, %eax > xorl %eax, %ecx > subl %r8d, %edx > addl $1, %r9d > subl %ecx, %edx > shrl $15, %ecx > movl %ecx, %eax > xorl %edx, %eax > movl %eax, (%rdi) > cmpl %r9d, 8(%rsi) > ja .L453 > popq %rbx > .cfi_def_cfa_offset 8 > ret > ------------------------------------------------------------------------ > > If I instead change the function to something like: > > /* Add wide_int-based value V. */ > template<typename T> > void add_wide_int (const generic_wide_int<T> &x) > { > add_int (x.get_len ()); > for (unsigned i = 1; i < x.get_len (); i++) > add_hwi (x.elt (i - 1)); > add_hwi (x.sext_elt (x.get_len () - 1)); > } > > then the function becomes much bigger for both f1 and f2. The f1 > version is: > > ------------------------------------------------------------------------ > .cfi_startproc > // Hash the length. > movl 24(%rsi), %ecx > movl (%rdi), %edx > movl $-1640531527, %r8d > subl %ecx, %r8d > subl %edx, %ecx > movl %r8d, %eax > movl %edx, %r8d > subl %edx, %eax > shrl $13, %r8d > xorl %eax, %r8d > movl %ecx, %eax > movl %r8d, %ecx > subl %r8d, %eax > subl %r8d, %edx > sall $8, %ecx > xorl %eax, %ecx > movl %edx, %eax > movl %ecx, %edx > subl %ecx, %eax > subl %ecx, %r8d > shrl $13, %edx > xorl %eax, %edx > movl %r8d, %eax > movl %edx, %r8d > subl %edx, %eax > subl %edx, %ecx > shrl $12, %r8d > xorl %eax, %r8d > movl %ecx, %eax > movl %r8d, %ecx > subl %r8d, %eax > subl %r8d, %edx > sall $16, %ecx > xorl %eax, %ecx > movl %edx, %eax > movl %ecx, %edx > subl %ecx, %eax > subl %ecx, %r8d > shrl $5, %edx > xorl %eax, %edx > movl %edx, %eax > subl %edx, %r8d > subl %edx, %ecx > shrl $3, %eax > xorl %eax, %r8d > movl %r8d, %eax > subl %r8d, %ecx > sall $10, %eax > xorl %eax, %ecx > subl %r8d, %edx > subl %ecx, %edx > shrl $15, %ecx > movl %ecx, %eax > xorl %edx, %eax > movl %eax, (%rdi) > // Hash the elements using elt(). > movl 24(%rsi), %edx > cmpl $1, %edx > jbe .L445 > xorl %r9d, %r9d > jmp .L448 > .p2align 4,,10 > .p2align 3 > .L454: > subl $1, %edx > movq (%rsi,%rdx,8), %r8 > sarq $63, %r8 > .L447: > movq %r8, %rcx > subl %eax, %r8d > sarq $32, %rcx > movl %r8d, %edx > movl %eax, %r8d > subl %ecx, %edx > shrl $13, %r8d > subl %eax, %ecx > xorl %edx, %r8d > movl %ecx, %edx > movl %r8d, %ecx > subl %r8d, %edx > subl %r8d, %eax > sall $8, %ecx > xorl %edx, %ecx > subl %ecx, %eax > subl %ecx, %r8d > movl %eax, %edx > movl %ecx, %eax > shrl $13, %eax > xorl %edx, %eax > movl %r8d, %edx > movl %eax, %r8d > subl %eax, %edx > subl %eax, %ecx > shrl $12, %r8d > xorl %edx, %r8d > subl %r8d, %ecx > subl %r8d, %eax > movl %ecx, %edx > movl %r8d, %ecx > sall $16, %ecx > xorl %edx, %ecx > movl %ecx, %edx > subl %ecx, %eax > subl %ecx, %r8d > shrl $5, %edx > xorl %eax, %edx > movl %edx, %eax > subl %edx, %r8d > subl %edx, %ecx > shrl $3, %eax > xorl %eax, %r8d > movl %r8d, %eax > subl %r8d, %ecx > sall $10, %eax > xorl %eax, %ecx > subl %r8d, %edx > addq $1, %r9 > subl %ecx, %edx > shrl $15, %ecx > movl %ecx, %eax > leal 1(%r9), %ecx > xorl %edx, %eax > movl %eax, (%rdi) > movl 24(%rsi), %edx > cmpl %ecx, %edx > jbe .L445 > .L448: > cmpl %r9d, %edx > jbe .L454 > movq (%rsi,%r9,8), %r8 > jmp .L447 > .p2align 4,,10 > .p2align 3 > .L445: > // Hash the final element using sext_elt(). > addl $-1, %edx > jc .L450 > movabsq $34359738360, %rdx > movq (%rsi,%rdx), %rcx > sarq $63, %rcx > .L452: > movq %rcx, %rdx > subl %eax, %ecx > sarq $32, %rdx > movl %ecx, %esi > movl %eax, %ecx > subl %edx, %esi > shrl $13, %ecx > subl %eax, %edx > xorl %esi, %ecx > movl %edx, %esi > movl %ecx, %edx > subl %ecx, %esi > subl %ecx, %eax > sall $8, %edx > xorl %esi, %edx > movl %edx, %esi > subl %edx, %eax > subl %edx, %ecx > shrl $13, %esi > xorl %esi, %eax > movl %ecx, %esi > movl %eax, %ecx > subl %eax, %esi > subl %eax, %edx > shrl $12, %ecx > xorl %esi, %ecx > movl %edx, %esi > movl %ecx, %edx > subl %ecx, %esi > subl %ecx, %eax > sall $16, %edx > xorl %esi, %edx > subl %edx, %eax > subl %edx, %ecx > movl %eax, %esi > movl %edx, %eax > shrl $5, %eax > xorl %esi, %eax > movl %eax, %esi > subl %eax, %ecx > subl %eax, %edx > shrl $3, %esi > xorl %esi, %ecx > movl %ecx, %esi > subl %ecx, %edx > sall $10, %esi > xorl %esi, %edx > subl %ecx, %eax > subl %edx, %eax > shrl $15, %edx > xorl %edx, %eax > movl %eax, (%rdi) > ret > .L450: > movl %edx, %edx > movq (%rsi,%rdx,8), %rcx > jmp .L452 > .cfi_endproc > ------------------------------------------------------------------------ > > That's a lot of extra code. :-) And by splitting the final element out, > we lose the optimisation that avoids the sign_mask path in elt(), so > that we end up with an if-then-else diamond that wasn't there before: > > // Hash the final element using sext_elt(). > addl $-1, %edx > jc .L450 > // sign_mask > movabsq $34359738360, %rdx > movq (%rsi,%rdx), %rcx > sarq $63, %rcx > .L452: > ... > .L450: > // normal read > movl %edx, %edx > movq (%rsi,%rdx,8), %rcx > jmp .L452 > > The branch should of course be predictable. And the branch around > the elt() handling should be predictable if it really is the case > that most wide ints don't need it. But it's still a lot of code > that isn't marked cold. (And marking it cold would make the > optimisation pointless, since the optimisation is targetting the > case in which the code is executed.)
I see. The patch is OK as-is then. Thanks, Richard. > Thanks, > Richard