On Mon, Jul 29, 2019 at 3:06 PM Richard Sandiford
<richard.sandif...@arm.com> wrote:
>
> Richard Sandiford <richard.sandif...@arm.com> writes:
> > Richard Biener <richard.guent...@gmail.com> writes:
> >> On Mon, Jul 29, 2019 at 11:11 AM Richard Sandiford
> >> <richard.sandif...@arm.com> wrote:
> >>>
> >>> inchash::hash::add_wide_int operated directly on the raw encoding
> >>> of the wide_int, including any redundant upper bits.  The problem
> >>> with that is that the upper bits are only defined for some wide-int
> >>> storage types (including wide_int itself).  wi::to_wide(tree) instead
> >>> returns a value that is extended according to the signedness of the
> >>> type (so that wi::to_widest can use the same encoding) while rtxes
> >>> have the awkward special case of BI, which can be zero-extended
> >>> rather than sign-extended.
> >>>
> >>> In the PR, we computed a hash for a "normal" sign-extended wide_int
> >>> while the existing entries hashed wi::to_wide(tree).  This gives
> >>> different results for unsigned types that have the top bit set.
> >>>
> >>> The patch fixes that by hashing the canonical sign-extended form even
> >>> if the raw encoding happens to be different.
> >>>
> >>> Tested on aarch64-linux-gnu (with and without SVE), armeb-eabi
> >>> and x86_64-linux-gnu.  OK to install?
> >>
> >> But only the most significant elt needs this treatment, no?  So
> >> we can save some compile-time in the hasing by not doing
> >> it for all elts.
> >
> > Yeah, we only need it for the most significant elt.  But the vast
> > majority of constants need 64 bits or fewer of encoding anyway
> > (even if the precision is greater than 64 bits), so that's usually the
> > only one we need to process.  I'm not convinced handling multiple elts
> > specially would pay off.
>
> To go into a bit more detail, the code for:
>
> void
> f1 (inchash::hash &h, const wide_int &x)
> {
>   h.add_wide_int (x);
> }
>
> doesn't change with the patch, because is_sign_extended is true for
> wide_int.  The code on x86_64 from a bootstrap compiler is:
>
> ------------------------------------------------------------------------
>         .cfi_startproc
>         // Hash the length.
>         movl    24(%rsi), %edx
>         movl    (%rdi), %ecx
>         movl    $-1640531527, %r8d
>         subl    %edx, %r8d
>         subl    %ecx, %edx
>         movl    %r8d, %eax
>         movl    %ecx, %r8d
>         subl    %ecx, %eax
>         shrl    $13, %r8d
>         xorl    %eax, %r8d
>         movl    %edx, %eax
>         movl    %r8d, %edx
>         subl    %r8d, %eax
>         subl    %r8d, %ecx
>         sall    $8, %edx
>         xorl    %eax, %edx
>         movl    %ecx, %eax
>         movl    %edx, %ecx
>         subl    %edx, %eax
>         subl    %edx, %r8d
>         shrl    $13, %ecx
>         xorl    %eax, %ecx
>         movl    %r8d, %eax
>         movl    %ecx, %r8d
>         subl    %ecx, %eax
>         subl    %ecx, %edx
>         shrl    $12, %r8d
>         xorl    %eax, %r8d
>         movl    %edx, %eax
>         movl    %r8d, %edx
>         subl    %r8d, %eax
>         subl    %r8d, %ecx
>         sall    $16, %edx
>         xorl    %eax, %edx
>         movl    %ecx, %eax
>         movl    %edx, %ecx
>         subl    %edx, %eax
>         subl    %edx, %r8d
>         shrl    $5, %ecx
>         xorl    %eax, %ecx
>         movl    %ecx, %eax
>         subl    %ecx, %r8d
>         subl    %ecx, %edx
>         shrl    $3, %eax
>         xorl    %eax, %r8d
>         movl    %edx, %eax
>         movl    %r8d, %edx
>         subl    %r8d, %eax
>         sall    $10, %edx
>         xorl    %eax, %edx
>         movl    %ecx, %eax
>         subl    %r8d, %eax
>         xorl    %r8d, %r8d
>         subl    %edx, %eax
>         shrl    $15, %edx
>         xorl    %edx, %eax
>         movl    %eax, (%rdi)
>         // Hash the elements.
>         movl    24(%rsi), %edx
>         testl   %edx, %edx
>         je      .L444
>         .p2align 4,,10
>         .p2align 3
> .L446:
>         movl    %r8d, %edx
>         movq    (%rsi,%rdx,8), %r9
>         movq    %r9, %rdx
>         subl    %eax, %r9d
>         sarq    $32, %rdx
>         movl    %r9d, %ecx
>         movl    %eax, %r9d
>         subl    %edx, %ecx
>         shrl    $13, %r9d
>         subl    %eax, %edx
>         xorl    %ecx, %r9d
>         movl    %edx, %ecx
>         movl    %r9d, %edx
>         subl    %r9d, %ecx
>         subl    %r9d, %eax
>         sall    $8, %edx
>         xorl    %ecx, %edx
>         movl    %edx, %ecx
>         subl    %edx, %eax
>         subl    %edx, %r9d
>         shrl    $13, %ecx
>         xorl    %ecx, %eax
>         movl    %r9d, %ecx
>         movl    %eax, %r9d
>         subl    %eax, %ecx
>         subl    %eax, %edx
>         shrl    $12, %r9d
>         xorl    %ecx, %r9d
>         movl    %edx, %ecx
>         movl    %r9d, %edx
>         subl    %r9d, %ecx
>         subl    %r9d, %eax
>         sall    $16, %edx
>         xorl    %ecx, %edx
>         movl    %edx, %ecx
>         subl    %edx, %eax
>         subl    %edx, %r9d
>         shrl    $5, %ecx
>         xorl    %eax, %ecx
>         movl    %ecx, %eax
>         subl    %ecx, %r9d
>         subl    %ecx, %edx
>         shrl    $3, %eax
>         xorl    %eax, %r9d
>         movl    %edx, %eax
>         movl    %r9d, %edx
>         subl    %r9d, %eax
>         sall    $10, %edx
>         xorl    %eax, %edx
>         movl    %ecx, %eax
>         addl    $1, %r8d
>         subl    %r9d, %eax
>         subl    %edx, %eax
>         shrl    $15, %edx
>         xorl    %edx, %eax
>         movl    %eax, (%rdi)
>         cmpl    %r8d, 24(%rsi)
>         ja      .L446
> .L444:
>         ret
> ------------------------------------------------------------------------
>
> The code for:
>
> void
> f2 (inchash::hash &h, const wide_int_ref &x)
> {
>   h.add_wide_int (x);
> }
>
> does change, and becomes:
>
> ------------------------------------------------------------------------
>         .cfi_startproc
>         // Hash the length.
>         movl    8(%rsi), %ecx
>         movl    (%rdi), %edx
>         movl    $-1640531527, %r8d
>         subl    %ecx, %r8d
>         subl    %edx, %ecx
>         movl    %r8d, %eax
>         movl    %edx, %r8d
>         subl    %edx, %eax
>         shrl    $13, %r8d
>         xorl    %eax, %r8d
>         movl    %ecx, %eax
>         movl    %r8d, %ecx
>         subl    %r8d, %eax
>         subl    %r8d, %edx
>         sall    $8, %ecx
>         xorl    %eax, %ecx
>         movl    %edx, %eax
>         movl    %ecx, %edx
>         subl    %ecx, %eax
>         subl    %ecx, %r8d
>         shrl    $13, %edx
>         xorl    %eax, %edx
>         movl    %r8d, %eax
>         movl    %edx, %r8d
>         subl    %edx, %eax
>         subl    %edx, %ecx
>         shrl    $12, %r8d
>         xorl    %eax, %r8d
>         movl    %ecx, %eax
>         movl    %r8d, %ecx
>         subl    %r8d, %eax
>         subl    %r8d, %edx
>         sall    $16, %ecx
>         xorl    %eax, %ecx
>         movl    %edx, %eax
>         movl    %ecx, %edx
>         subl    %ecx, %eax
>         subl    %ecx, %r8d
>         shrl    $5, %edx
>         xorl    %eax, %edx
>         movl    %edx, %eax
>         subl    %edx, %r8d
>         subl    %edx, %ecx
>         shrl    $3, %eax
>         xorl    %eax, %r8d
>         movl    %r8d, %eax
>         subl    %r8d, %ecx
>         sall    $10, %eax
>         xorl    %eax, %ecx
>         subl    %r8d, %edx
>         subl    %ecx, %edx
>         shrl    $15, %ecx
>         movl    %ecx, %eax
>         xorl    %edx, %eax
>         movl    %eax, (%rdi)
>         // Hash the elements.
>         movl    8(%rsi), %edx
>         testl   %edx, %edx
>         je      .L457
>         pushq   %rbx
>         .cfi_def_cfa_offset 16
>         .cfi_offset 3, -16
>         movq    (%rsi), %r11
>         xorl    %r9d, %r9d
>         movl    $64, %r10d
>         .p2align 4,,10
>         .p2align 3
> .L453:
>         movl    %r9d, %edx
>         movl    12(%rsi), %ecx
>         movq    (%r11,%rdx,8), %r8
>         movl    %r9d, %edx
>         sall    $6, %edx
>         movl    %ecx, %ebx
>         subl    %edx, %ebx
>         cmpl    $63, %ebx
>         ja      .L452
>         movl    %r10d, %ebx
>         subl    %ecx, %ebx
>         movl    %ebx, %ecx
>         addl    %edx, %ecx
>         salq    %cl, %r8
>         sarq    %cl, %r8
> .L452:
>         movq    %r8, %rcx
>         movl    %eax, %edx
>         subl    %eax, %r8d
>         sarq    $32, %rcx
>         shrl    $13, %edx
>         subl    %ecx, %r8d
>         subl    %eax, %ecx
>         xorl    %edx, %r8d
>         movl    %r8d, %edx
>         subl    %r8d, %ecx
>         subl    %r8d, %eax
>         sall    $8, %edx
>         xorl    %edx, %ecx
>         movl    %ecx, %edx
>         subl    %ecx, %eax
>         subl    %ecx, %r8d
>         shrl    $13, %edx
>         xorl    %edx, %eax
>         movl    %eax, %edx
>         subl    %eax, %r8d
>         subl    %eax, %ecx
>         shrl    $12, %edx
>         xorl    %edx, %r8d
>         movl    %r8d, %edx
>         subl    %r8d, %ecx
>         subl    %r8d, %eax
>         sall    $16, %edx
>         xorl    %edx, %ecx
>         movl    %ecx, %edx
>         subl    %ecx, %eax
>         subl    %ecx, %r8d
>         shrl    $5, %edx
>         xorl    %eax, %edx
>         movl    %edx, %eax
>         subl    %edx, %r8d
>         subl    %edx, %ecx
>         shrl    $3, %eax
>         xorl    %eax, %r8d
>         movl    %r8d, %eax
>         subl    %r8d, %ecx
>         sall    $10, %eax
>         xorl    %eax, %ecx
>         subl    %r8d, %edx
>         addl    $1, %r9d
>         subl    %ecx, %edx
>         shrl    $15, %ecx
>         movl    %ecx, %eax
>         xorl    %edx, %eax
>         movl    %eax, (%rdi)
>         cmpl    %r9d, 8(%rsi)
>         ja      .L453
>         popq    %rbx
>         .cfi_def_cfa_offset 8
>         ret
> ------------------------------------------------------------------------
>
> If I instead change the function to something like:
>
>   /* Add wide_int-based value V.  */
>   template<typename T>
>   void add_wide_int (const generic_wide_int<T> &x)
>   {
>     add_int (x.get_len ());
>     for (unsigned i = 1; i < x.get_len (); i++)
>       add_hwi (x.elt (i - 1));
>     add_hwi (x.sext_elt (x.get_len () - 1));
>   }
>
> then the function becomes much bigger for both f1 and f2.  The f1
> version is:
>
> ------------------------------------------------------------------------
>         .cfi_startproc
>         // Hash the length.
>         movl    24(%rsi), %ecx
>         movl    (%rdi), %edx
>         movl    $-1640531527, %r8d
>         subl    %ecx, %r8d
>         subl    %edx, %ecx
>         movl    %r8d, %eax
>         movl    %edx, %r8d
>         subl    %edx, %eax
>         shrl    $13, %r8d
>         xorl    %eax, %r8d
>         movl    %ecx, %eax
>         movl    %r8d, %ecx
>         subl    %r8d, %eax
>         subl    %r8d, %edx
>         sall    $8, %ecx
>         xorl    %eax, %ecx
>         movl    %edx, %eax
>         movl    %ecx, %edx
>         subl    %ecx, %eax
>         subl    %ecx, %r8d
>         shrl    $13, %edx
>         xorl    %eax, %edx
>         movl    %r8d, %eax
>         movl    %edx, %r8d
>         subl    %edx, %eax
>         subl    %edx, %ecx
>         shrl    $12, %r8d
>         xorl    %eax, %r8d
>         movl    %ecx, %eax
>         movl    %r8d, %ecx
>         subl    %r8d, %eax
>         subl    %r8d, %edx
>         sall    $16, %ecx
>         xorl    %eax, %ecx
>         movl    %edx, %eax
>         movl    %ecx, %edx
>         subl    %ecx, %eax
>         subl    %ecx, %r8d
>         shrl    $5, %edx
>         xorl    %eax, %edx
>         movl    %edx, %eax
>         subl    %edx, %r8d
>         subl    %edx, %ecx
>         shrl    $3, %eax
>         xorl    %eax, %r8d
>         movl    %r8d, %eax
>         subl    %r8d, %ecx
>         sall    $10, %eax
>         xorl    %eax, %ecx
>         subl    %r8d, %edx
>         subl    %ecx, %edx
>         shrl    $15, %ecx
>         movl    %ecx, %eax
>         xorl    %edx, %eax
>         movl    %eax, (%rdi)
>         // Hash the elements using elt().
>         movl    24(%rsi), %edx
>         cmpl    $1, %edx
>         jbe     .L445
>         xorl    %r9d, %r9d
>         jmp     .L448
>         .p2align 4,,10
>         .p2align 3
> .L454:
>         subl    $1, %edx
>         movq    (%rsi,%rdx,8), %r8
>         sarq    $63, %r8
> .L447:
>         movq    %r8, %rcx
>         subl    %eax, %r8d
>         sarq    $32, %rcx
>         movl    %r8d, %edx
>         movl    %eax, %r8d
>         subl    %ecx, %edx
>         shrl    $13, %r8d
>         subl    %eax, %ecx
>         xorl    %edx, %r8d
>         movl    %ecx, %edx
>         movl    %r8d, %ecx
>         subl    %r8d, %edx
>         subl    %r8d, %eax
>         sall    $8, %ecx
>         xorl    %edx, %ecx
>         subl    %ecx, %eax
>         subl    %ecx, %r8d
>         movl    %eax, %edx
>         movl    %ecx, %eax
>         shrl    $13, %eax
>         xorl    %edx, %eax
>         movl    %r8d, %edx
>         movl    %eax, %r8d
>         subl    %eax, %edx
>         subl    %eax, %ecx
>         shrl    $12, %r8d
>         xorl    %edx, %r8d
>         subl    %r8d, %ecx
>         subl    %r8d, %eax
>         movl    %ecx, %edx
>         movl    %r8d, %ecx
>         sall    $16, %ecx
>         xorl    %edx, %ecx
>         movl    %ecx, %edx
>         subl    %ecx, %eax
>         subl    %ecx, %r8d
>         shrl    $5, %edx
>         xorl    %eax, %edx
>         movl    %edx, %eax
>         subl    %edx, %r8d
>         subl    %edx, %ecx
>         shrl    $3, %eax
>         xorl    %eax, %r8d
>         movl    %r8d, %eax
>         subl    %r8d, %ecx
>         sall    $10, %eax
>         xorl    %eax, %ecx
>         subl    %r8d, %edx
>         addq    $1, %r9
>         subl    %ecx, %edx
>         shrl    $15, %ecx
>         movl    %ecx, %eax
>         leal    1(%r9), %ecx
>         xorl    %edx, %eax
>         movl    %eax, (%rdi)
>         movl    24(%rsi), %edx
>         cmpl    %ecx, %edx
>         jbe     .L445
> .L448:
>         cmpl    %r9d, %edx
>         jbe     .L454
>         movq    (%rsi,%r9,8), %r8
>         jmp     .L447
>         .p2align 4,,10
>         .p2align 3
> .L445:
>         // Hash the final element using sext_elt().
>         addl    $-1, %edx
>         jc      .L450
>         movabsq $34359738360, %rdx
>         movq    (%rsi,%rdx), %rcx
>         sarq    $63, %rcx
> .L452:
>         movq    %rcx, %rdx
>         subl    %eax, %ecx
>         sarq    $32, %rdx
>         movl    %ecx, %esi
>         movl    %eax, %ecx
>         subl    %edx, %esi
>         shrl    $13, %ecx
>         subl    %eax, %edx
>         xorl    %esi, %ecx
>         movl    %edx, %esi
>         movl    %ecx, %edx
>         subl    %ecx, %esi
>         subl    %ecx, %eax
>         sall    $8, %edx
>         xorl    %esi, %edx
>         movl    %edx, %esi
>         subl    %edx, %eax
>         subl    %edx, %ecx
>         shrl    $13, %esi
>         xorl    %esi, %eax
>         movl    %ecx, %esi
>         movl    %eax, %ecx
>         subl    %eax, %esi
>         subl    %eax, %edx
>         shrl    $12, %ecx
>         xorl    %esi, %ecx
>         movl    %edx, %esi
>         movl    %ecx, %edx
>         subl    %ecx, %esi
>         subl    %ecx, %eax
>         sall    $16, %edx
>         xorl    %esi, %edx
>         subl    %edx, %eax
>         subl    %edx, %ecx
>         movl    %eax, %esi
>         movl    %edx, %eax
>         shrl    $5, %eax
>         xorl    %esi, %eax
>         movl    %eax, %esi
>         subl    %eax, %ecx
>         subl    %eax, %edx
>         shrl    $3, %esi
>         xorl    %esi, %ecx
>         movl    %ecx, %esi
>         subl    %ecx, %edx
>         sall    $10, %esi
>         xorl    %esi, %edx
>         subl    %ecx, %eax
>         subl    %edx, %eax
>         shrl    $15, %edx
>         xorl    %edx, %eax
>         movl    %eax, (%rdi)
>         ret
> .L450:
>         movl    %edx, %edx
>         movq    (%rsi,%rdx,8), %rcx
>         jmp     .L452
>         .cfi_endproc
> ------------------------------------------------------------------------
>
> That's a lot of extra code. :-) And by splitting the final element out,
> we lose the optimisation that avoids the sign_mask path in elt(), so
> that we end up with an if-then-else diamond that wasn't there before:
>
>         // Hash the final element using sext_elt().
>         addl    $-1, %edx
>         jc      .L450
>         // sign_mask
>         movabsq $34359738360, %rdx
>         movq    (%rsi,%rdx), %rcx
>         sarq    $63, %rcx
> .L452:
>         ...
> .L450:
>         // normal read
>         movl    %edx, %edx
>         movq    (%rsi,%rdx,8), %rcx
>         jmp     .L452
>
> The branch should of course be predictable.  And the branch around
> the elt() handling should be predictable if it really is the case
> that most wide ints don't need it.  But it's still a lot of code
> that isn't marked cold.  (And marking it cold would make the
> optimisation pointless, since the optimisation is targetting the
> case in which the code is executed.)

I see.  The patch is OK as-is then.

Thanks,
Richard.

> Thanks,
> Richard

Reply via email to