Richard Sandiford <richard.sandif...@arm.com> writes:
> Richard Biener <richard.guent...@gmail.com> writes:
>> On Mon, Jul 29, 2019 at 11:11 AM Richard Sandiford
>> <richard.sandif...@arm.com> wrote:
>>>
>>> inchash::hash::add_wide_int operated directly on the raw encoding
>>> of the wide_int, including any redundant upper bits.  The problem
>>> with that is that the upper bits are only defined for some wide-int
>>> storage types (including wide_int itself).  wi::to_wide(tree) instead
>>> returns a value that is extended according to the signedness of the
>>> type (so that wi::to_widest can use the same encoding) while rtxes
>>> have the awkward special case of BI, which can be zero-extended
>>> rather than sign-extended.
>>>
>>> In the PR, we computed a hash for a "normal" sign-extended wide_int
>>> while the existing entries hashed wi::to_wide(tree).  This gives
>>> different results for unsigned types that have the top bit set.
>>>
>>> The patch fixes that by hashing the canonical sign-extended form even
>>> if the raw encoding happens to be different.
>>>
>>> Tested on aarch64-linux-gnu (with and without SVE), armeb-eabi
>>> and x86_64-linux-gnu.  OK to install?
>>
>> But only the most significant elt needs this treatment, no?  So
>> we can save some compile-time in the hasing by not doing
>> it for all elts.
>
> Yeah, we only need it for the most significant elt.  But the vast
> majority of constants need 64 bits or fewer of encoding anyway
> (even if the precision is greater than 64 bits), so that's usually the
> only one we need to process.  I'm not convinced handling multiple elts
> specially would pay off.

To go into a bit more detail, the code for:

void
f1 (inchash::hash &h, const wide_int &x)
{
  h.add_wide_int (x);
}

doesn't change with the patch, because is_sign_extended is true for
wide_int.  The code on x86_64 from a bootstrap compiler is:

------------------------------------------------------------------------
        .cfi_startproc
        // Hash the length.
        movl    24(%rsi), %edx
        movl    (%rdi), %ecx
        movl    $-1640531527, %r8d
        subl    %edx, %r8d
        subl    %ecx, %edx
        movl    %r8d, %eax
        movl    %ecx, %r8d
        subl    %ecx, %eax
        shrl    $13, %r8d
        xorl    %eax, %r8d
        movl    %edx, %eax
        movl    %r8d, %edx
        subl    %r8d, %eax
        subl    %r8d, %ecx
        sall    $8, %edx
        xorl    %eax, %edx
        movl    %ecx, %eax
        movl    %edx, %ecx
        subl    %edx, %eax
        subl    %edx, %r8d
        shrl    $13, %ecx
        xorl    %eax, %ecx
        movl    %r8d, %eax
        movl    %ecx, %r8d
        subl    %ecx, %eax
        subl    %ecx, %edx
        shrl    $12, %r8d
        xorl    %eax, %r8d
        movl    %edx, %eax
        movl    %r8d, %edx
        subl    %r8d, %eax
        subl    %r8d, %ecx
        sall    $16, %edx
        xorl    %eax, %edx
        movl    %ecx, %eax
        movl    %edx, %ecx
        subl    %edx, %eax
        subl    %edx, %r8d
        shrl    $5, %ecx
        xorl    %eax, %ecx
        movl    %ecx, %eax
        subl    %ecx, %r8d
        subl    %ecx, %edx
        shrl    $3, %eax
        xorl    %eax, %r8d
        movl    %edx, %eax
        movl    %r8d, %edx
        subl    %r8d, %eax
        sall    $10, %edx
        xorl    %eax, %edx
        movl    %ecx, %eax
        subl    %r8d, %eax
        xorl    %r8d, %r8d
        subl    %edx, %eax
        shrl    $15, %edx
        xorl    %edx, %eax
        movl    %eax, (%rdi)
        // Hash the elements.
        movl    24(%rsi), %edx
        testl   %edx, %edx
        je      .L444
        .p2align 4,,10
        .p2align 3
.L446:
        movl    %r8d, %edx
        movq    (%rsi,%rdx,8), %r9
        movq    %r9, %rdx
        subl    %eax, %r9d
        sarq    $32, %rdx
        movl    %r9d, %ecx
        movl    %eax, %r9d
        subl    %edx, %ecx
        shrl    $13, %r9d
        subl    %eax, %edx
        xorl    %ecx, %r9d
        movl    %edx, %ecx
        movl    %r9d, %edx
        subl    %r9d, %ecx
        subl    %r9d, %eax
        sall    $8, %edx
        xorl    %ecx, %edx
        movl    %edx, %ecx
        subl    %edx, %eax
        subl    %edx, %r9d
        shrl    $13, %ecx
        xorl    %ecx, %eax
        movl    %r9d, %ecx
        movl    %eax, %r9d
        subl    %eax, %ecx
        subl    %eax, %edx
        shrl    $12, %r9d
        xorl    %ecx, %r9d
        movl    %edx, %ecx
        movl    %r9d, %edx
        subl    %r9d, %ecx
        subl    %r9d, %eax
        sall    $16, %edx
        xorl    %ecx, %edx
        movl    %edx, %ecx
        subl    %edx, %eax
        subl    %edx, %r9d
        shrl    $5, %ecx
        xorl    %eax, %ecx
        movl    %ecx, %eax
        subl    %ecx, %r9d
        subl    %ecx, %edx
        shrl    $3, %eax
        xorl    %eax, %r9d
        movl    %edx, %eax
        movl    %r9d, %edx
        subl    %r9d, %eax
        sall    $10, %edx
        xorl    %eax, %edx
        movl    %ecx, %eax
        addl    $1, %r8d
        subl    %r9d, %eax
        subl    %edx, %eax
        shrl    $15, %edx
        xorl    %edx, %eax
        movl    %eax, (%rdi)
        cmpl    %r8d, 24(%rsi)
        ja      .L446
.L444:
        ret
------------------------------------------------------------------------

The code for:

void
f2 (inchash::hash &h, const wide_int_ref &x)
{
  h.add_wide_int (x);
}

does change, and becomes:

------------------------------------------------------------------------
        .cfi_startproc
        // Hash the length.
        movl    8(%rsi), %ecx
        movl    (%rdi), %edx
        movl    $-1640531527, %r8d
        subl    %ecx, %r8d
        subl    %edx, %ecx
        movl    %r8d, %eax
        movl    %edx, %r8d
        subl    %edx, %eax
        shrl    $13, %r8d
        xorl    %eax, %r8d
        movl    %ecx, %eax
        movl    %r8d, %ecx
        subl    %r8d, %eax
        subl    %r8d, %edx
        sall    $8, %ecx
        xorl    %eax, %ecx
        movl    %edx, %eax
        movl    %ecx, %edx
        subl    %ecx, %eax
        subl    %ecx, %r8d
        shrl    $13, %edx
        xorl    %eax, %edx
        movl    %r8d, %eax
        movl    %edx, %r8d
        subl    %edx, %eax
        subl    %edx, %ecx
        shrl    $12, %r8d
        xorl    %eax, %r8d
        movl    %ecx, %eax
        movl    %r8d, %ecx
        subl    %r8d, %eax
        subl    %r8d, %edx
        sall    $16, %ecx
        xorl    %eax, %ecx
        movl    %edx, %eax
        movl    %ecx, %edx
        subl    %ecx, %eax
        subl    %ecx, %r8d
        shrl    $5, %edx
        xorl    %eax, %edx
        movl    %edx, %eax
        subl    %edx, %r8d
        subl    %edx, %ecx
        shrl    $3, %eax
        xorl    %eax, %r8d
        movl    %r8d, %eax
        subl    %r8d, %ecx
        sall    $10, %eax
        xorl    %eax, %ecx
        subl    %r8d, %edx
        subl    %ecx, %edx
        shrl    $15, %ecx
        movl    %ecx, %eax
        xorl    %edx, %eax
        movl    %eax, (%rdi)
        // Hash the elements.
        movl    8(%rsi), %edx
        testl   %edx, %edx
        je      .L457
        pushq   %rbx
        .cfi_def_cfa_offset 16
        .cfi_offset 3, -16
        movq    (%rsi), %r11
        xorl    %r9d, %r9d
        movl    $64, %r10d
        .p2align 4,,10
        .p2align 3
.L453:
        movl    %r9d, %edx
        movl    12(%rsi), %ecx
        movq    (%r11,%rdx,8), %r8
        movl    %r9d, %edx
        sall    $6, %edx
        movl    %ecx, %ebx
        subl    %edx, %ebx
        cmpl    $63, %ebx
        ja      .L452
        movl    %r10d, %ebx
        subl    %ecx, %ebx
        movl    %ebx, %ecx
        addl    %edx, %ecx
        salq    %cl, %r8
        sarq    %cl, %r8
.L452:
        movq    %r8, %rcx
        movl    %eax, %edx
        subl    %eax, %r8d
        sarq    $32, %rcx
        shrl    $13, %edx
        subl    %ecx, %r8d
        subl    %eax, %ecx
        xorl    %edx, %r8d
        movl    %r8d, %edx
        subl    %r8d, %ecx
        subl    %r8d, %eax
        sall    $8, %edx
        xorl    %edx, %ecx
        movl    %ecx, %edx
        subl    %ecx, %eax
        subl    %ecx, %r8d
        shrl    $13, %edx
        xorl    %edx, %eax
        movl    %eax, %edx
        subl    %eax, %r8d
        subl    %eax, %ecx
        shrl    $12, %edx
        xorl    %edx, %r8d
        movl    %r8d, %edx
        subl    %r8d, %ecx
        subl    %r8d, %eax
        sall    $16, %edx
        xorl    %edx, %ecx
        movl    %ecx, %edx
        subl    %ecx, %eax
        subl    %ecx, %r8d
        shrl    $5, %edx
        xorl    %eax, %edx
        movl    %edx, %eax
        subl    %edx, %r8d
        subl    %edx, %ecx
        shrl    $3, %eax
        xorl    %eax, %r8d
        movl    %r8d, %eax
        subl    %r8d, %ecx
        sall    $10, %eax
        xorl    %eax, %ecx
        subl    %r8d, %edx
        addl    $1, %r9d
        subl    %ecx, %edx
        shrl    $15, %ecx
        movl    %ecx, %eax
        xorl    %edx, %eax
        movl    %eax, (%rdi)
        cmpl    %r9d, 8(%rsi)
        ja      .L453
        popq    %rbx
        .cfi_def_cfa_offset 8
        ret
------------------------------------------------------------------------

If I instead change the function to something like:

  /* Add wide_int-based value V.  */
  template<typename T>
  void add_wide_int (const generic_wide_int<T> &x)
  {
    add_int (x.get_len ());
    for (unsigned i = 1; i < x.get_len (); i++)
      add_hwi (x.elt (i - 1));
    add_hwi (x.sext_elt (x.get_len () - 1));
  }

then the function becomes much bigger for both f1 and f2.  The f1
version is:

------------------------------------------------------------------------
        .cfi_startproc
        // Hash the length.
        movl    24(%rsi), %ecx
        movl    (%rdi), %edx
        movl    $-1640531527, %r8d
        subl    %ecx, %r8d
        subl    %edx, %ecx
        movl    %r8d, %eax
        movl    %edx, %r8d
        subl    %edx, %eax
        shrl    $13, %r8d
        xorl    %eax, %r8d
        movl    %ecx, %eax
        movl    %r8d, %ecx
        subl    %r8d, %eax
        subl    %r8d, %edx
        sall    $8, %ecx
        xorl    %eax, %ecx
        movl    %edx, %eax
        movl    %ecx, %edx
        subl    %ecx, %eax
        subl    %ecx, %r8d
        shrl    $13, %edx
        xorl    %eax, %edx
        movl    %r8d, %eax
        movl    %edx, %r8d
        subl    %edx, %eax
        subl    %edx, %ecx
        shrl    $12, %r8d
        xorl    %eax, %r8d
        movl    %ecx, %eax
        movl    %r8d, %ecx
        subl    %r8d, %eax
        subl    %r8d, %edx
        sall    $16, %ecx
        xorl    %eax, %ecx
        movl    %edx, %eax
        movl    %ecx, %edx
        subl    %ecx, %eax
        subl    %ecx, %r8d
        shrl    $5, %edx
        xorl    %eax, %edx
        movl    %edx, %eax
        subl    %edx, %r8d
        subl    %edx, %ecx
        shrl    $3, %eax
        xorl    %eax, %r8d
        movl    %r8d, %eax
        subl    %r8d, %ecx
        sall    $10, %eax
        xorl    %eax, %ecx
        subl    %r8d, %edx
        subl    %ecx, %edx
        shrl    $15, %ecx
        movl    %ecx, %eax
        xorl    %edx, %eax
        movl    %eax, (%rdi)
        // Hash the elements using elt().
        movl    24(%rsi), %edx
        cmpl    $1, %edx
        jbe     .L445
        xorl    %r9d, %r9d
        jmp     .L448
        .p2align 4,,10
        .p2align 3
.L454:
        subl    $1, %edx
        movq    (%rsi,%rdx,8), %r8
        sarq    $63, %r8
.L447:
        movq    %r8, %rcx
        subl    %eax, %r8d
        sarq    $32, %rcx
        movl    %r8d, %edx
        movl    %eax, %r8d
        subl    %ecx, %edx
        shrl    $13, %r8d
        subl    %eax, %ecx
        xorl    %edx, %r8d
        movl    %ecx, %edx
        movl    %r8d, %ecx
        subl    %r8d, %edx
        subl    %r8d, %eax
        sall    $8, %ecx
        xorl    %edx, %ecx
        subl    %ecx, %eax
        subl    %ecx, %r8d
        movl    %eax, %edx
        movl    %ecx, %eax
        shrl    $13, %eax
        xorl    %edx, %eax
        movl    %r8d, %edx
        movl    %eax, %r8d
        subl    %eax, %edx
        subl    %eax, %ecx
        shrl    $12, %r8d
        xorl    %edx, %r8d
        subl    %r8d, %ecx
        subl    %r8d, %eax
        movl    %ecx, %edx
        movl    %r8d, %ecx
        sall    $16, %ecx
        xorl    %edx, %ecx
        movl    %ecx, %edx
        subl    %ecx, %eax
        subl    %ecx, %r8d
        shrl    $5, %edx
        xorl    %eax, %edx
        movl    %edx, %eax
        subl    %edx, %r8d
        subl    %edx, %ecx
        shrl    $3, %eax
        xorl    %eax, %r8d
        movl    %r8d, %eax
        subl    %r8d, %ecx
        sall    $10, %eax
        xorl    %eax, %ecx
        subl    %r8d, %edx
        addq    $1, %r9
        subl    %ecx, %edx
        shrl    $15, %ecx
        movl    %ecx, %eax
        leal    1(%r9), %ecx
        xorl    %edx, %eax
        movl    %eax, (%rdi)
        movl    24(%rsi), %edx
        cmpl    %ecx, %edx
        jbe     .L445
.L448:
        cmpl    %r9d, %edx
        jbe     .L454
        movq    (%rsi,%r9,8), %r8
        jmp     .L447
        .p2align 4,,10
        .p2align 3
.L445:
        // Hash the final element using sext_elt().
        addl    $-1, %edx
        jc      .L450
        movabsq $34359738360, %rdx
        movq    (%rsi,%rdx), %rcx
        sarq    $63, %rcx
.L452:
        movq    %rcx, %rdx
        subl    %eax, %ecx
        sarq    $32, %rdx
        movl    %ecx, %esi
        movl    %eax, %ecx
        subl    %edx, %esi
        shrl    $13, %ecx
        subl    %eax, %edx
        xorl    %esi, %ecx
        movl    %edx, %esi
        movl    %ecx, %edx
        subl    %ecx, %esi
        subl    %ecx, %eax
        sall    $8, %edx
        xorl    %esi, %edx
        movl    %edx, %esi
        subl    %edx, %eax
        subl    %edx, %ecx
        shrl    $13, %esi
        xorl    %esi, %eax
        movl    %ecx, %esi
        movl    %eax, %ecx
        subl    %eax, %esi
        subl    %eax, %edx
        shrl    $12, %ecx
        xorl    %esi, %ecx
        movl    %edx, %esi
        movl    %ecx, %edx
        subl    %ecx, %esi
        subl    %ecx, %eax
        sall    $16, %edx
        xorl    %esi, %edx
        subl    %edx, %eax
        subl    %edx, %ecx
        movl    %eax, %esi
        movl    %edx, %eax
        shrl    $5, %eax
        xorl    %esi, %eax
        movl    %eax, %esi
        subl    %eax, %ecx
        subl    %eax, %edx
        shrl    $3, %esi
        xorl    %esi, %ecx
        movl    %ecx, %esi
        subl    %ecx, %edx
        sall    $10, %esi
        xorl    %esi, %edx
        subl    %ecx, %eax
        subl    %edx, %eax
        shrl    $15, %edx
        xorl    %edx, %eax
        movl    %eax, (%rdi)
        ret
.L450:
        movl    %edx, %edx
        movq    (%rsi,%rdx,8), %rcx
        jmp     .L452
        .cfi_endproc
------------------------------------------------------------------------

That's a lot of extra code. :-) And by splitting the final element out,
we lose the optimisation that avoids the sign_mask path in elt(), so
that we end up with an if-then-else diamond that wasn't there before:

        // Hash the final element using sext_elt().
        addl    $-1, %edx
        jc      .L450
        // sign_mask
        movabsq $34359738360, %rdx
        movq    (%rsi,%rdx), %rcx
        sarq    $63, %rcx
.L452:
        ...
.L450:
        // normal read
        movl    %edx, %edx
        movq    (%rsi,%rdx,8), %rcx
        jmp     .L452

The branch should of course be predictable.  And the branch around
the elt() handling should be predictable if it really is the case
that most wide ints don't need it.  But it's still a lot of code
that isn't marked cold.  (And marking it cold would make the
optimisation pointless, since the optimisation is targetting the
case in which the code is executed.)

Thanks,
Richard

Reply via email to