Re: [PATCH 2/2][RFC] Add loop masking support for x86

Richard Sandiford via Gcc-patches Thu, 15 Jul 2021 07:57:38 -0700

Richard Biener <[email protected]> writes:
> On Thu, 15 Jul 2021, Hongtao Liu wrote:
>
>> On Thu, Jul 15, 2021 at 6:45 PM Richard Biener via Gcc-patches
>> <[email protected]> wrote:
>> >
>> > On Thu, Jul 15, 2021 at 12:30 PM Richard Biener <[email protected]> wrote:
>> > >
>> > > The following extends the existing loop masking support using
>> > > SVE WHILE_ULT to x86 by proving an alternate way to produce the
>> > > mask using VEC_COND_EXPRs.  So with --param vect-partial-vector-usage
>> > > you can now enable masked vectorized epilogues (=1) or fully
>> > > masked vector loops (=2).
>> > >
>> > > What's missing is using a scalar IV for the loop control
>> > > (but in principle AVX512 can use the mask here - just the patch
>> > > doesn't seem to work for AVX512 yet for some reason - likely
>> > > expand_vec_cond_expr_p doesn't work there).  What's also missing
>> > > is providing more support for predicated operations in the case
>> > > of reductions either via VEC_COND_EXPRs or via implementing
>> > > some of the .COND_{ADD,SUB,MUL...} internal functions as mapping
>> > > to masked AVX512 operations.
>> > >
>> > > For AVX2 and
>> > >
>> > > int foo (unsigned *a, unsigned * __restrict b, int n)
>> > > {
>> > >   unsigned sum = 1;
>> > >   for (int i = 0; i < n; ++i)
>> > >     b[i] += a[i];
>> > >   return sum;
>> > > }
>> > >
>> > > we get
>> > >
>> > > .L3:
>> > >         vpmaskmovd      (%rsi,%rax), %ymm0, %ymm3
>> > >         vpmaskmovd      (%rdi,%rax), %ymm0, %ymm1
>> > >         addl    $8, %edx
>> > >         vpaddd  %ymm3, %ymm1, %ymm1
>> > >         vpmaskmovd      %ymm1, %ymm0, (%rsi,%rax)
>> > >         vmovd   %edx, %xmm1
>> > >         vpsubd  %ymm15, %ymm2, %ymm0
>> > >         addq    $32, %rax
>> > >         vpbroadcastd    %xmm1, %ymm1
>> > >         vpaddd  %ymm4, %ymm1, %ymm1
>> > >         vpsubd  %ymm15, %ymm1, %ymm1
>> > >         vpcmpgtd        %ymm1, %ymm0, %ymm0
>> > >         vptest  %ymm0, %ymm0
>> > >         jne     .L3
>> > >
>> > > for the fully masked loop body and for the masked epilogue
>> > > we see
>> > >
>> > > .L4:
>> > >         vmovdqu (%rsi,%rax), %ymm3
>> > >         vpaddd  (%rdi,%rax), %ymm3, %ymm0
>> > >         vmovdqu %ymm0, (%rsi,%rax)
>> > >         addq    $32, %rax
>> > >         cmpq    %rax, %rcx
>> > >         jne     .L4
>> > >         movl    %edx, %eax
>> > >         andl    $-8, %eax
>> > >         testb   $7, %dl
>> > >         je      .L11
>> > > .L3:
>> > >         subl    %eax, %edx
>> > >         vmovdqa .LC0(%rip), %ymm1
>> > >         salq    $2, %rax
>> > >         vmovd   %edx, %xmm0
>> > >         movl    $-2147483648, %edx
>> > >         addq    %rax, %rsi
>> > >         vmovd   %edx, %xmm15
>> > >         vpbroadcastd    %xmm0, %ymm0
>> > >         vpbroadcastd    %xmm15, %ymm15
>> > >         vpsubd  %ymm15, %ymm1, %ymm1
>> > >         vpsubd  %ymm15, %ymm0, %ymm0
>> > >         vpcmpgtd        %ymm1, %ymm0, %ymm0
>> > >         vpmaskmovd      (%rsi), %ymm0, %ymm1
>> > >         vpmaskmovd      (%rdi,%rax), %ymm0, %ymm2
>> > >         vpaddd  %ymm2, %ymm1, %ymm1
>> > >         vpmaskmovd      %ymm1, %ymm0, (%rsi)
>> > > .L11:
>> > >         vzeroupper
>> > >
>> > > compared to
>> > >
>> > > .L3:
>> > >         movl    %edx, %r8d
>> > >         subl    %eax, %r8d
>> > >         leal    -1(%r8), %r9d
>> > >         cmpl    $2, %r9d
>> > >         jbe     .L6
>> > >         leaq    (%rcx,%rax,4), %r9
>> > >         vmovdqu (%rdi,%rax,4), %xmm2
>> > >         movl    %r8d, %eax
>> > >         andl    $-4, %eax
>> > >         vpaddd  (%r9), %xmm2, %xmm0
>> > >         addl    %eax, %esi
>> > >         andl    $3, %r8d
>> > >         vmovdqu %xmm0, (%r9)
>> > >         je      .L2
>> > > .L6:
>> > >         movslq  %esi, %r8
>> > >         leaq    0(,%r8,4), %rax
>> > >         movl    (%rdi,%r8,4), %r8d
>> > >         addl    %r8d, (%rcx,%rax)
>> > >         leal    1(%rsi), %r8d
>> > >         cmpl    %r8d, %edx
>> > >         jle     .L2
>> > >         addl    $2, %esi
>> > >         movl    4(%rdi,%rax), %r8d
>> > >         addl    %r8d, 4(%rcx,%rax)
>> > >         cmpl    %esi, %edx
>> > >         jle     .L2
>> > >         movl    8(%rdi,%rax), %edx
>> > >         addl    %edx, 8(%rcx,%rax)
>> > > .L2:
>> > >
>> > > I'm giving this a little testing right now but will dig on why
>> > > I don't get masked loops when AVX512 is enabled.
>> >
>> > Ah, a simple thinko - rgroup_controls vectypes seem to be
>> > always VECTOR_BOOLEAN_TYPE_P and thus we can
>> > use expand_vec_cmp_expr_p.  The AVX512 fully masked
>> > loop then looks like
>> >
>> > .L3:
>> >         vmovdqu32       (%rsi,%rax,4), %ymm2{%k1}
>> >         vmovdqu32       (%rdi,%rax,4), %ymm1{%k1}
>> >         vpaddd  %ymm2, %ymm1, %ymm0
>> >         vmovdqu32       %ymm0, (%rsi,%rax,4){%k1}
>> >         addq    $8, %rax
>> >         vpbroadcastd    %eax, %ymm0
>> >         vpaddd  %ymm4, %ymm0, %ymm0
>> >         vpcmpud $6, %ymm0, %ymm3, %k1
>> >         kortestb        %k1, %k1
>> >         jne     .L3
>> >
>> > I guess for x86 it's not worth preserving the VEC_COND_EXPR
>> > mask generation but other archs may not provide all required vec_cmp
>> > expanders.
>> 
>> For the main loop, the full-masked loop's codegen seems much worse.
>> Basically, we need at least 4 instructions to do what while_ult in arm does.
>> 
>>          vpbroadcastd    %eax, %ymm0
>>          vpaddd  %ymm4, %ymm0, %ymm0
>>          vpcmpud $6, %ymm0, %ymm3, %k1
>>          kortestb        %k1, %k1
>> vs
>>        whilelo(or some other while<op>)
>> 
>> more instructions are needed for avx2 since there's no direct
>> instruction for .COND_{ADD,SUB..}
>> 
>> original
>> .L4:
>>         vmovdqu (%rcx,%rax), %ymm1
>>         vpaddd (%rdi,%rax), %ymm1, %ymm0
>>         vmovdqu %ymm0, (%rcx,%rax)
>>         addq $32, %rax
>>         cmpq %rax, %rsi
>>         jne .L4
>> 
>> vs
>> avx512 full-masked loop
>> .L3:
>>          vmovdqu32       (%rsi,%rax,4), %ymm2{%k1}
>>          vmovdqu32       (%rdi,%rax,4), %ymm1{%k1}
>>          vpaddd  %ymm2, %ymm1, %ymm0
>>          vmovdqu32       %ymm0, (%rsi,%rax,4){%k1}
>>          addq    $8, %rax
>>          vpbroadcastd    %eax, %ymm0
>>          vpaddd  %ymm4, %ymm0, %ymm0
>>          vpcmpud $6, %ymm0, %ymm3, %k1
>>          kortestb        %k1, %k1
>>          jne     .L3
>> 
>> vs
>> avx2 full-masked loop
>> .L3:
>>          vpmaskmovd      (%rsi,%rax), %ymm0, %ymm3
>>          vpmaskmovd      (%rdi,%rax), %ymm0, %ymm1
>>          addl    $8, %edx
>>          vpaddd  %ymm3, %ymm1, %ymm1
>>          vpmaskmovd      %ymm1, %ymm0, (%rsi,%rax)
>>          vmovd   %edx, %xmm1
>>          vpsubd  %ymm15, %ymm2, %ymm0
>>          addq    $32, %rax
>>          vpbroadcastd    %xmm1, %ymm1
>>          vpaddd  %ymm4, %ymm1, %ymm1
>>          vpsubd  %ymm15, %ymm1, %ymm1
>>          vpcmpgtd        %ymm1, %ymm0, %ymm0
>>         vptest  %ymm0, %ymm0
>>          jne     .L3
>> 
>> vs  arm64's code
>> 
>> .L3:
>>     ld1w z1.s, p0/z, [x1, x3, lsl 2]
>>     ld1w z0.s, p0/z, [x0, x3, lsl 2]
>>     add z0.s, z0.s, z1.s
>>     st1w z0.s, p0, [x1, x3, lsl 2]
>>     add x3, x3, x4
>>     whilelo p0.s, w3, w2
>>     b.any .L3
>
> Yes, that's true - it might still be OK for vectorizing epilogues
> and thus --param vect-partial-vector-usage=1
>
> Can AVX512 do any better than this?
>
>         vpbroadcastd    %eax, %ymm0
>         vpaddd  %ymm4, %ymm0, %ymm0
>         vpcmpud $6, %ymm0, %ymm3, %k1
>
> Note with multiple types involved things get even worse
> since you need masks for each vector mode.  But as far as
> I can see that's the same for SVE (but there we have the
> single-instruction whilelo).  I guess we'll also generate
> wrong code at the moment for the case where we need
> multiple vectors to hold the full mask - vect_gen_while
> doesn't seem to be prepared for this?
>
> So with
>
> int foo (unsigned long *a, unsigned * __restrict b, int n)
> {
>   unsigned sum = 1;
>   for (int i = 0; i < n; ++i)
>     {
>       b[i] += a[i];
>     }
>   return sum;
> }
>
> SVE uses
>
> .L3:
>         ld1d    z0.d, p0/z, [x1, x3, lsl 3]
>         ld1d    z1.d, p0/z, [x0, x3, lsl 3]
>         adr     z0.d, [z0.d, z1.d, lsl 2]
>         st1d    z0.d, p0, [x1, x3, lsl 3]
>         add     x3, x3, x4
>         whilelo p0.d, w3, w2
>         b.any   .L3
>
> so p0 vs. p0/z, whatever that means and it looks like
> the vector add can somehow concatenate z0.d and z1.d.
> Truly fascinating ;)


It looks like this is from a version in which “b” was also long.
For the loop above I get:

.L3:
        ld1d    z0.d, p0/z, [x0, x3, lsl 3]
        ld1w    z1.d, p0/z, [x1, x3, lsl 2]
        add     z0.s, z0.s, z1.s
        st1w    z0.d, p0, [x1, x3, lsl 2]
        add     x3, x3, x4
        whilelo p0.d, w3, w2
        b.any   .L3

where the ld1w is an extending load and the st1w is a truncating store.

But yeah, there's a hard-coded assumption that a mask for 2 SIs
can also control 1 DI, etc.  For example:

void foo (unsigned long *a, unsigned * __restrict b, int n)
{
  for (int i = 0; i < n; ++i)
    {
      a[i * 2] += 1;
      a[i * 2 + 1] += 2;
      b[i * 4] += 1;
      b[i * 4 + 1] += 2;
      b[i * 4 + 2] += 3;
      b[i * 4 + 3] += 4;
    }
}

becomes:

.L3:
        ld1d    z0.d, p0/z, [x0]
        add     z0.d, z0.d, z2.d
        st1d    z0.d, p0, [x0]
        ld1w    z0.s, p0/z, [x1, x3, lsl 2]
        add     z0.s, z0.s, z1.s
        st1w    z0.s, p0, [x1, x3, lsl 2]
        add     x0, x0, x4
        add     x3, x3, x5
        whilelo p0.s, x3, x2
        b.any   .L3

This is explained more in the big comment above rgroup_controls
(guess you've already seen it).

> It looks like --param vect_partial_vector_usage defaults to 2,
> power forces it to 1 (power10) or 0 otherwise.
>
> I think we'd need a target hook that toggles this per mode
> so we could tune this dependent on AVX512 vectorization vs. AVX2.

Yeah, keying it off the architecture made sense for Power because the
only purpose of len_load/store is to support partial vectorisation.
But I agree a hook is needed now that we have MASK_LOAD/STORE for
general use and are emulating WHILE_ULT.

Thanks,
Richard

Re: [PATCH 2/2][RFC] Add loop masking support for x86

Reply via email to