Tamar Christina <tamar.christ...@arm.com> writes:
> Hi All,
>
> This adds an RTL pattern for when two NARROWB instructions are being combined
> with a PACK.  The second NARROWB is then transformed into a NARROWT.
>
> For the example:
>
> void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n)
> {
>   for (int i = 0; i < (n & -16); i+=1)
>     pixel[i] += (pixel[i] * level) / 0xff;
> }
>
> we generate:
>
>         addhnb  z6.b, z0.h, z4.h
>         addhnb  z5.b, z1.h, z4.h
>         addhnb  z0.b, z0.h, z6.h
>         addhnt  z0.b, z1.h, z5.h
>         add     z0.b, z0.b, z2.b
>
> instead of:
>
>         addhnb  z6.b, z1.h, z4.h
>         addhnb  z5.b, z0.h, z4.h
>         addhnb  z1.b, z1.h, z6.h
>         addhnb  z0.b, z0.h, z5.h
>         uzp1    z0.b, z0.b, z1.b
>         add     z0.b, z0.b, z2.b
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
>       * config/aarch64/aarch64-sve2.md (*aarch64_sve_pack_<sve_int_op><mode>):
>       New.
>       * config/aarch64/iterators.md (binary_top): New.
>
> gcc/testsuite/ChangeLog:
>
>       * gcc.dg/vect/vect-div-bitmask-4.c: New test.
>       * gcc.target/aarch64/sve2/div-by-bitmask_2.c: New test.
>
> --- inline copy of patch -- 
> diff --git a/gcc/config/aarch64/aarch64-sve2.md 
> b/gcc/config/aarch64/aarch64-sve2.md
> index 
> ab5dcc369481311e5bd68a1581265e1ce99b4b0f..0ee46c8b0d43467da4a6b98ad3c41e5d05d8cf38
>  100644
> --- a/gcc/config/aarch64/aarch64-sve2.md
> +++ b/gcc/config/aarch64/aarch64-sve2.md
> @@ -1600,6 +1600,25 @@ (define_insn "@aarch64_sve_<sve_int_op><mode>"
>    "<sve_int_op>\t%0.<Ventype>, %2.<Vetype>, %3.<Vetype>"
>  )
>  
> +(define_insn_and_split "*aarch64_sve_pack_<sve_int_op><mode>"
> +  [(set (match_operand:<VNARROW> 0 "register_operand" "=w")
> +     (unspec:<VNARROW>
> +       [(match_operand:SVE_FULL_HSDI 1 "register_operand" "w")

"0" would be safer, in case the instruction is only split after RA.

> +        (subreg:SVE_FULL_HSDI (unspec:<VNARROW>
> +          [(match_operand:SVE_FULL_HSDI 2 "register_operand" "w")
> +           (match_operand:SVE_FULL_HSDI 3 "register_operand" "w")]
> +          SVE2_INT_BINARY_NARROWB) 0)]
> +       UNSPEC_PACK))]

I think ideally this would be the canonical pattern, so that we can
drop the separate top unspecs.  That's more work though, and would
probably make sense to do once we have a generic way of representing
the pack.

So OK with the "0" change above.

Thanks,
Richard

> +  "TARGET_SVE2"
> +  "#"
> +  "&& true"
> +  [(const_int 0)]
> +{
> +  rtx tmp = lowpart_subreg (<VNARROW>mode, operands[1], <MODE>mode);
> +  emit_insn (gen_aarch64_sve (<SVE2_INT_BINARY_NARROWB:binary_top>, 
> <MODE>mode,
> +                           operands[0], tmp, operands[2], operands[3]));
> +})
> +
>  ;; -------------------------------------------------------------------------
>  ;; ---- [INT] Narrowing right shifts
>  ;; -------------------------------------------------------------------------
> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> index 
> 0dd9dc66f7ccd78acacb759662d0cd561cd5b4ef..37d8161a33b1c399d80be82afa67613a087389d4
>  100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -3589,6 +3589,11 @@ (define_int_attr brk_op [(UNSPEC_BRKA "a") 
> (UNSPEC_BRKB "b")
>  
>  (define_int_attr sve_pred_op [(UNSPEC_PFIRST "pfirst") (UNSPEC_PNEXT 
> "pnext")])
>  
> +(define_int_attr binary_top [(UNSPEC_ADDHNB "UNSPEC_ADDHNT")
> +                          (UNSPEC_RADDHNB "UNSPEC_RADDHNT")
> +                          (UNSPEC_RSUBHNB "UNSPEC_RSUBHNT")
> +                          (UNSPEC_SUBHNB "UNSPEC_SUBHNT")])
> +
>  (define_int_attr sve_int_op [(UNSPEC_ADCLB "adclb")
>                            (UNSPEC_ADCLT "adclt")
>                            (UNSPEC_ADDHNB "addhnb")
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c 
> b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..0df08bda6fd3e33280307ea15c82dd9726897cfd
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c
> @@ -0,0 +1,26 @@
> +/* { dg-require-effective-target vect_int } */
> +/* { dg-additional-options "-fno-vect-cost-model" { target aarch64*-*-* } } 
> */
> +
> +#include <stdint.h>
> +#include "tree-vect.h"
> +
> +#define N 50
> +#define TYPE uint32_t
> +
> +__attribute__((noipa, noinline, optimize("O1")))
> +void fun1(TYPE* restrict pixel, TYPE level, int n)
> +{
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] += (pixel[i] * (uint64_t)level) / 0xffffffffUL;
> +}
> +
> +__attribute__((noipa, noinline, optimize("O3")))
> +void fun2(TYPE* restrict pixel, TYPE level, int n)
> +{
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] += (pixel[i] * (uint64_t)level) / 0xffffffffUL;
> +}
> +
> +#include "vect-div-bitmask.h"
> +
> +/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected" 
> "vect" { target aarch64*-*-* } } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_2.c 
> b/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_2.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..cddcebdf15ecaa9dc515f58cdbced36c8038db1b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_2.c
> @@ -0,0 +1,56 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O2 -std=c99" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
> +
> +#include <stdint.h>
> +
> +/*
> +** draw_bitmap1:
> +** ...
> +**   addhnb  z6.b, z0.h, z4.h
> +**   addhnb  z5.b, z1.h, z4.h
> +**   addhnb  z0.b, z0.h, z6.h
> +**   addhnt  z0.b, z1.h, z5.h
> +** ...
> +*/
> +void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n)
> +{
> +  for (int i = 0; i < (n & -16); i+=1)
> +    pixel[i] += (pixel[i] * level) / 0xff;
> +}
> +
> +void draw_bitmap2(uint8_t* restrict pixel, uint8_t level, int n)
> +{
> +  for (int i = 0; i < (n & -16); i+=1)
> +    pixel[i] += (pixel[i] * level) / 0xfe;
> +}
> +
> +/*
> +** draw_bitmap3:
> +** ...
> +**   addhnb  z6.h, z0.s, z4.s
> +**   addhnb  z5.h, z1.s, z4.s
> +**   addhnb  z0.h, z0.s, z6.s
> +**   addhnt  z0.h, z1.s, z5.s
> +** ...
> +*/
> +void draw_bitmap3(uint16_t* restrict pixel, uint16_t level, int n)
> +{
> +  for (int i = 0; i < (n & -16); i+=1)
> +    pixel[i] += (pixel[i] * level) / 0xffffU;
> +}
> +
> +/*
> +** draw_bitmap4:
> +** ...
> +**   addhnb  z6.s, z0.d, z4.d
> +**   addhnb  z5.s, z1.d, z4.d
> +**   addhnb  z0.s, z0.d, z6.d
> +**   addhnt  z0.s, z1.d, z5.d
> +** ...
> +*/
> +void draw_bitmap4(uint32_t* restrict pixel, uint32_t level, int n)
> +{
> +  for (int i = 0; i < (n & -16); i+=1)
> +    pixel[i] += (pixel[i] * (uint64_t)level) / 0xffffffffUL;
> +}

Reply via email to