Re: [PATCH][PR113816] AArch64: Use SIMD+GPR for logical vector reductions

Richard Sandiford Thu, 10 Oct 2024 12:07:47 -0700

Jennifer Schmitz <jschm...@nvidia.com> writes:
> This patch implements the optabs reduc_and_scal_<mode>,
> reduc_ior_scal_<mode>, and reduc_xor_scal_<mode> for ASIMD modes V8QI,
> V16QI, V4HI, and V8HI for TARGET_SIMD to improve codegen for bitwise logical
> vector reduction operations.
> Previously, either only vector registers or only general purpose registers 
> (GPR)
> were used. Now, vector registers are used for the reduction from 128 to 64 
> bits;
> 64-bit GPR are used for the reduction from 64 to 32 bits; and 32-bit GPR are 
> used
> for the rest of the reduction steps.
>
> For example, the test case (V8HI)
> int16_t foo (int16_t *a)
> {
>   int16_t b = -1;
>   for (int i = 0; i < 8; ++i)
>     b &= a[i];
>   return b;
> }
>
> was previously compiled to (-O2):
> foo:
>       ldr     q0, [x0]
>       movi    v30.4s, 0
>       ext     v29.16b, v0.16b, v30.16b, #8
>       and     v29.16b, v29.16b, v0.16b
>       ext     v31.16b, v29.16b, v30.16b, #4
>       and     v31.16b, v31.16b, v29.16b
>       ext     v30.16b, v31.16b, v30.16b, #2
>       and     v30.16b, v30.16b, v31.16b
>       umov    w0, v30.h[0]
>       ret
>
> With patch, it is compiled to:
> foo:
>       ldr     q31, [x0]
>       ext     v30.16b, v31.16b, v31.16b, #8
>       and     v31.8b, v30.8b, v31.8b
>       fmov    x0, d31
>       and     x0, x0, x0, lsr 32
>       and     w0, w0, w0, lsr 16
>       ret
>
> For modes V4SI and V2DI, the pattern was not implemented, because the
> current codegen (using only base instructions) is already efficient.
>
> Note that the PR initially suggested to use SVE reduction ops. However,
> they have higher latency than the proposed sequence, which is why using
> neon and base instructions is preferable.
>
> Test cases were added for 8/16-bit integers for all implemented modes and all
> three operations to check the produced assembly.
>
> We also added [istarget aarch64*-*-*] to the selector vect_logical_reduc,
> because for aarch64 vector types, either the logical reduction optabs are
> implemented or the codegen for reduction operations is good as it is.
> This was motivated by failure of a scan-tree-dump directive in the test cases
> gcc.dg/vect/vect-reduc-or_1.c and gcc.dg/vect/vect-reduc-or_2.c.
>
> The patch was bootstrapped and regtested on aarch64-linux-gnu, no regression.
> OK for mainline?
>
> Signed-off-by: Jennifer Schmitz <jschm...@nvidia.com>
>
> gcc/
>       PR target/113816
>       * config/aarch64/aarch64-simd.md (reduc_<optab>_scal_<mode>):
>       Implement for logical bitwise operations for VDQV_E.
>
> gcc/testsuite/
>       PR target/113816
>       * lib/target-supports.exp (vect_logical_reduc): Add aarch64*.
>       * gcc.target/aarch64/simd/logical_reduc.c: New test.
>       * gcc.target/aarch64/vect-reduc-or_1.c: Adjust expected outcome.
> ---
>  gcc/config/aarch64/aarch64-simd.md            |  55 +++++
>  .../gcc.target/aarch64/simd/logical_reduc.c   | 208 ++++++++++++++++++
>  .../gcc.target/aarch64/vect-reduc-or_1.c      |   2 +-
>  gcc/testsuite/lib/target-supports.exp         |   4 +-
>  4 files changed, 267 insertions(+), 2 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/logical_reduc.c
>
> diff --git a/gcc/config/aarch64/aarch64-simd.md 
> b/gcc/config/aarch64/aarch64-simd.md
> index 23c03a96371..00286b8b020 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -3608,6 +3608,61 @@
>    }
>  )
>  
> +;; Emit a sequence for bitwise logical reductions over vectors for V8QI, 
> V16QI,
> +;; V4HI, and V8HI modes.  The reduction is achieved by iteratively operating
> +;; on the two halves of the input.
> +;; If the input has 128 bits, the first operation is performed in vector
> +;; registers.  From 64 bits down, the reduction steps are performed in 
> general
> +;; purpose registers.
> +;; For example, for V8HI and operation AND, the intended sequence is:
> +;; EXT      v1.16b, v0.16b, v0.16b, #8
> +;; AND      v0.8b, v1.8b, v0.8b
> +;; FMOV     x0, d0
> +;; AND      x0, x0, x0, 32
> +;; AND      w0, w0, w0, 16
> +;;
> +;; For V8QI and operation AND, the sequence is:
> +;; AND      x0, x0, x0, lsr 32
> +;; AND      w0, w0, w0, lsr, 16
> +;; AND      w0, w0, w0, lsr, 8
> +
> +(define_expand "reduc_<optab>_scal_<mode>"
> + [(match_operand:<VEL> 0 "register_operand")
> +  (LOGICAL:VDQV_E (match_operand:VDQV_E 1 "register_operand"))]
> +  "TARGET_SIMD"
> +  {
> +    rtx dst = operands[1];
> +    rtx tdi = gen_reg_rtx (DImode);
> +    rtx tsi = lowpart_subreg (SImode, tdi, DImode);
> +    rtx op1_lo;
> +    if (known_eq (GET_MODE_SIZE (<MODE>mode), 16))
> +      {
> +     rtx t0 = gen_reg_rtx (<MODE>mode);
> +     rtx t1 = gen_reg_rtx (DImode);
> +     rtx t2 = gen_reg_rtx (DImode);
> +     rtx idx = GEN_INT (8 / GET_MODE_UNIT_SIZE (<MODE>mode));
> +     emit_insn (gen_aarch64_ext<mode> (t0, dst, dst, idx));
> +     op1_lo = lowpart_subreg (V2DImode, dst, <MODE>mode);
> +     rtx t0_lo = lowpart_subreg (V2DImode, t0, <MODE>mode);
> +     emit_insn (gen_aarch64_get_lanev2di (t1, op1_lo, GEN_INT (0)));
> +     emit_insn (gen_aarch64_get_lanev2di (t2, t0_lo, GEN_INT (0)));
> +     emit_insn (gen_<optab>di3 (t1, t1, t2));
> +     emit_move_insn (tdi, t1);
> +      }
> +    else
> +      {
> +     op1_lo = lowpart_subreg (DImode, dst, <MODE>mode);
> +     emit_move_insn (tdi, op1_lo);
> +      }
> +    emit_insn (gen_<optab>_lshrdi3 (tdi, tdi, GEN_INT (32), tdi));
> +    emit_insn (gen_<optab>_lshrsi3 (tsi, tsi, GEN_INT (16), tsi));
> +    if (known_eq (GET_MODE_UNIT_BITSIZE (<MODE>mode), 8))
> +      emit_insn (gen_<optab>_lshrsi3 (tsi, tsi, GEN_INT (8), tsi));
> +    emit_move_insn (operands[0], lowpart_subreg (<VEL>mode, tsi, SImode));
> +    DONE;
> +  }
> +)


Sorry to be awkward, but I've got mixed feelings about this.
The sequence produced looks good.  But we're not defining this
optab because the target has a special instruction for implementing
the operation.  We're just implementing it to tweak the loop emitted
by vect_create_epilog_for_reduction.  Maybe we should change that loop
instead, perhaps with a target hook?

Also, this seems related to the fact that:

typedef unsigned char v8qi __attribute__((vector_size(8)));
unsigned char foo(v8qi x) {
  x &= __builtin_shufflevector (x, x, 1, 2, 3, 4, 5, 6, 7, 0);
  return x[0];
}

generates:

        ext     v31.8b, v0.8b, v0.8b, #1
        and     v31.8b, v31.8b, v0.8b
        umov    x0, v31.d[0]
        ret

instead of something like:

        umov    w0, v31.h[0]
        and     w0, w0, lsr #8
        ret

So an alternative might be to do some post-vectorisation simplification,
perhaps in isel?

cc:ing Richi for his thoughts.

Thanks,
Richard

> +
>  (define_insn "aarch64_reduc_<optab>_internal<mode>"
>   [(set (match_operand:VDQV_S 0 "register_operand" "=w")
>         (unspec:VDQV_S [(match_operand:VDQV_S 1 "register_operand" "w")]
> diff --git a/gcc/testsuite/gcc.target/aarch64/simd/logical_reduc.c 
> b/gcc/testsuite/gcc.target/aarch64/simd/logical_reduc.c
> new file mode 100644
> index 00000000000..9508288b218
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/simd/logical_reduc.c
> @@ -0,0 +1,208 @@
> +/* { dg-options "-O2 -ftree-vectorize" } */
> +/* { dg-final { check-function-bodies "**" "" } } */
> +
> +#include <stdint.h>
> +
> +/*
> +** fv16qi_and:
> +**   ldr     q([0-9]+), \[x0\]
> +**   ext     v([0-9]+)\.16b, v\1\.16b, v\1\.16b, #8
> +**   and     v\1\.8b, v\2\.8b, v\1\.8b
> +**   fmov    x0, d\1
> +**   and     x0, x0, x0, lsr 32
> +**   and     w0, w0, w0, lsr 16
> +**   and     w0, w0, w0, lsr 8
> +**   ret
> +*/
> +int8_t fv16qi_and (int8_t *a)
> +{
> +  int8_t b = -1;
> +  for (int i = 0; i < 16; ++i)
> +    b &= a[i];
> +  return b;
> +}
> +
> +/*
> +** fv8hi_and:
> +**   ldr     q([0-9]+), \[x0\]
> +**   ext     v([0-9]+)\.16b, v\1\.16b, v\1\.16b, #8
> +**   and     v\1\.8b, v\2\.8b, v\1\.8b
> +**   fmov    x0, d\1
> +**   and     x0, x0, x0, lsr 32
> +**   and     w0, w0, w0, lsr 16
> +**   ret
> +*/
> +int16_t fv8hi_and (int16_t *a)
> +{
> +  int16_t b = -1;
> +  for (int i = 0; i < 8; ++i)
> +    b &= a[i];
> +  return b;
> +}
> +
> +/*
> +** fv16qi_or:
> +**   ldr     q([0-9]+), \[x0\]
> +**   ext     v([0-9]+)\.16b, v\1\.16b, v\1\.16b, #8
> +**   orr     v\1\.8b, v\2\.8b, v\1\.8b
> +**   fmov    x0, d\1
> +**   orr     x0, x0, x0, lsr 32
> +**   orr     w0, w0, w0, lsr 16
> +**   orr     w0, w0, w0, lsr 8
> +**   ret
> +*/
> +int8_t fv16qi_or (int8_t *a)
> +{
> +  int8_t b = 0;
> +  for (int i = 0; i < 16; ++i)
> +    b |= a[i];
> +  return b;
> +}
> +
> +/*
> +** fv8hi_or:
> +**   ldr     q([0-9]+), \[x0\]
> +**   ext     v([0-9]+)\.16b, v\1\.16b, v\1\.16b, #8
> +**   orr     v\1\.8b, v\2\.8b, v\1\.8b
> +**   fmov    x0, d\1
> +**   orr     x0, x0, x0, lsr 32
> +**   orr     w0, w0, w0, lsr 16
> +**   ret
> +*/
> +int16_t fv8hi_or (int16_t *a)
> +{
> +  int16_t b = 0;
> +  for (int i = 0; i < 8; ++i)
> +    b |= a[i];
> +  return b;
> +}
> +
> +/*
> +** fv16qi_xor:
> +**   ldr     q([0-9]+), \[x0\]
> +**   ext     v([0-9]+)\.16b, v\1\.16b, v\1\.16b, #8
> +**   eor     v\1\.8b, v\2\.8b, v\1\.8b
> +**   fmov    x0, d\1
> +**   eor     x0, x0, x0, lsr 32
> +**   eor     w0, w0, w0, lsr 16
> +**   eor     w0, w0, w0, lsr 8
> +**   ret
> +*/
> +int8_t fv16qi_xor (int8_t *a)
> +{
> +  int8_t b = 0;
> +  for (int i = 0; i < 16; ++i)
> +    b ^= a[i];
> +  return b;
> +}
> +
> +/*
> +** fv8hi_xor:
> +**   ldr     q([0-9]+), \[x0\]
> +**   ext     v([0-9]+)\.16b, v\1\.16b, v\1\.16b, #8
> +**   eor     v\1\.8b, v\2\.8b, v\1\.8b
> +**   fmov    x0, d\1
> +**   eor     x0, x0, x0, lsr 32
> +**   eor     w0, w0, w0, lsr 16
> +**   ret
> +*/
> +int16_t fv8hi_xor (int16_t *a)
> +{
> +  int16_t b = 0;
> +  for (int i = 0; i < 8; ++i)
> +    b ^= a[i];
> +  return b;
> +}
> +
> +/*
> +** fv8qi_and:
> +**   ldr     x0, \[x0\]
> +**   and     x0, x0, x0, lsr 32
> +**   and     w0, w0, w0, lsr 16
> +**   and     w0, w0, w0, lsr 8
> +**   ret
> +*/
> +int8_t fv8qi_and (int8_t *a)
> +{
> +  int8_t b = -1;
> +  for (int i = 0; i < 8; ++i)
> +    b &= a[i];
> +  return b;
> +}
> +
> +/*
> +** fv4hi_and:
> +**   ldr     x0, \[x0\]
> +**   and     x0, x0, x0, lsr 32
> +**   and     w0, w0, w0, lsr 16
> +**   ret
> +*/
> +int16_t fv4hi_and (int16_t *a)
> +{
> +  int16_t b = -1;
> +  for (int i = 0; i < 4; ++i)
> +    b &= a[i];
> +  return b;
> +}
> +
> +/*
> +** fv8qi_or:
> +**   ldr     x0, \[x0\]
> +**   orr     x0, x0, x0, lsr 32
> +**   orr     w0, w0, w0, lsr 16
> +**   orr     w0, w0, w0, lsr 8
> +**   ret
> +*/
> +int8_t fv8qi_or (int8_t *a)
> +{
> +  int8_t b = 0;
> +  for (int i = 0; i < 8; ++i)
> +    b |= a[i];
> +  return b;
> +}
> +
> +/*
> +** fv4hi_or:
> +**   ldr     x0, \[x0\]
> +**   orr     x0, x0, x0, lsr 32
> +**   orr     w0, w0, w0, lsr 16
> +**   ret
> +*/
> +int16_t fv4hi_or (int16_t *a)
> +{
> +  int16_t b = 0;
> +  for (int i = 0; i < 4; ++i)
> +    b |= a[i];
> +  return b;
> +}
> +
> +/*
> +** fv8qi_xor:
> +**   ldr     x0, \[x0\]
> +**   eor     x0, x0, x0, lsr 32
> +**   eor     w0, w0, w0, lsr 16
> +**   eor     w0, w0, w0, lsr 8
> +**   ret
> +*/
> +int8_t fv8qi_xor (int8_t *a)
> +{
> +  int8_t b = 0;
> +  for (int i = 0; i < 8; ++i)
> +    b ^= a[i];
> +  return b;
> +}
> +
> +/*
> +** fv4hi_xor:
> +**   ldr     x0, \[x0\]
> +**   eor     x0, x0, x0, lsr 32
> +**   eor     w0, w0, w0, lsr 16
> +**   ret
> +*/
> +int16_t fv4hi_xor (int16_t *a)
> +{
> +  int16_t b = 0;
> +  for (int i = 0; i < 4; ++i)
> +    b ^= a[i];
> +  return b;
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/vect-reduc-or_1.c 
> b/gcc/testsuite/gcc.target/aarch64/vect-reduc-or_1.c
> index 918822a7d00..70c4ca18094 100644
> --- a/gcc/testsuite/gcc.target/aarch64/vect-reduc-or_1.c
> +++ b/gcc/testsuite/gcc.target/aarch64/vect-reduc-or_1.c
> @@ -32,4 +32,4 @@ main (unsigned char argc, char **argv)
>    return 0;
>  }
>  
> -/* { dg-final { scan-tree-dump "Reduce using vector shifts" "vect" } } */
> +/* { dg-final { scan-tree-dump "Reduce using direct vector reduction" "vect" 
> } } */
> diff --git a/gcc/testsuite/lib/target-supports.exp 
> b/gcc/testsuite/lib/target-supports.exp
> index 8f2afe866c7..44f737f15d0 100644
> --- a/gcc/testsuite/lib/target-supports.exp
> +++ b/gcc/testsuite/lib/target-supports.exp
> @@ -9564,7 +9564,9 @@ proc check_effective_target_vect_logical_reduc { } {
>                  || [istarget amdgcn-*-*]
>                  || [check_effective_target_riscv_v]
>                  || [check_effective_target_loongarch_sx]
> -                || [istarget i?86-*-*] || [istarget x86_64-*-*]}]
> +                || [istarget i?86-*-*]
> +                || [istarget x86_64-*-*]
> +                || [istarget aarch64*-*-*]}]
>  }
>  
>  # Return 1 if the target supports the fold_extract_last optab.

Re: [PATCH][PR113816] AArch64: Use SIMD+GPR for logical vector reductions

Reply via email to