On Tue, Aug 27, 2024 at 4:03 PM Robin Dapp <rdapp....@gmail.com> wrote:
>
> Hi,
>
> this is a hopefully better way to solve the "subreg problem" by first,
> in the generic case, have the RA go via memory and second, providing a
> vector-vector extract that deals with it in an optimized way.
>
> When the source mode is potentially larger than one vector (e.g. an
> LMUL2 mode for VLEN=128) we don't know which vector the subreg actually
> refers to.  For zvl128b and LMUL=2 the subreg in (subreg:V2DI (reg:V4DI))
> could actually be the a full (high) vector register of a two-register
> group (at VLEN=128) or the higher part of a single register (at VLEN>128).

Hmm - but how can you call this ambiguous?  VLEN and LMUL is a runtime
property(?), so unknown to the compiler(?) - as you do below the only
way to code generate would be a agnostic way such as with a slide-down.
But can't you always to this, for all subregs of this sort (even with offset)?

> As the subreg is statically ambiguous we prevent such situations in
> can_change_mode_class.
>
> The culprit in PR116086 is
>
>  _12 = BIT_FIELD_REF <vect_cst__42, 128, 128>;
>
> which can be expanded with a vector-vector extract (from V4DI to V2DI).
> This patch adds a VLS-mode vector-vector extract that handles "halving"
> cases like this one by sliding down the source vector, thus making sure
> the correct part is used.
>
> Regtested on rv64gcv_zvfh_zvbb.
>
> Regards
>  Robin
>
>         PR target/116086
>
> gcc/ChangeLog:
>
>         * config/riscv/autovec.md (vec_extract<mode><v_half>): Add
>         vector-vector extract for VLS modes.
>         * config/riscv/riscv.cc (riscv_can_change_mode_class): Forbid
>         VLS modes larger than one vector.
>         * config/riscv/vector-iterators.md: Add vector-vector extract
>         iterators.
>
> gcc/testsuite/ChangeLog:
>
>         * lib/target-supports.exp: Add effective target checks for
>         zvl256b and zvl512b.
>         * gcc.target/riscv/rvv/autovec/pr116086-2-run.c: New test.
>         * gcc.target/riscv/rvv/autovec/pr116086-2.c: New test.
>         * gcc.target/riscv/rvv/autovec/pr116086.c: New test.
> ---
>  gcc/config/riscv/autovec.md                   |  35 +++++
>  gcc/config/riscv/riscv.cc                     |  11 ++
>  gcc/config/riscv/vector-iterators.md          | 147 ++++++++++++++++++
>  .../riscv/rvv/autovec/pr116086-2-run.c        |   6 +
>  .../gcc.target/riscv/rvv/autovec/pr116086-2.c |  18 +++
>  .../gcc.target/riscv/rvv/autovec/pr116086.c   |  76 +++++++++
>  gcc/testsuite/lib/target-supports.exp         |  37 +++++
>  7 files changed, 330 insertions(+)
>  create mode 100644 
> gcc/testsuite/gcc.target/riscv/rvv/autovec/pr116086-2-run.c
>  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr116086-2.c
>  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr116086.c
>
> diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
> index f422ec0dc1e..5a7cf3523a7 100644
> --- a/gcc/config/riscv/autovec.md
> +++ b/gcc/config/riscv/autovec.md
> @@ -1462,6 +1462,41 @@ (define_expand "vec_extract<mode>bi"
>    DONE;
>  })
>
> +;; -------------------------------------------------------------------------
> +;; ---- [INT,FP] Extract a vector from a vector.
> +;; -------------------------------------------------------------------------
> +;; TODO: This can be extended to allow basically any extract mode.
> +;; For now this helps optimize VLS subregs like (subreg:V2DI (reg:V4DI) 16)
> +;; that would otherwise need to go via memory.
> +
> +(define_expand "vec_extract<mode><v_half>"
> +  [(set (match_operand:<V_HALF>                 0 "nonimmediate_operand")
> +     (vec_select:<V_HALF>
> +       (match_operand:V_HAS_HALF        1 "register_operand")
> +       (parallel
> +        [(match_operand                 2 "immediate_operand")])))]
> +  "TARGET_VECTOR"
> +{
> +  int sz = GET_MODE_NUNITS (<V_HALF>mode).to_constant ();
> +  int part = INTVAL (operands[2]);
> +
> +  rtx start = GEN_INT (part * sz);
> +  rtx tmp = operands[1];
> +
> +  if (part != 0)
> +    {
> +      tmp = gen_reg_rtx (<MODE>mode);
> +
> +      rtx ops[] = {tmp, operands[1], start};
> +      riscv_vector::emit_vlmax_insn
> +       (code_for_pred_slide (UNSPEC_VSLIDEDOWN, <MODE>mode),
> +        riscv_vector::BINARY_OP, ops);
> +    }
> +
> +  emit_move_insn (operands[0], gen_lowpart (<V_HALF>mode, tmp));
> +  DONE;
> +})
> +
>  ;; -------------------------------------------------------------------------
>  ;; ---- [FP] Binary operations
>  ;; -------------------------------------------------------------------------
> diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> index 8538d405f50..4b9f3081ac5 100644
> --- a/gcc/config/riscv/riscv.cc
> +++ b/gcc/config/riscv/riscv.cc
> @@ -10630,6 +10630,17 @@ riscv_can_change_mode_class (machine_mode from, 
> machine_mode to,
>    if (reg_classes_intersect_p (V_REGS, rclass)
>        && !ordered_p (GET_MODE_PRECISION (from), GET_MODE_PRECISION (to)))
>      return false;
> +
> +  /* Subregs of modes larger than one vector are ambiguous.
> +     A V4DImode with rv64gcv_zvl128b could, for example, span two 
> registers/one
> +     register group of two at VLEN = 128 or one register at VLEN >= 256 and
> +     we cannot, statically, determine which part of it to extract.
> +     Therefore prevent that.  */
> +  if (reg_classes_intersect_p (V_REGS, rclass)
> +      && riscv_v_ext_vls_mode_p (from)
> +      && !ordered_p (BITS_PER_RISCV_VECTOR, GET_MODE_PRECISION (from)))
> +      return false;
> +
>    return !reg_classes_intersect_p (FP_REGS, rclass);
>  }
>
> diff --git a/gcc/config/riscv/vector-iterators.md 
> b/gcc/config/riscv/vector-iterators.md
> index cbbd248c9bb..d662e6d376e 100644
> --- a/gcc/config/riscv/vector-iterators.md
> +++ b/gcc/config/riscv/vector-iterators.md
> @@ -4126,3 +4126,150 @@ (define_mode_attr VSIX8 [
>  (define_mode_attr VSIX16 [
>    (RVVMF2SI "RVVM8SI")
>  ])
> +
> +(define_mode_iterator V_HAS_HALF [
> +  V2QI V4QI V8QI V16QI V32QI V64QI V128QI V256QI V512QI V1024QI V2048QI 
> V4096QI
> +  V2HI V4HI V8HI V16HI V32HI V64HI V128HI V256HI V512HI V1024HI V2048HI
> +  V2SI V4SI V8SI V16SI V32SI V64SI V128SI V256SI V512SI V1024SI
> +  V2DI V4DI V8DI V16DI V32DI V64DI V128DI V256DI V512DI
> +  V2SF V4SF V8SF V16SF V32SF V64SF V128SF V256SF V512SF V1024SF
> +  V2DF V4DF V8DF V16DF V32DF V64DF V128DF V256DF V512DF
> +])
> +
> +(define_mode_attr V_HALF [
> +  (V2QI "V1QI")
> +  (V4QI "V2QI")
> +  (V8QI "V4QI")
> +  (V16QI "V8QI")
> +  (V32QI "V16QI")
> +  (V64QI "V32QI")
> +  (V128QI "V64QI")
> +  (V256QI "V128QI")
> +  (V512QI "V256QI")
> +  (V1024QI "V512QI")
> +  (V2048QI "V1024QI")
> +  (V4096QI "V2048QI")
> +
> +  (V2HI "V1HI")
> +  (V4HI "V2HI")
> +  (V8HI "V4HI")
> +  (V16HI "V8HI")
> +  (V32HI "V16HI")
> +  (V64HI "V32HI")
> +  (V128HI "V64HI")
> +  (V256HI "V128HI")
> +  (V512HI "V256HI")
> +  (V1024HI "V512HI")
> +  (V2048HI "V1024HI")
> +
> +  (V2SI "V1SI")
> +  (V4SI "V2SI")
> +  (V8SI "V4SI")
> +  (V16SI "V8SI")
> +  (V32SI "V16SI")
> +  (V64SI "V32SI")
> +  (V128SI "V64SI")
> +  (V256SI "V128SI")
> +  (V512SI "V256SI")
> +  (V1024SI "V512SI")
> +
> +  (V2DI "V1DI")
> +  (V4DI "V2DI")
> +  (V8DI "V4DI")
> +  (V16DI "V8DI")
> +  (V32DI "V16DI")
> +  (V64DI "V32DI")
> +  (V128DI "V64DI")
> +  (V256DI "V128DI")
> +  (V512DI "V256DI")
> +
> +  (V2SF "V1SF")
> +  (V4SF "V2SF")
> +  (V8SF "V4SF")
> +  (V16SF "V8SF")
> +  (V32SF "V16SF")
> +  (V64SF "V32SF")
> +  (V128SF "V64SF")
> +  (V256SF "V128SF")
> +  (V512SF "V256SF")
> +  (V1024SF "V512SF")
> +
> +  (V2DF "V1DF")
> +  (V4DF "V2DF")
> +  (V8DF "V4DF")
> +  (V16DF "V8DF")
> +  (V32DF "V16DF")
> +  (V64DF "V32DF")
> +  (V128DF "V64DF")
> +  (V256DF "V128DF")
> +  (V512DF "V256DF")
> +])
> +
> +(define_mode_attr v_half [
> +  (V2QI "v1qi")
> +  (V4QI "v2qi")
> +  (V8QI "v4qi")
> +  (V16QI "v8qi")
> +  (V32QI "v16qi")
> +  (V64QI "v32qi")
> +  (V128QI "v64qi")
> +  (V256QI "v128qi")
> +  (V512QI "v256qi")
> +  (V1024QI "v512qi")
> +  (V2048QI "v1024qi")
> +  (V4096QI "v2048qi")
> +
> +  (V2HI "v1hi")
> +  (V4HI "v2hi")
> +  (V8HI "v4hi")
> +  (V16HI "v8hi")
> +  (V32HI "v16hi")
> +  (V64HI "v32hi")
> +  (V128HI "v64hi")
> +  (V256HI "v128hi")
> +  (V512HI "v256hi")
> +  (V1024HI "v512hi")
> +  (V2048HI "v1024hi")
> +
> +  (V2SI "v1si")
> +  (V4SI "v2si")
> +  (V8SI "v4si")
> +  (V16SI "v8si")
> +  (V32SI "v16si")
> +  (V64SI "v32si")
> +  (V128SI "v64si")
> +  (V256SI "v128si")
> +  (V512SI "v256si")
> +  (V1024SI "v512si")
> +
> +  (V2DI "v1di")
> +  (V4DI "v2di")
> +  (V8DI "v4di")
> +  (V16DI "v8di")
> +  (V32DI "v16di")
> +  (V64DI "v32di")
> +  (V128DI "v64di")
> +  (V256DI "v128di")
> +  (V512DI "v256di")
> +
> +  (V2SF "v1sf")
> +  (V4SF "v2sf")
> +  (V8SF "v4sf")
> +  (V16SF "v8sf")
> +  (V32SF "v16sf")
> +  (V64SF "v32sf")
> +  (V128SF "v64sf")
> +  (V256SF "v128sf")
> +  (V512SF "v256sf")
> +  (V1024SF "v512sf")
> +
> +  (V2DF "v1df")
> +  (V4DF "v2df")
> +  (V8DF "v4df")
> +  (V16DF "v8df")
> +  (V32DF "v16df")
> +  (V64DF "v32df")
> +  (V128DF "v64df")
> +  (V256DF "v128df")
> +  (V512DF "v256df")
> +])
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr116086-2-run.c 
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr116086-2-run.c
> new file mode 100644
> index 00000000000..87cd3834067
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr116086-2-run.c
> @@ -0,0 +1,6 @@
> +/* { dg-do run } */
> +/* { dg-require-effective-target riscv_v } */
> +/* { dg-require-effective-target rvv_zvl256b_ok } */
> +/* { dg-options "-march=rv64gcv -mabi=lp64d -mrvv-max-lmul=m2" } */
> +
> +#include "pr116086-2.c"
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr116086-2.c 
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr116086-2.c
> new file mode 100644
> index 00000000000..8b5ea6be955
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr116086-2.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=rv64gcv -mabi=lp64d -mrvv-max-lmul=m2" } */
> +
> +long a;
> +long b;
> +long c[80];
> +int main() {
> +    for (int d = 0; d < 16; d++)
> +      c[d] = a;
> +    for (int d = 16; d < 80; d++)
> +      c[d] = c[d - 2];
> +    for (int d = 0; d < 80; d += 8)
> +      b += c[d];
> +    if (b != 0)
> +      __builtin_abort ();
> +}
> +
> +/* { dg-final { scan-assembler-times "vmv1r" 0 } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr116086.c 
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr116086.c
> new file mode 100644
> index 00000000000..0d711f69437
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr116086.c
> @@ -0,0 +1,76 @@
> +/* { dg-do run } */
> +/* { dg-require-effective-target riscv_v } */
> +/* { dg-require-effective-target rvv_zvl256b_ok } */
> +/* { dg-options "-march=rv64gcv -mabi=lp64d -mrvv-max-lmul=m2" } */
> +
> +typedef unsigned int uint32_t;
> +typedef unsigned long long uint64_t;
> +
> +typedef struct
> +{
> +    uint64_t length;
> +    uint64_t state[8];
> +    uint32_t curlen;
> +    unsigned char buf[128];
> +} sha512_state;
> +
> +static uint64_t load64(const unsigned char* y)
> +{
> +    uint64_t res = 0;
> +    for(int i = 0; i != 8; ++i)
> +        res |= (uint64_t)(y[i]) << ((7-i) * 8);
> +    return res;
> +}
> +
> +static const uint64_t K[80] =
> +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
> 0, 0, 0 };
> +
> +__attribute__ ((noipa))
> +static void sha_compress(sha512_state *md, const unsigned char *buf)
> +{
> +    uint64_t S[8], W[80];
> +
> +    for(int i = 0; i < 8; i++)
> +      S[i] = 0;
> +
> +    // Copy the state into 1024-bits into W[0..15]
> +    for(int i = 0; i < 16; i++)
> +      W[i] = load64(buf + (8*i));
> +
> +    // Fill W[16..79]
> +    for(int i = 16; i < 80; i++)
> +      W[i] = W[i - 2] + W[i - 7] + W[i - 15] + W[i - 16];
> +
> +    S[7] = W[72];
> +
> +     // Feedback
> +    for(int i = 0; i < 8; i++)
> +      md->state[i] = md->state[i] + S[i];
> +}
> +
> +int main ()
> +{
> +  sha512_state md;
> +  md.curlen = 0;
> +  md.length = 0;
> +  md.state[0] = 0;
> +  md.state[1] = 0;
> +  md.state[2] = 0;
> +  md.state[3] = 0;
> +  md.state[4] = 0;
> +  md.state[5] = 0;
> +  md.state[6] = 0;
> +  md.state[7] = 0;
> +
> +  for (int i = 0; i < 128; i++)
> +    md.buf[i] = 0;
> +
> +  md.buf[md.curlen++] = (unsigned char)0x80;
> +
> +  sha_compress (&md, md.buf);
> +
> +  if (md.state[7] != 0x8000000000000000ULL)
> +    __builtin_abort ();
> +
> +  return 0;
> +}
> diff --git a/gcc/testsuite/lib/target-supports.exp 
> b/gcc/testsuite/lib/target-supports.exp
> index 3501ce44b76..dab59011b06 100644
> --- a/gcc/testsuite/lib/target-supports.exp
> +++ b/gcc/testsuite/lib/target-supports.exp
> @@ -1955,6 +1955,43 @@ proc check_effective_target_riscv_v { } {
>      }]
>  }
>
> +# Return 1 if the target runtime supports 256-bit vectors, 0 otherwise.
> +# Cache the result.
> +
> +proc check_effective_target_rvv_zvl256b_ok { } {
> +    # Check if the target has a VLENB of 256 bits.
> +    set gcc_march [regsub {[[:alnum:]]*} [riscv_get_arch] &v]
> +    return [check_runtime ${gcc_march}_exec {
> +       int main()
> +       {
> +         int vlenb = 0;
> +         asm ("csrr %0,vlenb" : "=r" (vlenb) : : );
> +         if (vlenb == 32)
> +           return 1;
> +         return 0;
> +       }
> +      } "-march=${gcc_march}"]
> +}
> +
> +# Return 1 if the target runtime supports 512-bit vectors, 0 otherwise.
> +# Cache the result.
> +
> +proc check_effective_target_rvv_zvl512b_ok { } {
> +    # Check if the target has a VLENB of 512 bits.
> +    set gcc_march [regsub {[[:alnum:]]*} [riscv_get_arch] &v]
> +    return [check_runtime ${gcc_march}_exec {
> +       int main()
> +       {
> +         int vlenb = 0;
> +         asm ("csrr %0,vlenb" : "=r" (vlenb) : : );
> +         if (vlenb == 64)
> +           return 1;
> +         return 0;
> +       }
> +      } "-march=${gcc_march}"]
> +}
> +
> +
>  # Return 1 if the target arch supports the Zvfh extension, 0 otherwise.
>  # Cache the result.
>
> --
> 2.46.0
>

Reply via email to