On Tue, Aug 27, 2024 at 4:03 PM Robin Dapp <rdapp....@gmail.com> wrote: > > Hi, > > this is a hopefully better way to solve the "subreg problem" by first, > in the generic case, have the RA go via memory and second, providing a > vector-vector extract that deals with it in an optimized way. > > When the source mode is potentially larger than one vector (e.g. an > LMUL2 mode for VLEN=128) we don't know which vector the subreg actually > refers to. For zvl128b and LMUL=2 the subreg in (subreg:V2DI (reg:V4DI)) > could actually be the a full (high) vector register of a two-register > group (at VLEN=128) or the higher part of a single register (at VLEN>128).
Hmm - but how can you call this ambiguous? VLEN and LMUL is a runtime property(?), so unknown to the compiler(?) - as you do below the only way to code generate would be a agnostic way such as with a slide-down. But can't you always to this, for all subregs of this sort (even with offset)? > As the subreg is statically ambiguous we prevent such situations in > can_change_mode_class. > > The culprit in PR116086 is > > _12 = BIT_FIELD_REF <vect_cst__42, 128, 128>; > > which can be expanded with a vector-vector extract (from V4DI to V2DI). > This patch adds a VLS-mode vector-vector extract that handles "halving" > cases like this one by sliding down the source vector, thus making sure > the correct part is used. > > Regtested on rv64gcv_zvfh_zvbb. > > Regards > Robin > > PR target/116086 > > gcc/ChangeLog: > > * config/riscv/autovec.md (vec_extract<mode><v_half>): Add > vector-vector extract for VLS modes. > * config/riscv/riscv.cc (riscv_can_change_mode_class): Forbid > VLS modes larger than one vector. > * config/riscv/vector-iterators.md: Add vector-vector extract > iterators. > > gcc/testsuite/ChangeLog: > > * lib/target-supports.exp: Add effective target checks for > zvl256b and zvl512b. > * gcc.target/riscv/rvv/autovec/pr116086-2-run.c: New test. > * gcc.target/riscv/rvv/autovec/pr116086-2.c: New test. > * gcc.target/riscv/rvv/autovec/pr116086.c: New test. > --- > gcc/config/riscv/autovec.md | 35 +++++ > gcc/config/riscv/riscv.cc | 11 ++ > gcc/config/riscv/vector-iterators.md | 147 ++++++++++++++++++ > .../riscv/rvv/autovec/pr116086-2-run.c | 6 + > .../gcc.target/riscv/rvv/autovec/pr116086-2.c | 18 +++ > .../gcc.target/riscv/rvv/autovec/pr116086.c | 76 +++++++++ > gcc/testsuite/lib/target-supports.exp | 37 +++++ > 7 files changed, 330 insertions(+) > create mode 100644 > gcc/testsuite/gcc.target/riscv/rvv/autovec/pr116086-2-run.c > create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr116086-2.c > create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr116086.c > > diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md > index f422ec0dc1e..5a7cf3523a7 100644 > --- a/gcc/config/riscv/autovec.md > +++ b/gcc/config/riscv/autovec.md > @@ -1462,6 +1462,41 @@ (define_expand "vec_extract<mode>bi" > DONE; > }) > > +;; ------------------------------------------------------------------------- > +;; ---- [INT,FP] Extract a vector from a vector. > +;; ------------------------------------------------------------------------- > +;; TODO: This can be extended to allow basically any extract mode. > +;; For now this helps optimize VLS subregs like (subreg:V2DI (reg:V4DI) 16) > +;; that would otherwise need to go via memory. > + > +(define_expand "vec_extract<mode><v_half>" > + [(set (match_operand:<V_HALF> 0 "nonimmediate_operand") > + (vec_select:<V_HALF> > + (match_operand:V_HAS_HALF 1 "register_operand") > + (parallel > + [(match_operand 2 "immediate_operand")])))] > + "TARGET_VECTOR" > +{ > + int sz = GET_MODE_NUNITS (<V_HALF>mode).to_constant (); > + int part = INTVAL (operands[2]); > + > + rtx start = GEN_INT (part * sz); > + rtx tmp = operands[1]; > + > + if (part != 0) > + { > + tmp = gen_reg_rtx (<MODE>mode); > + > + rtx ops[] = {tmp, operands[1], start}; > + riscv_vector::emit_vlmax_insn > + (code_for_pred_slide (UNSPEC_VSLIDEDOWN, <MODE>mode), > + riscv_vector::BINARY_OP, ops); > + } > + > + emit_move_insn (operands[0], gen_lowpart (<V_HALF>mode, tmp)); > + DONE; > +}) > + > ;; ------------------------------------------------------------------------- > ;; ---- [FP] Binary operations > ;; ------------------------------------------------------------------------- > diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc > index 8538d405f50..4b9f3081ac5 100644 > --- a/gcc/config/riscv/riscv.cc > +++ b/gcc/config/riscv/riscv.cc > @@ -10630,6 +10630,17 @@ riscv_can_change_mode_class (machine_mode from, > machine_mode to, > if (reg_classes_intersect_p (V_REGS, rclass) > && !ordered_p (GET_MODE_PRECISION (from), GET_MODE_PRECISION (to))) > return false; > + > + /* Subregs of modes larger than one vector are ambiguous. > + A V4DImode with rv64gcv_zvl128b could, for example, span two > registers/one > + register group of two at VLEN = 128 or one register at VLEN >= 256 and > + we cannot, statically, determine which part of it to extract. > + Therefore prevent that. */ > + if (reg_classes_intersect_p (V_REGS, rclass) > + && riscv_v_ext_vls_mode_p (from) > + && !ordered_p (BITS_PER_RISCV_VECTOR, GET_MODE_PRECISION (from))) > + return false; > + > return !reg_classes_intersect_p (FP_REGS, rclass); > } > > diff --git a/gcc/config/riscv/vector-iterators.md > b/gcc/config/riscv/vector-iterators.md > index cbbd248c9bb..d662e6d376e 100644 > --- a/gcc/config/riscv/vector-iterators.md > +++ b/gcc/config/riscv/vector-iterators.md > @@ -4126,3 +4126,150 @@ (define_mode_attr VSIX8 [ > (define_mode_attr VSIX16 [ > (RVVMF2SI "RVVM8SI") > ]) > + > +(define_mode_iterator V_HAS_HALF [ > + V2QI V4QI V8QI V16QI V32QI V64QI V128QI V256QI V512QI V1024QI V2048QI > V4096QI > + V2HI V4HI V8HI V16HI V32HI V64HI V128HI V256HI V512HI V1024HI V2048HI > + V2SI V4SI V8SI V16SI V32SI V64SI V128SI V256SI V512SI V1024SI > + V2DI V4DI V8DI V16DI V32DI V64DI V128DI V256DI V512DI > + V2SF V4SF V8SF V16SF V32SF V64SF V128SF V256SF V512SF V1024SF > + V2DF V4DF V8DF V16DF V32DF V64DF V128DF V256DF V512DF > +]) > + > +(define_mode_attr V_HALF [ > + (V2QI "V1QI") > + (V4QI "V2QI") > + (V8QI "V4QI") > + (V16QI "V8QI") > + (V32QI "V16QI") > + (V64QI "V32QI") > + (V128QI "V64QI") > + (V256QI "V128QI") > + (V512QI "V256QI") > + (V1024QI "V512QI") > + (V2048QI "V1024QI") > + (V4096QI "V2048QI") > + > + (V2HI "V1HI") > + (V4HI "V2HI") > + (V8HI "V4HI") > + (V16HI "V8HI") > + (V32HI "V16HI") > + (V64HI "V32HI") > + (V128HI "V64HI") > + (V256HI "V128HI") > + (V512HI "V256HI") > + (V1024HI "V512HI") > + (V2048HI "V1024HI") > + > + (V2SI "V1SI") > + (V4SI "V2SI") > + (V8SI "V4SI") > + (V16SI "V8SI") > + (V32SI "V16SI") > + (V64SI "V32SI") > + (V128SI "V64SI") > + (V256SI "V128SI") > + (V512SI "V256SI") > + (V1024SI "V512SI") > + > + (V2DI "V1DI") > + (V4DI "V2DI") > + (V8DI "V4DI") > + (V16DI "V8DI") > + (V32DI "V16DI") > + (V64DI "V32DI") > + (V128DI "V64DI") > + (V256DI "V128DI") > + (V512DI "V256DI") > + > + (V2SF "V1SF") > + (V4SF "V2SF") > + (V8SF "V4SF") > + (V16SF "V8SF") > + (V32SF "V16SF") > + (V64SF "V32SF") > + (V128SF "V64SF") > + (V256SF "V128SF") > + (V512SF "V256SF") > + (V1024SF "V512SF") > + > + (V2DF "V1DF") > + (V4DF "V2DF") > + (V8DF "V4DF") > + (V16DF "V8DF") > + (V32DF "V16DF") > + (V64DF "V32DF") > + (V128DF "V64DF") > + (V256DF "V128DF") > + (V512DF "V256DF") > +]) > + > +(define_mode_attr v_half [ > + (V2QI "v1qi") > + (V4QI "v2qi") > + (V8QI "v4qi") > + (V16QI "v8qi") > + (V32QI "v16qi") > + (V64QI "v32qi") > + (V128QI "v64qi") > + (V256QI "v128qi") > + (V512QI "v256qi") > + (V1024QI "v512qi") > + (V2048QI "v1024qi") > + (V4096QI "v2048qi") > + > + (V2HI "v1hi") > + (V4HI "v2hi") > + (V8HI "v4hi") > + (V16HI "v8hi") > + (V32HI "v16hi") > + (V64HI "v32hi") > + (V128HI "v64hi") > + (V256HI "v128hi") > + (V512HI "v256hi") > + (V1024HI "v512hi") > + (V2048HI "v1024hi") > + > + (V2SI "v1si") > + (V4SI "v2si") > + (V8SI "v4si") > + (V16SI "v8si") > + (V32SI "v16si") > + (V64SI "v32si") > + (V128SI "v64si") > + (V256SI "v128si") > + (V512SI "v256si") > + (V1024SI "v512si") > + > + (V2DI "v1di") > + (V4DI "v2di") > + (V8DI "v4di") > + (V16DI "v8di") > + (V32DI "v16di") > + (V64DI "v32di") > + (V128DI "v64di") > + (V256DI "v128di") > + (V512DI "v256di") > + > + (V2SF "v1sf") > + (V4SF "v2sf") > + (V8SF "v4sf") > + (V16SF "v8sf") > + (V32SF "v16sf") > + (V64SF "v32sf") > + (V128SF "v64sf") > + (V256SF "v128sf") > + (V512SF "v256sf") > + (V1024SF "v512sf") > + > + (V2DF "v1df") > + (V4DF "v2df") > + (V8DF "v4df") > + (V16DF "v8df") > + (V32DF "v16df") > + (V64DF "v32df") > + (V128DF "v64df") > + (V256DF "v128df") > + (V512DF "v256df") > +]) > diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr116086-2-run.c > b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr116086-2-run.c > new file mode 100644 > index 00000000000..87cd3834067 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr116086-2-run.c > @@ -0,0 +1,6 @@ > +/* { dg-do run } */ > +/* { dg-require-effective-target riscv_v } */ > +/* { dg-require-effective-target rvv_zvl256b_ok } */ > +/* { dg-options "-march=rv64gcv -mabi=lp64d -mrvv-max-lmul=m2" } */ > + > +#include "pr116086-2.c" > diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr116086-2.c > b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr116086-2.c > new file mode 100644 > index 00000000000..8b5ea6be955 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr116086-2.c > @@ -0,0 +1,18 @@ > +/* { dg-do compile } */ > +/* { dg-options "-march=rv64gcv -mabi=lp64d -mrvv-max-lmul=m2" } */ > + > +long a; > +long b; > +long c[80]; > +int main() { > + for (int d = 0; d < 16; d++) > + c[d] = a; > + for (int d = 16; d < 80; d++) > + c[d] = c[d - 2]; > + for (int d = 0; d < 80; d += 8) > + b += c[d]; > + if (b != 0) > + __builtin_abort (); > +} > + > +/* { dg-final { scan-assembler-times "vmv1r" 0 } } */ > diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr116086.c > b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr116086.c > new file mode 100644 > index 00000000000..0d711f69437 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr116086.c > @@ -0,0 +1,76 @@ > +/* { dg-do run } */ > +/* { dg-require-effective-target riscv_v } */ > +/* { dg-require-effective-target rvv_zvl256b_ok } */ > +/* { dg-options "-march=rv64gcv -mabi=lp64d -mrvv-max-lmul=m2" } */ > + > +typedef unsigned int uint32_t; > +typedef unsigned long long uint64_t; > + > +typedef struct > +{ > + uint64_t length; > + uint64_t state[8]; > + uint32_t curlen; > + unsigned char buf[128]; > +} sha512_state; > + > +static uint64_t load64(const unsigned char* y) > +{ > + uint64_t res = 0; > + for(int i = 0; i != 8; ++i) > + res |= (uint64_t)(y[i]) << ((7-i) * 8); > + return res; > +} > + > +static const uint64_t K[80] = > +{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, > 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, > 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, > 0, 0, 0 }; > + > +__attribute__ ((noipa)) > +static void sha_compress(sha512_state *md, const unsigned char *buf) > +{ > + uint64_t S[8], W[80]; > + > + for(int i = 0; i < 8; i++) > + S[i] = 0; > + > + // Copy the state into 1024-bits into W[0..15] > + for(int i = 0; i < 16; i++) > + W[i] = load64(buf + (8*i)); > + > + // Fill W[16..79] > + for(int i = 16; i < 80; i++) > + W[i] = W[i - 2] + W[i - 7] + W[i - 15] + W[i - 16]; > + > + S[7] = W[72]; > + > + // Feedback > + for(int i = 0; i < 8; i++) > + md->state[i] = md->state[i] + S[i]; > +} > + > +int main () > +{ > + sha512_state md; > + md.curlen = 0; > + md.length = 0; > + md.state[0] = 0; > + md.state[1] = 0; > + md.state[2] = 0; > + md.state[3] = 0; > + md.state[4] = 0; > + md.state[5] = 0; > + md.state[6] = 0; > + md.state[7] = 0; > + > + for (int i = 0; i < 128; i++) > + md.buf[i] = 0; > + > + md.buf[md.curlen++] = (unsigned char)0x80; > + > + sha_compress (&md, md.buf); > + > + if (md.state[7] != 0x8000000000000000ULL) > + __builtin_abort (); > + > + return 0; > +} > diff --git a/gcc/testsuite/lib/target-supports.exp > b/gcc/testsuite/lib/target-supports.exp > index 3501ce44b76..dab59011b06 100644 > --- a/gcc/testsuite/lib/target-supports.exp > +++ b/gcc/testsuite/lib/target-supports.exp > @@ -1955,6 +1955,43 @@ proc check_effective_target_riscv_v { } { > }] > } > > +# Return 1 if the target runtime supports 256-bit vectors, 0 otherwise. > +# Cache the result. > + > +proc check_effective_target_rvv_zvl256b_ok { } { > + # Check if the target has a VLENB of 256 bits. > + set gcc_march [regsub {[[:alnum:]]*} [riscv_get_arch] &v] > + return [check_runtime ${gcc_march}_exec { > + int main() > + { > + int vlenb = 0; > + asm ("csrr %0,vlenb" : "=r" (vlenb) : : ); > + if (vlenb == 32) > + return 1; > + return 0; > + } > + } "-march=${gcc_march}"] > +} > + > +# Return 1 if the target runtime supports 512-bit vectors, 0 otherwise. > +# Cache the result. > + > +proc check_effective_target_rvv_zvl512b_ok { } { > + # Check if the target has a VLENB of 512 bits. > + set gcc_march [regsub {[[:alnum:]]*} [riscv_get_arch] &v] > + return [check_runtime ${gcc_march}_exec { > + int main() > + { > + int vlenb = 0; > + asm ("csrr %0,vlenb" : "=r" (vlenb) : : ); > + if (vlenb == 64) > + return 1; > + return 0; > + } > + } "-march=${gcc_march}"] > +} > + > + > # Return 1 if the target arch supports the Zvfh extension, 0 otherwise. > # Cache the result. > > -- > 2.46.0 >